1 /*
   2  * Copyright (c) 2020, 2023, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "gc/shared/barrierSet.hpp"
  29 #include "gc/shared/barrierSetAssembler.hpp"
  30 #include "oops/methodData.hpp"
  31 #include "opto/c2_MacroAssembler.hpp"
  32 #include "opto/intrinsicnode.hpp"
  33 #include "opto/output.hpp"
  34 #include "opto/opcodes.hpp"
  35 #include "opto/subnode.hpp"
  36 #include "runtime/objectMonitor.hpp"
  37 #include "runtime/stubRoutines.hpp"
  38 
  39 #ifdef PRODUCT
  40 #define BLOCK_COMMENT(str) /* nothing */
  41 #define STOP(error) stop(error)
  42 #else
  43 #define BLOCK_COMMENT(str) block_comment(str)
  44 #define STOP(error) block_comment(error); stop(error)
  45 #endif
  46 
  47 // C2 compiled method's prolog code.
  48 void C2_MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) {
  49 
  50   // WARNING: Initial instruction MUST be 5 bytes or longer so that
  51   // NativeJump::patch_verified_entry will be able to patch out the entry
  52   // code safely. The push to verify stack depth is ok at 5 bytes,
  53   // the frame allocation can be either 3 or 6 bytes. So if we don't do
  54   // stack bang then we must use the 6 byte frame allocation even if
  55   // we have no frame. :-(
  56   assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
  57 
  58   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
  59   // Remove word for return addr
  60   framesize -= wordSize;
  61   stack_bang_size -= wordSize;
  62 
  63   // Calls to C2R adapters often do not accept exceptional returns.
  64   // We require that their callers must bang for them.  But be careful, because
  65   // some VM calls (such as call site linkage) can use several kilobytes of
  66   // stack.  But the stack safety zone should account for that.
  67   // See bugs 4446381, 4468289, 4497237.
  68   if (stack_bang_size > 0) {
  69     generate_stack_overflow_check(stack_bang_size);
  70 
  71     // We always push rbp, so that on return to interpreter rbp, will be
  72     // restored correctly and we can correct the stack.
  73     push(rbp);
  74     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  75     if (PreserveFramePointer) {
  76       mov(rbp, rsp);
  77     }
  78     // Remove word for ebp
  79     framesize -= wordSize;
  80 
  81     // Create frame
  82     if (framesize) {
  83       subptr(rsp, framesize);
  84     }
  85   } else {
  86     // Create frame (force generation of a 4 byte immediate value)
  87     subptr_imm32(rsp, framesize);
  88 
  89     // Save RBP register now.
  90     framesize -= wordSize;
  91     movptr(Address(rsp, framesize), rbp);
  92     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  93     if (PreserveFramePointer) {
  94       movptr(rbp, rsp);
  95       if (framesize > 0) {
  96         addptr(rbp, framesize);
  97       }
  98     }
  99   }
 100 
 101   if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
 102     framesize -= wordSize;
 103     movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 104   }
 105 
 106 #ifndef _LP64
 107   // If method sets FPU control word do it now
 108   if (fp_mode_24b) {
 109     fldcw(ExternalAddress(StubRoutines::x86::addr_fpu_cntrl_wrd_24()));
 110   }
 111   if (UseSSE >= 2 && VerifyFPU) {
 112     verify_FPU(0, "FPU stack must be clean on entry");
 113   }
 114 #endif
 115 
 116 #ifdef ASSERT
 117   if (VerifyStackAtCalls) {
 118     Label L;
 119     push(rax);
 120     mov(rax, rsp);
 121     andptr(rax, StackAlignmentInBytes-1);
 122     cmpptr(rax, StackAlignmentInBytes-wordSize);
 123     pop(rax);
 124     jcc(Assembler::equal, L);
 125     STOP("Stack is not properly aligned!");
 126     bind(L);
 127   }
 128 #endif
 129 
 130   if (!is_stub) {
 131     BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 132  #ifdef _LP64
 133     if (BarrierSet::barrier_set()->barrier_set_nmethod() != NULL) {
 134       // We put the non-hot code of the nmethod entry barrier out-of-line in a stub.
 135       Label dummy_slow_path;
 136       Label dummy_continuation;
 137       Label* slow_path = &dummy_slow_path;
 138       Label* continuation = &dummy_continuation;
 139       if (!Compile::current()->output()->in_scratch_emit_size()) {
 140         // Use real labels from actual stub when not emitting code for the purpose of measuring its size
 141         C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
 142         Compile::current()->output()->add_stub(stub);
 143         slow_path = &stub->entry();
 144         continuation = &stub->continuation();
 145       }
 146       bs->nmethod_entry_barrier(this, slow_path, continuation);
 147     }
 148 #else
 149     // Don't bother with out-of-line nmethod entry barrier stub for x86_32.
 150     bs->nmethod_entry_barrier(this, NULL /* slow_path */, NULL /* continuation */);
 151 #endif
 152   }
 153 }
 154 
 155 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
 156   switch (vlen_in_bytes) {
 157     case  4: // fall-through
 158     case  8: // fall-through
 159     case 16: return Assembler::AVX_128bit;
 160     case 32: return Assembler::AVX_256bit;
 161     case 64: return Assembler::AVX_512bit;
 162 
 163     default: {
 164       ShouldNotReachHere();
 165       return Assembler::AVX_NoVec;
 166     }
 167   }
 168 }
 169 
 170 #if INCLUDE_RTM_OPT
 171 
 172 // Update rtm_counters based on abort status
 173 // input: abort_status
 174 //        rtm_counters (RTMLockingCounters*)
 175 // flags are killed
 176 void C2_MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) {
 177 
 178   atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset()));
 179   if (PrintPreciseRTMLockingStatistics) {
 180     for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {
 181       Label check_abort;
 182       testl(abort_status, (1<<i));
 183       jccb(Assembler::equal, check_abort);
 184       atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx))));
 185       bind(check_abort);
 186     }
 187   }
 188 }
 189 
 190 // Branch if (random & (count-1) != 0), count is 2^n
 191 // tmp, scr and flags are killed
 192 void C2_MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) {
 193   assert(tmp == rax, "");
 194   assert(scr == rdx, "");
 195   rdtsc(); // modifies EDX:EAX
 196   andptr(tmp, count-1);
 197   jccb(Assembler::notZero, brLabel);
 198 }
 199 
 200 // Perform abort ratio calculation, set no_rtm bit if high ratio
 201 // input:  rtm_counters_Reg (RTMLockingCounters* address)
 202 // tmpReg, rtm_counters_Reg and flags are killed
 203 void C2_MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg,
 204                                                     Register rtm_counters_Reg,
 205                                                     RTMLockingCounters* rtm_counters,
 206                                                     Metadata* method_data) {
 207   Label L_done, L_check_always_rtm1, L_check_always_rtm2;
 208 
 209   if (RTMLockingCalculationDelay > 0) {
 210     // Delay calculation
 211     movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr()));
 212     testptr(tmpReg, tmpReg);
 213     jccb(Assembler::equal, L_done);
 214   }
 215   // Abort ratio calculation only if abort_count > RTMAbortThreshold
 216   //   Aborted transactions = abort_count * 100
 217   //   All transactions = total_count *  RTMTotalCountIncrRate
 218   //   Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
 219 
 220   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset()));
 221   cmpptr(tmpReg, RTMAbortThreshold);
 222   jccb(Assembler::below, L_check_always_rtm2);
 223   imulptr(tmpReg, tmpReg, 100);
 224 
 225   Register scrReg = rtm_counters_Reg;
 226   movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
 227   imulptr(scrReg, scrReg, RTMTotalCountIncrRate);
 228   imulptr(scrReg, scrReg, RTMAbortRatio);
 229   cmpptr(tmpReg, scrReg);
 230   jccb(Assembler::below, L_check_always_rtm1);
 231   if (method_data != NULL) {
 232     // set rtm_state to "no rtm" in MDO
 233     mov_metadata(tmpReg, method_data);
 234     lock();
 235     orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), NoRTM);
 236   }
 237   jmpb(L_done);
 238   bind(L_check_always_rtm1);
 239   // Reload RTMLockingCounters* address
 240   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
 241   bind(L_check_always_rtm2);
 242   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
 243   cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate);
 244   jccb(Assembler::below, L_done);
 245   if (method_data != NULL) {
 246     // set rtm_state to "always rtm" in MDO
 247     mov_metadata(tmpReg, method_data);
 248     lock();
 249     orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), UseRTM);
 250   }
 251   bind(L_done);
 252 }
 253 
 254 // Update counters and perform abort ratio calculation
 255 // input:  abort_status_Reg
 256 // rtm_counters_Reg, flags are killed
 257 void C2_MacroAssembler::rtm_profiling(Register abort_status_Reg,
 258                                       Register rtm_counters_Reg,
 259                                       RTMLockingCounters* rtm_counters,
 260                                       Metadata* method_data,
 261                                       bool profile_rtm) {
 262 
 263   assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
 264   // update rtm counters based on rax value at abort
 265   // reads abort_status_Reg, updates flags
 266   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
 267   rtm_counters_update(abort_status_Reg, rtm_counters_Reg);
 268   if (profile_rtm) {
 269     // Save abort status because abort_status_Reg is used by following code.
 270     if (RTMRetryCount > 0) {
 271       push(abort_status_Reg);
 272     }
 273     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
 274     rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data);
 275     // restore abort status
 276     if (RTMRetryCount > 0) {
 277       pop(abort_status_Reg);
 278     }
 279   }
 280 }
 281 
 282 // Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4)
 283 // inputs: retry_count_Reg
 284 //       : abort_status_Reg
 285 // output: retry_count_Reg decremented by 1
 286 // flags are killed
 287 void C2_MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) {
 288   Label doneRetry;
 289   assert(abort_status_Reg == rax, "");
 290   // The abort reason bits are in eax (see all states in rtmLocking.hpp)
 291   // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4)
 292   // if reason is in 0x6 and retry count != 0 then retry
 293   andptr(abort_status_Reg, 0x6);
 294   jccb(Assembler::zero, doneRetry);
 295   testl(retry_count_Reg, retry_count_Reg);
 296   jccb(Assembler::zero, doneRetry);
 297   pause();
 298   decrementl(retry_count_Reg);
 299   jmp(retryLabel);
 300   bind(doneRetry);
 301 }
 302 
 303 // Spin and retry if lock is busy,
 304 // inputs: box_Reg (monitor address)
 305 //       : retry_count_Reg
 306 // output: retry_count_Reg decremented by 1
 307 //       : clear z flag if retry count exceeded
 308 // tmp_Reg, scr_Reg, flags are killed
 309 void C2_MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg,
 310                                                Register tmp_Reg, Register scr_Reg, Label& retryLabel) {
 311   Label SpinLoop, SpinExit, doneRetry;
 312   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 313 
 314   testl(retry_count_Reg, retry_count_Reg);
 315   jccb(Assembler::zero, doneRetry);
 316   decrementl(retry_count_Reg);
 317   movptr(scr_Reg, RTMSpinLoopCount);
 318 
 319   bind(SpinLoop);
 320   pause();
 321   decrementl(scr_Reg);
 322   jccb(Assembler::lessEqual, SpinExit);
 323   movptr(tmp_Reg, Address(box_Reg, owner_offset));
 324   testptr(tmp_Reg, tmp_Reg);
 325   jccb(Assembler::notZero, SpinLoop);
 326 
 327   bind(SpinExit);
 328   jmp(retryLabel);
 329   bind(doneRetry);
 330   incrementl(retry_count_Reg); // clear z flag
 331 }
 332 
 333 // Use RTM for normal stack locks
 334 // Input: objReg (object to lock)
 335 void C2_MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg,
 336                                          Register retry_on_abort_count_Reg,
 337                                          RTMLockingCounters* stack_rtm_counters,
 338                                          Metadata* method_data, bool profile_rtm,
 339                                          Label& DONE_LABEL, Label& IsInflated) {
 340   assert(UseRTMForStackLocks, "why call this otherwise?");
 341   assert(tmpReg == rax, "");
 342   assert(scrReg == rdx, "");
 343   Label L_rtm_retry, L_decrement_retry, L_on_abort;
 344 
 345   if (RTMRetryCount > 0) {
 346     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
 347     bind(L_rtm_retry);
 348   }
 349   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
 350   testptr(tmpReg, markWord::monitor_value);  // inflated vs stack-locked|neutral
 351   jcc(Assembler::notZero, IsInflated);
 352 
 353   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 354     Label L_noincrement;
 355     if (RTMTotalCountIncrRate > 1) {
 356       // tmpReg, scrReg and flags are killed
 357       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
 358     }
 359     assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM");
 360     atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg);
 361     bind(L_noincrement);
 362   }
 363   xbegin(L_on_abort);
 364   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));       // fetch markword
 365   andptr(tmpReg, markWord::lock_mask_in_place);     // look at 2 lock bits
 366   cmpptr(tmpReg, markWord::unlocked_value);         // bits = 01 unlocked
 367   jcc(Assembler::equal, DONE_LABEL);        // all done if unlocked
 368 
 369   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
 370   if (UseRTMXendForLockBusy) {
 371     xend();
 372     movptr(abort_status_Reg, 0x2);   // Set the abort status to 2 (so we can retry)
 373     jmp(L_decrement_retry);
 374   }
 375   else {
 376     xabort(0);
 377   }
 378   bind(L_on_abort);
 379   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 380     rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm);
 381   }
 382   bind(L_decrement_retry);
 383   if (RTMRetryCount > 0) {
 384     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
 385     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
 386   }
 387 }
 388 
 389 // Use RTM for inflating locks
 390 // inputs: objReg (object to lock)
 391 //         boxReg (on-stack box address (displaced header location) - KILLED)
 392 //         tmpReg (ObjectMonitor address + markWord::monitor_value)
 393 void C2_MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg,
 394                                             Register scrReg, Register retry_on_busy_count_Reg,
 395                                             Register retry_on_abort_count_Reg,
 396                                             RTMLockingCounters* rtm_counters,
 397                                             Metadata* method_data, bool profile_rtm,
 398                                             Label& DONE_LABEL) {
 399   assert(UseRTMLocking, "why call this otherwise?");
 400   assert(tmpReg == rax, "");
 401   assert(scrReg == rdx, "");
 402   Label L_rtm_retry, L_decrement_retry, L_on_abort;
 403   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 404 
 405   movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value()));
 406   movptr(boxReg, tmpReg); // Save ObjectMonitor address
 407 
 408   if (RTMRetryCount > 0) {
 409     movl(retry_on_busy_count_Reg, RTMRetryCount);  // Retry on lock busy
 410     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
 411     bind(L_rtm_retry);
 412   }
 413   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 414     Label L_noincrement;
 415     if (RTMTotalCountIncrRate > 1) {
 416       // tmpReg, scrReg and flags are killed
 417       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
 418     }
 419     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
 420     atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg);
 421     bind(L_noincrement);
 422   }
 423   xbegin(L_on_abort);
 424   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
 425   movptr(tmpReg, Address(tmpReg, owner_offset));
 426   testptr(tmpReg, tmpReg);
 427   jcc(Assembler::zero, DONE_LABEL);
 428   if (UseRTMXendForLockBusy) {
 429     xend();
 430     jmp(L_decrement_retry);
 431   }
 432   else {
 433     xabort(0);
 434   }
 435   bind(L_on_abort);
 436   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
 437   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 438     rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm);
 439   }
 440   if (RTMRetryCount > 0) {
 441     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
 442     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
 443   }
 444 
 445   movptr(tmpReg, Address(boxReg, owner_offset)) ;
 446   testptr(tmpReg, tmpReg) ;
 447   jccb(Assembler::notZero, L_decrement_retry) ;
 448 
 449   // Appears unlocked - try to swing _owner from null to non-null.
 450   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 451 #ifdef _LP64
 452   Register threadReg = r15_thread;
 453 #else
 454   get_thread(scrReg);
 455   Register threadReg = scrReg;
 456 #endif
 457   lock();
 458   cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg
 459 
 460   if (RTMRetryCount > 0) {
 461     // success done else retry
 462     jccb(Assembler::equal, DONE_LABEL) ;
 463     bind(L_decrement_retry);
 464     // Spin and retry if lock is busy.
 465     rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry);
 466   }
 467   else {
 468     bind(L_decrement_retry);
 469   }
 470 }
 471 
 472 #endif //  INCLUDE_RTM_OPT
 473 
 474 // fast_lock and fast_unlock used by C2
 475 
 476 // Because the transitions from emitted code to the runtime
 477 // monitorenter/exit helper stubs are so slow it's critical that
 478 // we inline both the stack-locking fast path and the inflated fast path.
 479 //
 480 // See also: cmpFastLock and cmpFastUnlock.
 481 //
 482 // What follows is a specialized inline transliteration of the code
 483 // in enter() and exit(). If we're concerned about I$ bloat another
 484 // option would be to emit TrySlowEnter and TrySlowExit methods
 485 // at startup-time.  These methods would accept arguments as
 486 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
 487 // indications in the icc.ZFlag.  fast_lock and fast_unlock would simply
 488 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
 489 // In practice, however, the # of lock sites is bounded and is usually small.
 490 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
 491 // if the processor uses simple bimodal branch predictors keyed by EIP
 492 // Since the helper routines would be called from multiple synchronization
 493 // sites.
 494 //
 495 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
 496 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
 497 // to those specialized methods.  That'd give us a mostly platform-independent
 498 // implementation that the JITs could optimize and inline at their pleasure.
 499 // Done correctly, the only time we'd need to cross to native could would be
 500 // to park() or unpark() threads.  We'd also need a few more unsafe operators
 501 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
 502 // (b) explicit barriers or fence operations.
 503 //
 504 // TODO:
 505 //
 506 // *  Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
 507 //    This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
 508 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
 509 //    the lock operators would typically be faster than reifying Self.
 510 //
 511 // *  Ideally I'd define the primitives as:
 512 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
 513 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
 514 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
 515 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
 516 //    Furthermore the register assignments are overconstrained, possibly resulting in
 517 //    sub-optimal code near the synchronization site.
 518 //
 519 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
 520 //    Alternately, use a better sp-proximity test.
 521 //
 522 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
 523 //    Either one is sufficient to uniquely identify a thread.
 524 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
 525 //
 526 // *  Intrinsify notify() and notifyAll() for the common cases where the
 527 //    object is locked by the calling thread but the waitlist is empty.
 528 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
 529 //
 530 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
 531 //    But beware of excessive branch density on AMD Opterons.
 532 //
 533 // *  Both fast_lock and fast_unlock set the ICC.ZF to indicate success
 534 //    or failure of the fast path.  If the fast path fails then we pass
 535 //    control to the slow path, typically in C.  In fast_lock and
 536 //    fast_unlock we often branch to DONE_LABEL, just to find that C2
 537 //    will emit a conditional branch immediately after the node.
 538 //    So we have branches to branches and lots of ICC.ZF games.
 539 //    Instead, it might be better to have C2 pass a "FailureLabel"
 540 //    into fast_lock and fast_unlock.  In the case of success, control
 541 //    will drop through the node.  ICC.ZF is undefined at exit.
 542 //    In the case of failure, the node will branch directly to the
 543 //    FailureLabel
 544 
 545 
 546 // obj: object to lock
 547 // box: on-stack box address (displaced header location) - KILLED
 548 // rax,: tmp -- KILLED
 549 // scr: tmp -- KILLED
 550 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
 551                                  Register scrReg, Register cx1Reg, Register cx2Reg,
 552                                  RTMLockingCounters* rtm_counters,
 553                                  RTMLockingCounters* stack_rtm_counters,
 554                                  Metadata* method_data,
 555                                  bool use_rtm, bool profile_rtm) {
 556   // Ensure the register assignments are disjoint
 557   assert(tmpReg == rax, "");
 558 
 559   if (use_rtm) {
 560     assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg);
 561   } else {
 562     assert(cx1Reg == noreg, "");
 563     assert(cx2Reg == noreg, "");
 564     assert_different_registers(objReg, boxReg, tmpReg, scrReg);
 565   }
 566 
 567   // Possible cases that we'll encounter in fast_lock
 568   // ------------------------------------------------
 569   // * Inflated
 570   //    -- unlocked
 571   //    -- Locked
 572   //       = by self
 573   //       = by other
 574   // * neutral
 575   // * stack-locked
 576   //    -- by self
 577   //       = sp-proximity test hits
 578   //       = sp-proximity test generates false-negative
 579   //    -- by other
 580   //
 581 
 582   Label IsInflated, DONE_LABEL, NO_COUNT, COUNT;
 583 
 584   if (DiagnoseSyncOnValueBasedClasses != 0) {
 585     load_klass(tmpReg, objReg, scrReg);
 586     movl(tmpReg, Address(tmpReg, Klass::access_flags_offset()));
 587     testl(tmpReg, JVM_ACC_IS_VALUE_BASED_CLASS);
 588     jcc(Assembler::notZero, DONE_LABEL);
 589   }
 590 
 591 #if INCLUDE_RTM_OPT
 592   if (UseRTMForStackLocks && use_rtm) {
 593     assert(!UseHeavyMonitors, "+UseHeavyMonitors and +UseRTMForStackLocks are mutually exclusive");
 594     rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg,
 595                       stack_rtm_counters, method_data, profile_rtm,
 596                       DONE_LABEL, IsInflated);
 597   }
 598 #endif // INCLUDE_RTM_OPT
 599 
 600   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));          // [FETCH]
 601   testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral
 602   jccb(Assembler::notZero, IsInflated);
 603 
 604   if (!UseHeavyMonitors) {
 605     // Attempt stack-locking ...
 606     orptr (tmpReg, markWord::unlocked_value);
 607     movptr(Address(boxReg, 0), tmpReg);          // Anticipate successful CAS
 608     lock();
 609     cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes()));      // Updates tmpReg
 610     jcc(Assembler::equal, COUNT);           // Success
 611 
 612     // Recursive locking.
 613     // The object is stack-locked: markword contains stack pointer to BasicLock.
 614     // Locked by current thread if difference with current SP is less than one page.
 615     subptr(tmpReg, rsp);
 616     // Next instruction set ZFlag == 1 (Success) if difference is less then one page.
 617     andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - (int)os::vm_page_size())) );
 618     movptr(Address(boxReg, 0), tmpReg);
 619   } else {
 620     // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0.
 621     testptr(objReg, objReg);
 622   }
 623   jmp(DONE_LABEL);
 624 
 625   bind(IsInflated);
 626   // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value
 627 
 628 #if INCLUDE_RTM_OPT
 629   // Use the same RTM locking code in 32- and 64-bit VM.
 630   if (use_rtm) {
 631     rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg,
 632                          rtm_counters, method_data, profile_rtm, DONE_LABEL);
 633   } else {
 634 #endif // INCLUDE_RTM_OPT
 635 
 636 #ifndef _LP64
 637   // The object is inflated.
 638 
 639   // boxReg refers to the on-stack BasicLock in the current frame.
 640   // We'd like to write:
 641   //   set box->_displaced_header = markWord::unused_mark().  Any non-0 value suffices.
 642   // This is convenient but results a ST-before-CAS penalty.  The following CAS suffers
 643   // additional latency as we have another ST in the store buffer that must drain.
 644 
 645   // avoid ST-before-CAS
 646   // register juggle because we need tmpReg for cmpxchgptr below
 647   movptr(scrReg, boxReg);
 648   movptr(boxReg, tmpReg);                   // consider: LEA box, [tmp-2]
 649 
 650   // Optimistic form: consider XORL tmpReg,tmpReg
 651   movptr(tmpReg, NULL_WORD);
 652 
 653   // Appears unlocked - try to swing _owner from null to non-null.
 654   // Ideally, I'd manifest "Self" with get_thread and then attempt
 655   // to CAS the register containing Self into m->Owner.
 656   // But we don't have enough registers, so instead we can either try to CAS
 657   // rsp or the address of the box (in scr) into &m->owner.  If the CAS succeeds
 658   // we later store "Self" into m->Owner.  Transiently storing a stack address
 659   // (rsp or the address of the box) into  m->owner is harmless.
 660   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 661   lock();
 662   cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 663   movptr(Address(scrReg, 0), 3);          // box->_displaced_header = 3
 664   // If we weren't able to swing _owner from NULL to the BasicLock
 665   // then take the slow path.
 666   jccb  (Assembler::notZero, NO_COUNT);
 667   // update _owner from BasicLock to thread
 668   get_thread (scrReg);                    // beware: clobbers ICCs
 669   movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg);
 670   xorptr(boxReg, boxReg);                 // set icc.ZFlag = 1 to indicate success
 671 
 672   // If the CAS fails we can either retry or pass control to the slow path.
 673   // We use the latter tactic.
 674   // Pass the CAS result in the icc.ZFlag into DONE_LABEL
 675   // If the CAS was successful ...
 676   //   Self has acquired the lock
 677   //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
 678   // Intentional fall-through into DONE_LABEL ...
 679 #else // _LP64
 680   // It's inflated and we use scrReg for ObjectMonitor* in this section.
 681   movq(scrReg, tmpReg);
 682   xorq(tmpReg, tmpReg);
 683   lock();
 684   cmpxchgptr(r15_thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 685   // Unconditionally set box->_displaced_header = markWord::unused_mark().
 686   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 687   movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value()));
 688   // Propagate ICC.ZF from CAS above into DONE_LABEL.
 689   jccb(Assembler::equal, COUNT);          // CAS above succeeded; propagate ZF = 1 (success)
 690 
 691   cmpptr(r15_thread, rax);                // Check if we are already the owner (recursive lock)
 692   jccb(Assembler::notEqual, NO_COUNT);    // If not recursive, ZF = 0 at this point (fail)
 693   incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 694   xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success
 695 #endif // _LP64
 696 #if INCLUDE_RTM_OPT
 697   } // use_rtm()
 698 #endif
 699   bind(DONE_LABEL);
 700 
 701   // ZFlag == 1 count in fast path
 702   // ZFlag == 0 count in slow path
 703   jccb(Assembler::notZero, NO_COUNT); // jump if ZFlag == 0
 704 
 705   bind(COUNT);
 706   // Count monitors in fast path
 707 #ifndef _LP64
 708   get_thread(tmpReg);
 709   incrementl(Address(tmpReg, JavaThread::held_monitor_count_offset()));
 710 #else // _LP64
 711   incrementq(Address(r15_thread, JavaThread::held_monitor_count_offset()));
 712 #endif
 713 
 714   xorl(tmpReg, tmpReg); // Set ZF == 1
 715 
 716   bind(NO_COUNT);
 717 
 718   // At NO_COUNT the icc ZFlag is set as follows ...
 719   // fast_unlock uses the same protocol.
 720   // ZFlag == 1 -> Success
 721   // ZFlag == 0 -> Failure - force control through the slow path
 722 }
 723 
 724 // obj: object to unlock
 725 // box: box address (displaced header location), killed.  Must be EAX.
 726 // tmp: killed, cannot be obj nor box.
 727 //
 728 // Some commentary on balanced locking:
 729 //
 730 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
 731 // Methods that don't have provably balanced locking are forced to run in the
 732 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
 733 // The interpreter provides two properties:
 734 // I1:  At return-time the interpreter automatically and quietly unlocks any
 735 //      objects acquired the current activation (frame).  Recall that the
 736 //      interpreter maintains an on-stack list of locks currently held by
 737 //      a frame.
 738 // I2:  If a method attempts to unlock an object that is not held by the
 739 //      the frame the interpreter throws IMSX.
 740 //
 741 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
 742 // B() doesn't have provably balanced locking so it runs in the interpreter.
 743 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
 744 // is still locked by A().
 745 //
 746 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
 747 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
 748 // should not be unlocked by "normal" java-level locking and vice-versa.  The specification
 749 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
 750 // Arguably given that the spec legislates the JNI case as undefined our implementation
 751 // could reasonably *avoid* checking owner in fast_unlock().
 752 // In the interest of performance we elide m->Owner==Self check in unlock.
 753 // A perfectly viable alternative is to elide the owner check except when
 754 // Xcheck:jni is enabled.
 755 
 756 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) {
 757   assert(boxReg == rax, "");
 758   assert_different_registers(objReg, boxReg, tmpReg);
 759 
 760   Label DONE_LABEL, Stacked, COUNT, NO_COUNT;
 761 
 762 #if INCLUDE_RTM_OPT
 763   if (UseRTMForStackLocks && use_rtm) {
 764     assert(!UseHeavyMonitors, "+UseHeavyMonitors and +UseRTMForStackLocks are mutually exclusive");
 765     Label L_regular_unlock;
 766     movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword
 767     andptr(tmpReg, markWord::lock_mask_in_place);                     // look at 2 lock bits
 768     cmpptr(tmpReg, markWord::unlocked_value);                         // bits = 01 unlocked
 769     jccb(Assembler::notEqual, L_regular_unlock);                      // if !HLE RegularLock
 770     xend();                                                           // otherwise end...
 771     jmp(DONE_LABEL);                                                  // ... and we're done
 772     bind(L_regular_unlock);
 773   }
 774 #endif
 775 
 776   if (!UseHeavyMonitors) {
 777     cmpptr(Address(boxReg, 0), NULL_WORD);                            // Examine the displaced header
 778     jcc   (Assembler::zero, COUNT);                                   // 0 indicates recursive stack-lock
 779   }
 780   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));   // Examine the object's markword
 781   if (!UseHeavyMonitors) {
 782     testptr(tmpReg, markWord::monitor_value);                         // Inflated?
 783     jccb   (Assembler::zero, Stacked);
 784   }
 785 
 786   // It's inflated.
 787 #if INCLUDE_RTM_OPT
 788   if (use_rtm) {
 789     Label L_regular_inflated_unlock;
 790     int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 791     movptr(boxReg, Address(tmpReg, owner_offset));
 792     testptr(boxReg, boxReg);
 793     jccb(Assembler::notZero, L_regular_inflated_unlock);
 794     xend();
 795     jmpb(DONE_LABEL);
 796     bind(L_regular_inflated_unlock);
 797   }
 798 #endif
 799 
 800   // Despite our balanced locking property we still check that m->_owner == Self
 801   // as java routines or native JNI code called by this thread might
 802   // have released the lock.
 803   // Refer to the comments in synchronizer.cpp for how we might encode extra
 804   // state in _succ so we can avoid fetching EntryList|cxq.
 805   //
 806   // If there's no contention try a 1-0 exit.  That is, exit without
 807   // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
 808   // we detect and recover from the race that the 1-0 exit admits.
 809   //
 810   // Conceptually fast_unlock() must execute a STST|LDST "release" barrier
 811   // before it STs null into _owner, releasing the lock.  Updates
 812   // to data protected by the critical section must be visible before
 813   // we drop the lock (and thus before any other thread could acquire
 814   // the lock and observe the fields protected by the lock).
 815   // IA32's memory-model is SPO, so STs are ordered with respect to
 816   // each other and there's no need for an explicit barrier (fence).
 817   // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
 818 #ifndef _LP64
 819   // Note that we could employ various encoding schemes to reduce
 820   // the number of loads below (currently 4) to just 2 or 3.
 821   // Refer to the comments in synchronizer.cpp.
 822   // In practice the chain of fetches doesn't seem to impact performance, however.
 823   xorptr(boxReg, boxReg);
 824   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 825   jccb  (Assembler::notZero, DONE_LABEL);
 826   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 827   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 828   jccb  (Assembler::notZero, DONE_LABEL);
 829   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 830   jmpb  (DONE_LABEL);
 831 #else // _LP64
 832   // It's inflated
 833   Label CheckSucc, LNotRecursive, LSuccess, LGoSlowPath;
 834 
 835   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0);
 836   jccb(Assembler::equal, LNotRecursive);
 837 
 838   // Recursive inflated unlock
 839   decq(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 840   jmpb(LSuccess);
 841 
 842   bind(LNotRecursive);
 843   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 844   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 845   jccb  (Assembler::notZero, CheckSucc);
 846   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 847   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 848   jmpb  (DONE_LABEL);
 849 
 850   // Try to avoid passing control into the slow_path ...
 851   bind  (CheckSucc);
 852 
 853   // The following optional optimization can be elided if necessary
 854   // Effectively: if (succ == null) goto slow path
 855   // The code reduces the window for a race, however,
 856   // and thus benefits performance.
 857   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD);
 858   jccb  (Assembler::zero, LGoSlowPath);
 859 
 860   xorptr(boxReg, boxReg);
 861   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 862   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 863 
 864   // Memory barrier/fence
 865   // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ
 866   // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack.
 867   // This is faster on Nehalem and AMD Shanghai/Barcelona.
 868   // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences
 869   // We might also restructure (ST Owner=0;barrier;LD _Succ) to
 870   // (mov box,0; xchgq box, &m->Owner; LD _succ) .
 871   lock(); addl(Address(rsp, 0), 0);
 872 
 873   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD);
 874   jccb  (Assembler::notZero, LSuccess);
 875 
 876   // Rare inopportune interleaving - race.
 877   // The successor vanished in the small window above.
 878   // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor.
 879   // We need to ensure progress and succession.
 880   // Try to reacquire the lock.
 881   // If that fails then the new owner is responsible for succession and this
 882   // thread needs to take no further action and can exit via the fast path (success).
 883   // If the re-acquire succeeds then pass control into the slow path.
 884   // As implemented, this latter mode is horrible because we generated more
 885   // coherence traffic on the lock *and* artificially extended the critical section
 886   // length while by virtue of passing control into the slow path.
 887 
 888   // box is really RAX -- the following CMPXCHG depends on that binding
 889   // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R)
 890   lock();
 891   cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 892   // There's no successor so we tried to regrab the lock.
 893   // If that didn't work, then another thread grabbed the
 894   // lock so we're done (and exit was a success).
 895   jccb  (Assembler::notEqual, LSuccess);
 896   // Intentional fall-through into slow path
 897 
 898   bind  (LGoSlowPath);
 899   orl   (boxReg, 1);                      // set ICC.ZF=0 to indicate failure
 900   jmpb  (DONE_LABEL);
 901 
 902   bind  (LSuccess);
 903   testl (boxReg, 0);                      // set ICC.ZF=1 to indicate success
 904   jmpb  (DONE_LABEL);
 905 
 906 #endif
 907   if (!UseHeavyMonitors) {
 908     bind  (Stacked);
 909     movptr(tmpReg, Address (boxReg, 0));      // re-fetch
 910     lock();
 911     cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
 912     // Intentional fall-thru into DONE_LABEL
 913   }
 914   bind(DONE_LABEL);
 915 
 916   // ZFlag == 1 count in fast path
 917   // ZFlag == 0 count in slow path
 918   jccb(Assembler::notZero, NO_COUNT);
 919 
 920   bind(COUNT);
 921   // Count monitors in fast path
 922 #ifndef _LP64
 923   get_thread(tmpReg);
 924   decrementl(Address(tmpReg, JavaThread::held_monitor_count_offset()));
 925 #else // _LP64
 926   decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset()));
 927 #endif
 928 
 929   xorl(tmpReg, tmpReg); // Set ZF == 1
 930 
 931   bind(NO_COUNT);
 932 }
 933 
 934 //-------------------------------------------------------------------------------------------
 935 // Generic instructions support for use in .ad files C2 code generation
 936 
 937 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) {
 938   if (dst != src) {
 939     movdqu(dst, src);
 940   }
 941   if (opcode == Op_AbsVD) {
 942     andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg);
 943   } else {
 944     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 945     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
 946   }
 947 }
 948 
 949 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 950   if (opcode == Op_AbsVD) {
 951     vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg);
 952   } else {
 953     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 954     vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg);
 955   }
 956 }
 957 
 958 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) {
 959   if (dst != src) {
 960     movdqu(dst, src);
 961   }
 962   if (opcode == Op_AbsVF) {
 963     andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg);
 964   } else {
 965     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 966     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
 967   }
 968 }
 969 
 970 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 971   if (opcode == Op_AbsVF) {
 972     vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg);
 973   } else {
 974     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 975     vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg);
 976   }
 977 }
 978 
 979 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
 980   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 981   assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
 982 
 983   if (opcode == Op_MinV) {
 984     if (elem_bt == T_BYTE) {
 985       pminsb(dst, src);
 986     } else if (elem_bt == T_SHORT) {
 987       pminsw(dst, src);
 988     } else if (elem_bt == T_INT) {
 989       pminsd(dst, src);
 990     } else {
 991       assert(elem_bt == T_LONG, "required");
 992       assert(tmp == xmm0, "required");
 993       assert_different_registers(dst, src, tmp);
 994       movdqu(xmm0, dst);
 995       pcmpgtq(xmm0, src);
 996       blendvpd(dst, src);  // xmm0 as mask
 997     }
 998   } else { // opcode == Op_MaxV
 999     if (elem_bt == T_BYTE) {
1000       pmaxsb(dst, src);
1001     } else if (elem_bt == T_SHORT) {
1002       pmaxsw(dst, src);
1003     } else if (elem_bt == T_INT) {
1004       pmaxsd(dst, src);
1005     } else {
1006       assert(elem_bt == T_LONG, "required");
1007       assert(tmp == xmm0, "required");
1008       assert_different_registers(dst, src, tmp);
1009       movdqu(xmm0, src);
1010       pcmpgtq(xmm0, dst);
1011       blendvpd(dst, src);  // xmm0 as mask
1012     }
1013   }
1014 }
1015 
1016 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
1017                                  XMMRegister dst, XMMRegister src1, XMMRegister src2,
1018                                  int vlen_enc) {
1019   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
1020 
1021   if (opcode == Op_MinV) {
1022     if (elem_bt == T_BYTE) {
1023       vpminsb(dst, src1, src2, vlen_enc);
1024     } else if (elem_bt == T_SHORT) {
1025       vpminsw(dst, src1, src2, vlen_enc);
1026     } else if (elem_bt == T_INT) {
1027       vpminsd(dst, src1, src2, vlen_enc);
1028     } else {
1029       assert(elem_bt == T_LONG, "required");
1030       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
1031         vpminsq(dst, src1, src2, vlen_enc);
1032       } else {
1033         assert_different_registers(dst, src1, src2);
1034         vpcmpgtq(dst, src1, src2, vlen_enc);
1035         vblendvpd(dst, src1, src2, dst, vlen_enc);
1036       }
1037     }
1038   } else { // opcode == Op_MaxV
1039     if (elem_bt == T_BYTE) {
1040       vpmaxsb(dst, src1, src2, vlen_enc);
1041     } else if (elem_bt == T_SHORT) {
1042       vpmaxsw(dst, src1, src2, vlen_enc);
1043     } else if (elem_bt == T_INT) {
1044       vpmaxsd(dst, src1, src2, vlen_enc);
1045     } else {
1046       assert(elem_bt == T_LONG, "required");
1047       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
1048         vpmaxsq(dst, src1, src2, vlen_enc);
1049       } else {
1050         assert_different_registers(dst, src1, src2);
1051         vpcmpgtq(dst, src1, src2, vlen_enc);
1052         vblendvpd(dst, src2, src1, dst, vlen_enc);
1053       }
1054     }
1055   }
1056 }
1057 
1058 // Float/Double min max
1059 
1060 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
1061                                    XMMRegister dst, XMMRegister a, XMMRegister b,
1062                                    XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
1063                                    int vlen_enc) {
1064   assert(UseAVX > 0, "required");
1065   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1066          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1067   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1068   assert_different_registers(a, b, tmp, atmp, btmp);
1069 
1070   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1071   bool is_double_word = is_double_word_type(elem_bt);
1072 
1073   if (!is_double_word && is_min) {
1074     vblendvps(atmp, a, b, a, vlen_enc);
1075     vblendvps(btmp, b, a, a, vlen_enc);
1076     vminps(tmp, atmp, btmp, vlen_enc);
1077     vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1078     vblendvps(dst, tmp, atmp, btmp, vlen_enc);
1079   } else if (!is_double_word && !is_min) {
1080     vblendvps(btmp, b, a, b, vlen_enc);
1081     vblendvps(atmp, a, b, b, vlen_enc);
1082     vmaxps(tmp, atmp, btmp, vlen_enc);
1083     vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1084     vblendvps(dst, tmp, atmp, btmp, vlen_enc);
1085   } else if (is_double_word && is_min) {
1086     vblendvpd(atmp, a, b, a, vlen_enc);
1087     vblendvpd(btmp, b, a, a, vlen_enc);
1088     vminpd(tmp, atmp, btmp, vlen_enc);
1089     vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1090     vblendvpd(dst, tmp, atmp, btmp, vlen_enc);
1091   } else {
1092     assert(is_double_word && !is_min, "sanity");
1093     vblendvpd(btmp, b, a, b, vlen_enc);
1094     vblendvpd(atmp, a, b, b, vlen_enc);
1095     vmaxpd(tmp, atmp, btmp, vlen_enc);
1096     vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1097     vblendvpd(dst, tmp, atmp, btmp, vlen_enc);
1098   }
1099 }
1100 
1101 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
1102                                     XMMRegister dst, XMMRegister a, XMMRegister b,
1103                                     KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
1104                                     int vlen_enc) {
1105   assert(UseAVX > 2, "required");
1106   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1107          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1108   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1109   assert_different_registers(dst, a, b, atmp, btmp);
1110 
1111   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1112   bool is_double_word = is_double_word_type(elem_bt);
1113   bool merge = true;
1114 
1115   if (!is_double_word && is_min) {
1116     evpmovd2m(ktmp, a, vlen_enc);
1117     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1118     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1119     vminps(dst, atmp, btmp, vlen_enc);
1120     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1121     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1122   } else if (!is_double_word && !is_min) {
1123     evpmovd2m(ktmp, b, vlen_enc);
1124     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1125     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1126     vmaxps(dst, atmp, btmp, vlen_enc);
1127     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1128     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1129   } else if (is_double_word && is_min) {
1130     evpmovq2m(ktmp, a, vlen_enc);
1131     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1132     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1133     vminpd(dst, atmp, btmp, vlen_enc);
1134     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1135     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1136   } else {
1137     assert(is_double_word && !is_min, "sanity");
1138     evpmovq2m(ktmp, b, vlen_enc);
1139     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1140     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1141     vmaxpd(dst, atmp, btmp, vlen_enc);
1142     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1143     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1144   }
1145 }
1146 
1147 // Float/Double signum
1148 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) {
1149   assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");
1150 
1151   Label DONE_LABEL;
1152 
1153   if (opcode == Op_SignumF) {
1154     assert(UseSSE > 0, "required");
1155     ucomiss(dst, zero);
1156     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1157     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1158     movflt(dst, one);
1159     jcc(Assembler::above, DONE_LABEL);
1160     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
1161   } else if (opcode == Op_SignumD) {
1162     assert(UseSSE > 1, "required");
1163     ucomisd(dst, zero);
1164     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1165     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1166     movdbl(dst, one);
1167     jcc(Assembler::above, DONE_LABEL);
1168     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
1169   }
1170 
1171   bind(DONE_LABEL);
1172 }
1173 
1174 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1175   if (sign) {
1176     pmovsxbw(dst, src);
1177   } else {
1178     pmovzxbw(dst, src);
1179   }
1180 }
1181 
1182 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1183   if (sign) {
1184     vpmovsxbw(dst, src, vector_len);
1185   } else {
1186     vpmovzxbw(dst, src, vector_len);
1187   }
1188 }
1189 
1190 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1191   if (sign) {
1192     vpmovsxbd(dst, src, vector_len);
1193   } else {
1194     vpmovzxbd(dst, src, vector_len);
1195   }
1196 }
1197 
1198 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1199   if (sign) {
1200     vpmovsxwd(dst, src, vector_len);
1201   } else {
1202     vpmovzxwd(dst, src, vector_len);
1203   }
1204 }
1205 
1206 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1207                                      int shift, int vector_len) {
1208   if (opcode == Op_RotateLeftV) {
1209     if (etype == T_INT) {
1210       evprold(dst, src, shift, vector_len);
1211     } else {
1212       assert(etype == T_LONG, "expected type T_LONG");
1213       evprolq(dst, src, shift, vector_len);
1214     }
1215   } else {
1216     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1217     if (etype == T_INT) {
1218       evprord(dst, src, shift, vector_len);
1219     } else {
1220       assert(etype == T_LONG, "expected type T_LONG");
1221       evprorq(dst, src, shift, vector_len);
1222     }
1223   }
1224 }
1225 
1226 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1227                                      XMMRegister shift, int vector_len) {
1228   if (opcode == Op_RotateLeftV) {
1229     if (etype == T_INT) {
1230       evprolvd(dst, src, shift, vector_len);
1231     } else {
1232       assert(etype == T_LONG, "expected type T_LONG");
1233       evprolvq(dst, src, shift, vector_len);
1234     }
1235   } else {
1236     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1237     if (etype == T_INT) {
1238       evprorvd(dst, src, shift, vector_len);
1239     } else {
1240       assert(etype == T_LONG, "expected type T_LONG");
1241       evprorvq(dst, src, shift, vector_len);
1242     }
1243   }
1244 }
1245 
1246 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1247   if (opcode == Op_RShiftVI) {
1248     psrad(dst, shift);
1249   } else if (opcode == Op_LShiftVI) {
1250     pslld(dst, shift);
1251   } else {
1252     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1253     psrld(dst, shift);
1254   }
1255 }
1256 
1257 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1258   switch (opcode) {
1259     case Op_RShiftVI:  psrad(dst, shift); break;
1260     case Op_LShiftVI:  pslld(dst, shift); break;
1261     case Op_URShiftVI: psrld(dst, shift); break;
1262 
1263     default: assert(false, "%s", NodeClassNames[opcode]);
1264   }
1265 }
1266 
1267 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1268   if (opcode == Op_RShiftVI) {
1269     vpsrad(dst, nds, shift, vector_len);
1270   } else if (opcode == Op_LShiftVI) {
1271     vpslld(dst, nds, shift, vector_len);
1272   } else {
1273     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1274     vpsrld(dst, nds, shift, vector_len);
1275   }
1276 }
1277 
1278 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1279   switch (opcode) {
1280     case Op_RShiftVI:  vpsrad(dst, src, shift, vlen_enc); break;
1281     case Op_LShiftVI:  vpslld(dst, src, shift, vlen_enc); break;
1282     case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1283 
1284     default: assert(false, "%s", NodeClassNames[opcode]);
1285   }
1286 }
1287 
1288 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1289   switch (opcode) {
1290     case Op_RShiftVB:  // fall-through
1291     case Op_RShiftVS:  psraw(dst, shift); break;
1292 
1293     case Op_LShiftVB:  // fall-through
1294     case Op_LShiftVS:  psllw(dst, shift);   break;
1295 
1296     case Op_URShiftVS: // fall-through
1297     case Op_URShiftVB: psrlw(dst, shift);  break;
1298 
1299     default: assert(false, "%s", NodeClassNames[opcode]);
1300   }
1301 }
1302 
1303 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1304   switch (opcode) {
1305     case Op_RShiftVB:  // fall-through
1306     case Op_RShiftVS:  vpsraw(dst, src, shift, vlen_enc); break;
1307 
1308     case Op_LShiftVB:  // fall-through
1309     case Op_LShiftVS:  vpsllw(dst, src, shift, vlen_enc); break;
1310 
1311     case Op_URShiftVS: // fall-through
1312     case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1313 
1314     default: assert(false, "%s", NodeClassNames[opcode]);
1315   }
1316 }
1317 
1318 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1319   switch (opcode) {
1320     case Op_RShiftVL:  psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1321     case Op_LShiftVL:  psllq(dst, shift); break;
1322     case Op_URShiftVL: psrlq(dst, shift); break;
1323 
1324     default: assert(false, "%s", NodeClassNames[opcode]);
1325   }
1326 }
1327 
1328 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1329   if (opcode == Op_RShiftVL) {
1330     psrlq(dst, shift);  // using srl to implement sra on pre-avs512 systems
1331   } else if (opcode == Op_LShiftVL) {
1332     psllq(dst, shift);
1333   } else {
1334     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1335     psrlq(dst, shift);
1336   }
1337 }
1338 
1339 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1340   switch (opcode) {
1341     case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1342     case Op_LShiftVL:  vpsllq(dst, src, shift, vlen_enc); break;
1343     case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1344 
1345     default: assert(false, "%s", NodeClassNames[opcode]);
1346   }
1347 }
1348 
1349 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1350   if (opcode == Op_RShiftVL) {
1351     evpsraq(dst, nds, shift, vector_len);
1352   } else if (opcode == Op_LShiftVL) {
1353     vpsllq(dst, nds, shift, vector_len);
1354   } else {
1355     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1356     vpsrlq(dst, nds, shift, vector_len);
1357   }
1358 }
1359 
1360 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1361   switch (opcode) {
1362     case Op_RShiftVB:  // fall-through
1363     case Op_RShiftVS:  // fall-through
1364     case Op_RShiftVI:  vpsravd(dst, src, shift, vlen_enc); break;
1365 
1366     case Op_LShiftVB:  // fall-through
1367     case Op_LShiftVS:  // fall-through
1368     case Op_LShiftVI:  vpsllvd(dst, src, shift, vlen_enc); break;
1369 
1370     case Op_URShiftVB: // fall-through
1371     case Op_URShiftVS: // fall-through
1372     case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1373 
1374     default: assert(false, "%s", NodeClassNames[opcode]);
1375   }
1376 }
1377 
1378 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1379   switch (opcode) {
1380     case Op_RShiftVB:  // fall-through
1381     case Op_RShiftVS:  evpsravw(dst, src, shift, vlen_enc); break;
1382 
1383     case Op_LShiftVB:  // fall-through
1384     case Op_LShiftVS:  evpsllvw(dst, src, shift, vlen_enc); break;
1385 
1386     case Op_URShiftVB: // fall-through
1387     case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1388 
1389     default: assert(false, "%s", NodeClassNames[opcode]);
1390   }
1391 }
1392 
1393 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1394   assert(UseAVX >= 2, "required");
1395   switch (opcode) {
1396     case Op_RShiftVL: {
1397       if (UseAVX > 2) {
1398         assert(tmp == xnoreg, "not used");
1399         if (!VM_Version::supports_avx512vl()) {
1400           vlen_enc = Assembler::AVX_512bit;
1401         }
1402         evpsravq(dst, src, shift, vlen_enc);
1403       } else {
1404         vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1405         vpsrlvq(dst, src, shift, vlen_enc);
1406         vpsrlvq(tmp, tmp, shift, vlen_enc);
1407         vpxor(dst, dst, tmp, vlen_enc);
1408         vpsubq(dst, dst, tmp, vlen_enc);
1409       }
1410       break;
1411     }
1412     case Op_LShiftVL: {
1413       assert(tmp == xnoreg, "not used");
1414       vpsllvq(dst, src, shift, vlen_enc);
1415       break;
1416     }
1417     case Op_URShiftVL: {
1418       assert(tmp == xnoreg, "not used");
1419       vpsrlvq(dst, src, shift, vlen_enc);
1420       break;
1421     }
1422     default: assert(false, "%s", NodeClassNames[opcode]);
1423   }
1424 }
1425 
1426 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1427 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1428   assert(opcode == Op_LShiftVB ||
1429          opcode == Op_RShiftVB ||
1430          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1431   bool sign = (opcode != Op_URShiftVB);
1432   assert(vector_len == 0, "required");
1433   vextendbd(sign, dst, src, 1);
1434   vpmovzxbd(vtmp, shift, 1);
1435   varshiftd(opcode, dst, dst, vtmp, 1);
1436   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg);
1437   vextracti128_high(vtmp, dst);
1438   vpackusdw(dst, dst, vtmp, 0);
1439 }
1440 
1441 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1442 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1443   assert(opcode == Op_LShiftVB ||
1444          opcode == Op_RShiftVB ||
1445          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1446   bool sign = (opcode != Op_URShiftVB);
1447   int ext_vector_len = vector_len + 1;
1448   vextendbw(sign, dst, src, ext_vector_len);
1449   vpmovzxbw(vtmp, shift, ext_vector_len);
1450   varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1451   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg);
1452   if (vector_len == 0) {
1453     vextracti128_high(vtmp, dst);
1454     vpackuswb(dst, dst, vtmp, vector_len);
1455   } else {
1456     vextracti64x4_high(vtmp, dst);
1457     vpackuswb(dst, dst, vtmp, vector_len);
1458     vpermq(dst, dst, 0xD8, vector_len);
1459   }
1460 }
1461 
1462 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1463   switch(typ) {
1464     case T_BYTE:
1465       pinsrb(dst, val, idx);
1466       break;
1467     case T_SHORT:
1468       pinsrw(dst, val, idx);
1469       break;
1470     case T_INT:
1471       pinsrd(dst, val, idx);
1472       break;
1473     case T_LONG:
1474       pinsrq(dst, val, idx);
1475       break;
1476     default:
1477       assert(false,"Should not reach here.");
1478       break;
1479   }
1480 }
1481 
1482 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1483   switch(typ) {
1484     case T_BYTE:
1485       vpinsrb(dst, src, val, idx);
1486       break;
1487     case T_SHORT:
1488       vpinsrw(dst, src, val, idx);
1489       break;
1490     case T_INT:
1491       vpinsrd(dst, src, val, idx);
1492       break;
1493     case T_LONG:
1494       vpinsrq(dst, src, val, idx);
1495       break;
1496     default:
1497       assert(false,"Should not reach here.");
1498       break;
1499   }
1500 }
1501 
1502 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1503   switch(typ) {
1504     case T_INT:
1505       vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1506       break;
1507     case T_FLOAT:
1508       vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1509       break;
1510     case T_LONG:
1511       vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1512       break;
1513     case T_DOUBLE:
1514       vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1515       break;
1516     default:
1517       assert(false,"Should not reach here.");
1518       break;
1519   }
1520 }
1521 
1522 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1523   switch(typ) {
1524     case T_INT:
1525       evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1526       break;
1527     case T_FLOAT:
1528       evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1529       break;
1530     case T_LONG:
1531       evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1532       break;
1533     case T_DOUBLE:
1534       evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1535       break;
1536     default:
1537       assert(false,"Should not reach here.");
1538       break;
1539   }
1540 }
1541 
1542 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1543   switch(typ) {
1544     case T_INT:
1545       evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1546       break;
1547     case T_FLOAT:
1548       evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1549       break;
1550     case T_LONG:
1551       evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1552       break;
1553     case T_DOUBLE:
1554       evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1555       break;
1556     default:
1557       assert(false,"Should not reach here.");
1558       break;
1559   }
1560 }
1561 
1562 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) {
1563   if (vlen_in_bytes <= 16) {
1564     pxor (dst, dst);
1565     psubb(dst, src);
1566     switch (elem_bt) {
1567       case T_BYTE:   /* nothing to do */ break;
1568       case T_SHORT:  pmovsxbw(dst, dst); break;
1569       case T_INT:    pmovsxbd(dst, dst); break;
1570       case T_FLOAT:  pmovsxbd(dst, dst); break;
1571       case T_LONG:   pmovsxbq(dst, dst); break;
1572       case T_DOUBLE: pmovsxbq(dst, dst); break;
1573 
1574       default: assert(false, "%s", type2name(elem_bt));
1575     }
1576   } else {
1577     assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, "");
1578     int vlen_enc = vector_length_encoding(vlen_in_bytes);
1579 
1580     vpxor (dst, dst, dst, vlen_enc);
1581     vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc);
1582 
1583     switch (elem_bt) {
1584       case T_BYTE:   /* nothing to do */            break;
1585       case T_SHORT:  vpmovsxbw(dst, dst, vlen_enc); break;
1586       case T_INT:    vpmovsxbd(dst, dst, vlen_enc); break;
1587       case T_FLOAT:  vpmovsxbd(dst, dst, vlen_enc); break;
1588       case T_LONG:   vpmovsxbq(dst, dst, vlen_enc); break;
1589       case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1590 
1591       default: assert(false, "%s", type2name(elem_bt));
1592     }
1593   }
1594 }
1595 
1596 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) {
1597   if (novlbwdq) {
1598     vpmovsxbd(xtmp, src, vlen_enc);
1599     evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()),
1600             Assembler::eq, true, vlen_enc, noreg);
1601   } else {
1602     vpxor(xtmp, xtmp, xtmp, vlen_enc);
1603     vpsubb(xtmp, xtmp, src, vlen_enc);
1604     evpmovb2m(dst, xtmp, vlen_enc);
1605   }
1606 }
1607 
1608 void C2_MacroAssembler::load_vector(XMMRegister dst, Address src, int vlen_in_bytes) {
1609   switch (vlen_in_bytes) {
1610     case 4:  movdl(dst, src);   break;
1611     case 8:  movq(dst, src);    break;
1612     case 16: movdqu(dst, src);  break;
1613     case 32: vmovdqu(dst, src); break;
1614     case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break;
1615     default: ShouldNotReachHere();
1616   }
1617 }
1618 
1619 void C2_MacroAssembler::load_vector(XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) {
1620   assert(rscratch != noreg || always_reachable(src), "missing");
1621 
1622   if (reachable(src)) {
1623     load_vector(dst, as_Address(src), vlen_in_bytes);
1624   } else {
1625     lea(rscratch, src);
1626     load_vector(dst, Address(rscratch, 0), vlen_in_bytes);
1627   }
1628 }
1629 
1630 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) {
1631   int vlen_enc = vector_length_encoding(vlen);
1632   if (VM_Version::supports_avx()) {
1633     if (bt == T_LONG) {
1634       if (VM_Version::supports_avx2()) {
1635         vpbroadcastq(dst, src, vlen_enc);
1636       } else {
1637         vmovddup(dst, src, vlen_enc);
1638       }
1639     } else if (bt == T_DOUBLE) {
1640       if (vlen_enc != Assembler::AVX_128bit) {
1641         vbroadcastsd(dst, src, vlen_enc, noreg);
1642       } else {
1643         vmovddup(dst, src, vlen_enc);
1644       }
1645     } else {
1646       if (VM_Version::supports_avx2() && is_integral_type(bt)) {
1647         vpbroadcastd(dst, src, vlen_enc);
1648       } else {
1649         vbroadcastss(dst, src, vlen_enc);
1650       }
1651     }
1652   } else if (VM_Version::supports_sse3()) {
1653     movddup(dst, src);
1654   } else {
1655     movq(dst, src);
1656     if (vlen == 16) {
1657       punpcklqdq(dst, dst);
1658     }
1659   }
1660 }
1661 
1662 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) {
1663   // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64.
1664   int offset = exact_log2(type2aelembytes(bt)) << 6;
1665   if (is_floating_point_type(bt)) {
1666     offset += 128;
1667   }
1668   ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset);
1669   load_vector(dst, addr, vlen_in_bytes);
1670 }
1671 
1672 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1673 
1674 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1675   int vector_len = Assembler::AVX_128bit;
1676 
1677   switch (opcode) {
1678     case Op_AndReductionV:  pand(dst, src); break;
1679     case Op_OrReductionV:   por (dst, src); break;
1680     case Op_XorReductionV:  pxor(dst, src); break;
1681     case Op_MinReductionV:
1682       switch (typ) {
1683         case T_BYTE:        pminsb(dst, src); break;
1684         case T_SHORT:       pminsw(dst, src); break;
1685         case T_INT:         pminsd(dst, src); break;
1686         case T_LONG:        assert(UseAVX > 2, "required");
1687                             vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
1688         default:            assert(false, "wrong type");
1689       }
1690       break;
1691     case Op_MaxReductionV:
1692       switch (typ) {
1693         case T_BYTE:        pmaxsb(dst, src); break;
1694         case T_SHORT:       pmaxsw(dst, src); break;
1695         case T_INT:         pmaxsd(dst, src); break;
1696         case T_LONG:        assert(UseAVX > 2, "required");
1697                             vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
1698         default:            assert(false, "wrong type");
1699       }
1700       break;
1701     case Op_AddReductionVF: addss(dst, src); break;
1702     case Op_AddReductionVD: addsd(dst, src); break;
1703     case Op_AddReductionVI:
1704       switch (typ) {
1705         case T_BYTE:        paddb(dst, src); break;
1706         case T_SHORT:       paddw(dst, src); break;
1707         case T_INT:         paddd(dst, src); break;
1708         default:            assert(false, "wrong type");
1709       }
1710       break;
1711     case Op_AddReductionVL: paddq(dst, src); break;
1712     case Op_MulReductionVF: mulss(dst, src); break;
1713     case Op_MulReductionVD: mulsd(dst, src); break;
1714     case Op_MulReductionVI:
1715       switch (typ) {
1716         case T_SHORT:       pmullw(dst, src); break;
1717         case T_INT:         pmulld(dst, src); break;
1718         default:            assert(false, "wrong type");
1719       }
1720       break;
1721     case Op_MulReductionVL: assert(UseAVX > 2, "required");
1722                             evpmullq(dst, dst, src, vector_len); break;
1723     default:                assert(false, "wrong opcode");
1724   }
1725 }
1726 
1727 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1728   int vector_len = Assembler::AVX_256bit;
1729 
1730   switch (opcode) {
1731     case Op_AndReductionV:  vpand(dst, src1, src2, vector_len); break;
1732     case Op_OrReductionV:   vpor (dst, src1, src2, vector_len); break;
1733     case Op_XorReductionV:  vpxor(dst, src1, src2, vector_len); break;
1734     case Op_MinReductionV:
1735       switch (typ) {
1736         case T_BYTE:        vpminsb(dst, src1, src2, vector_len); break;
1737         case T_SHORT:       vpminsw(dst, src1, src2, vector_len); break;
1738         case T_INT:         vpminsd(dst, src1, src2, vector_len); break;
1739         case T_LONG:        assert(UseAVX > 2, "required");
1740                             vpminsq(dst, src1, src2, vector_len); break;
1741         default:            assert(false, "wrong type");
1742       }
1743       break;
1744     case Op_MaxReductionV:
1745       switch (typ) {
1746         case T_BYTE:        vpmaxsb(dst, src1, src2, vector_len); break;
1747         case T_SHORT:       vpmaxsw(dst, src1, src2, vector_len); break;
1748         case T_INT:         vpmaxsd(dst, src1, src2, vector_len); break;
1749         case T_LONG:        assert(UseAVX > 2, "required");
1750                             vpmaxsq(dst, src1, src2, vector_len); break;
1751         default:            assert(false, "wrong type");
1752       }
1753       break;
1754     case Op_AddReductionVI:
1755       switch (typ) {
1756         case T_BYTE:        vpaddb(dst, src1, src2, vector_len); break;
1757         case T_SHORT:       vpaddw(dst, src1, src2, vector_len); break;
1758         case T_INT:         vpaddd(dst, src1, src2, vector_len); break;
1759         default:            assert(false, "wrong type");
1760       }
1761       break;
1762     case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
1763     case Op_MulReductionVI:
1764       switch (typ) {
1765         case T_SHORT:       vpmullw(dst, src1, src2, vector_len); break;
1766         case T_INT:         vpmulld(dst, src1, src2, vector_len); break;
1767         default:            assert(false, "wrong type");
1768       }
1769       break;
1770     case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break;
1771     default:                assert(false, "wrong opcode");
1772   }
1773 }
1774 
1775 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
1776                                   XMMRegister dst, XMMRegister src,
1777                                   XMMRegister vtmp1, XMMRegister vtmp2) {
1778   switch (opcode) {
1779     case Op_AddReductionVF:
1780     case Op_MulReductionVF:
1781       reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1782       break;
1783 
1784     case Op_AddReductionVD:
1785     case Op_MulReductionVD:
1786       reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1787       break;
1788 
1789     default: assert(false, "wrong opcode");
1790   }
1791 }
1792 
1793 void C2_MacroAssembler::reduceB(int opcode, int vlen,
1794                              Register dst, Register src1, XMMRegister src2,
1795                              XMMRegister vtmp1, XMMRegister vtmp2) {
1796   switch (vlen) {
1797     case  8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1798     case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1799     case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1800     case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1801 
1802     default: assert(false, "wrong vector length");
1803   }
1804 }
1805 
1806 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
1807                              Register dst, Register src1, XMMRegister src2,
1808                              XMMRegister vtmp1, XMMRegister vtmp2) {
1809   switch (vlen) {
1810     case  8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1811     case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1812     case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1813     case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1814 
1815     default: assert(false, "wrong vector length");
1816   }
1817 }
1818 
1819 void C2_MacroAssembler::reduceS(int opcode, int vlen,
1820                              Register dst, Register src1, XMMRegister src2,
1821                              XMMRegister vtmp1, XMMRegister vtmp2) {
1822   switch (vlen) {
1823     case  4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1824     case  8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1825     case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1826     case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1827 
1828     default: assert(false, "wrong vector length");
1829   }
1830 }
1831 
1832 void C2_MacroAssembler::reduceI(int opcode, int vlen,
1833                              Register dst, Register src1, XMMRegister src2,
1834                              XMMRegister vtmp1, XMMRegister vtmp2) {
1835   switch (vlen) {
1836     case  2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1837     case  4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1838     case  8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1839     case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1840 
1841     default: assert(false, "wrong vector length");
1842   }
1843 }
1844 
1845 #ifdef _LP64
1846 void C2_MacroAssembler::reduceL(int opcode, int vlen,
1847                              Register dst, Register src1, XMMRegister src2,
1848                              XMMRegister vtmp1, XMMRegister vtmp2) {
1849   switch (vlen) {
1850     case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1851     case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1852     case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1853 
1854     default: assert(false, "wrong vector length");
1855   }
1856 }
1857 #endif // _LP64
1858 
1859 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1860   switch (vlen) {
1861     case 2:
1862       assert(vtmp2 == xnoreg, "");
1863       reduce2F(opcode, dst, src, vtmp1);
1864       break;
1865     case 4:
1866       assert(vtmp2 == xnoreg, "");
1867       reduce4F(opcode, dst, src, vtmp1);
1868       break;
1869     case 8:
1870       reduce8F(opcode, dst, src, vtmp1, vtmp2);
1871       break;
1872     case 16:
1873       reduce16F(opcode, dst, src, vtmp1, vtmp2);
1874       break;
1875     default: assert(false, "wrong vector length");
1876   }
1877 }
1878 
1879 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1880   switch (vlen) {
1881     case 2:
1882       assert(vtmp2 == xnoreg, "");
1883       reduce2D(opcode, dst, src, vtmp1);
1884       break;
1885     case 4:
1886       reduce4D(opcode, dst, src, vtmp1, vtmp2);
1887       break;
1888     case 8:
1889       reduce8D(opcode, dst, src, vtmp1, vtmp2);
1890       break;
1891     default: assert(false, "wrong vector length");
1892   }
1893 }
1894 
1895 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1896   if (opcode == Op_AddReductionVI) {
1897     if (vtmp1 != src2) {
1898       movdqu(vtmp1, src2);
1899     }
1900     phaddd(vtmp1, vtmp1);
1901   } else {
1902     pshufd(vtmp1, src2, 0x1);
1903     reduce_operation_128(T_INT, opcode, vtmp1, src2);
1904   }
1905   movdl(vtmp2, src1);
1906   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
1907   movdl(dst, vtmp1);
1908 }
1909 
1910 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1911   if (opcode == Op_AddReductionVI) {
1912     if (vtmp1 != src2) {
1913       movdqu(vtmp1, src2);
1914     }
1915     phaddd(vtmp1, src2);
1916     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1917   } else {
1918     pshufd(vtmp2, src2, 0xE);
1919     reduce_operation_128(T_INT, opcode, vtmp2, src2);
1920     reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1921   }
1922 }
1923 
1924 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1925   if (opcode == Op_AddReductionVI) {
1926     vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
1927     vextracti128_high(vtmp2, vtmp1);
1928     vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
1929     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1930   } else {
1931     vextracti128_high(vtmp1, src2);
1932     reduce_operation_128(T_INT, opcode, vtmp1, src2);
1933     reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1934   }
1935 }
1936 
1937 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1938   vextracti64x4_high(vtmp2, src2);
1939   reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
1940   reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1941 }
1942 
1943 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1944   pshufd(vtmp2, src2, 0x1);
1945   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
1946   movdqu(vtmp1, vtmp2);
1947   psrldq(vtmp1, 2);
1948   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
1949   movdqu(vtmp2, vtmp1);
1950   psrldq(vtmp2, 1);
1951   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
1952   movdl(vtmp2, src1);
1953   pmovsxbd(vtmp1, vtmp1);
1954   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
1955   pextrb(dst, vtmp1, 0x0);
1956   movsbl(dst, dst);
1957 }
1958 
1959 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1960   pshufd(vtmp1, src2, 0xE);
1961   reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
1962   reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1963 }
1964 
1965 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1966   vextracti128_high(vtmp2, src2);
1967   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
1968   reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1969 }
1970 
1971 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1972   vextracti64x4_high(vtmp1, src2);
1973   reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
1974   reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1975 }
1976 
1977 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1978   pmovsxbw(vtmp2, src2);
1979   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1980 }
1981 
1982 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1983   if (UseAVX > 1) {
1984     int vector_len = Assembler::AVX_256bit;
1985     vpmovsxbw(vtmp1, src2, vector_len);
1986     reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1987   } else {
1988     pmovsxbw(vtmp2, src2);
1989     reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1990     pshufd(vtmp2, src2, 0x1);
1991     pmovsxbw(vtmp2, src2);
1992     reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
1993   }
1994 }
1995 
1996 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1997   if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
1998     int vector_len = Assembler::AVX_512bit;
1999     vpmovsxbw(vtmp1, src2, vector_len);
2000     reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2001   } else {
2002     assert(UseAVX >= 2,"Should not reach here.");
2003     mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
2004     vextracti128_high(vtmp2, src2);
2005     mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2006   }
2007 }
2008 
2009 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2010   mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
2011   vextracti64x4_high(vtmp2, src2);
2012   mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2013 }
2014 
2015 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2016   if (opcode == Op_AddReductionVI) {
2017     if (vtmp1 != src2) {
2018       movdqu(vtmp1, src2);
2019     }
2020     phaddw(vtmp1, vtmp1);
2021     phaddw(vtmp1, vtmp1);
2022   } else {
2023     pshufd(vtmp2, src2, 0x1);
2024     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2025     movdqu(vtmp1, vtmp2);
2026     psrldq(vtmp1, 2);
2027     reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
2028   }
2029   movdl(vtmp2, src1);
2030   pmovsxwd(vtmp1, vtmp1);
2031   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2032   pextrw(dst, vtmp1, 0x0);
2033   movswl(dst, dst);
2034 }
2035 
2036 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2037   if (opcode == Op_AddReductionVI) {
2038     if (vtmp1 != src2) {
2039       movdqu(vtmp1, src2);
2040     }
2041     phaddw(vtmp1, src2);
2042   } else {
2043     pshufd(vtmp1, src2, 0xE);
2044     reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
2045   }
2046   reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2047 }
2048 
2049 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2050   if (opcode == Op_AddReductionVI) {
2051     int vector_len = Assembler::AVX_256bit;
2052     vphaddw(vtmp2, src2, src2, vector_len);
2053     vpermq(vtmp2, vtmp2, 0xD8, vector_len);
2054   } else {
2055     vextracti128_high(vtmp2, src2);
2056     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2057   }
2058   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2059 }
2060 
2061 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2062   int vector_len = Assembler::AVX_256bit;
2063   vextracti64x4_high(vtmp1, src2);
2064   reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
2065   reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2066 }
2067 
2068 #ifdef _LP64
2069 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2070   pshufd(vtmp2, src2, 0xE);
2071   reduce_operation_128(T_LONG, opcode, vtmp2, src2);
2072   movdq(vtmp1, src1);
2073   reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
2074   movdq(dst, vtmp1);
2075 }
2076 
2077 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2078   vextracti128_high(vtmp1, src2);
2079   reduce_operation_128(T_LONG, opcode, vtmp1, src2);
2080   reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2081 }
2082 
2083 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2084   vextracti64x4_high(vtmp2, src2);
2085   reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
2086   reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2087 }
2088 
2089 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
2090   mov64(temp, -1L);
2091   bzhiq(temp, temp, len);
2092   kmovql(dst, temp);
2093 }
2094 #endif // _LP64
2095 
2096 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2097   reduce_operation_128(T_FLOAT, opcode, dst, src);
2098   pshufd(vtmp, src, 0x1);
2099   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2100 }
2101 
2102 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2103   reduce2F(opcode, dst, src, vtmp);
2104   pshufd(vtmp, src, 0x2);
2105   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2106   pshufd(vtmp, src, 0x3);
2107   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2108 }
2109 
2110 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2111   reduce4F(opcode, dst, src, vtmp2);
2112   vextractf128_high(vtmp2, src);
2113   reduce4F(opcode, dst, vtmp2, vtmp1);
2114 }
2115 
2116 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2117   reduce8F(opcode, dst, src, vtmp1, vtmp2);
2118   vextracti64x4_high(vtmp1, src);
2119   reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
2120 }
2121 
2122 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2123   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2124   pshufd(vtmp, src, 0xE);
2125   reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
2126 }
2127 
2128 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2129   reduce2D(opcode, dst, src, vtmp2);
2130   vextractf128_high(vtmp2, src);
2131   reduce2D(opcode, dst, vtmp2, vtmp1);
2132 }
2133 
2134 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2135   reduce4D(opcode, dst, src, vtmp1, vtmp2);
2136   vextracti64x4_high(vtmp1, src);
2137   reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
2138 }
2139 
2140 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) {
2141   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2142 }
2143 
2144 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) {
2145   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2146 }
2147 
2148 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask,
2149                                  int vec_enc) {
2150   switch(elem_bt) {
2151     case T_INT:
2152     case T_FLOAT:
2153       vmaskmovps(dst, src, mask, vec_enc);
2154       break;
2155     case T_LONG:
2156     case T_DOUBLE:
2157       vmaskmovpd(dst, src, mask, vec_enc);
2158       break;
2159     default:
2160       fatal("Unsupported type %s", type2name(elem_bt));
2161       break;
2162   }
2163 }
2164 
2165 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask,
2166                                  int vec_enc) {
2167   switch(elem_bt) {
2168     case T_INT:
2169     case T_FLOAT:
2170       vmaskmovps(dst, src, mask, vec_enc);
2171       break;
2172     case T_LONG:
2173     case T_DOUBLE:
2174       vmaskmovpd(dst, src, mask, vec_enc);
2175       break;
2176     default:
2177       fatal("Unsupported type %s", type2name(elem_bt));
2178       break;
2179   }
2180 }
2181 
2182 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
2183                                           XMMRegister dst, XMMRegister src,
2184                                           XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2185                                           XMMRegister xmm_0, XMMRegister xmm_1) {
2186   int permconst[] = {1, 14};
2187   XMMRegister wsrc = src;
2188   XMMRegister wdst = xmm_0;
2189   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2190 
2191   int vlen_enc = Assembler::AVX_128bit;
2192   if (vlen == 16) {
2193     vlen_enc = Assembler::AVX_256bit;
2194   }
2195 
2196   for (int i = log2(vlen) - 1; i >=0; i--) {
2197     if (i == 0 && !is_dst_valid) {
2198       wdst = dst;
2199     }
2200     if (i == 3) {
2201       vextracti64x4_high(wtmp, wsrc);
2202     } else if (i == 2) {
2203       vextracti128_high(wtmp, wsrc);
2204     } else { // i = [0,1]
2205       vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
2206     }
2207     vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2208     wsrc = wdst;
2209     vlen_enc = Assembler::AVX_128bit;
2210   }
2211   if (is_dst_valid) {
2212     vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2213   }
2214 }
2215 
2216 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
2217                                         XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2218                                         XMMRegister xmm_0, XMMRegister xmm_1) {
2219   XMMRegister wsrc = src;
2220   XMMRegister wdst = xmm_0;
2221   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2222   int vlen_enc = Assembler::AVX_128bit;
2223   if (vlen == 8) {
2224     vlen_enc = Assembler::AVX_256bit;
2225   }
2226   for (int i = log2(vlen) - 1; i >=0; i--) {
2227     if (i == 0 && !is_dst_valid) {
2228       wdst = dst;
2229     }
2230     if (i == 1) {
2231       vextracti128_high(wtmp, wsrc);
2232     } else if (i == 2) {
2233       vextracti64x4_high(wtmp, wsrc);
2234     } else {
2235       assert(i == 0, "%d", i);
2236       vpermilpd(wtmp, wsrc, 1, vlen_enc);
2237     }
2238     vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2239     wsrc = wdst;
2240     vlen_enc = Assembler::AVX_128bit;
2241   }
2242   if (is_dst_valid) {
2243     vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2244   }
2245 }
2246 
2247 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2248   switch (bt) {
2249     case T_BYTE:  pextrb(dst, src, idx); break;
2250     case T_SHORT: pextrw(dst, src, idx); break;
2251     case T_INT:   pextrd(dst, src, idx); break;
2252     case T_LONG:  pextrq(dst, src, idx); break;
2253 
2254     default:
2255       assert(false,"Should not reach here.");
2256       break;
2257   }
2258 }
2259 
2260 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2261   int esize =  type2aelembytes(typ);
2262   int elem_per_lane = 16/esize;
2263   int lane = elemindex / elem_per_lane;
2264   int eindex = elemindex % elem_per_lane;
2265 
2266   if (lane >= 2) {
2267     assert(UseAVX > 2, "required");
2268     vextractf32x4(dst, src, lane & 3);
2269     return dst;
2270   } else if (lane > 0) {
2271     assert(UseAVX > 0, "required");
2272     vextractf128(dst, src, lane);
2273     return dst;
2274   } else {
2275     return src;
2276   }
2277 }
2278 
2279 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2280   int esize =  type2aelembytes(typ);
2281   int elem_per_lane = 16/esize;
2282   int eindex = elemindex % elem_per_lane;
2283   assert(is_integral_type(typ),"required");
2284 
2285   if (eindex == 0) {
2286     if (typ == T_LONG) {
2287       movq(dst, src);
2288     } else {
2289       movdl(dst, src);
2290       if (typ == T_BYTE)
2291         movsbl(dst, dst);
2292       else if (typ == T_SHORT)
2293         movswl(dst, dst);
2294     }
2295   } else {
2296     extract(typ, dst, src, eindex);
2297   }
2298 }
2299 
2300 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) {
2301   int esize =  type2aelembytes(typ);
2302   int elem_per_lane = 16/esize;
2303   int eindex = elemindex % elem_per_lane;
2304   assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2305 
2306   if (eindex == 0) {
2307     movq(dst, src);
2308   } else {
2309     if (typ == T_FLOAT) {
2310       if (UseAVX == 0) {
2311         movdqu(dst, src);
2312         shufps(dst, dst, eindex);
2313       } else {
2314         vshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2315       }
2316     } else {
2317       if (UseAVX == 0) {
2318         movdqu(dst, src);
2319         psrldq(dst, eindex*esize);
2320       } else {
2321         vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2322       }
2323       movq(dst, dst);
2324     }
2325   }
2326   // Zero upper bits
2327   if (typ == T_FLOAT) {
2328     if (UseAVX == 0) {
2329       assert(vtmp != xnoreg, "required.");
2330       movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg);
2331       pand(dst, vtmp);
2332     } else {
2333       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg);
2334     }
2335   }
2336 }
2337 
2338 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) {
2339   switch(typ) {
2340     case T_BYTE:
2341     case T_BOOLEAN:
2342       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2343       break;
2344     case T_SHORT:
2345     case T_CHAR:
2346       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2347       break;
2348     case T_INT:
2349     case T_FLOAT:
2350       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2351       break;
2352     case T_LONG:
2353     case T_DOUBLE:
2354       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2355       break;
2356     default:
2357       assert(false,"Should not reach here.");
2358       break;
2359   }
2360 }
2361 
2362 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) {
2363   assert(rscratch != noreg || always_reachable(src2), "missing");
2364 
2365   switch(typ) {
2366     case T_BOOLEAN:
2367     case T_BYTE:
2368       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2369       break;
2370     case T_CHAR:
2371     case T_SHORT:
2372       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2373       break;
2374     case T_INT:
2375     case T_FLOAT:
2376       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2377       break;
2378     case T_LONG:
2379     case T_DOUBLE:
2380       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2381       break;
2382     default:
2383       assert(false,"Should not reach here.");
2384       break;
2385   }
2386 }
2387 
2388 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2389   switch(typ) {
2390     case T_BYTE:
2391       evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2392       break;
2393     case T_SHORT:
2394       evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2395       break;
2396     case T_INT:
2397     case T_FLOAT:
2398       evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2399       break;
2400     case T_LONG:
2401     case T_DOUBLE:
2402       evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2403       break;
2404     default:
2405       assert(false,"Should not reach here.");
2406       break;
2407   }
2408 }
2409 
2410 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) {
2411   assert(vlen_in_bytes <= 32, "");
2412   int esize = type2aelembytes(bt);
2413   if (vlen_in_bytes == 32) {
2414     assert(vtmp == xnoreg, "required.");
2415     if (esize >= 4) {
2416       vtestps(src1, src2, AVX_256bit);
2417     } else {
2418       vptest(src1, src2, AVX_256bit);
2419     }
2420     return;
2421   }
2422   if (vlen_in_bytes < 16) {
2423     // Duplicate the lower part to fill the whole register,
2424     // Don't need to do so for src2
2425     assert(vtmp != xnoreg, "required");
2426     int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04;
2427     pshufd(vtmp, src1, shuffle_imm);
2428   } else {
2429     assert(vtmp == xnoreg, "required");
2430     vtmp = src1;
2431   }
2432   if (esize >= 4 && VM_Version::supports_avx()) {
2433     vtestps(vtmp, src2, AVX_128bit);
2434   } else {
2435     ptest(vtmp, src2);
2436   }
2437 }
2438 
2439 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
2440   assert(UseAVX >= 2, "required");
2441 #ifdef ASSERT
2442   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2443   bool is_bw_supported = VM_Version::supports_avx512bw();
2444   if (is_bw && !is_bw_supported) {
2445     assert(vlen_enc != Assembler::AVX_512bit, "required");
2446     assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16),
2447            "XMM register should be 0-15");
2448   }
2449 #endif // ASSERT
2450   switch (elem_bt) {
2451     case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return;
2452     case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return;
2453     case T_INT: vpaddd(dst, src1, src2, vlen_enc); return;
2454     case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return;
2455     case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return;
2456     case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return;
2457     default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2458   }
2459 }
2460 
2461 #ifdef _LP64
2462 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) {
2463   assert(UseAVX >= 2, "required");
2464   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2465   bool is_vl = vlen_enc != Assembler::AVX_512bit;
2466   if ((UseAVX > 2) &&
2467       (!is_bw || VM_Version::supports_avx512bw()) &&
2468       (!is_vl || VM_Version::supports_avx512vl())) {
2469     switch (elem_bt) {
2470       case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return;
2471       case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return;
2472       case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return;
2473       case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return;
2474       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2475     }
2476   } else {
2477     assert(vlen_enc != Assembler::AVX_512bit, "required");
2478     assert((dst->encoding() < 16),"XMM register should be 0-15");
2479     switch (elem_bt) {
2480       case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return;
2481       case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return;
2482       case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return;
2483       case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return;
2484       case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return;
2485       case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return;
2486       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2487     }
2488   }
2489 }
2490 #endif
2491 
2492 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
2493   switch (to_elem_bt) {
2494     case T_SHORT:
2495       vpmovsxbw(dst, src, vlen_enc);
2496       break;
2497     case T_INT:
2498       vpmovsxbd(dst, src, vlen_enc);
2499       break;
2500     case T_FLOAT:
2501       vpmovsxbd(dst, src, vlen_enc);
2502       vcvtdq2ps(dst, dst, vlen_enc);
2503       break;
2504     case T_LONG:
2505       vpmovsxbq(dst, src, vlen_enc);
2506       break;
2507     case T_DOUBLE: {
2508       int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
2509       vpmovsxbd(dst, src, mid_vlen_enc);
2510       vcvtdq2pd(dst, dst, vlen_enc);
2511       break;
2512     }
2513     default:
2514       fatal("Unsupported type %s", type2name(to_elem_bt));
2515       break;
2516   }
2517 }
2518 
2519 //-------------------------------------------------------------------------------------------
2520 
2521 // IndexOf for constant substrings with size >= 8 chars
2522 // which don't need to be loaded through stack.
2523 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2524                                          Register cnt1, Register cnt2,
2525                                          int int_cnt2,  Register result,
2526                                          XMMRegister vec, Register tmp,
2527                                          int ae) {
2528   ShortBranchVerifier sbv(this);
2529   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2530   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2531 
2532   // This method uses the pcmpestri instruction with bound registers
2533   //   inputs:
2534   //     xmm - substring
2535   //     rax - substring length (elements count)
2536   //     mem - scanned string
2537   //     rdx - string length (elements count)
2538   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2539   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2540   //   outputs:
2541   //     rcx - matched index in string
2542   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2543   int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2544   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2545   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2546   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2547 
2548   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2549         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2550         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2551 
2552   // Note, inline_string_indexOf() generates checks:
2553   // if (substr.count > string.count) return -1;
2554   // if (substr.count == 0) return 0;
2555   assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2556 
2557   // Load substring.
2558   if (ae == StrIntrinsicNode::UL) {
2559     pmovzxbw(vec, Address(str2, 0));
2560   } else {
2561     movdqu(vec, Address(str2, 0));
2562   }
2563   movl(cnt2, int_cnt2);
2564   movptr(result, str1); // string addr
2565 
2566   if (int_cnt2 > stride) {
2567     jmpb(SCAN_TO_SUBSTR);
2568 
2569     // Reload substr for rescan, this code
2570     // is executed only for large substrings (> 8 chars)
2571     bind(RELOAD_SUBSTR);
2572     if (ae == StrIntrinsicNode::UL) {
2573       pmovzxbw(vec, Address(str2, 0));
2574     } else {
2575       movdqu(vec, Address(str2, 0));
2576     }
2577     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2578 
2579     bind(RELOAD_STR);
2580     // We came here after the beginning of the substring was
2581     // matched but the rest of it was not so we need to search
2582     // again. Start from the next element after the previous match.
2583 
2584     // cnt2 is number of substring reminding elements and
2585     // cnt1 is number of string reminding elements when cmp failed.
2586     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
2587     subl(cnt1, cnt2);
2588     addl(cnt1, int_cnt2);
2589     movl(cnt2, int_cnt2); // Now restore cnt2
2590 
2591     decrementl(cnt1);     // Shift to next element
2592     cmpl(cnt1, cnt2);
2593     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2594 
2595     addptr(result, (1<<scale1));
2596 
2597   } // (int_cnt2 > 8)
2598 
2599   // Scan string for start of substr in 16-byte vectors
2600   bind(SCAN_TO_SUBSTR);
2601   pcmpestri(vec, Address(result, 0), mode);
2602   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2603   subl(cnt1, stride);
2604   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2605   cmpl(cnt1, cnt2);
2606   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2607   addptr(result, 16);
2608   jmpb(SCAN_TO_SUBSTR);
2609 
2610   // Found a potential substr
2611   bind(FOUND_CANDIDATE);
2612   // Matched whole vector if first element matched (tmp(rcx) == 0).
2613   if (int_cnt2 == stride) {
2614     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
2615   } else { // int_cnt2 > 8
2616     jccb(Assembler::overflow, FOUND_SUBSTR);
2617   }
2618   // After pcmpestri tmp(rcx) contains matched element index
2619   // Compute start addr of substr
2620   lea(result, Address(result, tmp, scale1));
2621 
2622   // Make sure string is still long enough
2623   subl(cnt1, tmp);
2624   cmpl(cnt1, cnt2);
2625   if (int_cnt2 == stride) {
2626     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2627   } else { // int_cnt2 > 8
2628     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
2629   }
2630   // Left less then substring.
2631 
2632   bind(RET_NOT_FOUND);
2633   movl(result, -1);
2634   jmp(EXIT);
2635 
2636   if (int_cnt2 > stride) {
2637     // This code is optimized for the case when whole substring
2638     // is matched if its head is matched.
2639     bind(MATCH_SUBSTR_HEAD);
2640     pcmpestri(vec, Address(result, 0), mode);
2641     // Reload only string if does not match
2642     jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
2643 
2644     Label CONT_SCAN_SUBSTR;
2645     // Compare the rest of substring (> 8 chars).
2646     bind(FOUND_SUBSTR);
2647     // First 8 chars are already matched.
2648     negptr(cnt2);
2649     addptr(cnt2, stride);
2650 
2651     bind(SCAN_SUBSTR);
2652     subl(cnt1, stride);
2653     cmpl(cnt2, -stride); // Do not read beyond substring
2654     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
2655     // Back-up strings to avoid reading beyond substring:
2656     // cnt1 = cnt1 - cnt2 + 8
2657     addl(cnt1, cnt2); // cnt2 is negative
2658     addl(cnt1, stride);
2659     movl(cnt2, stride); negptr(cnt2);
2660     bind(CONT_SCAN_SUBSTR);
2661     if (int_cnt2 < (int)G) {
2662       int tail_off1 = int_cnt2<<scale1;
2663       int tail_off2 = int_cnt2<<scale2;
2664       if (ae == StrIntrinsicNode::UL) {
2665         pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
2666       } else {
2667         movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
2668       }
2669       pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
2670     } else {
2671       // calculate index in register to avoid integer overflow (int_cnt2*2)
2672       movl(tmp, int_cnt2);
2673       addptr(tmp, cnt2);
2674       if (ae == StrIntrinsicNode::UL) {
2675         pmovzxbw(vec, Address(str2, tmp, scale2, 0));
2676       } else {
2677         movdqu(vec, Address(str2, tmp, scale2, 0));
2678       }
2679       pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
2680     }
2681     // Need to reload strings pointers if not matched whole vector
2682     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2683     addptr(cnt2, stride);
2684     jcc(Assembler::negative, SCAN_SUBSTR);
2685     // Fall through if found full substring
2686 
2687   } // (int_cnt2 > 8)
2688 
2689   bind(RET_FOUND);
2690   // Found result if we matched full small substring.
2691   // Compute substr offset
2692   subptr(result, str1);
2693   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2694     shrl(result, 1); // index
2695   }
2696   bind(EXIT);
2697 
2698 } // string_indexofC8
2699 
2700 // Small strings are loaded through stack if they cross page boundary.
2701 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
2702                                        Register cnt1, Register cnt2,
2703                                        int int_cnt2,  Register result,
2704                                        XMMRegister vec, Register tmp,
2705                                        int ae) {
2706   ShortBranchVerifier sbv(this);
2707   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2708   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2709 
2710   //
2711   // int_cnt2 is length of small (< 8 chars) constant substring
2712   // or (-1) for non constant substring in which case its length
2713   // is in cnt2 register.
2714   //
2715   // Note, inline_string_indexOf() generates checks:
2716   // if (substr.count > string.count) return -1;
2717   // if (substr.count == 0) return 0;
2718   //
2719   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2720   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
2721   // This method uses the pcmpestri instruction with bound registers
2722   //   inputs:
2723   //     xmm - substring
2724   //     rax - substring length (elements count)
2725   //     mem - scanned string
2726   //     rdx - string length (elements count)
2727   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2728   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2729   //   outputs:
2730   //     rcx - matched index in string
2731   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2732   int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2733   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2734   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2735 
2736   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
2737         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
2738         FOUND_CANDIDATE;
2739 
2740   { //========================================================
2741     // We don't know where these strings are located
2742     // and we can't read beyond them. Load them through stack.
2743     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
2744 
2745     movptr(tmp, rsp); // save old SP
2746 
2747     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
2748       if (int_cnt2 == (1>>scale2)) { // One byte
2749         assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
2750         load_unsigned_byte(result, Address(str2, 0));
2751         movdl(vec, result); // move 32 bits
2752       } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
2753         // Not enough header space in 32-bit VM: 12+3 = 15.
2754         movl(result, Address(str2, -1));
2755         shrl(result, 8);
2756         movdl(vec, result); // move 32 bits
2757       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
2758         load_unsigned_short(result, Address(str2, 0));
2759         movdl(vec, result); // move 32 bits
2760       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
2761         movdl(vec, Address(str2, 0)); // move 32 bits
2762       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
2763         movq(vec, Address(str2, 0));  // move 64 bits
2764       } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
2765         // Array header size is 12 bytes in 32-bit VM
2766         // + 6 bytes for 3 chars == 18 bytes,
2767         // enough space to load vec and shift.
2768         assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
2769         if (ae == StrIntrinsicNode::UL) {
2770           int tail_off = int_cnt2-8;
2771           pmovzxbw(vec, Address(str2, tail_off));
2772           psrldq(vec, -2*tail_off);
2773         }
2774         else {
2775           int tail_off = int_cnt2*(1<<scale2);
2776           movdqu(vec, Address(str2, tail_off-16));
2777           psrldq(vec, 16-tail_off);
2778         }
2779       }
2780     } else { // not constant substring
2781       cmpl(cnt2, stride);
2782       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
2783 
2784       // We can read beyond string if srt+16 does not cross page boundary
2785       // since heaps are aligned and mapped by pages.
2786       assert(os::vm_page_size() < (int)G, "default page should be small");
2787       movl(result, str2); // We need only low 32 bits
2788       andl(result, ((int)os::vm_page_size()-1));
2789       cmpl(result, ((int)os::vm_page_size()-16));
2790       jccb(Assembler::belowEqual, CHECK_STR);
2791 
2792       // Move small strings to stack to allow load 16 bytes into vec.
2793       subptr(rsp, 16);
2794       int stk_offset = wordSize-(1<<scale2);
2795       push(cnt2);
2796 
2797       bind(COPY_SUBSTR);
2798       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
2799         load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
2800         movb(Address(rsp, cnt2, scale2, stk_offset), result);
2801       } else if (ae == StrIntrinsicNode::UU) {
2802         load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
2803         movw(Address(rsp, cnt2, scale2, stk_offset), result);
2804       }
2805       decrement(cnt2);
2806       jccb(Assembler::notZero, COPY_SUBSTR);
2807 
2808       pop(cnt2);
2809       movptr(str2, rsp);  // New substring address
2810     } // non constant
2811 
2812     bind(CHECK_STR);
2813     cmpl(cnt1, stride);
2814     jccb(Assembler::aboveEqual, BIG_STRINGS);
2815 
2816     // Check cross page boundary.
2817     movl(result, str1); // We need only low 32 bits
2818     andl(result, ((int)os::vm_page_size()-1));
2819     cmpl(result, ((int)os::vm_page_size()-16));
2820     jccb(Assembler::belowEqual, BIG_STRINGS);
2821 
2822     subptr(rsp, 16);
2823     int stk_offset = -(1<<scale1);
2824     if (int_cnt2 < 0) { // not constant
2825       push(cnt2);
2826       stk_offset += wordSize;
2827     }
2828     movl(cnt2, cnt1);
2829 
2830     bind(COPY_STR);
2831     if (ae == StrIntrinsicNode::LL) {
2832       load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
2833       movb(Address(rsp, cnt2, scale1, stk_offset), result);
2834     } else {
2835       load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
2836       movw(Address(rsp, cnt2, scale1, stk_offset), result);
2837     }
2838     decrement(cnt2);
2839     jccb(Assembler::notZero, COPY_STR);
2840 
2841     if (int_cnt2 < 0) { // not constant
2842       pop(cnt2);
2843     }
2844     movptr(str1, rsp);  // New string address
2845 
2846     bind(BIG_STRINGS);
2847     // Load substring.
2848     if (int_cnt2 < 0) { // -1
2849       if (ae == StrIntrinsicNode::UL) {
2850         pmovzxbw(vec, Address(str2, 0));
2851       } else {
2852         movdqu(vec, Address(str2, 0));
2853       }
2854       push(cnt2);       // substr count
2855       push(str2);       // substr addr
2856       push(str1);       // string addr
2857     } else {
2858       // Small (< 8 chars) constant substrings are loaded already.
2859       movl(cnt2, int_cnt2);
2860     }
2861     push(tmp);  // original SP
2862 
2863   } // Finished loading
2864 
2865   //========================================================
2866   // Start search
2867   //
2868 
2869   movptr(result, str1); // string addr
2870 
2871   if (int_cnt2  < 0) {  // Only for non constant substring
2872     jmpb(SCAN_TO_SUBSTR);
2873 
2874     // SP saved at sp+0
2875     // String saved at sp+1*wordSize
2876     // Substr saved at sp+2*wordSize
2877     // Substr count saved at sp+3*wordSize
2878 
2879     // Reload substr for rescan, this code
2880     // is executed only for large substrings (> 8 chars)
2881     bind(RELOAD_SUBSTR);
2882     movptr(str2, Address(rsp, 2*wordSize));
2883     movl(cnt2, Address(rsp, 3*wordSize));
2884     if (ae == StrIntrinsicNode::UL) {
2885       pmovzxbw(vec, Address(str2, 0));
2886     } else {
2887       movdqu(vec, Address(str2, 0));
2888     }
2889     // We came here after the beginning of the substring was
2890     // matched but the rest of it was not so we need to search
2891     // again. Start from the next element after the previous match.
2892     subptr(str1, result); // Restore counter
2893     if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2894       shrl(str1, 1);
2895     }
2896     addl(cnt1, str1);
2897     decrementl(cnt1);   // Shift to next element
2898     cmpl(cnt1, cnt2);
2899     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2900 
2901     addptr(result, (1<<scale1));
2902   } // non constant
2903 
2904   // Scan string for start of substr in 16-byte vectors
2905   bind(SCAN_TO_SUBSTR);
2906   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2907   pcmpestri(vec, Address(result, 0), mode);
2908   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2909   subl(cnt1, stride);
2910   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2911   cmpl(cnt1, cnt2);
2912   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2913   addptr(result, 16);
2914 
2915   bind(ADJUST_STR);
2916   cmpl(cnt1, stride); // Do not read beyond string
2917   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2918   // Back-up string to avoid reading beyond string.
2919   lea(result, Address(result, cnt1, scale1, -16));
2920   movl(cnt1, stride);
2921   jmpb(SCAN_TO_SUBSTR);
2922 
2923   // Found a potential substr
2924   bind(FOUND_CANDIDATE);
2925   // After pcmpestri tmp(rcx) contains matched element index
2926 
2927   // Make sure string is still long enough
2928   subl(cnt1, tmp);
2929   cmpl(cnt1, cnt2);
2930   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
2931   // Left less then substring.
2932 
2933   bind(RET_NOT_FOUND);
2934   movl(result, -1);
2935   jmp(CLEANUP);
2936 
2937   bind(FOUND_SUBSTR);
2938   // Compute start addr of substr
2939   lea(result, Address(result, tmp, scale1));
2940   if (int_cnt2 > 0) { // Constant substring
2941     // Repeat search for small substring (< 8 chars)
2942     // from new point without reloading substring.
2943     // Have to check that we don't read beyond string.
2944     cmpl(tmp, stride-int_cnt2);
2945     jccb(Assembler::greater, ADJUST_STR);
2946     // Fall through if matched whole substring.
2947   } else { // non constant
2948     assert(int_cnt2 == -1, "should be != 0");
2949 
2950     addl(tmp, cnt2);
2951     // Found result if we matched whole substring.
2952     cmpl(tmp, stride);
2953     jcc(Assembler::lessEqual, RET_FOUND);
2954 
2955     // Repeat search for small substring (<= 8 chars)
2956     // from new point 'str1' without reloading substring.
2957     cmpl(cnt2, stride);
2958     // Have to check that we don't read beyond string.
2959     jccb(Assembler::lessEqual, ADJUST_STR);
2960 
2961     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
2962     // Compare the rest of substring (> 8 chars).
2963     movptr(str1, result);
2964 
2965     cmpl(tmp, cnt2);
2966     // First 8 chars are already matched.
2967     jccb(Assembler::equal, CHECK_NEXT);
2968 
2969     bind(SCAN_SUBSTR);
2970     pcmpestri(vec, Address(str1, 0), mode);
2971     // Need to reload strings pointers if not matched whole vector
2972     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2973 
2974     bind(CHECK_NEXT);
2975     subl(cnt2, stride);
2976     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
2977     addptr(str1, 16);
2978     if (ae == StrIntrinsicNode::UL) {
2979       addptr(str2, 8);
2980     } else {
2981       addptr(str2, 16);
2982     }
2983     subl(cnt1, stride);
2984     cmpl(cnt2, stride); // Do not read beyond substring
2985     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
2986     // Back-up strings to avoid reading beyond substring.
2987 
2988     if (ae == StrIntrinsicNode::UL) {
2989       lea(str2, Address(str2, cnt2, scale2, -8));
2990       lea(str1, Address(str1, cnt2, scale1, -16));
2991     } else {
2992       lea(str2, Address(str2, cnt2, scale2, -16));
2993       lea(str1, Address(str1, cnt2, scale1, -16));
2994     }
2995     subl(cnt1, cnt2);
2996     movl(cnt2, stride);
2997     addl(cnt1, stride);
2998     bind(CONT_SCAN_SUBSTR);
2999     if (ae == StrIntrinsicNode::UL) {
3000       pmovzxbw(vec, Address(str2, 0));
3001     } else {
3002       movdqu(vec, Address(str2, 0));
3003     }
3004     jmp(SCAN_SUBSTR);
3005 
3006     bind(RET_FOUND_LONG);
3007     movptr(str1, Address(rsp, wordSize));
3008   } // non constant
3009 
3010   bind(RET_FOUND);
3011   // Compute substr offset
3012   subptr(result, str1);
3013   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3014     shrl(result, 1); // index
3015   }
3016   bind(CLEANUP);
3017   pop(rsp); // restore SP
3018 
3019 } // string_indexof
3020 
3021 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3022                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3023   ShortBranchVerifier sbv(this);
3024   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3025 
3026   int stride = 8;
3027 
3028   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
3029         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
3030         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
3031         FOUND_SEQ_CHAR, DONE_LABEL;
3032 
3033   movptr(result, str1);
3034   if (UseAVX >= 2) {
3035     cmpl(cnt1, stride);
3036     jcc(Assembler::less, SCAN_TO_CHAR);
3037     cmpl(cnt1, 2*stride);
3038     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
3039     movdl(vec1, ch);
3040     vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
3041     vpxor(vec2, vec2);
3042     movl(tmp, cnt1);
3043     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
3044     andl(cnt1,0x0000000F);  //tail count (in chars)
3045 
3046     bind(SCAN_TO_16_CHAR_LOOP);
3047     vmovdqu(vec3, Address(result, 0));
3048     vpcmpeqw(vec3, vec3, vec1, 1);
3049     vptest(vec2, vec3);
3050     jcc(Assembler::carryClear, FOUND_CHAR);
3051     addptr(result, 32);
3052     subl(tmp, 2*stride);
3053     jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
3054     jmp(SCAN_TO_8_CHAR);
3055     bind(SCAN_TO_8_CHAR_INIT);
3056     movdl(vec1, ch);
3057     pshuflw(vec1, vec1, 0x00);
3058     pshufd(vec1, vec1, 0);
3059     pxor(vec2, vec2);
3060   }
3061   bind(SCAN_TO_8_CHAR);
3062   cmpl(cnt1, stride);
3063   jcc(Assembler::less, SCAN_TO_CHAR);
3064   if (UseAVX < 2) {
3065     movdl(vec1, ch);
3066     pshuflw(vec1, vec1, 0x00);
3067     pshufd(vec1, vec1, 0);
3068     pxor(vec2, vec2);
3069   }
3070   movl(tmp, cnt1);
3071   andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
3072   andl(cnt1,0x00000007);  //tail count (in chars)
3073 
3074   bind(SCAN_TO_8_CHAR_LOOP);
3075   movdqu(vec3, Address(result, 0));
3076   pcmpeqw(vec3, vec1);
3077   ptest(vec2, vec3);
3078   jcc(Assembler::carryClear, FOUND_CHAR);
3079   addptr(result, 16);
3080   subl(tmp, stride);
3081   jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
3082   bind(SCAN_TO_CHAR);
3083   testl(cnt1, cnt1);
3084   jcc(Assembler::zero, RET_NOT_FOUND);
3085   bind(SCAN_TO_CHAR_LOOP);
3086   load_unsigned_short(tmp, Address(result, 0));
3087   cmpl(ch, tmp);
3088   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3089   addptr(result, 2);
3090   subl(cnt1, 1);
3091   jccb(Assembler::zero, RET_NOT_FOUND);
3092   jmp(SCAN_TO_CHAR_LOOP);
3093 
3094   bind(RET_NOT_FOUND);
3095   movl(result, -1);
3096   jmpb(DONE_LABEL);
3097 
3098   bind(FOUND_CHAR);
3099   if (UseAVX >= 2) {
3100     vpmovmskb(tmp, vec3);
3101   } else {
3102     pmovmskb(tmp, vec3);
3103   }
3104   bsfl(ch, tmp);
3105   addptr(result, ch);
3106 
3107   bind(FOUND_SEQ_CHAR);
3108   subptr(result, str1);
3109   shrl(result, 1);
3110 
3111   bind(DONE_LABEL);
3112 } // string_indexof_char
3113 
3114 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3115                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3116   ShortBranchVerifier sbv(this);
3117   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3118 
3119   int stride = 16;
3120 
3121   Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
3122         SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
3123         RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
3124         FOUND_SEQ_CHAR, DONE_LABEL;
3125 
3126   movptr(result, str1);
3127   if (UseAVX >= 2) {
3128     cmpl(cnt1, stride);
3129     jcc(Assembler::less, SCAN_TO_CHAR_INIT);
3130     cmpl(cnt1, stride*2);
3131     jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
3132     movdl(vec1, ch);
3133     vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
3134     vpxor(vec2, vec2);
3135     movl(tmp, cnt1);
3136     andl(tmp, 0xFFFFFFE0);  //vector count (in chars)
3137     andl(cnt1,0x0000001F);  //tail count (in chars)
3138 
3139     bind(SCAN_TO_32_CHAR_LOOP);
3140     vmovdqu(vec3, Address(result, 0));
3141     vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
3142     vptest(vec2, vec3);
3143     jcc(Assembler::carryClear, FOUND_CHAR);
3144     addptr(result, 32);
3145     subl(tmp, stride*2);
3146     jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
3147     jmp(SCAN_TO_16_CHAR);
3148 
3149     bind(SCAN_TO_16_CHAR_INIT);
3150     movdl(vec1, ch);
3151     pxor(vec2, vec2);
3152     pshufb(vec1, vec2);
3153   }
3154 
3155   bind(SCAN_TO_16_CHAR);
3156   cmpl(cnt1, stride);
3157   jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left
3158   if (UseAVX < 2) {
3159     movdl(vec1, ch);
3160     pxor(vec2, vec2);
3161     pshufb(vec1, vec2);
3162   }
3163   movl(tmp, cnt1);
3164   andl(tmp, 0xFFFFFFF0);  //vector count (in bytes)
3165   andl(cnt1,0x0000000F);  //tail count (in bytes)
3166 
3167   bind(SCAN_TO_16_CHAR_LOOP);
3168   movdqu(vec3, Address(result, 0));
3169   pcmpeqb(vec3, vec1);
3170   ptest(vec2, vec3);
3171   jcc(Assembler::carryClear, FOUND_CHAR);
3172   addptr(result, 16);
3173   subl(tmp, stride);
3174   jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
3175 
3176   bind(SCAN_TO_CHAR_INIT);
3177   testl(cnt1, cnt1);
3178   jcc(Assembler::zero, RET_NOT_FOUND);
3179   bind(SCAN_TO_CHAR_LOOP);
3180   load_unsigned_byte(tmp, Address(result, 0));
3181   cmpl(ch, tmp);
3182   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3183   addptr(result, 1);
3184   subl(cnt1, 1);
3185   jccb(Assembler::zero, RET_NOT_FOUND);
3186   jmp(SCAN_TO_CHAR_LOOP);
3187 
3188   bind(RET_NOT_FOUND);
3189   movl(result, -1);
3190   jmpb(DONE_LABEL);
3191 
3192   bind(FOUND_CHAR);
3193   if (UseAVX >= 2) {
3194     vpmovmskb(tmp, vec3);
3195   } else {
3196     pmovmskb(tmp, vec3);
3197   }
3198   bsfl(ch, tmp);
3199   addptr(result, ch);
3200 
3201   bind(FOUND_SEQ_CHAR);
3202   subptr(result, str1);
3203 
3204   bind(DONE_LABEL);
3205 } // stringL_indexof_char
3206 
3207 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) {
3208   switch (eltype) {
3209   case T_BOOLEAN: return sizeof(jboolean);
3210   case T_BYTE:  return sizeof(jbyte);
3211   case T_SHORT: return sizeof(jshort);
3212   case T_CHAR:  return sizeof(jchar);
3213   case T_INT:   return sizeof(jint);
3214   default:
3215     ShouldNotReachHere();
3216     return -1;
3217   }
3218 }
3219 
3220 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) {
3221   switch (eltype) {
3222   // T_BOOLEAN used as surrogate for unsigned byte
3223   case T_BOOLEAN: movzbl(dst, src);   break;
3224   case T_BYTE:    movsbl(dst, src);   break;
3225   case T_SHORT:   movswl(dst, src);   break;
3226   case T_CHAR:    movzwl(dst, src);   break;
3227   case T_INT:     movl(dst, src);     break;
3228   default:
3229     ShouldNotReachHere();
3230   }
3231 }
3232 
3233 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) {
3234   load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8);
3235 }
3236 
3237 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) {
3238   load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8);
3239 }
3240 
3241 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) {
3242   const int vlen = Assembler::AVX_256bit;
3243   switch (eltype) {
3244   case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3245   case T_BYTE:      vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3246   case T_SHORT:     vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3247   case T_CHAR:    vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3248   case T_INT:
3249     // do nothing
3250     break;
3251   default:
3252     ShouldNotReachHere();
3253   }
3254 }
3255 
3256 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result,
3257                                         Register index, Register tmp2, Register tmp3, XMMRegister vnext,
3258                                         XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3,
3259                                         XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3,
3260                                         XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3,
3261                                         BasicType eltype) {
3262   ShortBranchVerifier sbv(this);
3263   assert(UseAVX >= 2, "AVX2 intrinsics are required");
3264   assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3);
3265   assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3);
3266 
3267   Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN,
3268         SHORT_UNROLLED_LOOP_EXIT,
3269         UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME,
3270         UNROLLED_VECTOR_LOOP_BEGIN,
3271         END;
3272   switch (eltype) {
3273   case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break;
3274   case T_CHAR:    BLOCK_COMMENT("arrays_hashcode(char) {");          break;
3275   case T_BYTE:    BLOCK_COMMENT("arrays_hashcode(byte) {");          break;
3276   case T_SHORT:   BLOCK_COMMENT("arrays_hashcode(short) {");         break;
3277   case T_INT:     BLOCK_COMMENT("arrays_hashcode(int) {");           break;
3278   default:        BLOCK_COMMENT("arrays_hashcode {");                break;
3279   }
3280 
3281   // For "renaming" for readibility of the code
3282   XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 },
3283               vresult[] = { vresult0, vresult1, vresult2, vresult3 },
3284               vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 };
3285 
3286   const int elsize = arrays_hashcode_elsize(eltype);
3287 
3288   /*
3289     if (cnt1 >= 2) {
3290       if (cnt1 >= 32) {
3291         UNROLLED VECTOR LOOP
3292       }
3293       UNROLLED SCALAR LOOP
3294     }
3295     SINGLE SCALAR
3296    */
3297 
3298   cmpl(cnt1, 32);
3299   jcc(Assembler::less, SHORT_UNROLLED_BEGIN);
3300 
3301   // cnt1 >= 32 && generate_vectorized_loop
3302   xorl(index, index);
3303 
3304   // vresult = IntVector.zero(I256);
3305   for (int idx = 0; idx < 4; idx++) {
3306     vpxor(vresult[idx], vresult[idx]);
3307   }
3308   // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]);
3309   Register bound = tmp2;
3310   Register next = tmp3;
3311   lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint))));
3312   movl(next, Address(tmp2, 0));
3313   movdl(vnext, next);
3314   vpbroadcastd(vnext, vnext, Assembler::AVX_256bit);
3315 
3316   // index = 0;
3317   // bound = cnt1 & ~(32 - 1);
3318   movl(bound, cnt1);
3319   andl(bound, ~(32 - 1));
3320   // for (; index < bound; index += 32) {
3321   bind(UNROLLED_VECTOR_LOOP_BEGIN);
3322   // result *= next;
3323   imull(result, next);
3324   // loop fission to upfront the cost of fetching from memory, OOO execution
3325   // can then hopefully do a better job of prefetching
3326   for (int idx = 0; idx < 4; idx++) {
3327     arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype);
3328   }
3329   // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7];
3330   for (int idx = 0; idx < 4; idx++) {
3331     vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit);
3332     arrays_hashcode_elvcast(vtmp[idx], eltype);
3333     vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit);
3334   }
3335   // index += 32;
3336   addl(index, 32);
3337   // index < bound;
3338   cmpl(index, bound);
3339   jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN);
3340   // }
3341 
3342   lea(ary1, Address(ary1, bound, Address::times(elsize)));
3343   subl(cnt1, bound);
3344   // release bound
3345 
3346   // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1);
3347   for (int idx = 0; idx < 4; idx++) {
3348     lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint))));
3349     arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT);
3350     vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit);
3351   }
3352   // result += vresult.reduceLanes(ADD);
3353   for (int idx = 0; idx < 4; idx++) {
3354     reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]);
3355   }
3356 
3357   // } else if (cnt1 < 32) {
3358 
3359   bind(SHORT_UNROLLED_BEGIN);
3360   // int i = 1;
3361   movl(index, 1);
3362   cmpl(index, cnt1);
3363   jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT);
3364 
3365   // for (; i < cnt1 ; i += 2) {
3366   bind(SHORT_UNROLLED_LOOP_BEGIN);
3367   movl(tmp3, 961);
3368   imull(result, tmp3);
3369   arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3370   movl(tmp3, tmp2);
3371   shll(tmp3, 5);
3372   subl(tmp3, tmp2);
3373   addl(result, tmp3);
3374   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype);
3375   addl(result, tmp3);
3376   addl(index, 2);
3377   cmpl(index, cnt1);
3378   jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN);
3379 
3380   // }
3381   // if (i >= cnt1) {
3382   bind(SHORT_UNROLLED_LOOP_EXIT);
3383   jccb(Assembler::greater, END);
3384   movl(tmp2, result);
3385   shll(result, 5);
3386   subl(result, tmp2);
3387   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3388   addl(result, tmp3);
3389   // }
3390   bind(END);
3391 
3392   BLOCK_COMMENT("} // arrays_hashcode");
3393 
3394 } // arrays_hashcode
3395 
3396 // helper function for string_compare
3397 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
3398                                            Address::ScaleFactor scale, Address::ScaleFactor scale1,
3399                                            Address::ScaleFactor scale2, Register index, int ae) {
3400   if (ae == StrIntrinsicNode::LL) {
3401     load_unsigned_byte(elem1, Address(str1, index, scale, 0));
3402     load_unsigned_byte(elem2, Address(str2, index, scale, 0));
3403   } else if (ae == StrIntrinsicNode::UU) {
3404     load_unsigned_short(elem1, Address(str1, index, scale, 0));
3405     load_unsigned_short(elem2, Address(str2, index, scale, 0));
3406   } else {
3407     load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
3408     load_unsigned_short(elem2, Address(str2, index, scale2, 0));
3409   }
3410 }
3411 
3412 // Compare strings, used for char[] and byte[].
3413 void C2_MacroAssembler::string_compare(Register str1, Register str2,
3414                                        Register cnt1, Register cnt2, Register result,
3415                                        XMMRegister vec1, int ae, KRegister mask) {
3416   ShortBranchVerifier sbv(this);
3417   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
3418   Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only _LP64 && AVX3
3419   int stride, stride2, adr_stride, adr_stride1, adr_stride2;
3420   int stride2x2 = 0x40;
3421   Address::ScaleFactor scale = Address::no_scale;
3422   Address::ScaleFactor scale1 = Address::no_scale;
3423   Address::ScaleFactor scale2 = Address::no_scale;
3424 
3425   if (ae != StrIntrinsicNode::LL) {
3426     stride2x2 = 0x20;
3427   }
3428 
3429   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
3430     shrl(cnt2, 1);
3431   }
3432   // Compute the minimum of the string lengths and the
3433   // difference of the string lengths (stack).
3434   // Do the conditional move stuff
3435   movl(result, cnt1);
3436   subl(cnt1, cnt2);
3437   push(cnt1);
3438   cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
3439 
3440   // Is the minimum length zero?
3441   testl(cnt2, cnt2);
3442   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3443   if (ae == StrIntrinsicNode::LL) {
3444     // Load first bytes
3445     load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
3446     load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
3447   } else if (ae == StrIntrinsicNode::UU) {
3448     // Load first characters
3449     load_unsigned_short(result, Address(str1, 0));
3450     load_unsigned_short(cnt1, Address(str2, 0));
3451   } else {
3452     load_unsigned_byte(result, Address(str1, 0));
3453     load_unsigned_short(cnt1, Address(str2, 0));
3454   }
3455   subl(result, cnt1);
3456   jcc(Assembler::notZero,  POP_LABEL);
3457 
3458   if (ae == StrIntrinsicNode::UU) {
3459     // Divide length by 2 to get number of chars
3460     shrl(cnt2, 1);
3461   }
3462   cmpl(cnt2, 1);
3463   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3464 
3465   // Check if the strings start at the same location and setup scale and stride
3466   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3467     cmpptr(str1, str2);
3468     jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3469     if (ae == StrIntrinsicNode::LL) {
3470       scale = Address::times_1;
3471       stride = 16;
3472     } else {
3473       scale = Address::times_2;
3474       stride = 8;
3475     }
3476   } else {
3477     scale1 = Address::times_1;
3478     scale2 = Address::times_2;
3479     // scale not used
3480     stride = 8;
3481   }
3482 
3483   if (UseAVX >= 2 && UseSSE42Intrinsics) {
3484     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
3485     Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
3486     Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
3487     Label COMPARE_TAIL_LONG;
3488     Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only _LP64 && AVX3
3489 
3490     int pcmpmask = 0x19;
3491     if (ae == StrIntrinsicNode::LL) {
3492       pcmpmask &= ~0x01;
3493     }
3494 
3495     // Setup to compare 16-chars (32-bytes) vectors,
3496     // start from first character again because it has aligned address.
3497     if (ae == StrIntrinsicNode::LL) {
3498       stride2 = 32;
3499     } else {
3500       stride2 = 16;
3501     }
3502     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3503       adr_stride = stride << scale;
3504     } else {
3505       adr_stride1 = 8;  //stride << scale1;
3506       adr_stride2 = 16; //stride << scale2;
3507     }
3508 
3509     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3510     // rax and rdx are used by pcmpestri as elements counters
3511     movl(result, cnt2);
3512     andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
3513     jcc(Assembler::zero, COMPARE_TAIL_LONG);
3514 
3515     // fast path : compare first 2 8-char vectors.
3516     bind(COMPARE_16_CHARS);
3517     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3518       movdqu(vec1, Address(str1, 0));
3519     } else {
3520       pmovzxbw(vec1, Address(str1, 0));
3521     }
3522     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3523     jccb(Assembler::below, COMPARE_INDEX_CHAR);
3524 
3525     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3526       movdqu(vec1, Address(str1, adr_stride));
3527       pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
3528     } else {
3529       pmovzxbw(vec1, Address(str1, adr_stride1));
3530       pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
3531     }
3532     jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
3533     addl(cnt1, stride);
3534 
3535     // Compare the characters at index in cnt1
3536     bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
3537     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3538     subl(result, cnt2);
3539     jmp(POP_LABEL);
3540 
3541     // Setup the registers to start vector comparison loop
3542     bind(COMPARE_WIDE_VECTORS);
3543     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3544       lea(str1, Address(str1, result, scale));
3545       lea(str2, Address(str2, result, scale));
3546     } else {
3547       lea(str1, Address(str1, result, scale1));
3548       lea(str2, Address(str2, result, scale2));
3549     }
3550     subl(result, stride2);
3551     subl(cnt2, stride2);
3552     jcc(Assembler::zero, COMPARE_WIDE_TAIL);
3553     negptr(result);
3554 
3555     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
3556     bind(COMPARE_WIDE_VECTORS_LOOP);
3557 
3558 #ifdef _LP64
3559     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3560       cmpl(cnt2, stride2x2);
3561       jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3562       testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
3563       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
3564 
3565       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3566       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3567         evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
3568         evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3569       } else {
3570         vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
3571         evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3572       }
3573       kortestql(mask, mask);
3574       jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
3575       addptr(result, stride2x2);  // update since we already compared at this addr
3576       subl(cnt2, stride2x2);      // and sub the size too
3577       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3578 
3579       vpxor(vec1, vec1);
3580       jmpb(COMPARE_WIDE_TAIL);
3581     }//if (VM_Version::supports_avx512vlbw())
3582 #endif // _LP64
3583 
3584 
3585     bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3586     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3587       vmovdqu(vec1, Address(str1, result, scale));
3588       vpxor(vec1, Address(str2, result, scale));
3589     } else {
3590       vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
3591       vpxor(vec1, Address(str2, result, scale2));
3592     }
3593     vptest(vec1, vec1);
3594     jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
3595     addptr(result, stride2);
3596     subl(cnt2, stride2);
3597     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
3598     // clean upper bits of YMM registers
3599     vpxor(vec1, vec1);
3600 
3601     // compare wide vectors tail
3602     bind(COMPARE_WIDE_TAIL);
3603     testptr(result, result);
3604     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3605 
3606     movl(result, stride2);
3607     movl(cnt2, result);
3608     negptr(result);
3609     jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3610 
3611     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
3612     bind(VECTOR_NOT_EQUAL);
3613     // clean upper bits of YMM registers
3614     vpxor(vec1, vec1);
3615     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3616       lea(str1, Address(str1, result, scale));
3617       lea(str2, Address(str2, result, scale));
3618     } else {
3619       lea(str1, Address(str1, result, scale1));
3620       lea(str2, Address(str2, result, scale2));
3621     }
3622     jmp(COMPARE_16_CHARS);
3623 
3624     // Compare tail chars, length between 1 to 15 chars
3625     bind(COMPARE_TAIL_LONG);
3626     movl(cnt2, result);
3627     cmpl(cnt2, stride);
3628     jcc(Assembler::less, COMPARE_SMALL_STR);
3629 
3630     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3631       movdqu(vec1, Address(str1, 0));
3632     } else {
3633       pmovzxbw(vec1, Address(str1, 0));
3634     }
3635     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3636     jcc(Assembler::below, COMPARE_INDEX_CHAR);
3637     subptr(cnt2, stride);
3638     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3639     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3640       lea(str1, Address(str1, result, scale));
3641       lea(str2, Address(str2, result, scale));
3642     } else {
3643       lea(str1, Address(str1, result, scale1));
3644       lea(str2, Address(str2, result, scale2));
3645     }
3646     negptr(cnt2);
3647     jmpb(WHILE_HEAD_LABEL);
3648 
3649     bind(COMPARE_SMALL_STR);
3650   } else if (UseSSE42Intrinsics) {
3651     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
3652     int pcmpmask = 0x19;
3653     // Setup to compare 8-char (16-byte) vectors,
3654     // start from first character again because it has aligned address.
3655     movl(result, cnt2);
3656     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
3657     if (ae == StrIntrinsicNode::LL) {
3658       pcmpmask &= ~0x01;
3659     }
3660     jcc(Assembler::zero, COMPARE_TAIL);
3661     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3662       lea(str1, Address(str1, result, scale));
3663       lea(str2, Address(str2, result, scale));
3664     } else {
3665       lea(str1, Address(str1, result, scale1));
3666       lea(str2, Address(str2, result, scale2));
3667     }
3668     negptr(result);
3669 
3670     // pcmpestri
3671     //   inputs:
3672     //     vec1- substring
3673     //     rax - negative string length (elements count)
3674     //     mem - scanned string
3675     //     rdx - string length (elements count)
3676     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
3677     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
3678     //   outputs:
3679     //     rcx - first mismatched element index
3680     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3681 
3682     bind(COMPARE_WIDE_VECTORS);
3683     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3684       movdqu(vec1, Address(str1, result, scale));
3685       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3686     } else {
3687       pmovzxbw(vec1, Address(str1, result, scale1));
3688       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3689     }
3690     // After pcmpestri cnt1(rcx) contains mismatched element index
3691 
3692     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
3693     addptr(result, stride);
3694     subptr(cnt2, stride);
3695     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3696 
3697     // compare wide vectors tail
3698     testptr(result, result);
3699     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3700 
3701     movl(cnt2, stride);
3702     movl(result, stride);
3703     negptr(result);
3704     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3705       movdqu(vec1, Address(str1, result, scale));
3706       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3707     } else {
3708       pmovzxbw(vec1, Address(str1, result, scale1));
3709       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3710     }
3711     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
3712 
3713     // Mismatched characters in the vectors
3714     bind(VECTOR_NOT_EQUAL);
3715     addptr(cnt1, result);
3716     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3717     subl(result, cnt2);
3718     jmpb(POP_LABEL);
3719 
3720     bind(COMPARE_TAIL); // limit is zero
3721     movl(cnt2, result);
3722     // Fallthru to tail compare
3723   }
3724   // Shift str2 and str1 to the end of the arrays, negate min
3725   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3726     lea(str1, Address(str1, cnt2, scale));
3727     lea(str2, Address(str2, cnt2, scale));
3728   } else {
3729     lea(str1, Address(str1, cnt2, scale1));
3730     lea(str2, Address(str2, cnt2, scale2));
3731   }
3732   decrementl(cnt2);  // first character was compared already
3733   negptr(cnt2);
3734 
3735   // Compare the rest of the elements
3736   bind(WHILE_HEAD_LABEL);
3737   load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
3738   subl(result, cnt1);
3739   jccb(Assembler::notZero, POP_LABEL);
3740   increment(cnt2);
3741   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
3742 
3743   // Strings are equal up to min length.  Return the length difference.
3744   bind(LENGTH_DIFF_LABEL);
3745   pop(result);
3746   if (ae == StrIntrinsicNode::UU) {
3747     // Divide diff by 2 to get number of chars
3748     sarl(result, 1);
3749   }
3750   jmpb(DONE_LABEL);
3751 
3752 #ifdef _LP64
3753   if (VM_Version::supports_avx512vlbw()) {
3754 
3755     bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
3756 
3757     kmovql(cnt1, mask);
3758     notq(cnt1);
3759     bsfq(cnt2, cnt1);
3760     if (ae != StrIntrinsicNode::LL) {
3761       // Divide diff by 2 to get number of chars
3762       sarl(cnt2, 1);
3763     }
3764     addq(result, cnt2);
3765     if (ae == StrIntrinsicNode::LL) {
3766       load_unsigned_byte(cnt1, Address(str2, result));
3767       load_unsigned_byte(result, Address(str1, result));
3768     } else if (ae == StrIntrinsicNode::UU) {
3769       load_unsigned_short(cnt1, Address(str2, result, scale));
3770       load_unsigned_short(result, Address(str1, result, scale));
3771     } else {
3772       load_unsigned_short(cnt1, Address(str2, result, scale2));
3773       load_unsigned_byte(result, Address(str1, result, scale1));
3774     }
3775     subl(result, cnt1);
3776     jmpb(POP_LABEL);
3777   }//if (VM_Version::supports_avx512vlbw())
3778 #endif // _LP64
3779 
3780   // Discard the stored length difference
3781   bind(POP_LABEL);
3782   pop(cnt1);
3783 
3784   // That's it
3785   bind(DONE_LABEL);
3786   if(ae == StrIntrinsicNode::UL) {
3787     negl(result);
3788   }
3789 
3790 }
3791 
3792 // Search for Non-ASCII character (Negative byte value) in a byte array,
3793 // return the index of the first such character, otherwise the length
3794 // of the array segment searched.
3795 //   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
3796 //   @IntrinsicCandidate
3797 //   public static int countPositives(byte[] ba, int off, int len) {
3798 //     for (int i = off; i < off + len; i++) {
3799 //       if (ba[i] < 0) {
3800 //         return i - off;
3801 //       }
3802 //     }
3803 //     return len;
3804 //   }
3805 void C2_MacroAssembler::count_positives(Register ary1, Register len,
3806   Register result, Register tmp1,
3807   XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {
3808   // rsi: byte array
3809   // rcx: len
3810   // rax: result
3811   ShortBranchVerifier sbv(this);
3812   assert_different_registers(ary1, len, result, tmp1);
3813   assert_different_registers(vec1, vec2);
3814   Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
3815 
3816   movl(result, len); // copy
3817   // len == 0
3818   testl(len, len);
3819   jcc(Assembler::zero, DONE);
3820 
3821   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
3822     VM_Version::supports_avx512vlbw() &&
3823     VM_Version::supports_bmi2()) {
3824 
3825     Label test_64_loop, test_tail, BREAK_LOOP;
3826     Register tmp3_aliased = len;
3827 
3828     movl(tmp1, len);
3829     vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
3830 
3831     andl(tmp1, 64 - 1);   // tail count (in chars) 0x3F
3832     andl(len, ~(64 - 1));    // vector count (in chars)
3833     jccb(Assembler::zero, test_tail);
3834 
3835     lea(ary1, Address(ary1, len, Address::times_1));
3836     negptr(len);
3837 
3838     bind(test_64_loop);
3839     // Check whether our 64 elements of size byte contain negatives
3840     evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
3841     kortestql(mask1, mask1);
3842     jcc(Assembler::notZero, BREAK_LOOP);
3843 
3844     addptr(len, 64);
3845     jccb(Assembler::notZero, test_64_loop);
3846 
3847     bind(test_tail);
3848     // bail out when there is nothing to be done
3849     testl(tmp1, -1);
3850     jcc(Assembler::zero, DONE);
3851 
3852     // ~(~0 << len) applied up to two times (for 32-bit scenario)
3853 #ifdef _LP64
3854     mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
3855     shlxq(tmp3_aliased, tmp3_aliased, tmp1);
3856     notq(tmp3_aliased);
3857     kmovql(mask2, tmp3_aliased);
3858 #else
3859     Label k_init;
3860     jmp(k_init);
3861 
3862     // We could not read 64-bits from a general purpose register thus we move
3863     // data required to compose 64 1's to the instruction stream
3864     // We emit 64 byte wide series of elements from 0..63 which later on would
3865     // be used as a compare targets with tail count contained in tmp1 register.
3866     // Result would be a k register having tmp1 consecutive number or 1
3867     // counting from least significant bit.
3868     address tmp = pc();
3869     emit_int64(0x0706050403020100);
3870     emit_int64(0x0F0E0D0C0B0A0908);
3871     emit_int64(0x1716151413121110);
3872     emit_int64(0x1F1E1D1C1B1A1918);
3873     emit_int64(0x2726252423222120);
3874     emit_int64(0x2F2E2D2C2B2A2928);
3875     emit_int64(0x3736353433323130);
3876     emit_int64(0x3F3E3D3C3B3A3938);
3877 
3878     bind(k_init);
3879     lea(len, InternalAddress(tmp));
3880     // create mask to test for negative byte inside a vector
3881     evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit);
3882     evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit);
3883 
3884 #endif
3885     evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);
3886     ktestq(mask1, mask2);
3887     jcc(Assembler::zero, DONE);
3888 
3889     bind(BREAK_LOOP);
3890     // At least one byte in the last 64 bytes is negative.
3891     // Set up to look at the last 64 bytes as if they were a tail
3892     lea(ary1, Address(ary1, len, Address::times_1));
3893     addptr(result, len);
3894     // Ignore the very last byte: if all others are positive,
3895     // it must be negative, so we can skip right to the 2+1 byte
3896     // end comparison at this point
3897     orl(result, 63);
3898     movl(len, 63);
3899     // Fallthru to tail compare
3900   } else {
3901 
3902     if (UseAVX >= 2 && UseSSE >= 2) {
3903       // With AVX2, use 32-byte vector compare
3904       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
3905 
3906       // Compare 32-byte vectors
3907       testl(len, 0xffffffe0);   // vector count (in bytes)
3908       jccb(Assembler::zero, TAIL_START);
3909 
3910       andl(len, 0xffffffe0);
3911       lea(ary1, Address(ary1, len, Address::times_1));
3912       negptr(len);
3913 
3914       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
3915       movdl(vec2, tmp1);
3916       vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
3917 
3918       bind(COMPARE_WIDE_VECTORS);
3919       vmovdqu(vec1, Address(ary1, len, Address::times_1));
3920       vptest(vec1, vec2);
3921       jccb(Assembler::notZero, BREAK_LOOP);
3922       addptr(len, 32);
3923       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3924 
3925       testl(result, 0x0000001f);   // any bytes remaining?
3926       jcc(Assembler::zero, DONE);
3927 
3928       // Quick test using the already prepared vector mask
3929       movl(len, result);
3930       andl(len, 0x0000001f);
3931       vmovdqu(vec1, Address(ary1, len, Address::times_1, -32));
3932       vptest(vec1, vec2);
3933       jcc(Assembler::zero, DONE);
3934       // There are zeros, jump to the tail to determine exactly where
3935       jmpb(TAIL_START);
3936 
3937       bind(BREAK_LOOP);
3938       // At least one byte in the last 32-byte vector is negative.
3939       // Set up to look at the last 32 bytes as if they were a tail
3940       lea(ary1, Address(ary1, len, Address::times_1));
3941       addptr(result, len);
3942       // Ignore the very last byte: if all others are positive,
3943       // it must be negative, so we can skip right to the 2+1 byte
3944       // end comparison at this point
3945       orl(result, 31);
3946       movl(len, 31);
3947       // Fallthru to tail compare
3948     } else if (UseSSE42Intrinsics) {
3949       // With SSE4.2, use double quad vector compare
3950       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
3951 
3952       // Compare 16-byte vectors
3953       testl(len, 0xfffffff0);   // vector count (in bytes)
3954       jcc(Assembler::zero, TAIL_START);
3955 
3956       andl(len, 0xfffffff0);
3957       lea(ary1, Address(ary1, len, Address::times_1));
3958       negptr(len);
3959 
3960       movl(tmp1, 0x80808080);
3961       movdl(vec2, tmp1);
3962       pshufd(vec2, vec2, 0);
3963 
3964       bind(COMPARE_WIDE_VECTORS);
3965       movdqu(vec1, Address(ary1, len, Address::times_1));
3966       ptest(vec1, vec2);
3967       jccb(Assembler::notZero, BREAK_LOOP);
3968       addptr(len, 16);
3969       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3970 
3971       testl(result, 0x0000000f); // len is zero, any bytes remaining?
3972       jcc(Assembler::zero, DONE);
3973 
3974       // Quick test using the already prepared vector mask
3975       movl(len, result);
3976       andl(len, 0x0000000f);   // tail count (in bytes)
3977       movdqu(vec1, Address(ary1, len, Address::times_1, -16));
3978       ptest(vec1, vec2);
3979       jcc(Assembler::zero, DONE);
3980       jmpb(TAIL_START);
3981 
3982       bind(BREAK_LOOP);
3983       // At least one byte in the last 16-byte vector is negative.
3984       // Set up and look at the last 16 bytes as if they were a tail
3985       lea(ary1, Address(ary1, len, Address::times_1));
3986       addptr(result, len);
3987       // Ignore the very last byte: if all others are positive,
3988       // it must be negative, so we can skip right to the 2+1 byte
3989       // end comparison at this point
3990       orl(result, 15);
3991       movl(len, 15);
3992       // Fallthru to tail compare
3993     }
3994   }
3995 
3996   bind(TAIL_START);
3997   // Compare 4-byte vectors
3998   andl(len, 0xfffffffc); // vector count (in bytes)
3999   jccb(Assembler::zero, COMPARE_CHAR);
4000 
4001   lea(ary1, Address(ary1, len, Address::times_1));
4002   negptr(len);
4003 
4004   bind(COMPARE_VECTORS);
4005   movl(tmp1, Address(ary1, len, Address::times_1));
4006   andl(tmp1, 0x80808080);
4007   jccb(Assembler::notZero, TAIL_ADJUST);
4008   addptr(len, 4);
4009   jccb(Assembler::notZero, COMPARE_VECTORS);
4010 
4011   // Compare trailing char (final 2-3 bytes), if any
4012   bind(COMPARE_CHAR);
4013 
4014   testl(result, 0x2);   // tail  char
4015   jccb(Assembler::zero, COMPARE_BYTE);
4016   load_unsigned_short(tmp1, Address(ary1, 0));
4017   andl(tmp1, 0x00008080);
4018   jccb(Assembler::notZero, CHAR_ADJUST);
4019   lea(ary1, Address(ary1, 2));
4020 
4021   bind(COMPARE_BYTE);
4022   testl(result, 0x1);   // tail  byte
4023   jccb(Assembler::zero, DONE);
4024   load_unsigned_byte(tmp1, Address(ary1, 0));
4025   testl(tmp1, 0x00000080);
4026   jccb(Assembler::zero, DONE);
4027   subptr(result, 1);
4028   jmpb(DONE);
4029 
4030   bind(TAIL_ADJUST);
4031   // there are negative bits in the last 4 byte block.
4032   // Adjust result and check the next three bytes
4033   addptr(result, len);
4034   orl(result, 3);
4035   lea(ary1, Address(ary1, len, Address::times_1));
4036   jmpb(COMPARE_CHAR);
4037 
4038   bind(CHAR_ADJUST);
4039   // We are looking at a char + optional byte tail, and found that one
4040   // of the bytes in the char is negative. Adjust the result, check the
4041   // first byte and readjust if needed.
4042   andl(result, 0xfffffffc);
4043   testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first
4044   jccb(Assembler::notZero, DONE);
4045   addptr(result, 1);
4046 
4047   // That's it
4048   bind(DONE);
4049   if (UseAVX >= 2 && UseSSE >= 2) {
4050     // clean upper bits of YMM registers
4051     vpxor(vec1, vec1);
4052     vpxor(vec2, vec2);
4053   }
4054 }
4055 
4056 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
4057 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
4058                                       Register limit, Register result, Register chr,
4059                                       XMMRegister vec1, XMMRegister vec2, bool is_char, KRegister mask) {
4060   ShortBranchVerifier sbv(this);
4061   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
4062 
4063   int length_offset  = arrayOopDesc::length_offset_in_bytes();
4064   int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
4065 
4066   if (is_array_equ) {
4067     // Check the input args
4068     cmpoop(ary1, ary2);
4069     jcc(Assembler::equal, TRUE_LABEL);
4070 
4071     // Need additional checks for arrays_equals.
4072     testptr(ary1, ary1);
4073     jcc(Assembler::zero, FALSE_LABEL);
4074     testptr(ary2, ary2);
4075     jcc(Assembler::zero, FALSE_LABEL);
4076 
4077     // Check the lengths
4078     movl(limit, Address(ary1, length_offset));
4079     cmpl(limit, Address(ary2, length_offset));
4080     jcc(Assembler::notEqual, FALSE_LABEL);
4081   }
4082 
4083   // count == 0
4084   testl(limit, limit);
4085   jcc(Assembler::zero, TRUE_LABEL);
4086 
4087   if (is_array_equ) {
4088     // Load array address
4089     lea(ary1, Address(ary1, base_offset));
4090     lea(ary2, Address(ary2, base_offset));
4091   }
4092 
4093   if (is_array_equ && is_char) {
4094     // arrays_equals when used for char[].
4095     shll(limit, 1);      // byte count != 0
4096   }
4097   movl(result, limit); // copy
4098 
4099   if (UseAVX >= 2) {
4100     // With AVX2, use 32-byte vector compare
4101     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4102 
4103     // Compare 32-byte vectors
4104     andl(result, 0x0000001f);  //   tail count (in bytes)
4105     andl(limit, 0xffffffe0);   // vector count (in bytes)
4106     jcc(Assembler::zero, COMPARE_TAIL);
4107 
4108     lea(ary1, Address(ary1, limit, Address::times_1));
4109     lea(ary2, Address(ary2, limit, Address::times_1));
4110     negptr(limit);
4111 
4112 #ifdef _LP64
4113     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
4114       Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
4115 
4116       cmpl(limit, -64);
4117       jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
4118 
4119       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
4120 
4121       evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
4122       evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
4123       kortestql(mask, mask);
4124       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4125       addptr(limit, 64);  // update since we already compared at this addr
4126       cmpl(limit, -64);
4127       jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
4128 
4129       // At this point we may still need to compare -limit+result bytes.
4130       // We could execute the next two instruction and just continue via non-wide path:
4131       //  cmpl(limit, 0);
4132       //  jcc(Assembler::equal, COMPARE_TAIL);  // true
4133       // But since we stopped at the points ary{1,2}+limit which are
4134       // not farther than 64 bytes from the ends of arrays ary{1,2}+result
4135       // (|limit| <= 32 and result < 32),
4136       // we may just compare the last 64 bytes.
4137       //
4138       addptr(result, -64);   // it is safe, bc we just came from this area
4139       evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
4140       evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
4141       kortestql(mask, mask);
4142       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4143 
4144       jmp(TRUE_LABEL);
4145 
4146       bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
4147 
4148     }//if (VM_Version::supports_avx512vlbw())
4149 #endif //_LP64
4150     bind(COMPARE_WIDE_VECTORS);
4151     vmovdqu(vec1, Address(ary1, limit, Address::times_1));
4152     vmovdqu(vec2, Address(ary2, limit, Address::times_1));
4153     vpxor(vec1, vec2);
4154 
4155     vptest(vec1, vec1);
4156     jcc(Assembler::notZero, FALSE_LABEL);
4157     addptr(limit, 32);
4158     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4159 
4160     testl(result, result);
4161     jcc(Assembler::zero, TRUE_LABEL);
4162 
4163     vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
4164     vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
4165     vpxor(vec1, vec2);
4166 
4167     vptest(vec1, vec1);
4168     jccb(Assembler::notZero, FALSE_LABEL);
4169     jmpb(TRUE_LABEL);
4170 
4171     bind(COMPARE_TAIL); // limit is zero
4172     movl(limit, result);
4173     // Fallthru to tail compare
4174   } else if (UseSSE42Intrinsics) {
4175     // With SSE4.2, use double quad vector compare
4176     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4177 
4178     // Compare 16-byte vectors
4179     andl(result, 0x0000000f);  //   tail count (in bytes)
4180     andl(limit, 0xfffffff0);   // vector count (in bytes)
4181     jcc(Assembler::zero, COMPARE_TAIL);
4182 
4183     lea(ary1, Address(ary1, limit, Address::times_1));
4184     lea(ary2, Address(ary2, limit, Address::times_1));
4185     negptr(limit);
4186 
4187     bind(COMPARE_WIDE_VECTORS);
4188     movdqu(vec1, Address(ary1, limit, Address::times_1));
4189     movdqu(vec2, Address(ary2, limit, Address::times_1));
4190     pxor(vec1, vec2);
4191 
4192     ptest(vec1, vec1);
4193     jcc(Assembler::notZero, FALSE_LABEL);
4194     addptr(limit, 16);
4195     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4196 
4197     testl(result, result);
4198     jcc(Assembler::zero, TRUE_LABEL);
4199 
4200     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
4201     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
4202     pxor(vec1, vec2);
4203 
4204     ptest(vec1, vec1);
4205     jccb(Assembler::notZero, FALSE_LABEL);
4206     jmpb(TRUE_LABEL);
4207 
4208     bind(COMPARE_TAIL); // limit is zero
4209     movl(limit, result);
4210     // Fallthru to tail compare
4211   }
4212 
4213   // Compare 4-byte vectors
4214   andl(limit, 0xfffffffc); // vector count (in bytes)
4215   jccb(Assembler::zero, COMPARE_CHAR);
4216 
4217   lea(ary1, Address(ary1, limit, Address::times_1));
4218   lea(ary2, Address(ary2, limit, Address::times_1));
4219   negptr(limit);
4220 
4221   bind(COMPARE_VECTORS);
4222   movl(chr, Address(ary1, limit, Address::times_1));
4223   cmpl(chr, Address(ary2, limit, Address::times_1));
4224   jccb(Assembler::notEqual, FALSE_LABEL);
4225   addptr(limit, 4);
4226   jcc(Assembler::notZero, COMPARE_VECTORS);
4227 
4228   // Compare trailing char (final 2 bytes), if any
4229   bind(COMPARE_CHAR);
4230   testl(result, 0x2);   // tail  char
4231   jccb(Assembler::zero, COMPARE_BYTE);
4232   load_unsigned_short(chr, Address(ary1, 0));
4233   load_unsigned_short(limit, Address(ary2, 0));
4234   cmpl(chr, limit);
4235   jccb(Assembler::notEqual, FALSE_LABEL);
4236 
4237   if (is_array_equ && is_char) {
4238     bind(COMPARE_BYTE);
4239   } else {
4240     lea(ary1, Address(ary1, 2));
4241     lea(ary2, Address(ary2, 2));
4242 
4243     bind(COMPARE_BYTE);
4244     testl(result, 0x1);   // tail  byte
4245     jccb(Assembler::zero, TRUE_LABEL);
4246     load_unsigned_byte(chr, Address(ary1, 0));
4247     load_unsigned_byte(limit, Address(ary2, 0));
4248     cmpl(chr, limit);
4249     jccb(Assembler::notEqual, FALSE_LABEL);
4250   }
4251   bind(TRUE_LABEL);
4252   movl(result, 1);   // return true
4253   jmpb(DONE);
4254 
4255   bind(FALSE_LABEL);
4256   xorl(result, result); // return false
4257 
4258   // That's it
4259   bind(DONE);
4260   if (UseAVX >= 2) {
4261     // clean upper bits of YMM registers
4262     vpxor(vec1, vec1);
4263     vpxor(vec2, vec2);
4264   }
4265 }
4266 
4267 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4268                                     XMMRegister src1, int imm8, bool merge, int vlen_enc) {
4269   switch(ideal_opc) {
4270     case Op_LShiftVS:
4271       Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break;
4272     case Op_LShiftVI:
4273       Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break;
4274     case Op_LShiftVL:
4275       Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break;
4276     case Op_RShiftVS:
4277       Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break;
4278     case Op_RShiftVI:
4279       Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break;
4280     case Op_RShiftVL:
4281       Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break;
4282     case Op_URShiftVS:
4283       Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break;
4284     case Op_URShiftVI:
4285       Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break;
4286     case Op_URShiftVL:
4287       Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break;
4288     case Op_RotateRightV:
4289       evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4290     case Op_RotateLeftV:
4291       evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4292     default:
4293       fatal("Unsupported masked operation"); break;
4294   }
4295 }
4296 
4297 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4298                                     XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc,
4299                                     bool is_varshift) {
4300   switch (ideal_opc) {
4301     case Op_AddVB:
4302       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4303     case Op_AddVS:
4304       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4305     case Op_AddVI:
4306       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4307     case Op_AddVL:
4308       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4309     case Op_AddVF:
4310       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4311     case Op_AddVD:
4312       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4313     case Op_SubVB:
4314       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4315     case Op_SubVS:
4316       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4317     case Op_SubVI:
4318       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4319     case Op_SubVL:
4320       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4321     case Op_SubVF:
4322       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4323     case Op_SubVD:
4324       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4325     case Op_MulVS:
4326       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4327     case Op_MulVI:
4328       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4329     case Op_MulVL:
4330       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4331     case Op_MulVF:
4332       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4333     case Op_MulVD:
4334       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4335     case Op_DivVF:
4336       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4337     case Op_DivVD:
4338       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4339     case Op_SqrtVF:
4340       evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break;
4341     case Op_SqrtVD:
4342       evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break;
4343     case Op_AbsVB:
4344       evpabsb(dst, mask, src2, merge, vlen_enc); break;
4345     case Op_AbsVS:
4346       evpabsw(dst, mask, src2, merge, vlen_enc); break;
4347     case Op_AbsVI:
4348       evpabsd(dst, mask, src2, merge, vlen_enc); break;
4349     case Op_AbsVL:
4350       evpabsq(dst, mask, src2, merge, vlen_enc); break;
4351     case Op_FmaVF:
4352       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4353     case Op_FmaVD:
4354       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4355     case Op_VectorRearrange:
4356       evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break;
4357     case Op_LShiftVS:
4358       evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4359     case Op_LShiftVI:
4360       evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4361     case Op_LShiftVL:
4362       evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4363     case Op_RShiftVS:
4364       evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4365     case Op_RShiftVI:
4366       evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4367     case Op_RShiftVL:
4368       evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4369     case Op_URShiftVS:
4370       evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4371     case Op_URShiftVI:
4372       evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4373     case Op_URShiftVL:
4374       evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4375     case Op_RotateLeftV:
4376       evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4377     case Op_RotateRightV:
4378       evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4379     case Op_MaxV:
4380       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4381     case Op_MinV:
4382       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4383     case Op_XorV:
4384       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4385     case Op_OrV:
4386       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4387     case Op_AndV:
4388       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4389     default:
4390       fatal("Unsupported masked operation"); break;
4391   }
4392 }
4393 
4394 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4395                                     XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4396   switch (ideal_opc) {
4397     case Op_AddVB:
4398       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4399     case Op_AddVS:
4400       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4401     case Op_AddVI:
4402       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4403     case Op_AddVL:
4404       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4405     case Op_AddVF:
4406       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4407     case Op_AddVD:
4408       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4409     case Op_SubVB:
4410       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4411     case Op_SubVS:
4412       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4413     case Op_SubVI:
4414       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4415     case Op_SubVL:
4416       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4417     case Op_SubVF:
4418       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4419     case Op_SubVD:
4420       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4421     case Op_MulVS:
4422       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4423     case Op_MulVI:
4424       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4425     case Op_MulVL:
4426       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4427     case Op_MulVF:
4428       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4429     case Op_MulVD:
4430       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4431     case Op_DivVF:
4432       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4433     case Op_DivVD:
4434       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4435     case Op_FmaVF:
4436       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4437     case Op_FmaVD:
4438       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4439     case Op_MaxV:
4440       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4441     case Op_MinV:
4442       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4443     case Op_XorV:
4444       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4445     case Op_OrV:
4446       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4447     case Op_AndV:
4448       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4449     default:
4450       fatal("Unsupported masked operation"); break;
4451   }
4452 }
4453 
4454 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst,
4455                                   KRegister src1, KRegister src2) {
4456   BasicType etype = T_ILLEGAL;
4457   switch(mask_len) {
4458     case 2:
4459     case 4:
4460     case 8:  etype = T_BYTE; break;
4461     case 16: etype = T_SHORT; break;
4462     case 32: etype = T_INT; break;
4463     case 64: etype = T_LONG; break;
4464     default: fatal("Unsupported type"); break;
4465   }
4466   assert(etype != T_ILLEGAL, "");
4467   switch(ideal_opc) {
4468     case Op_AndVMask:
4469       kand(etype, dst, src1, src2); break;
4470     case Op_OrVMask:
4471       kor(etype, dst, src1, src2); break;
4472     case Op_XorVMask:
4473       kxor(etype, dst, src1, src2); break;
4474     default:
4475       fatal("Unsupported masked operation"); break;
4476   }
4477 }
4478 
4479 /*
4480  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4481  * If src is NaN, the result is 0.
4482  * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE,
4483  * the result is equal to the value of Integer.MIN_VALUE.
4484  * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE,
4485  * the result is equal to the value of Integer.MAX_VALUE.
4486  */
4487 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4488                                                                    XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
4489                                                                    Register rscratch, AddressLiteral float_sign_flip,
4490                                                                    int vec_enc) {
4491   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4492   Label done;
4493   vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch);
4494   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
4495   vptest(xtmp2, xtmp2, vec_enc);
4496   jccb(Assembler::equal, done);
4497 
4498   vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc);
4499   vpxor(xtmp1, xtmp1, xtmp4, vec_enc);
4500 
4501   vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
4502   vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc);
4503   vblendvps(dst, dst, xtmp4, xtmp3, vec_enc);
4504 
4505   // Recompute the mask for remaining special value.
4506   vpxor(xtmp2, xtmp2, xtmp3, vec_enc);
4507   // Extract SRC values corresponding to TRUE mask lanes.
4508   vpand(xtmp4, xtmp2, src, vec_enc);
4509   // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special
4510   // values are set.
4511   vpxor(xtmp3, xtmp2, xtmp4, vec_enc);
4512 
4513   vblendvps(dst, dst, xtmp1, xtmp3, vec_enc);
4514   bind(done);
4515 }
4516 
4517 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4518                                                                     XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4519                                                                     Register rscratch, AddressLiteral float_sign_flip,
4520                                                                     int vec_enc) {
4521   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4522   Label done;
4523   evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
4524   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4525   kortestwl(ktmp1, ktmp1);
4526   jccb(Assembler::equal, done);
4527 
4528   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4529   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4530   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
4531 
4532   kxorwl(ktmp1, ktmp1, ktmp2);
4533   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4534   vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4535   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
4536   bind(done);
4537 }
4538 
4539 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4540                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4541                                                                      Register rscratch, AddressLiteral double_sign_flip,
4542                                                                      int vec_enc) {
4543   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4544 
4545   Label done;
4546   evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
4547   Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc);
4548   kortestwl(ktmp1, ktmp1);
4549   jccb(Assembler::equal, done);
4550 
4551   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4552   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4553   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
4554 
4555   kxorwl(ktmp1, ktmp1, ktmp2);
4556   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4557   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4558   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
4559   bind(done);
4560 }
4561 
4562 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4563                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4564                                                                      Register rscratch, AddressLiteral float_sign_flip,
4565                                                                      int vec_enc) {
4566   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4567   Label done;
4568   evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
4569   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4570   kortestwl(ktmp1, ktmp1);
4571   jccb(Assembler::equal, done);
4572 
4573   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4574   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4575   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
4576 
4577   kxorwl(ktmp1, ktmp1, ktmp2);
4578   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4579   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4580   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
4581   bind(done);
4582 }
4583 
4584 /*
4585  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4586  * If src is NaN, the result is 0.
4587  * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE,
4588  * the result is equal to the value of Long.MIN_VALUE.
4589  * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE,
4590  * the result is equal to the value of Long.MAX_VALUE.
4591  */
4592 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4593                                                                       XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4594                                                                       Register rscratch, AddressLiteral double_sign_flip,
4595                                                                       int vec_enc) {
4596   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4597 
4598   Label done;
4599   evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
4600   evpcmpeqq(ktmp1, xtmp1, dst, vec_enc);
4601   kortestwl(ktmp1, ktmp1);
4602   jccb(Assembler::equal, done);
4603 
4604   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4605   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4606   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
4607 
4608   kxorwl(ktmp1, ktmp1, ktmp2);
4609   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4610   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4611   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
4612   bind(done);
4613 }
4614 
4615 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero,
4616                                                              XMMRegister xtmp, int index, int vec_enc) {
4617    assert(vec_enc < Assembler::AVX_512bit, "");
4618    if (vec_enc == Assembler::AVX_256bit) {
4619      vextractf128_high(xtmp, src);
4620      vshufps(dst, src, xtmp, index, vec_enc);
4621    } else {
4622      vshufps(dst, src, zero, index, vec_enc);
4623    }
4624 }
4625 
4626 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
4627                                                                     XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch,
4628                                                                     AddressLiteral float_sign_flip, int src_vec_enc) {
4629   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4630 
4631   Label done;
4632   // Compare the destination lanes with float_sign_flip
4633   // value to get mask for all special values.
4634   movdqu(xtmp1, float_sign_flip, rscratch);
4635   vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit);
4636   ptest(xtmp2, xtmp2);
4637   jccb(Assembler::equal, done);
4638 
4639   // Flip float_sign_flip to get max integer value.
4640   vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit);
4641   pxor(xtmp1, xtmp4);
4642 
4643   // Set detination lanes corresponding to unordered source lanes as zero.
4644   vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc);
4645   vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc);
4646 
4647   // Shuffle mask vector and pack lower doubles word from each quadword lane.
4648   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
4649   vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit);
4650 
4651   // Recompute the mask for remaining special value.
4652   pxor(xtmp2, xtmp3);
4653   // Extract mask corresponding to non-negative source lanes.
4654   vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc);
4655 
4656   // Shuffle mask vector and pack lower doubles word from each quadword lane.
4657   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
4658   pand(xtmp3, xtmp2);
4659 
4660   // Replace destination lanes holding special value(0x80000000) with max int
4661   // if corresponding source lane holds a +ve value.
4662   vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit);
4663   bind(done);
4664 }
4665 
4666 
4667 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero,
4668                                                    XMMRegister xtmp, Register rscratch, int vec_enc) {
4669   switch(to_elem_bt) {
4670     case T_SHORT:
4671       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing");
4672       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch);
4673       vpackusdw(dst, dst, zero, vec_enc);
4674       if (vec_enc == Assembler::AVX_256bit) {
4675         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
4676       }
4677       break;
4678     case  T_BYTE:
4679       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing");
4680       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch);
4681       vpackusdw(dst, dst, zero, vec_enc);
4682       if (vec_enc == Assembler::AVX_256bit) {
4683         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
4684       }
4685       vpackuswb(dst, dst, zero, vec_enc);
4686       break;
4687     default: assert(false, "%s", type2name(to_elem_bt));
4688   }
4689 }
4690 
4691 /*
4692  * Algorithm for vector D2L and F2I conversions:-
4693  * a) Perform vector D2L/F2I cast.
4694  * b) Choose fast path if none of the result vector lane contains 0x80000000 value.
4695  *    It signifies that source value could be any of the special floating point
4696  *    values(NaN,-Inf,Inf,Max,-Min).
4697  * c) Set destination to zero if source is NaN value.
4698  * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value.
4699  */
4700 
4701 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4702                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
4703                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
4704   int to_elem_sz = type2aelembytes(to_elem_bt);
4705   assert(to_elem_sz <= 4, "");
4706   vcvttps2dq(dst, src, vec_enc);
4707   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc);
4708   if (to_elem_sz < 4) {
4709     vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
4710     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc);
4711   }
4712 }
4713 
4714 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4715                                             XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip,
4716                                             Register rscratch, int vec_enc) {
4717   int to_elem_sz = type2aelembytes(to_elem_bt);
4718   assert(to_elem_sz <= 4, "");
4719   vcvttps2dq(dst, src, vec_enc);
4720   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc);
4721   switch(to_elem_bt) {
4722     case T_INT:
4723       break;
4724     case T_SHORT:
4725       evpmovdw(dst, dst, vec_enc);
4726       break;
4727     case T_BYTE:
4728       evpmovdb(dst, dst, vec_enc);
4729       break;
4730     default: assert(false, "%s", type2name(to_elem_bt));
4731   }
4732 }
4733 
4734 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
4735                                             KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip,
4736                                             Register rscratch, int vec_enc) {
4737   evcvttps2qq(dst, src, vec_enc);
4738   vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc);
4739 }
4740 
4741 // Handling for downcasting from double to integer or sub-word types on AVX2.
4742 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4743                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5,
4744                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
4745   int to_elem_sz = type2aelembytes(to_elem_bt);
4746   assert(to_elem_sz < 8, "");
4747   vcvttpd2dq(dst, src, vec_enc);
4748   vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch,
4749                                               float_sign_flip, vec_enc);
4750   if (to_elem_sz < 4) {
4751     // xtmp4 holds all zero lanes.
4752     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit);
4753   }
4754 }
4755 
4756 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src,
4757                                             XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1,
4758                                             KRegister ktmp2, AddressLiteral sign_flip,
4759                                             Register rscratch, int vec_enc) {
4760   if (VM_Version::supports_avx512dq()) {
4761     evcvttpd2qq(dst, src, vec_enc);
4762     vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
4763     switch(to_elem_bt) {
4764       case T_LONG:
4765         break;
4766       case T_INT:
4767         evpmovsqd(dst, dst, vec_enc);
4768         break;
4769       case T_SHORT:
4770         evpmovsqd(dst, dst, vec_enc);
4771         evpmovdw(dst, dst, vec_enc);
4772         break;
4773       case T_BYTE:
4774         evpmovsqd(dst, dst, vec_enc);
4775         evpmovdb(dst, dst, vec_enc);
4776         break;
4777       default: assert(false, "%s", type2name(to_elem_bt));
4778     }
4779   } else {
4780     assert(type2aelembytes(to_elem_bt) <= 4, "");
4781     vcvttpd2dq(dst, src, vec_enc);
4782     vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
4783     switch(to_elem_bt) {
4784       case T_INT:
4785         break;
4786       case T_SHORT:
4787         evpmovdw(dst, dst, vec_enc);
4788         break;
4789       case T_BYTE:
4790         evpmovdb(dst, dst, vec_enc);
4791         break;
4792       default: assert(false, "%s", type2name(to_elem_bt));
4793     }
4794   }
4795 }
4796 
4797 #ifdef _LP64
4798 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src,
4799                                                  AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
4800                                                  Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
4801   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
4802   // and re-instantiate original MXCSR.RC mode after that.
4803   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
4804 
4805   mov64(tmp, julong_cast(0.5L));
4806   evpbroadcastq(xtmp1, tmp, vec_enc);
4807   vaddpd(xtmp1, src , xtmp1, vec_enc);
4808   evcvtpd2qq(dst, xtmp1, vec_enc);
4809   vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
4810                                                 double_sign_flip, vec_enc);;
4811 
4812   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
4813 }
4814 
4815 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src,
4816                                                 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
4817                                                 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
4818   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
4819   // and re-instantiate original MXCSR.RC mode after that.
4820   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
4821 
4822   movl(tmp, jint_cast(0.5));
4823   movq(xtmp1, tmp);
4824   vbroadcastss(xtmp1, xtmp1, vec_enc);
4825   vaddps(xtmp1, src , xtmp1, vec_enc);
4826   vcvtps2dq(dst, xtmp1, vec_enc);
4827   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
4828                                               float_sign_flip, vec_enc);
4829 
4830   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
4831 }
4832 
4833 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src,
4834                                                AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
4835                                                Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) {
4836   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
4837   // and re-instantiate original MXCSR.RC mode after that.
4838   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
4839 
4840   movl(tmp, jint_cast(0.5));
4841   movq(xtmp1, tmp);
4842   vbroadcastss(xtmp1, xtmp1, vec_enc);
4843   vaddps(xtmp1, src , xtmp1, vec_enc);
4844   vcvtps2dq(dst, xtmp1, vec_enc);
4845   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc);
4846 
4847   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
4848 }
4849 #endif // _LP64
4850 
4851 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
4852                                              BasicType from_elem_bt, BasicType to_elem_bt) {
4853   switch (from_elem_bt) {
4854     case T_BYTE:
4855       switch (to_elem_bt) {
4856         case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break;
4857         case T_INT:   vpmovzxbd(dst, src, vlen_enc); break;
4858         case T_LONG:  vpmovzxbq(dst, src, vlen_enc); break;
4859         default: ShouldNotReachHere();
4860       }
4861       break;
4862     case T_SHORT:
4863       switch (to_elem_bt) {
4864         case T_INT:  vpmovzxwd(dst, src, vlen_enc); break;
4865         case T_LONG: vpmovzxwq(dst, src, vlen_enc); break;
4866         default: ShouldNotReachHere();
4867       }
4868       break;
4869     case T_INT:
4870       assert(to_elem_bt == T_LONG, "");
4871       vpmovzxdq(dst, src, vlen_enc);
4872       break;
4873     default:
4874       ShouldNotReachHere();
4875   }
4876 }
4877 
4878 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
4879                                            BasicType from_elem_bt, BasicType to_elem_bt) {
4880   switch (from_elem_bt) {
4881     case T_BYTE:
4882       switch (to_elem_bt) {
4883         case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break;
4884         case T_INT:   vpmovsxbd(dst, src, vlen_enc); break;
4885         case T_LONG:  vpmovsxbq(dst, src, vlen_enc); break;
4886         default: ShouldNotReachHere();
4887       }
4888       break;
4889     case T_SHORT:
4890       switch (to_elem_bt) {
4891         case T_INT:  vpmovsxwd(dst, src, vlen_enc); break;
4892         case T_LONG: vpmovsxwq(dst, src, vlen_enc); break;
4893         default: ShouldNotReachHere();
4894       }
4895       break;
4896     case T_INT:
4897       assert(to_elem_bt == T_LONG, "");
4898       vpmovsxdq(dst, src, vlen_enc);
4899       break;
4900     default:
4901       ShouldNotReachHere();
4902   }
4903 }
4904 
4905 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src,
4906                                          BasicType dst_bt, BasicType src_bt, int vlen) {
4907   int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen);
4908   assert(vlen_enc != AVX_512bit, "");
4909 
4910   int dst_bt_size = type2aelembytes(dst_bt);
4911   int src_bt_size = type2aelembytes(src_bt);
4912   if (dst_bt_size > src_bt_size) {
4913     switch (dst_bt_size / src_bt_size) {
4914       case 2: vpmovsxbw(dst, src, vlen_enc); break;
4915       case 4: vpmovsxbd(dst, src, vlen_enc); break;
4916       case 8: vpmovsxbq(dst, src, vlen_enc); break;
4917       default: ShouldNotReachHere();
4918     }
4919   } else {
4920     assert(dst_bt_size < src_bt_size, "");
4921     switch (src_bt_size / dst_bt_size) {
4922       case 2: {
4923         if (vlen_enc == AVX_128bit) {
4924           vpacksswb(dst, src, src, vlen_enc);
4925         } else {
4926           vpacksswb(dst, src, src, vlen_enc);
4927           vpermq(dst, dst, 0x08, vlen_enc);
4928         }
4929         break;
4930       }
4931       case 4: {
4932         if (vlen_enc == AVX_128bit) {
4933           vpackssdw(dst, src, src, vlen_enc);
4934           vpacksswb(dst, dst, dst, vlen_enc);
4935         } else {
4936           vpackssdw(dst, src, src, vlen_enc);
4937           vpermq(dst, dst, 0x08, vlen_enc);
4938           vpacksswb(dst, dst, dst, AVX_128bit);
4939         }
4940         break;
4941       }
4942       case 8: {
4943         if (vlen_enc == AVX_128bit) {
4944           vpshufd(dst, src, 0x08, vlen_enc);
4945           vpackssdw(dst, dst, dst, vlen_enc);
4946           vpacksswb(dst, dst, dst, vlen_enc);
4947         } else {
4948           vpshufd(dst, src, 0x08, vlen_enc);
4949           vpermq(dst, dst, 0x08, vlen_enc);
4950           vpackssdw(dst, dst, dst, AVX_128bit);
4951           vpacksswb(dst, dst, dst, AVX_128bit);
4952         }
4953         break;
4954       }
4955       default: ShouldNotReachHere();
4956     }
4957   }
4958 }
4959 
4960 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3,
4961                                    bool merge, BasicType bt, int vlen_enc) {
4962   if (bt == T_INT) {
4963     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
4964   } else {
4965     assert(bt == T_LONG, "");
4966     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
4967   }
4968 }
4969 
4970 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3,
4971                                    bool merge, BasicType bt, int vlen_enc) {
4972   if (bt == T_INT) {
4973     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
4974   } else {
4975     assert(bt == T_LONG, "");
4976     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
4977   }
4978 }
4979 
4980 #ifdef _LP64
4981 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1,
4982                                                Register rtmp2, XMMRegister xtmp, int mask_len,
4983                                                int vec_enc) {
4984   int index = 0;
4985   int vindex = 0;
4986   mov64(rtmp1, 0x0101010101010101L);
4987   pdepq(rtmp1, src, rtmp1);
4988   if (mask_len > 8) {
4989     movq(rtmp2, src);
4990     vpxor(xtmp, xtmp, xtmp, vec_enc);
4991     movq(xtmp, rtmp1);
4992   }
4993   movq(dst, rtmp1);
4994 
4995   mask_len -= 8;
4996   while (mask_len > 0) {
4997     assert ((mask_len & 0x7) == 0, "mask must be multiple of 8");
4998     index++;
4999     if ((index % 2) == 0) {
5000       pxor(xtmp, xtmp);
5001     }
5002     mov64(rtmp1, 0x0101010101010101L);
5003     shrq(rtmp2, 8);
5004     pdepq(rtmp1, rtmp2, rtmp1);
5005     pinsrq(xtmp, rtmp1, index % 2);
5006     vindex = index / 2;
5007     if (vindex) {
5008       // Write entire 16 byte vector when both 64 bit
5009       // lanes are update to save redundant instructions.
5010       if (index % 2) {
5011         vinsertf128(dst, dst, xtmp, vindex);
5012       }
5013     } else {
5014       vmovdqu(dst, xtmp);
5015     }
5016     mask_len -= 8;
5017   }
5018 }
5019 
5020 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) {
5021   switch(opc) {
5022     case Op_VectorMaskTrueCount:
5023       popcntq(dst, tmp);
5024       break;
5025     case Op_VectorMaskLastTrue:
5026       if (VM_Version::supports_lzcnt()) {
5027         lzcntq(tmp, tmp);
5028         movl(dst, 63);
5029         subl(dst, tmp);
5030       } else {
5031         movl(dst, -1);
5032         bsrq(tmp, tmp);
5033         cmov32(Assembler::notZero, dst, tmp);
5034       }
5035       break;
5036     case Op_VectorMaskFirstTrue:
5037       if (VM_Version::supports_bmi1()) {
5038         if (masklen < 32) {
5039           orl(tmp, 1 << masklen);
5040           tzcntl(dst, tmp);
5041         } else if (masklen == 32) {
5042           tzcntl(dst, tmp);
5043         } else {
5044           assert(masklen == 64, "");
5045           tzcntq(dst, tmp);
5046         }
5047       } else {
5048         if (masklen < 32) {
5049           orl(tmp, 1 << masklen);
5050           bsfl(dst, tmp);
5051         } else {
5052           assert(masklen == 32 || masklen == 64, "");
5053           movl(dst, masklen);
5054           if (masklen == 32)  {
5055             bsfl(tmp, tmp);
5056           } else {
5057             bsfq(tmp, tmp);
5058           }
5059           cmov32(Assembler::notZero, dst, tmp);
5060         }
5061       }
5062       break;
5063     case Op_VectorMaskToLong:
5064       assert(dst == tmp, "Dst and tmp should be the same for toLong operations");
5065       break;
5066     default: assert(false, "Unhandled mask operation");
5067   }
5068 }
5069 
5070 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp,
5071                                               int masklen, int masksize, int vec_enc) {
5072   assert(VM_Version::supports_popcnt(), "");
5073 
5074   if(VM_Version::supports_avx512bw()) {
5075     kmovql(tmp, mask);
5076   } else {
5077     assert(masklen <= 16, "");
5078     kmovwl(tmp, mask);
5079   }
5080 
5081   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5082   // operations needs to be clipped.
5083   if (masksize < 16 && opc != Op_VectorMaskFirstTrue) {
5084     andq(tmp, (1 << masklen) - 1);
5085   }
5086 
5087   vector_mask_operation_helper(opc, dst, tmp, masklen);
5088 }
5089 
5090 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
5091                                               Register tmp, int masklen, BasicType bt, int vec_enc) {
5092   assert(vec_enc == AVX_128bit && VM_Version::supports_avx() ||
5093          vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4), "");
5094   assert(VM_Version::supports_popcnt(), "");
5095 
5096   bool need_clip = false;
5097   switch(bt) {
5098     case T_BOOLEAN:
5099       // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1
5100       vpxor(xtmp, xtmp, xtmp, vec_enc);
5101       vpsubb(xtmp, xtmp, mask, vec_enc);
5102       vpmovmskb(tmp, xtmp, vec_enc);
5103       need_clip = masklen < 16;
5104       break;
5105     case T_BYTE:
5106       vpmovmskb(tmp, mask, vec_enc);
5107       need_clip = masklen < 16;
5108       break;
5109     case T_SHORT:
5110       vpacksswb(xtmp, mask, mask, vec_enc);
5111       if (masklen >= 16) {
5112         vpermpd(xtmp, xtmp, 8, vec_enc);
5113       }
5114       vpmovmskb(tmp, xtmp, Assembler::AVX_128bit);
5115       need_clip = masklen < 16;
5116       break;
5117     case T_INT:
5118     case T_FLOAT:
5119       vmovmskps(tmp, mask, vec_enc);
5120       need_clip = masklen < 4;
5121       break;
5122     case T_LONG:
5123     case T_DOUBLE:
5124       vmovmskpd(tmp, mask, vec_enc);
5125       need_clip = masklen < 2;
5126       break;
5127     default: assert(false, "Unhandled type, %s", type2name(bt));
5128   }
5129 
5130   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5131   // operations needs to be clipped.
5132   if (need_clip && opc != Op_VectorMaskFirstTrue) {
5133     // need_clip implies masklen < 32
5134     andq(tmp, (1 << masklen) - 1);
5135   }
5136 
5137   vector_mask_operation_helper(opc, dst, tmp, masklen);
5138 }
5139 
5140 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1,
5141                                              Register rtmp2, int mask_len) {
5142   kmov(rtmp1, src);
5143   andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len)));
5144   mov64(rtmp2, -1L);
5145   pextq(rtmp2, rtmp2, rtmp1);
5146   kmov(dst, rtmp2);
5147 }
5148 
5149 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask,
5150                                                bool merge, BasicType bt, int vec_enc) {
5151   if (opcode == Op_CompressV) {
5152     switch(bt) {
5153     case T_BYTE:
5154       evpcompressb(dst, mask, src, merge, vec_enc);
5155       break;
5156     case T_CHAR:
5157     case T_SHORT:
5158       evpcompressw(dst, mask, src, merge, vec_enc);
5159       break;
5160     case T_INT:
5161       evpcompressd(dst, mask, src, merge, vec_enc);
5162       break;
5163     case T_FLOAT:
5164       evcompressps(dst, mask, src, merge, vec_enc);
5165       break;
5166     case T_LONG:
5167       evpcompressq(dst, mask, src, merge, vec_enc);
5168       break;
5169     case T_DOUBLE:
5170       evcompresspd(dst, mask, src, merge, vec_enc);
5171       break;
5172     default:
5173       fatal("Unsupported type %s", type2name(bt));
5174       break;
5175     }
5176   } else {
5177     assert(opcode == Op_ExpandV, "");
5178     switch(bt) {
5179     case T_BYTE:
5180       evpexpandb(dst, mask, src, merge, vec_enc);
5181       break;
5182     case T_CHAR:
5183     case T_SHORT:
5184       evpexpandw(dst, mask, src, merge, vec_enc);
5185       break;
5186     case T_INT:
5187       evpexpandd(dst, mask, src, merge, vec_enc);
5188       break;
5189     case T_FLOAT:
5190       evexpandps(dst, mask, src, merge, vec_enc);
5191       break;
5192     case T_LONG:
5193       evpexpandq(dst, mask, src, merge, vec_enc);
5194       break;
5195     case T_DOUBLE:
5196       evexpandpd(dst, mask, src, merge, vec_enc);
5197       break;
5198     default:
5199       fatal("Unsupported type %s", type2name(bt));
5200       break;
5201     }
5202   }
5203 }
5204 #endif
5205 
5206 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5207                                            KRegister ktmp1, int vec_enc) {
5208   if (opcode == Op_SignumVD) {
5209     vsubpd(dst, zero, one, vec_enc);
5210     // if src < 0 ? -1 : 1
5211     evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5212     evblendmpd(dst, ktmp1, one, dst, true, vec_enc);
5213     // if src == NaN, -0.0 or 0.0 return src.
5214     evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5215     evblendmpd(dst, ktmp1, dst, src, true, vec_enc);
5216   } else {
5217     assert(opcode == Op_SignumVF, "");
5218     vsubps(dst, zero, one, vec_enc);
5219     // if src < 0 ? -1 : 1
5220     evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5221     evblendmps(dst, ktmp1, one, dst, true, vec_enc);
5222     // if src == NaN, -0.0 or 0.0 return src.
5223     evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5224     evblendmps(dst, ktmp1, dst, src, true, vec_enc);
5225   }
5226 }
5227 
5228 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5229                                           XMMRegister xtmp1, int vec_enc) {
5230   if (opcode == Op_SignumVD) {
5231     vsubpd(dst, zero, one, vec_enc);
5232     // if src < 0 ? -1 : 1
5233     vblendvpd(dst, one, dst, src, vec_enc);
5234     // if src == NaN, -0.0 or 0.0 return src.
5235     vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5236     vblendvpd(dst, dst, src, xtmp1, vec_enc);
5237   } else {
5238     assert(opcode == Op_SignumVF, "");
5239     vsubps(dst, zero, one, vec_enc);
5240     // if src < 0 ? -1 : 1
5241     vblendvps(dst, one, dst, src, vec_enc);
5242     // if src == NaN, -0.0 or 0.0 return src.
5243     vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5244     vblendvps(dst, dst, src, xtmp1, vec_enc);
5245   }
5246 }
5247 
5248 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) {
5249   if (VM_Version::supports_avx512bw()) {
5250     if (mask_len > 32) {
5251       kmovql(dst, src);
5252     } else {
5253       kmovdl(dst, src);
5254       if (mask_len != 32) {
5255         kshiftrdl(dst, dst, 32 - mask_len);
5256       }
5257     }
5258   } else {
5259     assert(mask_len <= 16, "");
5260     kmovwl(dst, src);
5261     if (mask_len != 16) {
5262       kshiftrwl(dst, dst, 16 - mask_len);
5263     }
5264   }
5265 }
5266 
5267 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) {
5268   int lane_size = type2aelembytes(bt);
5269   bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
5270   if ((is_LP64 || lane_size < 8) &&
5271       ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) ||
5272        (is_subword_type(bt) && VM_Version::supports_avx512vlbw()))) {
5273     movptr(rtmp, imm32);
5274     switch(lane_size) {
5275       case 1 : evpbroadcastb(dst, rtmp, vec_enc); break;
5276       case 2 : evpbroadcastw(dst, rtmp, vec_enc); break;
5277       case 4 : evpbroadcastd(dst, rtmp, vec_enc); break;
5278       case 8 : evpbroadcastq(dst, rtmp, vec_enc); break;
5279       fatal("Unsupported lane size %d", lane_size);
5280       break;
5281     }
5282   } else {
5283     movptr(rtmp, imm32);
5284     LP64_ONLY(movq(dst, rtmp)) NOT_LP64(movdl(dst, rtmp));
5285     switch(lane_size) {
5286       case 1 : vpbroadcastb(dst, dst, vec_enc); break;
5287       case 2 : vpbroadcastw(dst, dst, vec_enc); break;
5288       case 4 : vpbroadcastd(dst, dst, vec_enc); break;
5289       case 8 : vpbroadcastq(dst, dst, vec_enc); break;
5290       fatal("Unsupported lane size %d", lane_size);
5291       break;
5292     }
5293   }
5294 }
5295 
5296 //
5297 // Following is lookup table based popcount computation algorithm:-
5298 //       Index   Bit set count
5299 //     [ 0000 ->   0,
5300 //       0001 ->   1,
5301 //       0010 ->   1,
5302 //       0011 ->   2,
5303 //       0100 ->   1,
5304 //       0101 ->   2,
5305 //       0110 ->   2,
5306 //       0111 ->   3,
5307 //       1000 ->   1,
5308 //       1001 ->   2,
5309 //       1010 ->   3,
5310 //       1011 ->   3,
5311 //       1100 ->   2,
5312 //       1101 ->   3,
5313 //       1111 ->   4 ]
5314 //  a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as
5315 //     shuffle indices for lookup table access.
5316 //  b. Right shift each byte of vector lane by 4 positions.
5317 //  c. Count the number of 1s in 4 MSB bits each byte. These bits are used as
5318 //     shuffle indices for lookup table access.
5319 //  d. Add the bitset count of upper and lower 4 bits of each byte.
5320 //  e. Unpack double words to quad words and compute sum of absolute difference of bitset
5321 //     count of all the bytes of a quadword.
5322 //  f. Perform step e. for upper 128bit vector lane.
5323 //  g. Pack the bitset count of quadwords back to double word.
5324 //  h. Unpacking and packing operations are not needed for 64bit vector lane.
5325 
5326 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5327                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5328   assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), "");
5329   vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc);
5330   vpsrlw(dst, src, 4, vec_enc);
5331   vpand(dst, dst, xtmp1, vec_enc);
5332   vpand(xtmp1, src, xtmp1, vec_enc);
5333   vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg);
5334   vpshufb(xtmp1, xtmp2, xtmp1, vec_enc);
5335   vpshufb(dst, xtmp2, dst, vec_enc);
5336   vpaddb(dst, dst, xtmp1, vec_enc);
5337 }
5338 
5339 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5340                                             XMMRegister xtmp2, Register rtmp, int vec_enc) {
5341   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5342   // Following code is as per steps e,f,g and h of above algorithm.
5343   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5344   vpunpckhdq(dst, xtmp1, xtmp2, vec_enc);
5345   vpsadbw(dst, dst, xtmp2, vec_enc);
5346   vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc);
5347   vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc);
5348   vpackuswb(dst, xtmp1, dst, vec_enc);
5349 }
5350 
5351 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5352                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5353   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5354   // Add the popcount of upper and lower bytes of word.
5355   vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc);
5356   vpsrlw(dst, xtmp1, 8, vec_enc);
5357   vpand(xtmp1, xtmp1, xtmp2, vec_enc);
5358   vpaddw(dst, dst, xtmp1, vec_enc);
5359 }
5360 
5361 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5362                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5363   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5364   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5365   vpsadbw(dst, xtmp1, xtmp2, vec_enc);
5366 }
5367 
5368 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5369                                                  XMMRegister xtmp2, Register rtmp, int vec_enc) {
5370   switch(bt) {
5371     case T_LONG:
5372       vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5373       break;
5374     case T_INT:
5375       vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5376       break;
5377     case T_CHAR:
5378     case T_SHORT:
5379       vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5380       break;
5381     case T_BYTE:
5382     case T_BOOLEAN:
5383       vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5384       break;
5385     default:
5386       fatal("Unsupported type %s", type2name(bt));
5387       break;
5388   }
5389 }
5390 
5391 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5392                                                       KRegister mask, bool merge, int vec_enc) {
5393   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5394   switch(bt) {
5395     case T_LONG:
5396       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5397       evpopcntq(dst, mask, src, merge, vec_enc);
5398       break;
5399     case T_INT:
5400       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5401       evpopcntd(dst, mask, src, merge, vec_enc);
5402       break;
5403     case T_CHAR:
5404     case T_SHORT:
5405       assert(VM_Version::supports_avx512_bitalg(), "");
5406       evpopcntw(dst, mask, src, merge, vec_enc);
5407       break;
5408     case T_BYTE:
5409     case T_BOOLEAN:
5410       assert(VM_Version::supports_avx512_bitalg(), "");
5411       evpopcntb(dst, mask, src, merge, vec_enc);
5412       break;
5413     default:
5414       fatal("Unsupported type %s", type2name(bt));
5415       break;
5416   }
5417 }
5418 
5419 #ifndef _LP64
5420 void C2_MacroAssembler::vector_maskall_operation32(KRegister dst, Register src, KRegister tmp, int mask_len) {
5421   assert(VM_Version::supports_avx512bw(), "");
5422   kmovdl(tmp, src);
5423   kunpckdql(dst, tmp, tmp);
5424 }
5425 #endif
5426 
5427 // Bit reversal algorithm first reverses the bits of each byte followed by
5428 // a byte level reversal for multi-byte primitive types (short/int/long).
5429 // Algorithm performs a lookup table access to get reverse bit sequence
5430 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte
5431 // is obtained by swapping the reverse bit sequences of upper and lower
5432 // nibble of a byte.
5433 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5434                                            XMMRegister xtmp2, Register rtmp, int vec_enc) {
5435   if (VM_Version::supports_avx512vlbw()) {
5436 
5437     // Get the reverse bit sequence of lower nibble of each byte.
5438     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg);
5439     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5440     evpandq(dst, xtmp2, src, vec_enc);
5441     vpshufb(dst, xtmp1, dst, vec_enc);
5442     vpsllq(dst, dst, 4, vec_enc);
5443 
5444     // Get the reverse bit sequence of upper nibble of each byte.
5445     vpandn(xtmp2, xtmp2, src, vec_enc);
5446     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5447     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5448 
5449     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5450     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5451     evporq(xtmp2, dst, xtmp2, vec_enc);
5452     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
5453 
5454   } else if(vec_enc == Assembler::AVX_512bit) {
5455     // Shift based bit reversal.
5456     assert(bt == T_LONG || bt == T_INT, "");
5457 
5458     // Swap lower and upper nibble of each byte.
5459     vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc);
5460 
5461     // Swap two least and most significant bits of each nibble.
5462     vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc);
5463 
5464     // Swap adjacent pair of bits.
5465     evmovdqul(xtmp1, k0, dst, true, vec_enc);
5466     vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc);
5467 
5468     evmovdqul(xtmp1, k0, dst, true, vec_enc);
5469     vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc);
5470   } else {
5471     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp);
5472     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5473 
5474     // Get the reverse bit sequence of lower nibble of each byte.
5475     vpand(dst, xtmp2, src, vec_enc);
5476     vpshufb(dst, xtmp1, dst, vec_enc);
5477     vpsllq(dst, dst, 4, vec_enc);
5478 
5479     // Get the reverse bit sequence of upper nibble of each byte.
5480     vpandn(xtmp2, xtmp2, src, vec_enc);
5481     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5482     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5483 
5484     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5485     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5486     vpor(xtmp2, dst, xtmp2, vec_enc);
5487     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
5488   }
5489 }
5490 
5491 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc,
5492                                                 XMMRegister xtmp, Register rscratch) {
5493   assert(VM_Version::supports_gfni(), "");
5494   assert(rscratch != noreg || always_reachable(mask), "missing");
5495 
5496   // Galois field instruction based bit reversal based on following algorithm.
5497   // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
5498   vpbroadcastq(xtmp, mask, vec_enc, rscratch);
5499   vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc);
5500   vector_reverse_byte(bt, dst, xtmp, vec_enc);
5501 }
5502 
5503 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src,
5504                                           XMMRegister xtmp1, Register rtmp, int vec_enc) {
5505   vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc);
5506   evpandq(dst, xtmp1, src, vec_enc);
5507   vpsllq(dst, dst, nbits, vec_enc);
5508   vpandn(xtmp1, xtmp1, src, vec_enc);
5509   vpsrlq(xtmp1, xtmp1, nbits, vec_enc);
5510   evporq(dst, dst, xtmp1, vec_enc);
5511 }
5512 
5513 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5514                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5515   // Shift based bit reversal.
5516   assert(VM_Version::supports_evex(), "");
5517   switch(bt) {
5518     case T_LONG:
5519       // Swap upper and lower double word of each quad word.
5520       evprorq(xtmp1, k0, src, 32, true, vec_enc);
5521       evprord(xtmp1, k0, xtmp1, 16, true, vec_enc);
5522       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
5523       break;
5524     case T_INT:
5525       // Swap upper and lower word of each double word.
5526       evprord(xtmp1, k0, src, 16, true, vec_enc);
5527       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
5528       break;
5529     case T_CHAR:
5530     case T_SHORT:
5531       // Swap upper and lower byte of each word.
5532       vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc);
5533       break;
5534     case T_BYTE:
5535       evmovdquq(dst, k0, src, true, vec_enc);
5536       break;
5537     default:
5538       fatal("Unsupported type %s", type2name(bt));
5539       break;
5540   }
5541 }
5542 
5543 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) {
5544   if (bt == T_BYTE) {
5545     if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) {
5546       evmovdquq(dst, k0, src, true, vec_enc);
5547     } else {
5548       vmovdqu(dst, src);
5549     }
5550     return;
5551   }
5552   // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using
5553   // pre-computed shuffle indices.
5554   switch(bt) {
5555     case T_LONG:
5556       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg);
5557       break;
5558     case T_INT:
5559       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg);
5560       break;
5561     case T_CHAR:
5562     case T_SHORT:
5563       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg);
5564       break;
5565     default:
5566       fatal("Unsupported type %s", type2name(bt));
5567       break;
5568   }
5569   vpshufb(dst, src, dst, vec_enc);
5570 }
5571 
5572 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5573                                                         XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
5574                                                         KRegister ktmp, Register rtmp, bool merge, int vec_enc) {
5575   assert(is_integral_type(bt), "");
5576   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5577   assert(VM_Version::supports_avx512cd(), "");
5578   switch(bt) {
5579     case T_LONG:
5580       evplzcntq(dst, ktmp, src, merge, vec_enc);
5581       break;
5582     case T_INT:
5583       evplzcntd(dst, ktmp, src, merge, vec_enc);
5584       break;
5585     case T_SHORT:
5586       vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc);
5587       vpunpcklwd(xtmp2, xtmp1, src, vec_enc);
5588       evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc);
5589       vpunpckhwd(dst, xtmp1, src, vec_enc);
5590       evplzcntd(dst, ktmp, dst, merge, vec_enc);
5591       vpackusdw(dst, xtmp2, dst, vec_enc);
5592       break;
5593     case T_BYTE:
5594       // T1 = Compute leading zero counts of 4 LSB bits of each byte by
5595       // accessing the lookup table.
5596       // T2 = Compute leading zero counts of 4 MSB bits of each byte by
5597       // accessing the lookup table.
5598       // Add T1 to T2 if 4 MSB bits of byte are all zeros.
5599       assert(VM_Version::supports_avx512bw(), "");
5600       evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp);
5601       vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc);
5602       vpand(xtmp2, dst, src, vec_enc);
5603       vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5604       vpsrlw(xtmp3, src, 4, vec_enc);
5605       vpand(xtmp3, dst, xtmp3, vec_enc);
5606       vpshufb(dst, xtmp1, xtmp3, vec_enc);
5607       vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
5608       evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc);
5609       evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc);
5610       break;
5611     default:
5612       fatal("Unsupported type %s", type2name(bt));
5613       break;
5614   }
5615 }
5616 
5617 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5618                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
5619   vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp);
5620   vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5621   // T1 = Compute leading zero counts of 4 LSB bits of each byte by
5622   // accessing the lookup table.
5623   vpand(dst, xtmp2, src, vec_enc);
5624   vpshufb(dst, xtmp1, dst, vec_enc);
5625   // T2 = Compute leading zero counts of 4 MSB bits of each byte by
5626   // accessing the lookup table.
5627   vpsrlw(xtmp3, src, 4, vec_enc);
5628   vpand(xtmp3, xtmp2, xtmp3, vec_enc);
5629   vpshufb(xtmp2, xtmp1, xtmp3, vec_enc);
5630   // Add T1 to T2 if 4 MSB bits of byte are all zeros.
5631   vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
5632   vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc);
5633   vpaddb(dst, dst, xtmp2, vec_enc);
5634   vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc);
5635 }
5636 
5637 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5638                                                              XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
5639   vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
5640   // Add zero counts of lower byte and upper byte of a word if
5641   // upper byte holds a zero value.
5642   vpsrlw(xtmp3, src, 8, vec_enc);
5643   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
5644   vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc);
5645   vpsllw(xtmp2, dst, 8, vec_enc);
5646   vpaddw(xtmp2, xtmp2, dst, vec_enc);
5647   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
5648   vpsrlw(dst, dst, 8, vec_enc);
5649 }
5650 
5651 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5652                                                            XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) {
5653   // Since IEEE 754 floating point format represents mantissa in 1.0 format
5654   // hence biased exponent can be used to compute leading zero count as per
5655   // following formula:-
5656   // LZCNT = 32 - (biased_exp - 127)
5657   // Special handling has been introduced for Zero, Max_Int and -ve source values.
5658 
5659   // Broadcast 0xFF
5660   vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc);
5661   vpsrld(xtmp1, xtmp1, 24, vec_enc);
5662 
5663   // Extract biased exponent.
5664   vcvtdq2ps(dst, src, vec_enc);
5665   vpsrld(dst, dst, 23, vec_enc);
5666   vpand(dst, dst, xtmp1, vec_enc);
5667 
5668   // Broadcast 127.
5669   vpsrld(xtmp1, xtmp1, 1, vec_enc);
5670   // Exponent = biased_exp - 127
5671   vpsubd(dst, dst, xtmp1, vec_enc);
5672 
5673   // Exponent = Exponent  + 1
5674   vpsrld(xtmp3, xtmp1, 6, vec_enc);
5675   vpaddd(dst, dst, xtmp3, vec_enc);
5676 
5677   // Replace -ve exponent with zero, exponent is -ve when src
5678   // lane contains a zero value.
5679   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5680   vblendvps(dst, dst, xtmp2, dst, vec_enc);
5681 
5682   // Rematerialize broadcast 32.
5683   vpslld(xtmp1, xtmp3, 5, vec_enc);
5684   // Exponent is 32 if corresponding source lane contains max_int value.
5685   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
5686   // LZCNT = 32 - exponent
5687   vpsubd(dst, xtmp1, dst, vec_enc);
5688 
5689   // Replace LZCNT with a value 1 if corresponding source lane
5690   // contains max_int value.
5691   vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc);
5692 
5693   // Replace biased_exp with 0 if source lane value is less than zero.
5694   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5695   vblendvps(dst, dst, xtmp2, src, vec_enc);
5696 }
5697 
5698 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5699                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
5700   vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
5701   // Add zero counts of lower word and upper word of a double word if
5702   // upper word holds a zero value.
5703   vpsrld(xtmp3, src, 16, vec_enc);
5704   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
5705   vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc);
5706   vpslld(xtmp2, dst, 16, vec_enc);
5707   vpaddd(xtmp2, xtmp2, dst, vec_enc);
5708   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
5709   vpsrld(dst, dst, 16, vec_enc);
5710   // Add zero counts of lower doubleword and upper doubleword of a
5711   // quadword if upper doubleword holds a zero value.
5712   vpsrlq(xtmp3, src, 32, vec_enc);
5713   vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc);
5714   vpsllq(xtmp2, dst, 32, vec_enc);
5715   vpaddq(xtmp2, xtmp2, dst, vec_enc);
5716   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
5717   vpsrlq(dst, dst, 32, vec_enc);
5718 }
5719 
5720 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src,
5721                                                        XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
5722                                                        Register rtmp, int vec_enc) {
5723   assert(is_integral_type(bt), "unexpected type");
5724   assert(vec_enc < Assembler::AVX_512bit, "");
5725   switch(bt) {
5726     case T_LONG:
5727       vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
5728       break;
5729     case T_INT:
5730       vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
5731       break;
5732     case T_SHORT:
5733       vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
5734       break;
5735     case T_BYTE:
5736       vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
5737       break;
5738     default:
5739       fatal("Unsupported type %s", type2name(bt));
5740       break;
5741   }
5742 }
5743 
5744 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) {
5745   switch(bt) {
5746     case T_BYTE:
5747       vpsubb(dst, src1, src2, vec_enc);
5748       break;
5749     case T_SHORT:
5750       vpsubw(dst, src1, src2, vec_enc);
5751       break;
5752     case T_INT:
5753       vpsubd(dst, src1, src2, vec_enc);
5754       break;
5755     case T_LONG:
5756       vpsubq(dst, src1, src2, vec_enc);
5757       break;
5758     default:
5759       fatal("Unsupported type %s", type2name(bt));
5760       break;
5761   }
5762 }
5763 
5764 // Trailing zero count computation is based on leading zero count operation as per
5765 // following equation. All AVX3 targets support AVX512CD feature which offers
5766 // direct vector instruction to compute leading zero count.
5767 //      CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x)
5768 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5769                                                          XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
5770                                                          XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) {
5771   assert(is_integral_type(bt), "");
5772   // xtmp = -1
5773   vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc);
5774   // xtmp = xtmp + src
5775   vpadd(bt, xtmp4, xtmp4, src, vec_enc);
5776   // xtmp = xtmp & ~src
5777   vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc);
5778   vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc);
5779   vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc);
5780   vpsub(bt, dst, xtmp4, dst, vec_enc);
5781 }
5782 
5783 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation
5784 //      CTZ = PRIM_TYPE_WIDHT - POPC(x | -x)
5785 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5786                                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
5787   assert(is_integral_type(bt), "");
5788   // xtmp = 0
5789   vpxor(xtmp3 , xtmp3, xtmp3, vec_enc);
5790   // xtmp = 0 - src
5791   vpsub(bt, xtmp3, xtmp3, src, vec_enc);
5792   // xtmp = xtmp | src
5793   vpor(xtmp3, xtmp3, src, vec_enc);
5794   vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc);
5795   vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc);
5796   vpsub(bt, dst, xtmp1, dst, vec_enc);
5797 }
5798 
5799 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) {
5800   Label done;
5801   Label neg_divisor_fastpath;
5802   cmpl(divisor, 0);
5803   jccb(Assembler::less, neg_divisor_fastpath);
5804   xorl(rdx, rdx);
5805   divl(divisor);
5806   jmpb(done);
5807   bind(neg_divisor_fastpath);
5808   // Fastpath for divisor < 0:
5809   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
5810   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
5811   movl(rdx, rax);
5812   subl(rdx, divisor);
5813   if (VM_Version::supports_bmi1()) {
5814     andnl(rax, rdx, rax);
5815   } else {
5816     notl(rdx);
5817     andl(rax, rdx);
5818   }
5819   shrl(rax, 31);
5820   bind(done);
5821 }
5822 
5823 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) {
5824   Label done;
5825   Label neg_divisor_fastpath;
5826   cmpl(divisor, 0);
5827   jccb(Assembler::less, neg_divisor_fastpath);
5828   xorl(rdx, rdx);
5829   divl(divisor);
5830   jmpb(done);
5831   bind(neg_divisor_fastpath);
5832   // Fastpath when divisor < 0:
5833   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
5834   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
5835   movl(rdx, rax);
5836   subl(rax, divisor);
5837   if (VM_Version::supports_bmi1()) {
5838     andnl(rax, rax, rdx);
5839   } else {
5840     notl(rax);
5841     andl(rax, rdx);
5842   }
5843   sarl(rax, 31);
5844   andl(rax, divisor);
5845   subl(rdx, rax);
5846   bind(done);
5847 }
5848 
5849 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) {
5850   Label done;
5851   Label neg_divisor_fastpath;
5852 
5853   cmpl(divisor, 0);
5854   jccb(Assembler::less, neg_divisor_fastpath);
5855   xorl(rdx, rdx);
5856   divl(divisor);
5857   jmpb(done);
5858   bind(neg_divisor_fastpath);
5859   // Fastpath for divisor < 0:
5860   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
5861   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
5862   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
5863   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
5864   movl(rdx, rax);
5865   subl(rax, divisor);
5866   if (VM_Version::supports_bmi1()) {
5867     andnl(rax, rax, rdx);
5868   } else {
5869     notl(rax);
5870     andl(rax, rdx);
5871   }
5872   movl(tmp, rax);
5873   shrl(rax, 31); // quotient
5874   sarl(tmp, 31);
5875   andl(tmp, divisor);
5876   subl(rdx, tmp); // remainder
5877   bind(done);
5878 }
5879 
5880 #ifdef _LP64
5881 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1,
5882                                  XMMRegister xtmp2, Register rtmp) {
5883   if(VM_Version::supports_gfni()) {
5884     // Galois field instruction based bit reversal based on following algorithm.
5885     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
5886     mov64(rtmp, 0x8040201008040201L);
5887     movq(xtmp1, src);
5888     movq(xtmp2, rtmp);
5889     gf2p8affineqb(xtmp1, xtmp2, 0);
5890     movq(dst, xtmp1);
5891   } else {
5892     // Swap even and odd numbered bits.
5893     movl(rtmp, src);
5894     andl(rtmp, 0x55555555);
5895     shll(rtmp, 1);
5896     movl(dst, src);
5897     andl(dst, 0xAAAAAAAA);
5898     shrl(dst, 1);
5899     orl(dst, rtmp);
5900 
5901     // Swap LSB and MSB 2 bits of each nibble.
5902     movl(rtmp, dst);
5903     andl(rtmp, 0x33333333);
5904     shll(rtmp, 2);
5905     andl(dst, 0xCCCCCCCC);
5906     shrl(dst, 2);
5907     orl(dst, rtmp);
5908 
5909     // Swap LSB and MSB 4 bits of each byte.
5910     movl(rtmp, dst);
5911     andl(rtmp, 0x0F0F0F0F);
5912     shll(rtmp, 4);
5913     andl(dst, 0xF0F0F0F0);
5914     shrl(dst, 4);
5915     orl(dst, rtmp);
5916   }
5917   bswapl(dst);
5918 }
5919 
5920 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1,
5921                                  XMMRegister xtmp2, Register rtmp1, Register rtmp2) {
5922   if(VM_Version::supports_gfni()) {
5923     // Galois field instruction based bit reversal based on following algorithm.
5924     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
5925     mov64(rtmp1, 0x8040201008040201L);
5926     movq(xtmp1, src);
5927     movq(xtmp2, rtmp1);
5928     gf2p8affineqb(xtmp1, xtmp2, 0);
5929     movq(dst, xtmp1);
5930   } else {
5931     // Swap even and odd numbered bits.
5932     movq(rtmp1, src);
5933     mov64(rtmp2, 0x5555555555555555L);
5934     andq(rtmp1, rtmp2);
5935     shlq(rtmp1, 1);
5936     movq(dst, src);
5937     notq(rtmp2);
5938     andq(dst, rtmp2);
5939     shrq(dst, 1);
5940     orq(dst, rtmp1);
5941 
5942     // Swap LSB and MSB 2 bits of each nibble.
5943     movq(rtmp1, dst);
5944     mov64(rtmp2, 0x3333333333333333L);
5945     andq(rtmp1, rtmp2);
5946     shlq(rtmp1, 2);
5947     notq(rtmp2);
5948     andq(dst, rtmp2);
5949     shrq(dst, 2);
5950     orq(dst, rtmp1);
5951 
5952     // Swap LSB and MSB 4 bits of each byte.
5953     movq(rtmp1, dst);
5954     mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL);
5955     andq(rtmp1, rtmp2);
5956     shlq(rtmp1, 4);
5957     notq(rtmp2);
5958     andq(dst, rtmp2);
5959     shrq(dst, 4);
5960     orq(dst, rtmp1);
5961   }
5962   bswapq(dst);
5963 }
5964 
5965 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) {
5966   Label done;
5967   Label neg_divisor_fastpath;
5968   cmpq(divisor, 0);
5969   jccb(Assembler::less, neg_divisor_fastpath);
5970   xorl(rdx, rdx);
5971   divq(divisor);
5972   jmpb(done);
5973   bind(neg_divisor_fastpath);
5974   // Fastpath for divisor < 0:
5975   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
5976   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
5977   movq(rdx, rax);
5978   subq(rdx, divisor);
5979   if (VM_Version::supports_bmi1()) {
5980     andnq(rax, rdx, rax);
5981   } else {
5982     notq(rdx);
5983     andq(rax, rdx);
5984   }
5985   shrq(rax, 63);
5986   bind(done);
5987 }
5988 
5989 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) {
5990   Label done;
5991   Label neg_divisor_fastpath;
5992   cmpq(divisor, 0);
5993   jccb(Assembler::less, neg_divisor_fastpath);
5994   xorq(rdx, rdx);
5995   divq(divisor);
5996   jmp(done);
5997   bind(neg_divisor_fastpath);
5998   // Fastpath when divisor < 0:
5999   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6000   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6001   movq(rdx, rax);
6002   subq(rax, divisor);
6003   if (VM_Version::supports_bmi1()) {
6004     andnq(rax, rax, rdx);
6005   } else {
6006     notq(rax);
6007     andq(rax, rdx);
6008   }
6009   sarq(rax, 63);
6010   andq(rax, divisor);
6011   subq(rdx, rax);
6012   bind(done);
6013 }
6014 
6015 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) {
6016   Label done;
6017   Label neg_divisor_fastpath;
6018   cmpq(divisor, 0);
6019   jccb(Assembler::less, neg_divisor_fastpath);
6020   xorq(rdx, rdx);
6021   divq(divisor);
6022   jmp(done);
6023   bind(neg_divisor_fastpath);
6024   // Fastpath for divisor < 0:
6025   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6026   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6027   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6028   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6029   movq(rdx, rax);
6030   subq(rax, divisor);
6031   if (VM_Version::supports_bmi1()) {
6032     andnq(rax, rax, rdx);
6033   } else {
6034     notq(rax);
6035     andq(rax, rdx);
6036   }
6037   movq(tmp, rax);
6038   shrq(rax, 63); // quotient
6039   sarq(tmp, 63);
6040   andq(tmp, divisor);
6041   subq(rdx, tmp); // remainder
6042   bind(done);
6043 }
6044 #endif
6045 
6046 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1,
6047                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp,
6048                                         int vlen_enc) {
6049   assert(VM_Version::supports_avx512bw(), "");
6050   // Byte shuffles are inlane operations and indices are determined using
6051   // lower 4 bit of each shuffle lane, thus all shuffle indices are
6052   // normalized to index range 0-15. This makes sure that all the multiples
6053   // of an index value are placed at same relative position in 128 bit
6054   // lane i.e. elements corresponding to shuffle indices 16, 32 and 64
6055   // will be 16th element in their respective 128 bit lanes.
6056   movl(rtmp, 16);
6057   evpbroadcastb(xtmp1, rtmp, vlen_enc);
6058 
6059   // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16,
6060   // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using
6061   // original shuffle indices and move the shuffled lanes corresponding to true
6062   // mask to destination vector.
6063   evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
6064   evshufi64x2(xtmp2, src, src, 0x0, vlen_enc);
6065   evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc);
6066 
6067   // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32
6068   // and broadcasting second 128 bit lane.
6069   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6070   vpsllq(xtmp2, xtmp1, 0x1, vlen_enc);
6071   evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
6072   evshufi64x2(xtmp3, src, src, 0x55, vlen_enc);
6073   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6074 
6075   // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48
6076   // and broadcasting third 128 bit lane.
6077   evpcmpb(ktmp, k0, shuffle,  xtmp2, Assembler::nlt, true, vlen_enc);
6078   vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc);
6079   evpcmpb(ktmp, ktmp, shuffle,  xtmp1, Assembler::lt, true, vlen_enc);
6080   evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc);
6081   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6082 
6083   // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64
6084   // and broadcasting third 128 bit lane.
6085   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6086   vpsllq(xtmp2, xtmp2, 0x1, vlen_enc);
6087   evpcmpb(ktmp, ktmp, shuffle,  xtmp2, Assembler::lt, true, vlen_enc);
6088   evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc);
6089   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6090 }
6091