1 /*
   2  * Copyright (c) 2020, 2022, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "gc/shared/barrierSet.hpp"
  29 #include "gc/shared/barrierSetAssembler.hpp"
  30 #include "oops/methodData.hpp"
  31 #include "opto/c2_MacroAssembler.hpp"
  32 #include "opto/intrinsicnode.hpp"
  33 #include "opto/output.hpp"
  34 #include "opto/opcodes.hpp"
  35 #include "opto/subnode.hpp"
  36 #include "runtime/objectMonitor.hpp"
  37 #include "runtime/stubRoutines.hpp"
  38 
  39 #ifdef PRODUCT
  40 #define BLOCK_COMMENT(str) /* nothing */
  41 #define STOP(error) stop(error)
  42 #else
  43 #define BLOCK_COMMENT(str) block_comment(str)
  44 #define STOP(error) block_comment(error); stop(error)
  45 #endif
  46 
  47 // C2 compiled method's prolog code.
  48 void C2_MacroAssembler::verified_entry(Compile* C, int sp_inc) {
  49   if (C->clinit_barrier_on_entry()) {
  50     assert(VM_Version::supports_fast_class_init_checks(), "sanity");
  51     assert(!C->method()->holder()->is_not_initialized(), "initialization should have been started");
  52 
  53     Label L_skip_barrier;
  54     Register klass = rscratch1;
  55 
  56     mov_metadata(klass, C->method()->holder()->constant_encoding());
  57     clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
  58 
  59     jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
  60 
  61     bind(L_skip_barrier);
  62   }
  63 
  64   int framesize = C->output()->frame_size_in_bytes();
  65   int bangsize = C->output()->bang_size_in_bytes();
  66   bool fp_mode_24b = false;
  67   int stack_bang_size = C->output()->need_stack_bang(bangsize) ? bangsize : 0;
  68 
  69   // WARNING: Initial instruction MUST be 5 bytes or longer so that
  70   // NativeJump::patch_verified_entry will be able to patch out the entry
  71   // code safely. The push to verify stack depth is ok at 5 bytes,
  72   // the frame allocation can be either 3 or 6 bytes. So if we don't do
  73   // stack bang then we must use the 6 byte frame allocation even if
  74   // we have no frame. :-(
  75   assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
  76 
  77   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
  78   // Remove word for return addr
  79   framesize -= wordSize;
  80   stack_bang_size -= wordSize;
  81 
  82   // Calls to C2R adapters often do not accept exceptional returns.
  83   // We require that their callers must bang for them.  But be careful, because
  84   // some VM calls (such as call site linkage) can use several kilobytes of
  85   // stack.  But the stack safety zone should account for that.
  86   // See bugs 4446381, 4468289, 4497237.
  87   if (stack_bang_size > 0) {
  88     generate_stack_overflow_check(stack_bang_size);
  89 
  90     // We always push rbp, so that on return to interpreter rbp, will be
  91     // restored correctly and we can correct the stack.
  92     push(rbp);
  93     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  94     if (PreserveFramePointer) {
  95       mov(rbp, rsp);
  96     }
  97     // Remove word for ebp
  98     framesize -= wordSize;
  99 
 100     // Create frame
 101     if (framesize) {
 102       subptr(rsp, framesize);
 103     }
 104   } else {
 105     // Create frame (force generation of a 4 byte immediate value)
 106     subptr_imm32(rsp, framesize);
 107 
 108     // Save RBP register now.
 109     framesize -= wordSize;
 110     movptr(Address(rsp, framesize), rbp);
 111     // Save caller's stack pointer into RBP if the frame pointer is preserved.
 112     if (PreserveFramePointer) {
 113       movptr(rbp, rsp);
 114       if (framesize > 0) {
 115         addptr(rbp, framesize);
 116       }
 117     }
 118   }
 119 
 120   if (C->needs_stack_repair()) {
 121     // Save stack increment just below the saved rbp (also account for fixed framesize and rbp)
 122     assert((sp_inc & (StackAlignmentInBytes-1)) == 0, "stack increment not aligned");
 123     movptr(Address(rsp, framesize - wordSize), sp_inc + framesize + wordSize);
 124   }
 125 
 126   if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
 127     framesize -= wordSize;
 128     movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 129   }
 130 
 131 #ifndef _LP64
 132   // If method sets FPU control word do it now
 133   if (fp_mode_24b) {
 134     fldcw(ExternalAddress(StubRoutines::x86::addr_fpu_cntrl_wrd_24()));
 135   }
 136   if (UseSSE >= 2 && VerifyFPU) {
 137     verify_FPU(0, "FPU stack must be clean on entry");
 138   }
 139 #endif
 140 
 141 #ifdef ASSERT
 142   if (VerifyStackAtCalls) {
 143     Label L;
 144     push(rax);
 145     mov(rax, rsp);
 146     andptr(rax, StackAlignmentInBytes-1);
 147     cmpptr(rax, StackAlignmentInBytes-wordSize);
 148     pop(rax);
 149     jcc(Assembler::equal, L);
 150     STOP("Stack is not properly aligned!");
 151     bind(L);
 152   }
 153 #endif
 154 }
 155 
 156 void C2_MacroAssembler::entry_barrier() {
 157   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 158 #ifdef _LP64
 159   if (BarrierSet::barrier_set()->barrier_set_nmethod() != NULL) {
 160     // We put the non-hot code of the nmethod entry barrier out-of-line in a stub.
 161     Label dummy_slow_path;
 162     Label dummy_continuation;
 163     Label* slow_path = &dummy_slow_path;
 164     Label* continuation = &dummy_continuation;
 165     if (!Compile::current()->output()->in_scratch_emit_size()) {
 166       // Use real labels from actual stub when not emitting code for the purpose of measuring its size
 167       C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
 168       Compile::current()->output()->add_stub(stub);
 169       slow_path = &stub->entry();                                                                                                                                                                                                                                                                                                                                                              continuation = &stub->continuation();
 170     }
 171     bs->nmethod_entry_barrier(this, slow_path, continuation);
 172   }
 173 #else
 174   // Don't bother with out-of-line nmethod entry barrier stub for x86_32.
 175   bs->nmethod_entry_barrier(this, NULL /* slow_path */, NULL /* continuation */);
 176 #endif
 177 }
 178 
 179 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
 180   switch (vlen_in_bytes) {
 181     case  4: // fall-through
 182     case  8: // fall-through
 183     case 16: return Assembler::AVX_128bit;
 184     case 32: return Assembler::AVX_256bit;
 185     case 64: return Assembler::AVX_512bit;
 186 
 187     default: {
 188       ShouldNotReachHere();
 189       return Assembler::AVX_NoVec;
 190     }
 191   }
 192 }
 193 
 194 #if INCLUDE_RTM_OPT
 195 
 196 // Update rtm_counters based on abort status
 197 // input: abort_status
 198 //        rtm_counters (RTMLockingCounters*)
 199 // flags are killed
 200 void C2_MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) {
 201 
 202   atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset()));
 203   if (PrintPreciseRTMLockingStatistics) {
 204     for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {
 205       Label check_abort;
 206       testl(abort_status, (1<<i));
 207       jccb(Assembler::equal, check_abort);
 208       atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx))));
 209       bind(check_abort);
 210     }
 211   }
 212 }
 213 
 214 // Branch if (random & (count-1) != 0), count is 2^n
 215 // tmp, scr and flags are killed
 216 void C2_MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) {
 217   assert(tmp == rax, "");
 218   assert(scr == rdx, "");
 219   rdtsc(); // modifies EDX:EAX
 220   andptr(tmp, count-1);
 221   jccb(Assembler::notZero, brLabel);
 222 }
 223 
 224 // Perform abort ratio calculation, set no_rtm bit if high ratio
 225 // input:  rtm_counters_Reg (RTMLockingCounters* address)
 226 // tmpReg, rtm_counters_Reg and flags are killed
 227 void C2_MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg,
 228                                                     Register rtm_counters_Reg,
 229                                                     RTMLockingCounters* rtm_counters,
 230                                                     Metadata* method_data) {
 231   Label L_done, L_check_always_rtm1, L_check_always_rtm2;
 232 
 233   if (RTMLockingCalculationDelay > 0) {
 234     // Delay calculation
 235     movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr()));
 236     testptr(tmpReg, tmpReg);
 237     jccb(Assembler::equal, L_done);
 238   }
 239   // Abort ratio calculation only if abort_count > RTMAbortThreshold
 240   //   Aborted transactions = abort_count * 100
 241   //   All transactions = total_count *  RTMTotalCountIncrRate
 242   //   Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
 243 
 244   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset()));
 245   cmpptr(tmpReg, RTMAbortThreshold);
 246   jccb(Assembler::below, L_check_always_rtm2);
 247   imulptr(tmpReg, tmpReg, 100);
 248 
 249   Register scrReg = rtm_counters_Reg;
 250   movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
 251   imulptr(scrReg, scrReg, RTMTotalCountIncrRate);
 252   imulptr(scrReg, scrReg, RTMAbortRatio);
 253   cmpptr(tmpReg, scrReg);
 254   jccb(Assembler::below, L_check_always_rtm1);
 255   if (method_data != NULL) {
 256     // set rtm_state to "no rtm" in MDO
 257     mov_metadata(tmpReg, method_data);
 258     lock();
 259     orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), NoRTM);
 260   }
 261   jmpb(L_done);
 262   bind(L_check_always_rtm1);
 263   // Reload RTMLockingCounters* address
 264   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
 265   bind(L_check_always_rtm2);
 266   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
 267   cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate);
 268   jccb(Assembler::below, L_done);
 269   if (method_data != NULL) {
 270     // set rtm_state to "always rtm" in MDO
 271     mov_metadata(tmpReg, method_data);
 272     lock();
 273     orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), UseRTM);
 274   }
 275   bind(L_done);
 276 }
 277 
 278 // Update counters and perform abort ratio calculation
 279 // input:  abort_status_Reg
 280 // rtm_counters_Reg, flags are killed
 281 void C2_MacroAssembler::rtm_profiling(Register abort_status_Reg,
 282                                       Register rtm_counters_Reg,
 283                                       RTMLockingCounters* rtm_counters,
 284                                       Metadata* method_data,
 285                                       bool profile_rtm) {
 286 
 287   assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
 288   // update rtm counters based on rax value at abort
 289   // reads abort_status_Reg, updates flags
 290   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
 291   rtm_counters_update(abort_status_Reg, rtm_counters_Reg);
 292   if (profile_rtm) {
 293     // Save abort status because abort_status_Reg is used by following code.
 294     if (RTMRetryCount > 0) {
 295       push(abort_status_Reg);
 296     }
 297     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
 298     rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data);
 299     // restore abort status
 300     if (RTMRetryCount > 0) {
 301       pop(abort_status_Reg);
 302     }
 303   }
 304 }
 305 
 306 // Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4)
 307 // inputs: retry_count_Reg
 308 //       : abort_status_Reg
 309 // output: retry_count_Reg decremented by 1
 310 // flags are killed
 311 void C2_MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) {
 312   Label doneRetry;
 313   assert(abort_status_Reg == rax, "");
 314   // The abort reason bits are in eax (see all states in rtmLocking.hpp)
 315   // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4)
 316   // if reason is in 0x6 and retry count != 0 then retry
 317   andptr(abort_status_Reg, 0x6);
 318   jccb(Assembler::zero, doneRetry);
 319   testl(retry_count_Reg, retry_count_Reg);
 320   jccb(Assembler::zero, doneRetry);
 321   pause();
 322   decrementl(retry_count_Reg);
 323   jmp(retryLabel);
 324   bind(doneRetry);
 325 }
 326 
 327 // Spin and retry if lock is busy,
 328 // inputs: box_Reg (monitor address)
 329 //       : retry_count_Reg
 330 // output: retry_count_Reg decremented by 1
 331 //       : clear z flag if retry count exceeded
 332 // tmp_Reg, scr_Reg, flags are killed
 333 void C2_MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg,
 334                                                Register tmp_Reg, Register scr_Reg, Label& retryLabel) {
 335   Label SpinLoop, SpinExit, doneRetry;
 336   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 337 
 338   testl(retry_count_Reg, retry_count_Reg);
 339   jccb(Assembler::zero, doneRetry);
 340   decrementl(retry_count_Reg);
 341   movptr(scr_Reg, RTMSpinLoopCount);
 342 
 343   bind(SpinLoop);
 344   pause();
 345   decrementl(scr_Reg);
 346   jccb(Assembler::lessEqual, SpinExit);
 347   movptr(tmp_Reg, Address(box_Reg, owner_offset));
 348   testptr(tmp_Reg, tmp_Reg);
 349   jccb(Assembler::notZero, SpinLoop);
 350 
 351   bind(SpinExit);
 352   jmp(retryLabel);
 353   bind(doneRetry);
 354   incrementl(retry_count_Reg); // clear z flag
 355 }
 356 
 357 // Use RTM for normal stack locks
 358 // Input: objReg (object to lock)
 359 void C2_MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg,
 360                                          Register retry_on_abort_count_Reg,
 361                                          RTMLockingCounters* stack_rtm_counters,
 362                                          Metadata* method_data, bool profile_rtm,
 363                                          Label& DONE_LABEL, Label& IsInflated) {
 364   assert(UseRTMForStackLocks, "why call this otherwise?");
 365   assert(tmpReg == rax, "");
 366   assert(scrReg == rdx, "");
 367   Label L_rtm_retry, L_decrement_retry, L_on_abort;
 368 
 369   if (RTMRetryCount > 0) {
 370     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
 371     bind(L_rtm_retry);
 372   }
 373   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
 374   testptr(tmpReg, markWord::monitor_value);  // inflated vs stack-locked|neutral
 375   jcc(Assembler::notZero, IsInflated);
 376 
 377   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 378     Label L_noincrement;
 379     if (RTMTotalCountIncrRate > 1) {
 380       // tmpReg, scrReg and flags are killed
 381       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
 382     }
 383     assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM");
 384     atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg);
 385     bind(L_noincrement);
 386   }
 387   xbegin(L_on_abort);
 388   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));       // fetch markword
 389   andptr(tmpReg, markWord::lock_mask_in_place);     // look at 2 lock bits
 390   cmpptr(tmpReg, markWord::unlocked_value);         // bits = 01 unlocked
 391   jcc(Assembler::equal, DONE_LABEL);        // all done if unlocked
 392 
 393   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
 394   if (UseRTMXendForLockBusy) {
 395     xend();
 396     movptr(abort_status_Reg, 0x2);   // Set the abort status to 2 (so we can retry)
 397     jmp(L_decrement_retry);
 398   }
 399   else {
 400     xabort(0);
 401   }
 402   bind(L_on_abort);
 403   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 404     rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm);
 405   }
 406   bind(L_decrement_retry);
 407   if (RTMRetryCount > 0) {
 408     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
 409     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
 410   }
 411 }
 412 
 413 // Use RTM for inflating locks
 414 // inputs: objReg (object to lock)
 415 //         boxReg (on-stack box address (displaced header location) - KILLED)
 416 //         tmpReg (ObjectMonitor address + markWord::monitor_value)
 417 void C2_MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg,
 418                                             Register scrReg, Register retry_on_busy_count_Reg,
 419                                             Register retry_on_abort_count_Reg,
 420                                             RTMLockingCounters* rtm_counters,
 421                                             Metadata* method_data, bool profile_rtm,
 422                                             Label& DONE_LABEL) {
 423   assert(UseRTMLocking, "why call this otherwise?");
 424   assert(tmpReg == rax, "");
 425   assert(scrReg == rdx, "");
 426   Label L_rtm_retry, L_decrement_retry, L_on_abort;
 427   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 428 
 429   movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value()));
 430   movptr(boxReg, tmpReg); // Save ObjectMonitor address
 431 
 432   if (RTMRetryCount > 0) {
 433     movl(retry_on_busy_count_Reg, RTMRetryCount);  // Retry on lock busy
 434     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
 435     bind(L_rtm_retry);
 436   }
 437   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 438     Label L_noincrement;
 439     if (RTMTotalCountIncrRate > 1) {
 440       // tmpReg, scrReg and flags are killed
 441       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
 442     }
 443     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
 444     atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg);
 445     bind(L_noincrement);
 446   }
 447   xbegin(L_on_abort);
 448   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
 449   movptr(tmpReg, Address(tmpReg, owner_offset));
 450   testptr(tmpReg, tmpReg);
 451   jcc(Assembler::zero, DONE_LABEL);
 452   if (UseRTMXendForLockBusy) {
 453     xend();
 454     jmp(L_decrement_retry);
 455   }
 456   else {
 457     xabort(0);
 458   }
 459   bind(L_on_abort);
 460   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
 461   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 462     rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm);
 463   }
 464   if (RTMRetryCount > 0) {
 465     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
 466     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
 467   }
 468 
 469   movptr(tmpReg, Address(boxReg, owner_offset)) ;
 470   testptr(tmpReg, tmpReg) ;
 471   jccb(Assembler::notZero, L_decrement_retry) ;
 472 
 473   // Appears unlocked - try to swing _owner from null to non-null.
 474   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 475 #ifdef _LP64
 476   Register threadReg = r15_thread;
 477 #else
 478   get_thread(scrReg);
 479   Register threadReg = scrReg;
 480 #endif
 481   lock();
 482   cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg
 483 
 484   if (RTMRetryCount > 0) {
 485     // success done else retry
 486     jccb(Assembler::equal, DONE_LABEL) ;
 487     bind(L_decrement_retry);
 488     // Spin and retry if lock is busy.
 489     rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry);
 490   }
 491   else {
 492     bind(L_decrement_retry);
 493   }
 494 }
 495 
 496 #endif //  INCLUDE_RTM_OPT
 497 
 498 // fast_lock and fast_unlock used by C2
 499 
 500 // Because the transitions from emitted code to the runtime
 501 // monitorenter/exit helper stubs are so slow it's critical that
 502 // we inline both the stack-locking fast path and the inflated fast path.
 503 //
 504 // See also: cmpFastLock and cmpFastUnlock.
 505 //
 506 // What follows is a specialized inline transliteration of the code
 507 // in enter() and exit(). If we're concerned about I$ bloat another
 508 // option would be to emit TrySlowEnter and TrySlowExit methods
 509 // at startup-time.  These methods would accept arguments as
 510 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
 511 // indications in the icc.ZFlag.  fast_lock and fast_unlock would simply
 512 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
 513 // In practice, however, the # of lock sites is bounded and is usually small.
 514 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
 515 // if the processor uses simple bimodal branch predictors keyed by EIP
 516 // Since the helper routines would be called from multiple synchronization
 517 // sites.
 518 //
 519 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
 520 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
 521 // to those specialized methods.  That'd give us a mostly platform-independent
 522 // implementation that the JITs could optimize and inline at their pleasure.
 523 // Done correctly, the only time we'd need to cross to native could would be
 524 // to park() or unpark() threads.  We'd also need a few more unsafe operators
 525 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
 526 // (b) explicit barriers or fence operations.
 527 //
 528 // TODO:
 529 //
 530 // *  Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
 531 //    This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
 532 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
 533 //    the lock operators would typically be faster than reifying Self.
 534 //
 535 // *  Ideally I'd define the primitives as:
 536 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
 537 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
 538 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
 539 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
 540 //    Furthermore the register assignments are overconstrained, possibly resulting in
 541 //    sub-optimal code near the synchronization site.
 542 //
 543 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
 544 //    Alternately, use a better sp-proximity test.
 545 //
 546 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
 547 //    Either one is sufficient to uniquely identify a thread.
 548 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
 549 //
 550 // *  Intrinsify notify() and notifyAll() for the common cases where the
 551 //    object is locked by the calling thread but the waitlist is empty.
 552 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
 553 //
 554 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
 555 //    But beware of excessive branch density on AMD Opterons.
 556 //
 557 // *  Both fast_lock and fast_unlock set the ICC.ZF to indicate success
 558 //    or failure of the fast path.  If the fast path fails then we pass
 559 //    control to the slow path, typically in C.  In fast_lock and
 560 //    fast_unlock we often branch to DONE_LABEL, just to find that C2
 561 //    will emit a conditional branch immediately after the node.
 562 //    So we have branches to branches and lots of ICC.ZF games.
 563 //    Instead, it might be better to have C2 pass a "FailureLabel"
 564 //    into fast_lock and fast_unlock.  In the case of success, control
 565 //    will drop through the node.  ICC.ZF is undefined at exit.
 566 //    In the case of failure, the node will branch directly to the
 567 //    FailureLabel
 568 
 569 
 570 // obj: object to lock
 571 // box: on-stack box address (displaced header location) - KILLED
 572 // rax,: tmp -- KILLED
 573 // scr: tmp -- KILLED
 574 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
 575                                  Register scrReg, Register cx1Reg, Register cx2Reg,
 576                                  RTMLockingCounters* rtm_counters,
 577                                  RTMLockingCounters* stack_rtm_counters,
 578                                  Metadata* method_data,
 579                                  bool use_rtm, bool profile_rtm) {
 580   // Ensure the register assignments are disjoint
 581   assert(tmpReg == rax, "");
 582 
 583   if (use_rtm) {
 584     assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg);
 585   } else {
 586     assert(cx2Reg == noreg, "");
 587     assert_different_registers(objReg, boxReg, tmpReg, scrReg);
 588   }
 589 
 590   // Possible cases that we'll encounter in fast_lock
 591   // ------------------------------------------------
 592   // * Inflated
 593   //    -- unlocked
 594   //    -- Locked
 595   //       = by self
 596   //       = by other
 597   // * neutral
 598   // * stack-locked
 599   //    -- by self
 600   //       = sp-proximity test hits
 601   //       = sp-proximity test generates false-negative
 602   //    -- by other
 603   //
 604 
 605   Label IsInflated, DONE_LABEL, NO_COUNT, COUNT;
 606 
 607   if (DiagnoseSyncOnValueBasedClasses != 0) {
 608     load_klass(tmpReg, objReg, cx1Reg);
 609     movl(tmpReg, Address(tmpReg, Klass::access_flags_offset()));
 610     testl(tmpReg, JVM_ACC_IS_VALUE_BASED_CLASS);
 611     jcc(Assembler::notZero, DONE_LABEL);
 612   }
 613 
 614 #if INCLUDE_RTM_OPT
 615   if (UseRTMForStackLocks && use_rtm) {
 616     assert(!UseHeavyMonitors, "+UseHeavyMonitors and +UseRTMForStackLocks are mutually exclusive");
 617     rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg,
 618                       stack_rtm_counters, method_data, profile_rtm,
 619                       DONE_LABEL, IsInflated);
 620   }
 621 #endif // INCLUDE_RTM_OPT
 622 
 623   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));          // [FETCH]
 624   testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral
 625   jccb(Assembler::notZero, IsInflated);
 626 
 627   if (!UseHeavyMonitors) {
 628     // Attempt stack-locking ...
 629     orptr (tmpReg, markWord::unlocked_value);
 630     if (EnableValhalla) {
 631       // Mask inline_type bit such that we go to the slow path if object is an inline type
 632       andptr(tmpReg, ~((int) markWord::inline_type_bit_in_place));
 633     }
 634     movptr(Address(boxReg, 0), tmpReg);          // Anticipate successful CAS
 635     lock();
 636     cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes()));      // Updates tmpReg
 637     jcc(Assembler::equal, COUNT);           // Success
 638 
 639     // Recursive locking.
 640     // The object is stack-locked: markword contains stack pointer to BasicLock.
 641     // Locked by current thread if difference with current SP is less than one page.
 642     subptr(tmpReg, rsp);
 643     // Next instruction set ZFlag == 1 (Success) if difference is less then one page.
 644     andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - os::vm_page_size())) );
 645     movptr(Address(boxReg, 0), tmpReg);
 646   } else {
 647     // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0.
 648     testptr(objReg, objReg);
 649   }
 650   jmp(DONE_LABEL);
 651 
 652   bind(IsInflated);
 653   // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value
 654 
 655 #if INCLUDE_RTM_OPT
 656   // Use the same RTM locking code in 32- and 64-bit VM.
 657   if (use_rtm) {
 658     rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg,
 659                          rtm_counters, method_data, profile_rtm, DONE_LABEL);
 660   } else {
 661 #endif // INCLUDE_RTM_OPT
 662 
 663 #ifndef _LP64
 664   // The object is inflated.
 665 
 666   // boxReg refers to the on-stack BasicLock in the current frame.
 667   // We'd like to write:
 668   //   set box->_displaced_header = markWord::unused_mark().  Any non-0 value suffices.
 669   // This is convenient but results a ST-before-CAS penalty.  The following CAS suffers
 670   // additional latency as we have another ST in the store buffer that must drain.
 671 
 672   // avoid ST-before-CAS
 673   // register juggle because we need tmpReg for cmpxchgptr below
 674   movptr(scrReg, boxReg);
 675   movptr(boxReg, tmpReg);                   // consider: LEA box, [tmp-2]
 676 
 677   // Optimistic form: consider XORL tmpReg,tmpReg
 678   movptr(tmpReg, NULL_WORD);
 679 
 680   // Appears unlocked - try to swing _owner from null to non-null.
 681   // Ideally, I'd manifest "Self" with get_thread and then attempt
 682   // to CAS the register containing Self into m->Owner.
 683   // But we don't have enough registers, so instead we can either try to CAS
 684   // rsp or the address of the box (in scr) into &m->owner.  If the CAS succeeds
 685   // we later store "Self" into m->Owner.  Transiently storing a stack address
 686   // (rsp or the address of the box) into  m->owner is harmless.
 687   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 688   lock();
 689   cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 690   movptr(Address(scrReg, 0), 3);          // box->_displaced_header = 3
 691   // If we weren't able to swing _owner from NULL to the BasicLock
 692   // then take the slow path.
 693   jccb  (Assembler::notZero, NO_COUNT);
 694   // update _owner from BasicLock to thread
 695   get_thread (scrReg);                    // beware: clobbers ICCs
 696   movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg);
 697   xorptr(boxReg, boxReg);                 // set icc.ZFlag = 1 to indicate success
 698 
 699   // If the CAS fails we can either retry or pass control to the slow path.
 700   // We use the latter tactic.
 701   // Pass the CAS result in the icc.ZFlag into DONE_LABEL
 702   // If the CAS was successful ...
 703   //   Self has acquired the lock
 704   //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
 705   // Intentional fall-through into DONE_LABEL ...
 706 #else // _LP64
 707   // It's inflated and we use scrReg for ObjectMonitor* in this section.
 708   movq(scrReg, tmpReg);
 709   xorq(tmpReg, tmpReg);
 710   lock();
 711   cmpxchgptr(r15_thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 712   // Unconditionally set box->_displaced_header = markWord::unused_mark().
 713   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 714   movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value()));
 715   // Propagate ICC.ZF from CAS above into DONE_LABEL.
 716   jccb(Assembler::equal, COUNT);          // CAS above succeeded; propagate ZF = 1 (success)
 717 
 718   cmpptr(r15_thread, rax);                // Check if we are already the owner (recursive lock)
 719   jccb(Assembler::notEqual, NO_COUNT);    // If not recursive, ZF = 0 at this point (fail)
 720   incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 721   xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success
 722 #endif // _LP64
 723 #if INCLUDE_RTM_OPT
 724   } // use_rtm()
 725 #endif
 726   bind(DONE_LABEL);
 727 
 728   // ZFlag == 1 count in fast path
 729   // ZFlag == 0 count in slow path
 730   jccb(Assembler::notZero, NO_COUNT); // jump if ZFlag == 0
 731 
 732   bind(COUNT);
 733   // Count monitors in fast path
 734 #ifndef _LP64
 735   get_thread(tmpReg);
 736   incrementl(Address(tmpReg, JavaThread::held_monitor_count_offset()));
 737 #else // _LP64
 738   incrementq(Address(r15_thread, JavaThread::held_monitor_count_offset()));
 739 #endif
 740 
 741   xorl(tmpReg, tmpReg); // Set ZF == 1
 742 
 743   bind(NO_COUNT);
 744 
 745   // At NO_COUNT the icc ZFlag is set as follows ...
 746   // fast_unlock uses the same protocol.
 747   // ZFlag == 1 -> Success
 748   // ZFlag == 0 -> Failure - force control through the slow path
 749 }
 750 
 751 // obj: object to unlock
 752 // box: box address (displaced header location), killed.  Must be EAX.
 753 // tmp: killed, cannot be obj nor box.
 754 //
 755 // Some commentary on balanced locking:
 756 //
 757 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
 758 // Methods that don't have provably balanced locking are forced to run in the
 759 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
 760 // The interpreter provides two properties:
 761 // I1:  At return-time the interpreter automatically and quietly unlocks any
 762 //      objects acquired the current activation (frame).  Recall that the
 763 //      interpreter maintains an on-stack list of locks currently held by
 764 //      a frame.
 765 // I2:  If a method attempts to unlock an object that is not held by the
 766 //      the frame the interpreter throws IMSX.
 767 //
 768 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
 769 // B() doesn't have provably balanced locking so it runs in the interpreter.
 770 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
 771 // is still locked by A().
 772 //
 773 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
 774 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
 775 // should not be unlocked by "normal" java-level locking and vice-versa.  The specification
 776 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
 777 // Arguably given that the spec legislates the JNI case as undefined our implementation
 778 // could reasonably *avoid* checking owner in fast_unlock().
 779 // In the interest of performance we elide m->Owner==Self check in unlock.
 780 // A perfectly viable alternative is to elide the owner check except when
 781 // Xcheck:jni is enabled.
 782 
 783 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) {
 784   assert(boxReg == rax, "");
 785   assert_different_registers(objReg, boxReg, tmpReg);
 786 
 787   Label DONE_LABEL, Stacked, COUNT, NO_COUNT;
 788 
 789 #if INCLUDE_RTM_OPT
 790   if (UseRTMForStackLocks && use_rtm) {
 791     assert(!UseHeavyMonitors, "+UseHeavyMonitors and +UseRTMForStackLocks are mutually exclusive");
 792     Label L_regular_unlock;
 793     movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword
 794     andptr(tmpReg, markWord::lock_mask_in_place);                     // look at 2 lock bits
 795     cmpptr(tmpReg, markWord::unlocked_value);                         // bits = 01 unlocked
 796     jccb(Assembler::notEqual, L_regular_unlock);                      // if !HLE RegularLock
 797     xend();                                                           // otherwise end...
 798     jmp(DONE_LABEL);                                                  // ... and we're done
 799     bind(L_regular_unlock);
 800   }
 801 #endif
 802 
 803   if (!UseHeavyMonitors) {
 804     cmpptr(Address(boxReg, 0), NULL_WORD);                            // Examine the displaced header
 805     jcc   (Assembler::zero, COUNT);                                   // 0 indicates recursive stack-lock
 806   }
 807   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));   // Examine the object's markword
 808   if (!UseHeavyMonitors) {
 809     testptr(tmpReg, markWord::monitor_value);                         // Inflated?
 810     jccb   (Assembler::zero, Stacked);
 811   }
 812 
 813   // It's inflated.
 814 #if INCLUDE_RTM_OPT
 815   if (use_rtm) {
 816     Label L_regular_inflated_unlock;
 817     int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 818     movptr(boxReg, Address(tmpReg, owner_offset));
 819     testptr(boxReg, boxReg);
 820     jccb(Assembler::notZero, L_regular_inflated_unlock);
 821     xend();
 822     jmpb(DONE_LABEL);
 823     bind(L_regular_inflated_unlock);
 824   }
 825 #endif
 826 
 827   // Despite our balanced locking property we still check that m->_owner == Self
 828   // as java routines or native JNI code called by this thread might
 829   // have released the lock.
 830   // Refer to the comments in synchronizer.cpp for how we might encode extra
 831   // state in _succ so we can avoid fetching EntryList|cxq.
 832   //
 833   // If there's no contention try a 1-0 exit.  That is, exit without
 834   // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
 835   // we detect and recover from the race that the 1-0 exit admits.
 836   //
 837   // Conceptually fast_unlock() must execute a STST|LDST "release" barrier
 838   // before it STs null into _owner, releasing the lock.  Updates
 839   // to data protected by the critical section must be visible before
 840   // we drop the lock (and thus before any other thread could acquire
 841   // the lock and observe the fields protected by the lock).
 842   // IA32's memory-model is SPO, so STs are ordered with respect to
 843   // each other and there's no need for an explicit barrier (fence).
 844   // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
 845 #ifndef _LP64
 846   // Note that we could employ various encoding schemes to reduce
 847   // the number of loads below (currently 4) to just 2 or 3.
 848   // Refer to the comments in synchronizer.cpp.
 849   // In practice the chain of fetches doesn't seem to impact performance, however.
 850   xorptr(boxReg, boxReg);
 851   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 852   jccb  (Assembler::notZero, DONE_LABEL);
 853   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 854   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 855   jccb  (Assembler::notZero, DONE_LABEL);
 856   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 857   jmpb  (DONE_LABEL);
 858 #else // _LP64
 859   // It's inflated
 860   Label CheckSucc, LNotRecursive, LSuccess, LGoSlowPath;
 861 
 862   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0);
 863   jccb(Assembler::equal, LNotRecursive);
 864 
 865   // Recursive inflated unlock
 866   decq(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 867   jmpb(LSuccess);
 868 
 869   bind(LNotRecursive);
 870   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 871   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 872   jccb  (Assembler::notZero, CheckSucc);
 873   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 874   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 875   jmpb  (DONE_LABEL);
 876 
 877   // Try to avoid passing control into the slow_path ...
 878   bind  (CheckSucc);
 879 
 880   // The following optional optimization can be elided if necessary
 881   // Effectively: if (succ == null) goto slow path
 882   // The code reduces the window for a race, however,
 883   // and thus benefits performance.
 884   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD);
 885   jccb  (Assembler::zero, LGoSlowPath);
 886 
 887   xorptr(boxReg, boxReg);
 888   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 889   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 890 
 891   // Memory barrier/fence
 892   // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ
 893   // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack.
 894   // This is faster on Nehalem and AMD Shanghai/Barcelona.
 895   // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences
 896   // We might also restructure (ST Owner=0;barrier;LD _Succ) to
 897   // (mov box,0; xchgq box, &m->Owner; LD _succ) .
 898   lock(); addl(Address(rsp, 0), 0);
 899 
 900   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD);
 901   jccb  (Assembler::notZero, LSuccess);
 902 
 903   // Rare inopportune interleaving - race.
 904   // The successor vanished in the small window above.
 905   // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor.
 906   // We need to ensure progress and succession.
 907   // Try to reacquire the lock.
 908   // If that fails then the new owner is responsible for succession and this
 909   // thread needs to take no further action and can exit via the fast path (success).
 910   // If the re-acquire succeeds then pass control into the slow path.
 911   // As implemented, this latter mode is horrible because we generated more
 912   // coherence traffic on the lock *and* artificially extended the critical section
 913   // length while by virtue of passing control into the slow path.
 914 
 915   // box is really RAX -- the following CMPXCHG depends on that binding
 916   // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R)
 917   lock();
 918   cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 919   // There's no successor so we tried to regrab the lock.
 920   // If that didn't work, then another thread grabbed the
 921   // lock so we're done (and exit was a success).
 922   jccb  (Assembler::notEqual, LSuccess);
 923   // Intentional fall-through into slow path
 924 
 925   bind  (LGoSlowPath);
 926   orl   (boxReg, 1);                      // set ICC.ZF=0 to indicate failure
 927   jmpb  (DONE_LABEL);
 928 
 929   bind  (LSuccess);
 930   testl (boxReg, 0);                      // set ICC.ZF=1 to indicate success
 931   jmpb  (DONE_LABEL);
 932 
 933 #endif
 934   if (!UseHeavyMonitors) {
 935     bind  (Stacked);
 936     movptr(tmpReg, Address (boxReg, 0));      // re-fetch
 937     lock();
 938     cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
 939     // Intentional fall-thru into DONE_LABEL
 940   }
 941   bind(DONE_LABEL);
 942 
 943   // ZFlag == 1 count in fast path
 944   // ZFlag == 0 count in slow path
 945   jccb(Assembler::notZero, NO_COUNT);
 946 
 947   bind(COUNT);
 948   // Count monitors in fast path
 949 #ifndef _LP64
 950   get_thread(tmpReg);
 951   decrementl(Address(tmpReg, JavaThread::held_monitor_count_offset()));
 952 #else // _LP64
 953   decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset()));
 954 #endif
 955 
 956   xorl(tmpReg, tmpReg); // Set ZF == 1
 957 
 958   bind(NO_COUNT);
 959 }
 960 
 961 //-------------------------------------------------------------------------------------------
 962 // Generic instructions support for use in .ad files C2 code generation
 963 
 964 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) {
 965   if (dst != src) {
 966     movdqu(dst, src);
 967   }
 968   if (opcode == Op_AbsVD) {
 969     andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg);
 970   } else {
 971     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 972     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
 973   }
 974 }
 975 
 976 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 977   if (opcode == Op_AbsVD) {
 978     vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg);
 979   } else {
 980     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 981     vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg);
 982   }
 983 }
 984 
 985 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) {
 986   if (dst != src) {
 987     movdqu(dst, src);
 988   }
 989   if (opcode == Op_AbsVF) {
 990     andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg);
 991   } else {
 992     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 993     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
 994   }
 995 }
 996 
 997 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 998   if (opcode == Op_AbsVF) {
 999     vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg);
1000   } else {
1001     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
1002     vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg);
1003   }
1004 }
1005 
1006 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
1007   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
1008   assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
1009 
1010   if (opcode == Op_MinV) {
1011     if (elem_bt == T_BYTE) {
1012       pminsb(dst, src);
1013     } else if (elem_bt == T_SHORT) {
1014       pminsw(dst, src);
1015     } else if (elem_bt == T_INT) {
1016       pminsd(dst, src);
1017     } else {
1018       assert(elem_bt == T_LONG, "required");
1019       assert(tmp == xmm0, "required");
1020       assert_different_registers(dst, src, tmp);
1021       movdqu(xmm0, dst);
1022       pcmpgtq(xmm0, src);
1023       blendvpd(dst, src);  // xmm0 as mask
1024     }
1025   } else { // opcode == Op_MaxV
1026     if (elem_bt == T_BYTE) {
1027       pmaxsb(dst, src);
1028     } else if (elem_bt == T_SHORT) {
1029       pmaxsw(dst, src);
1030     } else if (elem_bt == T_INT) {
1031       pmaxsd(dst, src);
1032     } else {
1033       assert(elem_bt == T_LONG, "required");
1034       assert(tmp == xmm0, "required");
1035       assert_different_registers(dst, src, tmp);
1036       movdqu(xmm0, src);
1037       pcmpgtq(xmm0, dst);
1038       blendvpd(dst, src);  // xmm0 as mask
1039     }
1040   }
1041 }
1042 
1043 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
1044                                  XMMRegister dst, XMMRegister src1, XMMRegister src2,
1045                                  int vlen_enc) {
1046   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
1047 
1048   if (opcode == Op_MinV) {
1049     if (elem_bt == T_BYTE) {
1050       vpminsb(dst, src1, src2, vlen_enc);
1051     } else if (elem_bt == T_SHORT) {
1052       vpminsw(dst, src1, src2, vlen_enc);
1053     } else if (elem_bt == T_INT) {
1054       vpminsd(dst, src1, src2, vlen_enc);
1055     } else {
1056       assert(elem_bt == T_LONG, "required");
1057       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
1058         vpminsq(dst, src1, src2, vlen_enc);
1059       } else {
1060         assert_different_registers(dst, src1, src2);
1061         vpcmpgtq(dst, src1, src2, vlen_enc);
1062         vblendvpd(dst, src1, src2, dst, vlen_enc);
1063       }
1064     }
1065   } else { // opcode == Op_MaxV
1066     if (elem_bt == T_BYTE) {
1067       vpmaxsb(dst, src1, src2, vlen_enc);
1068     } else if (elem_bt == T_SHORT) {
1069       vpmaxsw(dst, src1, src2, vlen_enc);
1070     } else if (elem_bt == T_INT) {
1071       vpmaxsd(dst, src1, src2, vlen_enc);
1072     } else {
1073       assert(elem_bt == T_LONG, "required");
1074       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
1075         vpmaxsq(dst, src1, src2, vlen_enc);
1076       } else {
1077         assert_different_registers(dst, src1, src2);
1078         vpcmpgtq(dst, src1, src2, vlen_enc);
1079         vblendvpd(dst, src2, src1, dst, vlen_enc);
1080       }
1081     }
1082   }
1083 }
1084 
1085 // Float/Double min max
1086 
1087 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
1088                                    XMMRegister dst, XMMRegister a, XMMRegister b,
1089                                    XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
1090                                    int vlen_enc) {
1091   assert(UseAVX > 0, "required");
1092   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1093          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1094   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1095   assert_different_registers(a, b, tmp, atmp, btmp);
1096 
1097   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1098   bool is_double_word = is_double_word_type(elem_bt);
1099 
1100   if (!is_double_word && is_min) {
1101     vblendvps(atmp, a, b, a, vlen_enc);
1102     vblendvps(btmp, b, a, a, vlen_enc);
1103     vminps(tmp, atmp, btmp, vlen_enc);
1104     vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1105     vblendvps(dst, tmp, atmp, btmp, vlen_enc);
1106   } else if (!is_double_word && !is_min) {
1107     vblendvps(btmp, b, a, b, vlen_enc);
1108     vblendvps(atmp, a, b, b, vlen_enc);
1109     vmaxps(tmp, atmp, btmp, vlen_enc);
1110     vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1111     vblendvps(dst, tmp, atmp, btmp, vlen_enc);
1112   } else if (is_double_word && is_min) {
1113     vblendvpd(atmp, a, b, a, vlen_enc);
1114     vblendvpd(btmp, b, a, a, vlen_enc);
1115     vminpd(tmp, atmp, btmp, vlen_enc);
1116     vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1117     vblendvpd(dst, tmp, atmp, btmp, vlen_enc);
1118   } else {
1119     assert(is_double_word && !is_min, "sanity");
1120     vblendvpd(btmp, b, a, b, vlen_enc);
1121     vblendvpd(atmp, a, b, b, vlen_enc);
1122     vmaxpd(tmp, atmp, btmp, vlen_enc);
1123     vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1124     vblendvpd(dst, tmp, atmp, btmp, vlen_enc);
1125   }
1126 }
1127 
1128 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
1129                                     XMMRegister dst, XMMRegister a, XMMRegister b,
1130                                     KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
1131                                     int vlen_enc) {
1132   assert(UseAVX > 2, "required");
1133   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1134          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1135   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1136   assert_different_registers(dst, a, b, atmp, btmp);
1137 
1138   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1139   bool is_double_word = is_double_word_type(elem_bt);
1140   bool merge = true;
1141 
1142   if (!is_double_word && is_min) {
1143     evpmovd2m(ktmp, a, vlen_enc);
1144     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1145     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1146     vminps(dst, atmp, btmp, vlen_enc);
1147     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1148     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1149   } else if (!is_double_word && !is_min) {
1150     evpmovd2m(ktmp, b, vlen_enc);
1151     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1152     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1153     vmaxps(dst, atmp, btmp, vlen_enc);
1154     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1155     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1156   } else if (is_double_word && is_min) {
1157     evpmovq2m(ktmp, a, vlen_enc);
1158     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1159     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1160     vminpd(dst, atmp, btmp, vlen_enc);
1161     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1162     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1163   } else {
1164     assert(is_double_word && !is_min, "sanity");
1165     evpmovq2m(ktmp, b, vlen_enc);
1166     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1167     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1168     vmaxpd(dst, atmp, btmp, vlen_enc);
1169     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1170     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1171   }
1172 }
1173 
1174 // Float/Double signum
1175 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) {
1176   assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");
1177 
1178   Label DONE_LABEL;
1179 
1180   if (opcode == Op_SignumF) {
1181     assert(UseSSE > 0, "required");
1182     ucomiss(dst, zero);
1183     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1184     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1185     movflt(dst, one);
1186     jcc(Assembler::above, DONE_LABEL);
1187     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
1188   } else if (opcode == Op_SignumD) {
1189     assert(UseSSE > 1, "required");
1190     ucomisd(dst, zero);
1191     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1192     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1193     movdbl(dst, one);
1194     jcc(Assembler::above, DONE_LABEL);
1195     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
1196   }
1197 
1198   bind(DONE_LABEL);
1199 }
1200 
1201 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1202   if (sign) {
1203     pmovsxbw(dst, src);
1204   } else {
1205     pmovzxbw(dst, src);
1206   }
1207 }
1208 
1209 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1210   if (sign) {
1211     vpmovsxbw(dst, src, vector_len);
1212   } else {
1213     vpmovzxbw(dst, src, vector_len);
1214   }
1215 }
1216 
1217 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1218   if (sign) {
1219     vpmovsxbd(dst, src, vector_len);
1220   } else {
1221     vpmovzxbd(dst, src, vector_len);
1222   }
1223 }
1224 
1225 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1226   if (sign) {
1227     vpmovsxwd(dst, src, vector_len);
1228   } else {
1229     vpmovzxwd(dst, src, vector_len);
1230   }
1231 }
1232 
1233 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1234                                      int shift, int vector_len) {
1235   if (opcode == Op_RotateLeftV) {
1236     if (etype == T_INT) {
1237       evprold(dst, src, shift, vector_len);
1238     } else {
1239       assert(etype == T_LONG, "expected type T_LONG");
1240       evprolq(dst, src, shift, vector_len);
1241     }
1242   } else {
1243     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1244     if (etype == T_INT) {
1245       evprord(dst, src, shift, vector_len);
1246     } else {
1247       assert(etype == T_LONG, "expected type T_LONG");
1248       evprorq(dst, src, shift, vector_len);
1249     }
1250   }
1251 }
1252 
1253 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1254                                      XMMRegister shift, int vector_len) {
1255   if (opcode == Op_RotateLeftV) {
1256     if (etype == T_INT) {
1257       evprolvd(dst, src, shift, vector_len);
1258     } else {
1259       assert(etype == T_LONG, "expected type T_LONG");
1260       evprolvq(dst, src, shift, vector_len);
1261     }
1262   } else {
1263     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1264     if (etype == T_INT) {
1265       evprorvd(dst, src, shift, vector_len);
1266     } else {
1267       assert(etype == T_LONG, "expected type T_LONG");
1268       evprorvq(dst, src, shift, vector_len);
1269     }
1270   }
1271 }
1272 
1273 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1274   if (opcode == Op_RShiftVI) {
1275     psrad(dst, shift);
1276   } else if (opcode == Op_LShiftVI) {
1277     pslld(dst, shift);
1278   } else {
1279     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1280     psrld(dst, shift);
1281   }
1282 }
1283 
1284 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1285   switch (opcode) {
1286     case Op_RShiftVI:  psrad(dst, shift); break;
1287     case Op_LShiftVI:  pslld(dst, shift); break;
1288     case Op_URShiftVI: psrld(dst, shift); break;
1289 
1290     default: assert(false, "%s", NodeClassNames[opcode]);
1291   }
1292 }
1293 
1294 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1295   if (opcode == Op_RShiftVI) {
1296     vpsrad(dst, nds, shift, vector_len);
1297   } else if (opcode == Op_LShiftVI) {
1298     vpslld(dst, nds, shift, vector_len);
1299   } else {
1300     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1301     vpsrld(dst, nds, shift, vector_len);
1302   }
1303 }
1304 
1305 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1306   switch (opcode) {
1307     case Op_RShiftVI:  vpsrad(dst, src, shift, vlen_enc); break;
1308     case Op_LShiftVI:  vpslld(dst, src, shift, vlen_enc); break;
1309     case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1310 
1311     default: assert(false, "%s", NodeClassNames[opcode]);
1312   }
1313 }
1314 
1315 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1316   switch (opcode) {
1317     case Op_RShiftVB:  // fall-through
1318     case Op_RShiftVS:  psraw(dst, shift); break;
1319 
1320     case Op_LShiftVB:  // fall-through
1321     case Op_LShiftVS:  psllw(dst, shift);   break;
1322 
1323     case Op_URShiftVS: // fall-through
1324     case Op_URShiftVB: psrlw(dst, shift);  break;
1325 
1326     default: assert(false, "%s", NodeClassNames[opcode]);
1327   }
1328 }
1329 
1330 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1331   switch (opcode) {
1332     case Op_RShiftVB:  // fall-through
1333     case Op_RShiftVS:  vpsraw(dst, src, shift, vlen_enc); break;
1334 
1335     case Op_LShiftVB:  // fall-through
1336     case Op_LShiftVS:  vpsllw(dst, src, shift, vlen_enc); break;
1337 
1338     case Op_URShiftVS: // fall-through
1339     case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1340 
1341     default: assert(false, "%s", NodeClassNames[opcode]);
1342   }
1343 }
1344 
1345 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1346   switch (opcode) {
1347     case Op_RShiftVL:  psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1348     case Op_LShiftVL:  psllq(dst, shift); break;
1349     case Op_URShiftVL: psrlq(dst, shift); break;
1350 
1351     default: assert(false, "%s", NodeClassNames[opcode]);
1352   }
1353 }
1354 
1355 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1356   if (opcode == Op_RShiftVL) {
1357     psrlq(dst, shift);  // using srl to implement sra on pre-avs512 systems
1358   } else if (opcode == Op_LShiftVL) {
1359     psllq(dst, shift);
1360   } else {
1361     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1362     psrlq(dst, shift);
1363   }
1364 }
1365 
1366 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1367   switch (opcode) {
1368     case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1369     case Op_LShiftVL:  vpsllq(dst, src, shift, vlen_enc); break;
1370     case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1371 
1372     default: assert(false, "%s", NodeClassNames[opcode]);
1373   }
1374 }
1375 
1376 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1377   if (opcode == Op_RShiftVL) {
1378     evpsraq(dst, nds, shift, vector_len);
1379   } else if (opcode == Op_LShiftVL) {
1380     vpsllq(dst, nds, shift, vector_len);
1381   } else {
1382     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1383     vpsrlq(dst, nds, shift, vector_len);
1384   }
1385 }
1386 
1387 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1388   switch (opcode) {
1389     case Op_RShiftVB:  // fall-through
1390     case Op_RShiftVS:  // fall-through
1391     case Op_RShiftVI:  vpsravd(dst, src, shift, vlen_enc); break;
1392 
1393     case Op_LShiftVB:  // fall-through
1394     case Op_LShiftVS:  // fall-through
1395     case Op_LShiftVI:  vpsllvd(dst, src, shift, vlen_enc); break;
1396 
1397     case Op_URShiftVB: // fall-through
1398     case Op_URShiftVS: // fall-through
1399     case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1400 
1401     default: assert(false, "%s", NodeClassNames[opcode]);
1402   }
1403 }
1404 
1405 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1406   switch (opcode) {
1407     case Op_RShiftVB:  // fall-through
1408     case Op_RShiftVS:  evpsravw(dst, src, shift, vlen_enc); break;
1409 
1410     case Op_LShiftVB:  // fall-through
1411     case Op_LShiftVS:  evpsllvw(dst, src, shift, vlen_enc); break;
1412 
1413     case Op_URShiftVB: // fall-through
1414     case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1415 
1416     default: assert(false, "%s", NodeClassNames[opcode]);
1417   }
1418 }
1419 
1420 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1421   assert(UseAVX >= 2, "required");
1422   switch (opcode) {
1423     case Op_RShiftVL: {
1424       if (UseAVX > 2) {
1425         assert(tmp == xnoreg, "not used");
1426         if (!VM_Version::supports_avx512vl()) {
1427           vlen_enc = Assembler::AVX_512bit;
1428         }
1429         evpsravq(dst, src, shift, vlen_enc);
1430       } else {
1431         vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1432         vpsrlvq(dst, src, shift, vlen_enc);
1433         vpsrlvq(tmp, tmp, shift, vlen_enc);
1434         vpxor(dst, dst, tmp, vlen_enc);
1435         vpsubq(dst, dst, tmp, vlen_enc);
1436       }
1437       break;
1438     }
1439     case Op_LShiftVL: {
1440       assert(tmp == xnoreg, "not used");
1441       vpsllvq(dst, src, shift, vlen_enc);
1442       break;
1443     }
1444     case Op_URShiftVL: {
1445       assert(tmp == xnoreg, "not used");
1446       vpsrlvq(dst, src, shift, vlen_enc);
1447       break;
1448     }
1449     default: assert(false, "%s", NodeClassNames[opcode]);
1450   }
1451 }
1452 
1453 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1454 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1455   assert(opcode == Op_LShiftVB ||
1456          opcode == Op_RShiftVB ||
1457          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1458   bool sign = (opcode != Op_URShiftVB);
1459   assert(vector_len == 0, "required");
1460   vextendbd(sign, dst, src, 1);
1461   vpmovzxbd(vtmp, shift, 1);
1462   varshiftd(opcode, dst, dst, vtmp, 1);
1463   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg);
1464   vextracti128_high(vtmp, dst);
1465   vpackusdw(dst, dst, vtmp, 0);
1466 }
1467 
1468 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1469 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1470   assert(opcode == Op_LShiftVB ||
1471          opcode == Op_RShiftVB ||
1472          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1473   bool sign = (opcode != Op_URShiftVB);
1474   int ext_vector_len = vector_len + 1;
1475   vextendbw(sign, dst, src, ext_vector_len);
1476   vpmovzxbw(vtmp, shift, ext_vector_len);
1477   varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1478   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg);
1479   if (vector_len == 0) {
1480     vextracti128_high(vtmp, dst);
1481     vpackuswb(dst, dst, vtmp, vector_len);
1482   } else {
1483     vextracti64x4_high(vtmp, dst);
1484     vpackuswb(dst, dst, vtmp, vector_len);
1485     vpermq(dst, dst, 0xD8, vector_len);
1486   }
1487 }
1488 
1489 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1490   switch(typ) {
1491     case T_BYTE:
1492       pinsrb(dst, val, idx);
1493       break;
1494     case T_SHORT:
1495       pinsrw(dst, val, idx);
1496       break;
1497     case T_INT:
1498       pinsrd(dst, val, idx);
1499       break;
1500     case T_LONG:
1501       pinsrq(dst, val, idx);
1502       break;
1503     default:
1504       assert(false,"Should not reach here.");
1505       break;
1506   }
1507 }
1508 
1509 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1510   switch(typ) {
1511     case T_BYTE:
1512       vpinsrb(dst, src, val, idx);
1513       break;
1514     case T_SHORT:
1515       vpinsrw(dst, src, val, idx);
1516       break;
1517     case T_INT:
1518       vpinsrd(dst, src, val, idx);
1519       break;
1520     case T_LONG:
1521       vpinsrq(dst, src, val, idx);
1522       break;
1523     default:
1524       assert(false,"Should not reach here.");
1525       break;
1526   }
1527 }
1528 
1529 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1530   switch(typ) {
1531     case T_INT:
1532       vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1533       break;
1534     case T_FLOAT:
1535       vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1536       break;
1537     case T_LONG:
1538       vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1539       break;
1540     case T_DOUBLE:
1541       vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1542       break;
1543     default:
1544       assert(false,"Should not reach here.");
1545       break;
1546   }
1547 }
1548 
1549 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1550   switch(typ) {
1551     case T_INT:
1552       evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1553       break;
1554     case T_FLOAT:
1555       evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1556       break;
1557     case T_LONG:
1558       evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1559       break;
1560     case T_DOUBLE:
1561       evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1562       break;
1563     default:
1564       assert(false,"Should not reach here.");
1565       break;
1566   }
1567 }
1568 
1569 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1570   switch(typ) {
1571     case T_INT:
1572       evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1573       break;
1574     case T_FLOAT:
1575       evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1576       break;
1577     case T_LONG:
1578       evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1579       break;
1580     case T_DOUBLE:
1581       evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1582       break;
1583     default:
1584       assert(false,"Should not reach here.");
1585       break;
1586   }
1587 }
1588 
1589 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) {
1590   if (vlen_in_bytes <= 16) {
1591     pxor (dst, dst);
1592     psubb(dst, src);
1593     switch (elem_bt) {
1594       case T_BYTE:   /* nothing to do */ break;
1595       case T_SHORT:  pmovsxbw(dst, dst); break;
1596       case T_INT:    pmovsxbd(dst, dst); break;
1597       case T_FLOAT:  pmovsxbd(dst, dst); break;
1598       case T_LONG:   pmovsxbq(dst, dst); break;
1599       case T_DOUBLE: pmovsxbq(dst, dst); break;
1600 
1601       default: assert(false, "%s", type2name(elem_bt));
1602     }
1603   } else {
1604     assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, "");
1605     int vlen_enc = vector_length_encoding(vlen_in_bytes);
1606 
1607     vpxor (dst, dst, dst, vlen_enc);
1608     vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc);
1609 
1610     switch (elem_bt) {
1611       case T_BYTE:   /* nothing to do */            break;
1612       case T_SHORT:  vpmovsxbw(dst, dst, vlen_enc); break;
1613       case T_INT:    vpmovsxbd(dst, dst, vlen_enc); break;
1614       case T_FLOAT:  vpmovsxbd(dst, dst, vlen_enc); break;
1615       case T_LONG:   vpmovsxbq(dst, dst, vlen_enc); break;
1616       case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1617 
1618       default: assert(false, "%s", type2name(elem_bt));
1619     }
1620   }
1621 }
1622 
1623 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) {
1624   if (novlbwdq) {
1625     vpmovsxbd(xtmp, src, vlen_enc);
1626     evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()),
1627             Assembler::eq, true, vlen_enc, noreg);
1628   } else {
1629     vpxor(xtmp, xtmp, xtmp, vlen_enc);
1630     vpsubb(xtmp, xtmp, src, vlen_enc);
1631     evpmovb2m(dst, xtmp, vlen_enc);
1632   }
1633 }
1634 
1635 void C2_MacroAssembler::load_vector(XMMRegister dst, Address src, int vlen_in_bytes) {
1636   switch (vlen_in_bytes) {
1637     case 4:  movdl(dst, src);   break;
1638     case 8:  movq(dst, src);    break;
1639     case 16: movdqu(dst, src);  break;
1640     case 32: vmovdqu(dst, src); break;
1641     case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break;
1642     default: ShouldNotReachHere();
1643   }
1644 }
1645 
1646 void C2_MacroAssembler::load_vector(XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) {
1647   assert(rscratch != noreg || always_reachable(src), "missing");
1648 
1649   if (reachable(src)) {
1650     load_vector(dst, as_Address(src), vlen_in_bytes);
1651   } else {
1652     lea(rscratch, src);
1653     load_vector(dst, Address(rscratch, 0), vlen_in_bytes);
1654   }
1655 }
1656 
1657 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) {
1658   int vlen_enc = vector_length_encoding(vlen);
1659   if (VM_Version::supports_avx()) {
1660     if (bt == T_LONG) {
1661       if (VM_Version::supports_avx2()) {
1662         vpbroadcastq(dst, src, vlen_enc);
1663       } else {
1664         vmovddup(dst, src, vlen_enc);
1665       }
1666     } else if (bt == T_DOUBLE) {
1667       if (vlen_enc != Assembler::AVX_128bit) {
1668         vbroadcastsd(dst, src, vlen_enc, noreg);
1669       } else {
1670         vmovddup(dst, src, vlen_enc);
1671       }
1672     } else {
1673       if (VM_Version::supports_avx2() && is_integral_type(bt)) {
1674         vpbroadcastd(dst, src, vlen_enc);
1675       } else {
1676         vbroadcastss(dst, src, vlen_enc);
1677       }
1678     }
1679   } else if (VM_Version::supports_sse3()) {
1680     movddup(dst, src);
1681   } else {
1682     movq(dst, src);
1683     if (vlen == 16) {
1684       punpcklqdq(dst, dst);
1685     }
1686   }
1687 }
1688 
1689 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) {
1690   // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64.
1691   int offset = exact_log2(type2aelembytes(bt)) << 6;
1692   if (is_floating_point_type(bt)) {
1693     offset += 128;
1694   }
1695   ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset);
1696   load_vector(dst, addr, vlen_in_bytes);
1697 }
1698 
1699 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1700 
1701 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1702   int vector_len = Assembler::AVX_128bit;
1703 
1704   switch (opcode) {
1705     case Op_AndReductionV:  pand(dst, src); break;
1706     case Op_OrReductionV:   por (dst, src); break;
1707     case Op_XorReductionV:  pxor(dst, src); break;
1708     case Op_MinReductionV:
1709       switch (typ) {
1710         case T_BYTE:        pminsb(dst, src); break;
1711         case T_SHORT:       pminsw(dst, src); break;
1712         case T_INT:         pminsd(dst, src); break;
1713         case T_LONG:        assert(UseAVX > 2, "required");
1714                             vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
1715         default:            assert(false, "wrong type");
1716       }
1717       break;
1718     case Op_MaxReductionV:
1719       switch (typ) {
1720         case T_BYTE:        pmaxsb(dst, src); break;
1721         case T_SHORT:       pmaxsw(dst, src); break;
1722         case T_INT:         pmaxsd(dst, src); break;
1723         case T_LONG:        assert(UseAVX > 2, "required");
1724                             vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
1725         default:            assert(false, "wrong type");
1726       }
1727       break;
1728     case Op_AddReductionVF: addss(dst, src); break;
1729     case Op_AddReductionVD: addsd(dst, src); break;
1730     case Op_AddReductionVI:
1731       switch (typ) {
1732         case T_BYTE:        paddb(dst, src); break;
1733         case T_SHORT:       paddw(dst, src); break;
1734         case T_INT:         paddd(dst, src); break;
1735         default:            assert(false, "wrong type");
1736       }
1737       break;
1738     case Op_AddReductionVL: paddq(dst, src); break;
1739     case Op_MulReductionVF: mulss(dst, src); break;
1740     case Op_MulReductionVD: mulsd(dst, src); break;
1741     case Op_MulReductionVI:
1742       switch (typ) {
1743         case T_SHORT:       pmullw(dst, src); break;
1744         case T_INT:         pmulld(dst, src); break;
1745         default:            assert(false, "wrong type");
1746       }
1747       break;
1748     case Op_MulReductionVL: assert(UseAVX > 2, "required");
1749                             evpmullq(dst, dst, src, vector_len); break;
1750     default:                assert(false, "wrong opcode");
1751   }
1752 }
1753 
1754 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1755   int vector_len = Assembler::AVX_256bit;
1756 
1757   switch (opcode) {
1758     case Op_AndReductionV:  vpand(dst, src1, src2, vector_len); break;
1759     case Op_OrReductionV:   vpor (dst, src1, src2, vector_len); break;
1760     case Op_XorReductionV:  vpxor(dst, src1, src2, vector_len); break;
1761     case Op_MinReductionV:
1762       switch (typ) {
1763         case T_BYTE:        vpminsb(dst, src1, src2, vector_len); break;
1764         case T_SHORT:       vpminsw(dst, src1, src2, vector_len); break;
1765         case T_INT:         vpminsd(dst, src1, src2, vector_len); break;
1766         case T_LONG:        assert(UseAVX > 2, "required");
1767                             vpminsq(dst, src1, src2, vector_len); break;
1768         default:            assert(false, "wrong type");
1769       }
1770       break;
1771     case Op_MaxReductionV:
1772       switch (typ) {
1773         case T_BYTE:        vpmaxsb(dst, src1, src2, vector_len); break;
1774         case T_SHORT:       vpmaxsw(dst, src1, src2, vector_len); break;
1775         case T_INT:         vpmaxsd(dst, src1, src2, vector_len); break;
1776         case T_LONG:        assert(UseAVX > 2, "required");
1777                             vpmaxsq(dst, src1, src2, vector_len); break;
1778         default:            assert(false, "wrong type");
1779       }
1780       break;
1781     case Op_AddReductionVI:
1782       switch (typ) {
1783         case T_BYTE:        vpaddb(dst, src1, src2, vector_len); break;
1784         case T_SHORT:       vpaddw(dst, src1, src2, vector_len); break;
1785         case T_INT:         vpaddd(dst, src1, src2, vector_len); break;
1786         default:            assert(false, "wrong type");
1787       }
1788       break;
1789     case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
1790     case Op_MulReductionVI:
1791       switch (typ) {
1792         case T_SHORT:       vpmullw(dst, src1, src2, vector_len); break;
1793         case T_INT:         vpmulld(dst, src1, src2, vector_len); break;
1794         default:            assert(false, "wrong type");
1795       }
1796       break;
1797     case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break;
1798     default:                assert(false, "wrong opcode");
1799   }
1800 }
1801 
1802 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
1803                                   XMMRegister dst, XMMRegister src,
1804                                   XMMRegister vtmp1, XMMRegister vtmp2) {
1805   switch (opcode) {
1806     case Op_AddReductionVF:
1807     case Op_MulReductionVF:
1808       reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1809       break;
1810 
1811     case Op_AddReductionVD:
1812     case Op_MulReductionVD:
1813       reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1814       break;
1815 
1816     default: assert(false, "wrong opcode");
1817   }
1818 }
1819 
1820 void C2_MacroAssembler::reduceB(int opcode, int vlen,
1821                              Register dst, Register src1, XMMRegister src2,
1822                              XMMRegister vtmp1, XMMRegister vtmp2) {
1823   switch (vlen) {
1824     case  8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1825     case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1826     case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1827     case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1828 
1829     default: assert(false, "wrong vector length");
1830   }
1831 }
1832 
1833 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
1834                              Register dst, Register src1, XMMRegister src2,
1835                              XMMRegister vtmp1, XMMRegister vtmp2) {
1836   switch (vlen) {
1837     case  8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1838     case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1839     case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1840     case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1841 
1842     default: assert(false, "wrong vector length");
1843   }
1844 }
1845 
1846 void C2_MacroAssembler::reduceS(int opcode, int vlen,
1847                              Register dst, Register src1, XMMRegister src2,
1848                              XMMRegister vtmp1, XMMRegister vtmp2) {
1849   switch (vlen) {
1850     case  4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1851     case  8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1852     case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1853     case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1854 
1855     default: assert(false, "wrong vector length");
1856   }
1857 }
1858 
1859 void C2_MacroAssembler::reduceI(int opcode, int vlen,
1860                              Register dst, Register src1, XMMRegister src2,
1861                              XMMRegister vtmp1, XMMRegister vtmp2) {
1862   switch (vlen) {
1863     case  2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1864     case  4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1865     case  8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1866     case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1867 
1868     default: assert(false, "wrong vector length");
1869   }
1870 }
1871 
1872 #ifdef _LP64
1873 void C2_MacroAssembler::reduceL(int opcode, int vlen,
1874                              Register dst, Register src1, XMMRegister src2,
1875                              XMMRegister vtmp1, XMMRegister vtmp2) {
1876   switch (vlen) {
1877     case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1878     case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1879     case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1880 
1881     default: assert(false, "wrong vector length");
1882   }
1883 }
1884 #endif // _LP64
1885 
1886 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1887   switch (vlen) {
1888     case 2:
1889       assert(vtmp2 == xnoreg, "");
1890       reduce2F(opcode, dst, src, vtmp1);
1891       break;
1892     case 4:
1893       assert(vtmp2 == xnoreg, "");
1894       reduce4F(opcode, dst, src, vtmp1);
1895       break;
1896     case 8:
1897       reduce8F(opcode, dst, src, vtmp1, vtmp2);
1898       break;
1899     case 16:
1900       reduce16F(opcode, dst, src, vtmp1, vtmp2);
1901       break;
1902     default: assert(false, "wrong vector length");
1903   }
1904 }
1905 
1906 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1907   switch (vlen) {
1908     case 2:
1909       assert(vtmp2 == xnoreg, "");
1910       reduce2D(opcode, dst, src, vtmp1);
1911       break;
1912     case 4:
1913       reduce4D(opcode, dst, src, vtmp1, vtmp2);
1914       break;
1915     case 8:
1916       reduce8D(opcode, dst, src, vtmp1, vtmp2);
1917       break;
1918     default: assert(false, "wrong vector length");
1919   }
1920 }
1921 
1922 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1923   if (opcode == Op_AddReductionVI) {
1924     if (vtmp1 != src2) {
1925       movdqu(vtmp1, src2);
1926     }
1927     phaddd(vtmp1, vtmp1);
1928   } else {
1929     pshufd(vtmp1, src2, 0x1);
1930     reduce_operation_128(T_INT, opcode, vtmp1, src2);
1931   }
1932   movdl(vtmp2, src1);
1933   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
1934   movdl(dst, vtmp1);
1935 }
1936 
1937 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1938   if (opcode == Op_AddReductionVI) {
1939     if (vtmp1 != src2) {
1940       movdqu(vtmp1, src2);
1941     }
1942     phaddd(vtmp1, src2);
1943     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1944   } else {
1945     pshufd(vtmp2, src2, 0xE);
1946     reduce_operation_128(T_INT, opcode, vtmp2, src2);
1947     reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1948   }
1949 }
1950 
1951 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1952   if (opcode == Op_AddReductionVI) {
1953     vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
1954     vextracti128_high(vtmp2, vtmp1);
1955     vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
1956     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1957   } else {
1958     vextracti128_high(vtmp1, src2);
1959     reduce_operation_128(T_INT, opcode, vtmp1, src2);
1960     reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1961   }
1962 }
1963 
1964 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1965   vextracti64x4_high(vtmp2, src2);
1966   reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
1967   reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1968 }
1969 
1970 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1971   pshufd(vtmp2, src2, 0x1);
1972   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
1973   movdqu(vtmp1, vtmp2);
1974   psrldq(vtmp1, 2);
1975   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
1976   movdqu(vtmp2, vtmp1);
1977   psrldq(vtmp2, 1);
1978   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
1979   movdl(vtmp2, src1);
1980   pmovsxbd(vtmp1, vtmp1);
1981   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
1982   pextrb(dst, vtmp1, 0x0);
1983   movsbl(dst, dst);
1984 }
1985 
1986 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1987   pshufd(vtmp1, src2, 0xE);
1988   reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
1989   reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1990 }
1991 
1992 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1993   vextracti128_high(vtmp2, src2);
1994   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
1995   reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1996 }
1997 
1998 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1999   vextracti64x4_high(vtmp1, src2);
2000   reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
2001   reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2002 }
2003 
2004 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2005   pmovsxbw(vtmp2, src2);
2006   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2007 }
2008 
2009 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2010   if (UseAVX > 1) {
2011     int vector_len = Assembler::AVX_256bit;
2012     vpmovsxbw(vtmp1, src2, vector_len);
2013     reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2014   } else {
2015     pmovsxbw(vtmp2, src2);
2016     reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2017     pshufd(vtmp2, src2, 0x1);
2018     pmovsxbw(vtmp2, src2);
2019     reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2020   }
2021 }
2022 
2023 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2024   if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
2025     int vector_len = Assembler::AVX_512bit;
2026     vpmovsxbw(vtmp1, src2, vector_len);
2027     reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2028   } else {
2029     assert(UseAVX >= 2,"Should not reach here.");
2030     mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
2031     vextracti128_high(vtmp2, src2);
2032     mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2033   }
2034 }
2035 
2036 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2037   mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
2038   vextracti64x4_high(vtmp2, src2);
2039   mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2040 }
2041 
2042 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2043   if (opcode == Op_AddReductionVI) {
2044     if (vtmp1 != src2) {
2045       movdqu(vtmp1, src2);
2046     }
2047     phaddw(vtmp1, vtmp1);
2048     phaddw(vtmp1, vtmp1);
2049   } else {
2050     pshufd(vtmp2, src2, 0x1);
2051     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2052     movdqu(vtmp1, vtmp2);
2053     psrldq(vtmp1, 2);
2054     reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
2055   }
2056   movdl(vtmp2, src1);
2057   pmovsxwd(vtmp1, vtmp1);
2058   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2059   pextrw(dst, vtmp1, 0x0);
2060   movswl(dst, dst);
2061 }
2062 
2063 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2064   if (opcode == Op_AddReductionVI) {
2065     if (vtmp1 != src2) {
2066       movdqu(vtmp1, src2);
2067     }
2068     phaddw(vtmp1, src2);
2069   } else {
2070     pshufd(vtmp1, src2, 0xE);
2071     reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
2072   }
2073   reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2074 }
2075 
2076 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2077   if (opcode == Op_AddReductionVI) {
2078     int vector_len = Assembler::AVX_256bit;
2079     vphaddw(vtmp2, src2, src2, vector_len);
2080     vpermq(vtmp2, vtmp2, 0xD8, vector_len);
2081   } else {
2082     vextracti128_high(vtmp2, src2);
2083     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2084   }
2085   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2086 }
2087 
2088 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2089   int vector_len = Assembler::AVX_256bit;
2090   vextracti64x4_high(vtmp1, src2);
2091   reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
2092   reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2093 }
2094 
2095 #ifdef _LP64
2096 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2097   pshufd(vtmp2, src2, 0xE);
2098   reduce_operation_128(T_LONG, opcode, vtmp2, src2);
2099   movdq(vtmp1, src1);
2100   reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
2101   movdq(dst, vtmp1);
2102 }
2103 
2104 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2105   vextracti128_high(vtmp1, src2);
2106   reduce_operation_128(T_LONG, opcode, vtmp1, src2);
2107   reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2108 }
2109 
2110 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2111   vextracti64x4_high(vtmp2, src2);
2112   reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
2113   reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2114 }
2115 
2116 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
2117   mov64(temp, -1L);
2118   bzhiq(temp, temp, len);
2119   kmovql(dst, temp);
2120 }
2121 #endif // _LP64
2122 
2123 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2124   reduce_operation_128(T_FLOAT, opcode, dst, src);
2125   pshufd(vtmp, src, 0x1);
2126   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2127 }
2128 
2129 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2130   reduce2F(opcode, dst, src, vtmp);
2131   pshufd(vtmp, src, 0x2);
2132   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2133   pshufd(vtmp, src, 0x3);
2134   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2135 }
2136 
2137 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2138   reduce4F(opcode, dst, src, vtmp2);
2139   vextractf128_high(vtmp2, src);
2140   reduce4F(opcode, dst, vtmp2, vtmp1);
2141 }
2142 
2143 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2144   reduce8F(opcode, dst, src, vtmp1, vtmp2);
2145   vextracti64x4_high(vtmp1, src);
2146   reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
2147 }
2148 
2149 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2150   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2151   pshufd(vtmp, src, 0xE);
2152   reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
2153 }
2154 
2155 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2156   reduce2D(opcode, dst, src, vtmp2);
2157   vextractf128_high(vtmp2, src);
2158   reduce2D(opcode, dst, vtmp2, vtmp1);
2159 }
2160 
2161 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2162   reduce4D(opcode, dst, src, vtmp1, vtmp2);
2163   vextracti64x4_high(vtmp1, src);
2164   reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
2165 }
2166 
2167 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) {
2168   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2169 }
2170 
2171 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) {
2172   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2173 }
2174 
2175 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask,
2176                                  int vec_enc) {
2177   switch(elem_bt) {
2178     case T_INT:
2179     case T_FLOAT:
2180       vmaskmovps(dst, src, mask, vec_enc);
2181       break;
2182     case T_LONG:
2183     case T_DOUBLE:
2184       vmaskmovpd(dst, src, mask, vec_enc);
2185       break;
2186     default:
2187       fatal("Unsupported type %s", type2name(elem_bt));
2188       break;
2189   }
2190 }
2191 
2192 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask,
2193                                  int vec_enc) {
2194   switch(elem_bt) {
2195     case T_INT:
2196     case T_FLOAT:
2197       vmaskmovps(dst, src, mask, vec_enc);
2198       break;
2199     case T_LONG:
2200     case T_DOUBLE:
2201       vmaskmovpd(dst, src, mask, vec_enc);
2202       break;
2203     default:
2204       fatal("Unsupported type %s", type2name(elem_bt));
2205       break;
2206   }
2207 }
2208 
2209 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
2210                                           XMMRegister dst, XMMRegister src,
2211                                           XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2212                                           XMMRegister xmm_0, XMMRegister xmm_1) {
2213   int permconst[] = {1, 14};
2214   XMMRegister wsrc = src;
2215   XMMRegister wdst = xmm_0;
2216   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2217 
2218   int vlen_enc = Assembler::AVX_128bit;
2219   if (vlen == 16) {
2220     vlen_enc = Assembler::AVX_256bit;
2221   }
2222 
2223   for (int i = log2(vlen) - 1; i >=0; i--) {
2224     if (i == 0 && !is_dst_valid) {
2225       wdst = dst;
2226     }
2227     if (i == 3) {
2228       vextracti64x4_high(wtmp, wsrc);
2229     } else if (i == 2) {
2230       vextracti128_high(wtmp, wsrc);
2231     } else { // i = [0,1]
2232       vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
2233     }
2234     vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2235     wsrc = wdst;
2236     vlen_enc = Assembler::AVX_128bit;
2237   }
2238   if (is_dst_valid) {
2239     vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2240   }
2241 }
2242 
2243 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
2244                                         XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2245                                         XMMRegister xmm_0, XMMRegister xmm_1) {
2246   XMMRegister wsrc = src;
2247   XMMRegister wdst = xmm_0;
2248   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2249   int vlen_enc = Assembler::AVX_128bit;
2250   if (vlen == 8) {
2251     vlen_enc = Assembler::AVX_256bit;
2252   }
2253   for (int i = log2(vlen) - 1; i >=0; i--) {
2254     if (i == 0 && !is_dst_valid) {
2255       wdst = dst;
2256     }
2257     if (i == 1) {
2258       vextracti128_high(wtmp, wsrc);
2259     } else if (i == 2) {
2260       vextracti64x4_high(wtmp, wsrc);
2261     } else {
2262       assert(i == 0, "%d", i);
2263       vpermilpd(wtmp, wsrc, 1, vlen_enc);
2264     }
2265     vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2266     wsrc = wdst;
2267     vlen_enc = Assembler::AVX_128bit;
2268   }
2269   if (is_dst_valid) {
2270     vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2271   }
2272 }
2273 
2274 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2275   switch (bt) {
2276     case T_BYTE:  pextrb(dst, src, idx); break;
2277     case T_SHORT: pextrw(dst, src, idx); break;
2278     case T_INT:   pextrd(dst, src, idx); break;
2279     case T_LONG:  pextrq(dst, src, idx); break;
2280 
2281     default:
2282       assert(false,"Should not reach here.");
2283       break;
2284   }
2285 }
2286 
2287 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2288   int esize =  type2aelembytes(typ);
2289   int elem_per_lane = 16/esize;
2290   int lane = elemindex / elem_per_lane;
2291   int eindex = elemindex % elem_per_lane;
2292 
2293   if (lane >= 2) {
2294     assert(UseAVX > 2, "required");
2295     vextractf32x4(dst, src, lane & 3);
2296     return dst;
2297   } else if (lane > 0) {
2298     assert(UseAVX > 0, "required");
2299     vextractf128(dst, src, lane);
2300     return dst;
2301   } else {
2302     return src;
2303   }
2304 }
2305 
2306 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2307   int esize =  type2aelembytes(typ);
2308   int elem_per_lane = 16/esize;
2309   int eindex = elemindex % elem_per_lane;
2310   assert(is_integral_type(typ),"required");
2311 
2312   if (eindex == 0) {
2313     if (typ == T_LONG) {
2314       movq(dst, src);
2315     } else {
2316       movdl(dst, src);
2317       if (typ == T_BYTE)
2318         movsbl(dst, dst);
2319       else if (typ == T_SHORT)
2320         movswl(dst, dst);
2321     }
2322   } else {
2323     extract(typ, dst, src, eindex);
2324   }
2325 }
2326 
2327 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) {
2328   int esize =  type2aelembytes(typ);
2329   int elem_per_lane = 16/esize;
2330   int eindex = elemindex % elem_per_lane;
2331   assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2332 
2333   if (eindex == 0) {
2334     movq(dst, src);
2335   } else {
2336     if (typ == T_FLOAT) {
2337       if (UseAVX == 0) {
2338         movdqu(dst, src);
2339         shufps(dst, dst, eindex);
2340       } else {
2341         vshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2342       }
2343     } else {
2344       if (UseAVX == 0) {
2345         movdqu(dst, src);
2346         psrldq(dst, eindex*esize);
2347       } else {
2348         vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2349       }
2350       movq(dst, dst);
2351     }
2352   }
2353   // Zero upper bits
2354   if (typ == T_FLOAT) {
2355     if (UseAVX == 0) {
2356       assert(vtmp != xnoreg, "required.");
2357       movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg);
2358       pand(dst, vtmp);
2359     } else {
2360       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg);
2361     }
2362   }
2363 }
2364 
2365 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) {
2366   switch(typ) {
2367     case T_BYTE:
2368     case T_BOOLEAN:
2369       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2370       break;
2371     case T_SHORT:
2372     case T_CHAR:
2373       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2374       break;
2375     case T_INT:
2376     case T_FLOAT:
2377       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2378       break;
2379     case T_LONG:
2380     case T_DOUBLE:
2381       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2382       break;
2383     default:
2384       assert(false,"Should not reach here.");
2385       break;
2386   }
2387 }
2388 
2389 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) {
2390   assert(rscratch != noreg || always_reachable(src2), "missing");
2391 
2392   switch(typ) {
2393     case T_BOOLEAN:
2394     case T_BYTE:
2395       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2396       break;
2397     case T_CHAR:
2398     case T_SHORT:
2399       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2400       break;
2401     case T_INT:
2402     case T_FLOAT:
2403       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2404       break;
2405     case T_LONG:
2406     case T_DOUBLE:
2407       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2408       break;
2409     default:
2410       assert(false,"Should not reach here.");
2411       break;
2412   }
2413 }
2414 
2415 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2416   switch(typ) {
2417     case T_BYTE:
2418       evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2419       break;
2420     case T_SHORT:
2421       evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2422       break;
2423     case T_INT:
2424     case T_FLOAT:
2425       evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2426       break;
2427     case T_LONG:
2428     case T_DOUBLE:
2429       evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2430       break;
2431     default:
2432       assert(false,"Should not reach here.");
2433       break;
2434   }
2435 }
2436 
2437 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) {
2438   assert(vlen_in_bytes <= 32, "");
2439   int esize = type2aelembytes(bt);
2440   if (vlen_in_bytes == 32) {
2441     assert(vtmp == xnoreg, "required.");
2442     if (esize >= 4) {
2443       vtestps(src1, src2, AVX_256bit);
2444     } else {
2445       vptest(src1, src2, AVX_256bit);
2446     }
2447     return;
2448   }
2449   if (vlen_in_bytes < 16) {
2450     // Duplicate the lower part to fill the whole register,
2451     // Don't need to do so for src2
2452     assert(vtmp != xnoreg, "required");
2453     int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04;
2454     pshufd(vtmp, src1, shuffle_imm);
2455   } else {
2456     assert(vtmp == xnoreg, "required");
2457     vtmp = src1;
2458   }
2459   if (esize >= 4 && VM_Version::supports_avx()) {
2460     vtestps(vtmp, src2, AVX_128bit);
2461   } else {
2462     ptest(vtmp, src2);
2463   }
2464 }
2465 
2466 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
2467   assert(UseAVX >= 2, "required");
2468 #ifdef ASSERT
2469   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2470   bool is_bw_supported = VM_Version::supports_avx512bw();
2471   if (is_bw && !is_bw_supported) {
2472     assert(vlen_enc != Assembler::AVX_512bit, "required");
2473     assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16),
2474            "XMM register should be 0-15");
2475   }
2476 #endif // ASSERT
2477   switch (elem_bt) {
2478     case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return;
2479     case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return;
2480     case T_INT: vpaddd(dst, src1, src2, vlen_enc); return;
2481     case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return;
2482     case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return;
2483     case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return;
2484     default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2485   }
2486 }
2487 
2488 #ifdef _LP64
2489 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) {
2490   assert(UseAVX >= 2, "required");
2491   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2492   bool is_vl = vlen_enc != Assembler::AVX_512bit;
2493   if ((UseAVX > 2) &&
2494       (!is_bw || VM_Version::supports_avx512bw()) &&
2495       (!is_vl || VM_Version::supports_avx512vl())) {
2496     switch (elem_bt) {
2497       case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return;
2498       case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return;
2499       case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return;
2500       case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return;
2501       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2502     }
2503   } else {
2504     assert(vlen_enc != Assembler::AVX_512bit, "required");
2505     assert((dst->encoding() < 16),"XMM register should be 0-15");
2506     switch (elem_bt) {
2507       case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return;
2508       case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return;
2509       case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return;
2510       case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return;
2511       case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return;
2512       case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return;
2513       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2514     }
2515   }
2516 }
2517 #endif
2518 
2519 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
2520   switch (to_elem_bt) {
2521     case T_SHORT:
2522       vpmovsxbw(dst, src, vlen_enc);
2523       break;
2524     case T_INT:
2525       vpmovsxbd(dst, src, vlen_enc);
2526       break;
2527     case T_FLOAT:
2528       vpmovsxbd(dst, src, vlen_enc);
2529       vcvtdq2ps(dst, dst, vlen_enc);
2530       break;
2531     case T_LONG:
2532       vpmovsxbq(dst, src, vlen_enc);
2533       break;
2534     case T_DOUBLE: {
2535       int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
2536       vpmovsxbd(dst, src, mid_vlen_enc);
2537       vcvtdq2pd(dst, dst, vlen_enc);
2538       break;
2539     }
2540     default:
2541       fatal("Unsupported type %s", type2name(to_elem_bt));
2542       break;
2543   }
2544 }
2545 
2546 //-------------------------------------------------------------------------------------------
2547 
2548 // IndexOf for constant substrings with size >= 8 chars
2549 // which don't need to be loaded through stack.
2550 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2551                                          Register cnt1, Register cnt2,
2552                                          int int_cnt2,  Register result,
2553                                          XMMRegister vec, Register tmp,
2554                                          int ae) {
2555   ShortBranchVerifier sbv(this);
2556   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2557   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2558 
2559   // This method uses the pcmpestri instruction with bound registers
2560   //   inputs:
2561   //     xmm - substring
2562   //     rax - substring length (elements count)
2563   //     mem - scanned string
2564   //     rdx - string length (elements count)
2565   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2566   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2567   //   outputs:
2568   //     rcx - matched index in string
2569   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2570   int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2571   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2572   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2573   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2574 
2575   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2576         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2577         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2578 
2579   // Note, inline_string_indexOf() generates checks:
2580   // if (substr.count > string.count) return -1;
2581   // if (substr.count == 0) return 0;
2582   assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2583 
2584   // Load substring.
2585   if (ae == StrIntrinsicNode::UL) {
2586     pmovzxbw(vec, Address(str2, 0));
2587   } else {
2588     movdqu(vec, Address(str2, 0));
2589   }
2590   movl(cnt2, int_cnt2);
2591   movptr(result, str1); // string addr
2592 
2593   if (int_cnt2 > stride) {
2594     jmpb(SCAN_TO_SUBSTR);
2595 
2596     // Reload substr for rescan, this code
2597     // is executed only for large substrings (> 8 chars)
2598     bind(RELOAD_SUBSTR);
2599     if (ae == StrIntrinsicNode::UL) {
2600       pmovzxbw(vec, Address(str2, 0));
2601     } else {
2602       movdqu(vec, Address(str2, 0));
2603     }
2604     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2605 
2606     bind(RELOAD_STR);
2607     // We came here after the beginning of the substring was
2608     // matched but the rest of it was not so we need to search
2609     // again. Start from the next element after the previous match.
2610 
2611     // cnt2 is number of substring reminding elements and
2612     // cnt1 is number of string reminding elements when cmp failed.
2613     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
2614     subl(cnt1, cnt2);
2615     addl(cnt1, int_cnt2);
2616     movl(cnt2, int_cnt2); // Now restore cnt2
2617 
2618     decrementl(cnt1);     // Shift to next element
2619     cmpl(cnt1, cnt2);
2620     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2621 
2622     addptr(result, (1<<scale1));
2623 
2624   } // (int_cnt2 > 8)
2625 
2626   // Scan string for start of substr in 16-byte vectors
2627   bind(SCAN_TO_SUBSTR);
2628   pcmpestri(vec, Address(result, 0), mode);
2629   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2630   subl(cnt1, stride);
2631   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2632   cmpl(cnt1, cnt2);
2633   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2634   addptr(result, 16);
2635   jmpb(SCAN_TO_SUBSTR);
2636 
2637   // Found a potential substr
2638   bind(FOUND_CANDIDATE);
2639   // Matched whole vector if first element matched (tmp(rcx) == 0).
2640   if (int_cnt2 == stride) {
2641     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
2642   } else { // int_cnt2 > 8
2643     jccb(Assembler::overflow, FOUND_SUBSTR);
2644   }
2645   // After pcmpestri tmp(rcx) contains matched element index
2646   // Compute start addr of substr
2647   lea(result, Address(result, tmp, scale1));
2648 
2649   // Make sure string is still long enough
2650   subl(cnt1, tmp);
2651   cmpl(cnt1, cnt2);
2652   if (int_cnt2 == stride) {
2653     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2654   } else { // int_cnt2 > 8
2655     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
2656   }
2657   // Left less then substring.
2658 
2659   bind(RET_NOT_FOUND);
2660   movl(result, -1);
2661   jmp(EXIT);
2662 
2663   if (int_cnt2 > stride) {
2664     // This code is optimized for the case when whole substring
2665     // is matched if its head is matched.
2666     bind(MATCH_SUBSTR_HEAD);
2667     pcmpestri(vec, Address(result, 0), mode);
2668     // Reload only string if does not match
2669     jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
2670 
2671     Label CONT_SCAN_SUBSTR;
2672     // Compare the rest of substring (> 8 chars).
2673     bind(FOUND_SUBSTR);
2674     // First 8 chars are already matched.
2675     negptr(cnt2);
2676     addptr(cnt2, stride);
2677 
2678     bind(SCAN_SUBSTR);
2679     subl(cnt1, stride);
2680     cmpl(cnt2, -stride); // Do not read beyond substring
2681     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
2682     // Back-up strings to avoid reading beyond substring:
2683     // cnt1 = cnt1 - cnt2 + 8
2684     addl(cnt1, cnt2); // cnt2 is negative
2685     addl(cnt1, stride);
2686     movl(cnt2, stride); negptr(cnt2);
2687     bind(CONT_SCAN_SUBSTR);
2688     if (int_cnt2 < (int)G) {
2689       int tail_off1 = int_cnt2<<scale1;
2690       int tail_off2 = int_cnt2<<scale2;
2691       if (ae == StrIntrinsicNode::UL) {
2692         pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
2693       } else {
2694         movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
2695       }
2696       pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
2697     } else {
2698       // calculate index in register to avoid integer overflow (int_cnt2*2)
2699       movl(tmp, int_cnt2);
2700       addptr(tmp, cnt2);
2701       if (ae == StrIntrinsicNode::UL) {
2702         pmovzxbw(vec, Address(str2, tmp, scale2, 0));
2703       } else {
2704         movdqu(vec, Address(str2, tmp, scale2, 0));
2705       }
2706       pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
2707     }
2708     // Need to reload strings pointers if not matched whole vector
2709     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2710     addptr(cnt2, stride);
2711     jcc(Assembler::negative, SCAN_SUBSTR);
2712     // Fall through if found full substring
2713 
2714   } // (int_cnt2 > 8)
2715 
2716   bind(RET_FOUND);
2717   // Found result if we matched full small substring.
2718   // Compute substr offset
2719   subptr(result, str1);
2720   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2721     shrl(result, 1); // index
2722   }
2723   bind(EXIT);
2724 
2725 } // string_indexofC8
2726 
2727 // Small strings are loaded through stack if they cross page boundary.
2728 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
2729                                        Register cnt1, Register cnt2,
2730                                        int int_cnt2,  Register result,
2731                                        XMMRegister vec, Register tmp,
2732                                        int ae) {
2733   ShortBranchVerifier sbv(this);
2734   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2735   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2736 
2737   //
2738   // int_cnt2 is length of small (< 8 chars) constant substring
2739   // or (-1) for non constant substring in which case its length
2740   // is in cnt2 register.
2741   //
2742   // Note, inline_string_indexOf() generates checks:
2743   // if (substr.count > string.count) return -1;
2744   // if (substr.count == 0) return 0;
2745   //
2746   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2747   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
2748   // This method uses the pcmpestri instruction with bound registers
2749   //   inputs:
2750   //     xmm - substring
2751   //     rax - substring length (elements count)
2752   //     mem - scanned string
2753   //     rdx - string length (elements count)
2754   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2755   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2756   //   outputs:
2757   //     rcx - matched index in string
2758   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2759   int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2760   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2761   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2762 
2763   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
2764         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
2765         FOUND_CANDIDATE;
2766 
2767   { //========================================================
2768     // We don't know where these strings are located
2769     // and we can't read beyond them. Load them through stack.
2770     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
2771 
2772     movptr(tmp, rsp); // save old SP
2773 
2774     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
2775       if (int_cnt2 == (1>>scale2)) { // One byte
2776         assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
2777         load_unsigned_byte(result, Address(str2, 0));
2778         movdl(vec, result); // move 32 bits
2779       } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
2780         // Not enough header space in 32-bit VM: 12+3 = 15.
2781         movl(result, Address(str2, -1));
2782         shrl(result, 8);
2783         movdl(vec, result); // move 32 bits
2784       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
2785         load_unsigned_short(result, Address(str2, 0));
2786         movdl(vec, result); // move 32 bits
2787       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
2788         movdl(vec, Address(str2, 0)); // move 32 bits
2789       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
2790         movq(vec, Address(str2, 0));  // move 64 bits
2791       } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
2792         // Array header size is 12 bytes in 32-bit VM
2793         // + 6 bytes for 3 chars == 18 bytes,
2794         // enough space to load vec and shift.
2795         assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
2796         if (ae == StrIntrinsicNode::UL) {
2797           int tail_off = int_cnt2-8;
2798           pmovzxbw(vec, Address(str2, tail_off));
2799           psrldq(vec, -2*tail_off);
2800         }
2801         else {
2802           int tail_off = int_cnt2*(1<<scale2);
2803           movdqu(vec, Address(str2, tail_off-16));
2804           psrldq(vec, 16-tail_off);
2805         }
2806       }
2807     } else { // not constant substring
2808       cmpl(cnt2, stride);
2809       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
2810 
2811       // We can read beyond string if srt+16 does not cross page boundary
2812       // since heaps are aligned and mapped by pages.
2813       assert(os::vm_page_size() < (int)G, "default page should be small");
2814       movl(result, str2); // We need only low 32 bits
2815       andl(result, (os::vm_page_size()-1));
2816       cmpl(result, (os::vm_page_size()-16));
2817       jccb(Assembler::belowEqual, CHECK_STR);
2818 
2819       // Move small strings to stack to allow load 16 bytes into vec.
2820       subptr(rsp, 16);
2821       int stk_offset = wordSize-(1<<scale2);
2822       push(cnt2);
2823 
2824       bind(COPY_SUBSTR);
2825       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
2826         load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
2827         movb(Address(rsp, cnt2, scale2, stk_offset), result);
2828       } else if (ae == StrIntrinsicNode::UU) {
2829         load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
2830         movw(Address(rsp, cnt2, scale2, stk_offset), result);
2831       }
2832       decrement(cnt2);
2833       jccb(Assembler::notZero, COPY_SUBSTR);
2834 
2835       pop(cnt2);
2836       movptr(str2, rsp);  // New substring address
2837     } // non constant
2838 
2839     bind(CHECK_STR);
2840     cmpl(cnt1, stride);
2841     jccb(Assembler::aboveEqual, BIG_STRINGS);
2842 
2843     // Check cross page boundary.
2844     movl(result, str1); // We need only low 32 bits
2845     andl(result, (os::vm_page_size()-1));
2846     cmpl(result, (os::vm_page_size()-16));
2847     jccb(Assembler::belowEqual, BIG_STRINGS);
2848 
2849     subptr(rsp, 16);
2850     int stk_offset = -(1<<scale1);
2851     if (int_cnt2 < 0) { // not constant
2852       push(cnt2);
2853       stk_offset += wordSize;
2854     }
2855     movl(cnt2, cnt1);
2856 
2857     bind(COPY_STR);
2858     if (ae == StrIntrinsicNode::LL) {
2859       load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
2860       movb(Address(rsp, cnt2, scale1, stk_offset), result);
2861     } else {
2862       load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
2863       movw(Address(rsp, cnt2, scale1, stk_offset), result);
2864     }
2865     decrement(cnt2);
2866     jccb(Assembler::notZero, COPY_STR);
2867 
2868     if (int_cnt2 < 0) { // not constant
2869       pop(cnt2);
2870     }
2871     movptr(str1, rsp);  // New string address
2872 
2873     bind(BIG_STRINGS);
2874     // Load substring.
2875     if (int_cnt2 < 0) { // -1
2876       if (ae == StrIntrinsicNode::UL) {
2877         pmovzxbw(vec, Address(str2, 0));
2878       } else {
2879         movdqu(vec, Address(str2, 0));
2880       }
2881       push(cnt2);       // substr count
2882       push(str2);       // substr addr
2883       push(str1);       // string addr
2884     } else {
2885       // Small (< 8 chars) constant substrings are loaded already.
2886       movl(cnt2, int_cnt2);
2887     }
2888     push(tmp);  // original SP
2889 
2890   } // Finished loading
2891 
2892   //========================================================
2893   // Start search
2894   //
2895 
2896   movptr(result, str1); // string addr
2897 
2898   if (int_cnt2  < 0) {  // Only for non constant substring
2899     jmpb(SCAN_TO_SUBSTR);
2900 
2901     // SP saved at sp+0
2902     // String saved at sp+1*wordSize
2903     // Substr saved at sp+2*wordSize
2904     // Substr count saved at sp+3*wordSize
2905 
2906     // Reload substr for rescan, this code
2907     // is executed only for large substrings (> 8 chars)
2908     bind(RELOAD_SUBSTR);
2909     movptr(str2, Address(rsp, 2*wordSize));
2910     movl(cnt2, Address(rsp, 3*wordSize));
2911     if (ae == StrIntrinsicNode::UL) {
2912       pmovzxbw(vec, Address(str2, 0));
2913     } else {
2914       movdqu(vec, Address(str2, 0));
2915     }
2916     // We came here after the beginning of the substring was
2917     // matched but the rest of it was not so we need to search
2918     // again. Start from the next element after the previous match.
2919     subptr(str1, result); // Restore counter
2920     if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2921       shrl(str1, 1);
2922     }
2923     addl(cnt1, str1);
2924     decrementl(cnt1);   // Shift to next element
2925     cmpl(cnt1, cnt2);
2926     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2927 
2928     addptr(result, (1<<scale1));
2929   } // non constant
2930 
2931   // Scan string for start of substr in 16-byte vectors
2932   bind(SCAN_TO_SUBSTR);
2933   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2934   pcmpestri(vec, Address(result, 0), mode);
2935   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2936   subl(cnt1, stride);
2937   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2938   cmpl(cnt1, cnt2);
2939   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2940   addptr(result, 16);
2941 
2942   bind(ADJUST_STR);
2943   cmpl(cnt1, stride); // Do not read beyond string
2944   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2945   // Back-up string to avoid reading beyond string.
2946   lea(result, Address(result, cnt1, scale1, -16));
2947   movl(cnt1, stride);
2948   jmpb(SCAN_TO_SUBSTR);
2949 
2950   // Found a potential substr
2951   bind(FOUND_CANDIDATE);
2952   // After pcmpestri tmp(rcx) contains matched element index
2953 
2954   // Make sure string is still long enough
2955   subl(cnt1, tmp);
2956   cmpl(cnt1, cnt2);
2957   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
2958   // Left less then substring.
2959 
2960   bind(RET_NOT_FOUND);
2961   movl(result, -1);
2962   jmp(CLEANUP);
2963 
2964   bind(FOUND_SUBSTR);
2965   // Compute start addr of substr
2966   lea(result, Address(result, tmp, scale1));
2967   if (int_cnt2 > 0) { // Constant substring
2968     // Repeat search for small substring (< 8 chars)
2969     // from new point without reloading substring.
2970     // Have to check that we don't read beyond string.
2971     cmpl(tmp, stride-int_cnt2);
2972     jccb(Assembler::greater, ADJUST_STR);
2973     // Fall through if matched whole substring.
2974   } else { // non constant
2975     assert(int_cnt2 == -1, "should be != 0");
2976 
2977     addl(tmp, cnt2);
2978     // Found result if we matched whole substring.
2979     cmpl(tmp, stride);
2980     jcc(Assembler::lessEqual, RET_FOUND);
2981 
2982     // Repeat search for small substring (<= 8 chars)
2983     // from new point 'str1' without reloading substring.
2984     cmpl(cnt2, stride);
2985     // Have to check that we don't read beyond string.
2986     jccb(Assembler::lessEqual, ADJUST_STR);
2987 
2988     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
2989     // Compare the rest of substring (> 8 chars).
2990     movptr(str1, result);
2991 
2992     cmpl(tmp, cnt2);
2993     // First 8 chars are already matched.
2994     jccb(Assembler::equal, CHECK_NEXT);
2995 
2996     bind(SCAN_SUBSTR);
2997     pcmpestri(vec, Address(str1, 0), mode);
2998     // Need to reload strings pointers if not matched whole vector
2999     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
3000 
3001     bind(CHECK_NEXT);
3002     subl(cnt2, stride);
3003     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
3004     addptr(str1, 16);
3005     if (ae == StrIntrinsicNode::UL) {
3006       addptr(str2, 8);
3007     } else {
3008       addptr(str2, 16);
3009     }
3010     subl(cnt1, stride);
3011     cmpl(cnt2, stride); // Do not read beyond substring
3012     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
3013     // Back-up strings to avoid reading beyond substring.
3014 
3015     if (ae == StrIntrinsicNode::UL) {
3016       lea(str2, Address(str2, cnt2, scale2, -8));
3017       lea(str1, Address(str1, cnt2, scale1, -16));
3018     } else {
3019       lea(str2, Address(str2, cnt2, scale2, -16));
3020       lea(str1, Address(str1, cnt2, scale1, -16));
3021     }
3022     subl(cnt1, cnt2);
3023     movl(cnt2, stride);
3024     addl(cnt1, stride);
3025     bind(CONT_SCAN_SUBSTR);
3026     if (ae == StrIntrinsicNode::UL) {
3027       pmovzxbw(vec, Address(str2, 0));
3028     } else {
3029       movdqu(vec, Address(str2, 0));
3030     }
3031     jmp(SCAN_SUBSTR);
3032 
3033     bind(RET_FOUND_LONG);
3034     movptr(str1, Address(rsp, wordSize));
3035   } // non constant
3036 
3037   bind(RET_FOUND);
3038   // Compute substr offset
3039   subptr(result, str1);
3040   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3041     shrl(result, 1); // index
3042   }
3043   bind(CLEANUP);
3044   pop(rsp); // restore SP
3045 
3046 } // string_indexof
3047 
3048 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3049                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3050   ShortBranchVerifier sbv(this);
3051   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3052 
3053   int stride = 8;
3054 
3055   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
3056         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
3057         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
3058         FOUND_SEQ_CHAR, DONE_LABEL;
3059 
3060   movptr(result, str1);
3061   if (UseAVX >= 2) {
3062     cmpl(cnt1, stride);
3063     jcc(Assembler::less, SCAN_TO_CHAR);
3064     cmpl(cnt1, 2*stride);
3065     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
3066     movdl(vec1, ch);
3067     vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
3068     vpxor(vec2, vec2);
3069     movl(tmp, cnt1);
3070     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
3071     andl(cnt1,0x0000000F);  //tail count (in chars)
3072 
3073     bind(SCAN_TO_16_CHAR_LOOP);
3074     vmovdqu(vec3, Address(result, 0));
3075     vpcmpeqw(vec3, vec3, vec1, 1);
3076     vptest(vec2, vec3);
3077     jcc(Assembler::carryClear, FOUND_CHAR);
3078     addptr(result, 32);
3079     subl(tmp, 2*stride);
3080     jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
3081     jmp(SCAN_TO_8_CHAR);
3082     bind(SCAN_TO_8_CHAR_INIT);
3083     movdl(vec1, ch);
3084     pshuflw(vec1, vec1, 0x00);
3085     pshufd(vec1, vec1, 0);
3086     pxor(vec2, vec2);
3087   }
3088   bind(SCAN_TO_8_CHAR);
3089   cmpl(cnt1, stride);
3090   jcc(Assembler::less, SCAN_TO_CHAR);
3091   if (UseAVX < 2) {
3092     movdl(vec1, ch);
3093     pshuflw(vec1, vec1, 0x00);
3094     pshufd(vec1, vec1, 0);
3095     pxor(vec2, vec2);
3096   }
3097   movl(tmp, cnt1);
3098   andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
3099   andl(cnt1,0x00000007);  //tail count (in chars)
3100 
3101   bind(SCAN_TO_8_CHAR_LOOP);
3102   movdqu(vec3, Address(result, 0));
3103   pcmpeqw(vec3, vec1);
3104   ptest(vec2, vec3);
3105   jcc(Assembler::carryClear, FOUND_CHAR);
3106   addptr(result, 16);
3107   subl(tmp, stride);
3108   jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
3109   bind(SCAN_TO_CHAR);
3110   testl(cnt1, cnt1);
3111   jcc(Assembler::zero, RET_NOT_FOUND);
3112   bind(SCAN_TO_CHAR_LOOP);
3113   load_unsigned_short(tmp, Address(result, 0));
3114   cmpl(ch, tmp);
3115   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3116   addptr(result, 2);
3117   subl(cnt1, 1);
3118   jccb(Assembler::zero, RET_NOT_FOUND);
3119   jmp(SCAN_TO_CHAR_LOOP);
3120 
3121   bind(RET_NOT_FOUND);
3122   movl(result, -1);
3123   jmpb(DONE_LABEL);
3124 
3125   bind(FOUND_CHAR);
3126   if (UseAVX >= 2) {
3127     vpmovmskb(tmp, vec3);
3128   } else {
3129     pmovmskb(tmp, vec3);
3130   }
3131   bsfl(ch, tmp);
3132   addptr(result, ch);
3133 
3134   bind(FOUND_SEQ_CHAR);
3135   subptr(result, str1);
3136   shrl(result, 1);
3137 
3138   bind(DONE_LABEL);
3139 } // string_indexof_char
3140 
3141 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3142                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3143   ShortBranchVerifier sbv(this);
3144   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3145 
3146   int stride = 16;
3147 
3148   Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
3149         SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
3150         RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
3151         FOUND_SEQ_CHAR, DONE_LABEL;
3152 
3153   movptr(result, str1);
3154   if (UseAVX >= 2) {
3155     cmpl(cnt1, stride);
3156     jcc(Assembler::less, SCAN_TO_CHAR_INIT);
3157     cmpl(cnt1, stride*2);
3158     jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
3159     movdl(vec1, ch);
3160     vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
3161     vpxor(vec2, vec2);
3162     movl(tmp, cnt1);
3163     andl(tmp, 0xFFFFFFE0);  //vector count (in chars)
3164     andl(cnt1,0x0000001F);  //tail count (in chars)
3165 
3166     bind(SCAN_TO_32_CHAR_LOOP);
3167     vmovdqu(vec3, Address(result, 0));
3168     vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
3169     vptest(vec2, vec3);
3170     jcc(Assembler::carryClear, FOUND_CHAR);
3171     addptr(result, 32);
3172     subl(tmp, stride*2);
3173     jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
3174     jmp(SCAN_TO_16_CHAR);
3175 
3176     bind(SCAN_TO_16_CHAR_INIT);
3177     movdl(vec1, ch);
3178     pxor(vec2, vec2);
3179     pshufb(vec1, vec2);
3180   }
3181 
3182   bind(SCAN_TO_16_CHAR);
3183   cmpl(cnt1, stride);
3184   jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left
3185   if (UseAVX < 2) {
3186     movdl(vec1, ch);
3187     pxor(vec2, vec2);
3188     pshufb(vec1, vec2);
3189   }
3190   movl(tmp, cnt1);
3191   andl(tmp, 0xFFFFFFF0);  //vector count (in bytes)
3192   andl(cnt1,0x0000000F);  //tail count (in bytes)
3193 
3194   bind(SCAN_TO_16_CHAR_LOOP);
3195   movdqu(vec3, Address(result, 0));
3196   pcmpeqb(vec3, vec1);
3197   ptest(vec2, vec3);
3198   jcc(Assembler::carryClear, FOUND_CHAR);
3199   addptr(result, 16);
3200   subl(tmp, stride);
3201   jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
3202 
3203   bind(SCAN_TO_CHAR_INIT);
3204   testl(cnt1, cnt1);
3205   jcc(Assembler::zero, RET_NOT_FOUND);
3206   bind(SCAN_TO_CHAR_LOOP);
3207   load_unsigned_byte(tmp, Address(result, 0));
3208   cmpl(ch, tmp);
3209   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3210   addptr(result, 1);
3211   subl(cnt1, 1);
3212   jccb(Assembler::zero, RET_NOT_FOUND);
3213   jmp(SCAN_TO_CHAR_LOOP);
3214 
3215   bind(RET_NOT_FOUND);
3216   movl(result, -1);
3217   jmpb(DONE_LABEL);
3218 
3219   bind(FOUND_CHAR);
3220   if (UseAVX >= 2) {
3221     vpmovmskb(tmp, vec3);
3222   } else {
3223     pmovmskb(tmp, vec3);
3224   }
3225   bsfl(ch, tmp);
3226   addptr(result, ch);
3227 
3228   bind(FOUND_SEQ_CHAR);
3229   subptr(result, str1);
3230 
3231   bind(DONE_LABEL);
3232 } // stringL_indexof_char
3233 
3234 // helper function for string_compare
3235 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
3236                                            Address::ScaleFactor scale, Address::ScaleFactor scale1,
3237                                            Address::ScaleFactor scale2, Register index, int ae) {
3238   if (ae == StrIntrinsicNode::LL) {
3239     load_unsigned_byte(elem1, Address(str1, index, scale, 0));
3240     load_unsigned_byte(elem2, Address(str2, index, scale, 0));
3241   } else if (ae == StrIntrinsicNode::UU) {
3242     load_unsigned_short(elem1, Address(str1, index, scale, 0));
3243     load_unsigned_short(elem2, Address(str2, index, scale, 0));
3244   } else {
3245     load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
3246     load_unsigned_short(elem2, Address(str2, index, scale2, 0));
3247   }
3248 }
3249 
3250 // Compare strings, used for char[] and byte[].
3251 void C2_MacroAssembler::string_compare(Register str1, Register str2,
3252                                        Register cnt1, Register cnt2, Register result,
3253                                        XMMRegister vec1, int ae, KRegister mask) {
3254   ShortBranchVerifier sbv(this);
3255   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
3256   Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only _LP64 && AVX3
3257   int stride, stride2, adr_stride, adr_stride1, adr_stride2;
3258   int stride2x2 = 0x40;
3259   Address::ScaleFactor scale = Address::no_scale;
3260   Address::ScaleFactor scale1 = Address::no_scale;
3261   Address::ScaleFactor scale2 = Address::no_scale;
3262 
3263   if (ae != StrIntrinsicNode::LL) {
3264     stride2x2 = 0x20;
3265   }
3266 
3267   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
3268     shrl(cnt2, 1);
3269   }
3270   // Compute the minimum of the string lengths and the
3271   // difference of the string lengths (stack).
3272   // Do the conditional move stuff
3273   movl(result, cnt1);
3274   subl(cnt1, cnt2);
3275   push(cnt1);
3276   cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
3277 
3278   // Is the minimum length zero?
3279   testl(cnt2, cnt2);
3280   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3281   if (ae == StrIntrinsicNode::LL) {
3282     // Load first bytes
3283     load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
3284     load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
3285   } else if (ae == StrIntrinsicNode::UU) {
3286     // Load first characters
3287     load_unsigned_short(result, Address(str1, 0));
3288     load_unsigned_short(cnt1, Address(str2, 0));
3289   } else {
3290     load_unsigned_byte(result, Address(str1, 0));
3291     load_unsigned_short(cnt1, Address(str2, 0));
3292   }
3293   subl(result, cnt1);
3294   jcc(Assembler::notZero,  POP_LABEL);
3295 
3296   if (ae == StrIntrinsicNode::UU) {
3297     // Divide length by 2 to get number of chars
3298     shrl(cnt2, 1);
3299   }
3300   cmpl(cnt2, 1);
3301   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3302 
3303   // Check if the strings start at the same location and setup scale and stride
3304   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3305     cmpptr(str1, str2);
3306     jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3307     if (ae == StrIntrinsicNode::LL) {
3308       scale = Address::times_1;
3309       stride = 16;
3310     } else {
3311       scale = Address::times_2;
3312       stride = 8;
3313     }
3314   } else {
3315     scale1 = Address::times_1;
3316     scale2 = Address::times_2;
3317     // scale not used
3318     stride = 8;
3319   }
3320 
3321   if (UseAVX >= 2 && UseSSE42Intrinsics) {
3322     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
3323     Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
3324     Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
3325     Label COMPARE_TAIL_LONG;
3326     Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only _LP64 && AVX3
3327 
3328     int pcmpmask = 0x19;
3329     if (ae == StrIntrinsicNode::LL) {
3330       pcmpmask &= ~0x01;
3331     }
3332 
3333     // Setup to compare 16-chars (32-bytes) vectors,
3334     // start from first character again because it has aligned address.
3335     if (ae == StrIntrinsicNode::LL) {
3336       stride2 = 32;
3337     } else {
3338       stride2 = 16;
3339     }
3340     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3341       adr_stride = stride << scale;
3342     } else {
3343       adr_stride1 = 8;  //stride << scale1;
3344       adr_stride2 = 16; //stride << scale2;
3345     }
3346 
3347     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3348     // rax and rdx are used by pcmpestri as elements counters
3349     movl(result, cnt2);
3350     andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
3351     jcc(Assembler::zero, COMPARE_TAIL_LONG);
3352 
3353     // fast path : compare first 2 8-char vectors.
3354     bind(COMPARE_16_CHARS);
3355     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3356       movdqu(vec1, Address(str1, 0));
3357     } else {
3358       pmovzxbw(vec1, Address(str1, 0));
3359     }
3360     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3361     jccb(Assembler::below, COMPARE_INDEX_CHAR);
3362 
3363     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3364       movdqu(vec1, Address(str1, adr_stride));
3365       pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
3366     } else {
3367       pmovzxbw(vec1, Address(str1, adr_stride1));
3368       pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
3369     }
3370     jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
3371     addl(cnt1, stride);
3372 
3373     // Compare the characters at index in cnt1
3374     bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
3375     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3376     subl(result, cnt2);
3377     jmp(POP_LABEL);
3378 
3379     // Setup the registers to start vector comparison loop
3380     bind(COMPARE_WIDE_VECTORS);
3381     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3382       lea(str1, Address(str1, result, scale));
3383       lea(str2, Address(str2, result, scale));
3384     } else {
3385       lea(str1, Address(str1, result, scale1));
3386       lea(str2, Address(str2, result, scale2));
3387     }
3388     subl(result, stride2);
3389     subl(cnt2, stride2);
3390     jcc(Assembler::zero, COMPARE_WIDE_TAIL);
3391     negptr(result);
3392 
3393     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
3394     bind(COMPARE_WIDE_VECTORS_LOOP);
3395 
3396 #ifdef _LP64
3397     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3398       cmpl(cnt2, stride2x2);
3399       jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3400       testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
3401       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
3402 
3403       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3404       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3405         evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
3406         evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3407       } else {
3408         vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
3409         evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3410       }
3411       kortestql(mask, mask);
3412       jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
3413       addptr(result, stride2x2);  // update since we already compared at this addr
3414       subl(cnt2, stride2x2);      // and sub the size too
3415       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3416 
3417       vpxor(vec1, vec1);
3418       jmpb(COMPARE_WIDE_TAIL);
3419     }//if (VM_Version::supports_avx512vlbw())
3420 #endif // _LP64
3421 
3422 
3423     bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3424     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3425       vmovdqu(vec1, Address(str1, result, scale));
3426       vpxor(vec1, Address(str2, result, scale));
3427     } else {
3428       vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
3429       vpxor(vec1, Address(str2, result, scale2));
3430     }
3431     vptest(vec1, vec1);
3432     jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
3433     addptr(result, stride2);
3434     subl(cnt2, stride2);
3435     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
3436     // clean upper bits of YMM registers
3437     vpxor(vec1, vec1);
3438 
3439     // compare wide vectors tail
3440     bind(COMPARE_WIDE_TAIL);
3441     testptr(result, result);
3442     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3443 
3444     movl(result, stride2);
3445     movl(cnt2, result);
3446     negptr(result);
3447     jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3448 
3449     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
3450     bind(VECTOR_NOT_EQUAL);
3451     // clean upper bits of YMM registers
3452     vpxor(vec1, vec1);
3453     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3454       lea(str1, Address(str1, result, scale));
3455       lea(str2, Address(str2, result, scale));
3456     } else {
3457       lea(str1, Address(str1, result, scale1));
3458       lea(str2, Address(str2, result, scale2));
3459     }
3460     jmp(COMPARE_16_CHARS);
3461 
3462     // Compare tail chars, length between 1 to 15 chars
3463     bind(COMPARE_TAIL_LONG);
3464     movl(cnt2, result);
3465     cmpl(cnt2, stride);
3466     jcc(Assembler::less, COMPARE_SMALL_STR);
3467 
3468     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3469       movdqu(vec1, Address(str1, 0));
3470     } else {
3471       pmovzxbw(vec1, Address(str1, 0));
3472     }
3473     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3474     jcc(Assembler::below, COMPARE_INDEX_CHAR);
3475     subptr(cnt2, stride);
3476     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3477     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3478       lea(str1, Address(str1, result, scale));
3479       lea(str2, Address(str2, result, scale));
3480     } else {
3481       lea(str1, Address(str1, result, scale1));
3482       lea(str2, Address(str2, result, scale2));
3483     }
3484     negptr(cnt2);
3485     jmpb(WHILE_HEAD_LABEL);
3486 
3487     bind(COMPARE_SMALL_STR);
3488   } else if (UseSSE42Intrinsics) {
3489     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
3490     int pcmpmask = 0x19;
3491     // Setup to compare 8-char (16-byte) vectors,
3492     // start from first character again because it has aligned address.
3493     movl(result, cnt2);
3494     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
3495     if (ae == StrIntrinsicNode::LL) {
3496       pcmpmask &= ~0x01;
3497     }
3498     jcc(Assembler::zero, COMPARE_TAIL);
3499     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3500       lea(str1, Address(str1, result, scale));
3501       lea(str2, Address(str2, result, scale));
3502     } else {
3503       lea(str1, Address(str1, result, scale1));
3504       lea(str2, Address(str2, result, scale2));
3505     }
3506     negptr(result);
3507 
3508     // pcmpestri
3509     //   inputs:
3510     //     vec1- substring
3511     //     rax - negative string length (elements count)
3512     //     mem - scanned string
3513     //     rdx - string length (elements count)
3514     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
3515     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
3516     //   outputs:
3517     //     rcx - first mismatched element index
3518     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3519 
3520     bind(COMPARE_WIDE_VECTORS);
3521     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3522       movdqu(vec1, Address(str1, result, scale));
3523       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3524     } else {
3525       pmovzxbw(vec1, Address(str1, result, scale1));
3526       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3527     }
3528     // After pcmpestri cnt1(rcx) contains mismatched element index
3529 
3530     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
3531     addptr(result, stride);
3532     subptr(cnt2, stride);
3533     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3534 
3535     // compare wide vectors tail
3536     testptr(result, result);
3537     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3538 
3539     movl(cnt2, stride);
3540     movl(result, stride);
3541     negptr(result);
3542     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3543       movdqu(vec1, Address(str1, result, scale));
3544       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3545     } else {
3546       pmovzxbw(vec1, Address(str1, result, scale1));
3547       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3548     }
3549     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
3550 
3551     // Mismatched characters in the vectors
3552     bind(VECTOR_NOT_EQUAL);
3553     addptr(cnt1, result);
3554     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3555     subl(result, cnt2);
3556     jmpb(POP_LABEL);
3557 
3558     bind(COMPARE_TAIL); // limit is zero
3559     movl(cnt2, result);
3560     // Fallthru to tail compare
3561   }
3562   // Shift str2 and str1 to the end of the arrays, negate min
3563   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3564     lea(str1, Address(str1, cnt2, scale));
3565     lea(str2, Address(str2, cnt2, scale));
3566   } else {
3567     lea(str1, Address(str1, cnt2, scale1));
3568     lea(str2, Address(str2, cnt2, scale2));
3569   }
3570   decrementl(cnt2);  // first character was compared already
3571   negptr(cnt2);
3572 
3573   // Compare the rest of the elements
3574   bind(WHILE_HEAD_LABEL);
3575   load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
3576   subl(result, cnt1);
3577   jccb(Assembler::notZero, POP_LABEL);
3578   increment(cnt2);
3579   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
3580 
3581   // Strings are equal up to min length.  Return the length difference.
3582   bind(LENGTH_DIFF_LABEL);
3583   pop(result);
3584   if (ae == StrIntrinsicNode::UU) {
3585     // Divide diff by 2 to get number of chars
3586     sarl(result, 1);
3587   }
3588   jmpb(DONE_LABEL);
3589 
3590 #ifdef _LP64
3591   if (VM_Version::supports_avx512vlbw()) {
3592 
3593     bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
3594 
3595     kmovql(cnt1, mask);
3596     notq(cnt1);
3597     bsfq(cnt2, cnt1);
3598     if (ae != StrIntrinsicNode::LL) {
3599       // Divide diff by 2 to get number of chars
3600       sarl(cnt2, 1);
3601     }
3602     addq(result, cnt2);
3603     if (ae == StrIntrinsicNode::LL) {
3604       load_unsigned_byte(cnt1, Address(str2, result));
3605       load_unsigned_byte(result, Address(str1, result));
3606     } else if (ae == StrIntrinsicNode::UU) {
3607       load_unsigned_short(cnt1, Address(str2, result, scale));
3608       load_unsigned_short(result, Address(str1, result, scale));
3609     } else {
3610       load_unsigned_short(cnt1, Address(str2, result, scale2));
3611       load_unsigned_byte(result, Address(str1, result, scale1));
3612     }
3613     subl(result, cnt1);
3614     jmpb(POP_LABEL);
3615   }//if (VM_Version::supports_avx512vlbw())
3616 #endif // _LP64
3617 
3618   // Discard the stored length difference
3619   bind(POP_LABEL);
3620   pop(cnt1);
3621 
3622   // That's it
3623   bind(DONE_LABEL);
3624   if(ae == StrIntrinsicNode::UL) {
3625     negl(result);
3626   }
3627 
3628 }
3629 
3630 // Search for Non-ASCII character (Negative byte value) in a byte array,
3631 // return the index of the first such character, otherwise the length
3632 // of the array segment searched.
3633 //   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
3634 //   @IntrinsicCandidate
3635 //   public static int countPositives(byte[] ba, int off, int len) {
3636 //     for (int i = off; i < off + len; i++) {
3637 //       if (ba[i] < 0) {
3638 //         return i - off;
3639 //       }
3640 //     }
3641 //     return len;
3642 //   }
3643 void C2_MacroAssembler::count_positives(Register ary1, Register len,
3644   Register result, Register tmp1,
3645   XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {
3646   // rsi: byte array
3647   // rcx: len
3648   // rax: result
3649   ShortBranchVerifier sbv(this);
3650   assert_different_registers(ary1, len, result, tmp1);
3651   assert_different_registers(vec1, vec2);
3652   Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
3653 
3654   movl(result, len); // copy
3655   // len == 0
3656   testl(len, len);
3657   jcc(Assembler::zero, DONE);
3658 
3659   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
3660     VM_Version::supports_avx512vlbw() &&
3661     VM_Version::supports_bmi2()) {
3662 
3663     Label test_64_loop, test_tail, BREAK_LOOP;
3664     Register tmp3_aliased = len;
3665 
3666     movl(tmp1, len);
3667     vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
3668 
3669     andl(tmp1, 64 - 1);   // tail count (in chars) 0x3F
3670     andl(len, ~(64 - 1));    // vector count (in chars)
3671     jccb(Assembler::zero, test_tail);
3672 
3673     lea(ary1, Address(ary1, len, Address::times_1));
3674     negptr(len);
3675 
3676     bind(test_64_loop);
3677     // Check whether our 64 elements of size byte contain negatives
3678     evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
3679     kortestql(mask1, mask1);
3680     jcc(Assembler::notZero, BREAK_LOOP);
3681 
3682     addptr(len, 64);
3683     jccb(Assembler::notZero, test_64_loop);
3684 
3685     bind(test_tail);
3686     // bail out when there is nothing to be done
3687     testl(tmp1, -1);
3688     jcc(Assembler::zero, DONE);
3689 
3690     // ~(~0 << len) applied up to two times (for 32-bit scenario)
3691 #ifdef _LP64
3692     mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
3693     shlxq(tmp3_aliased, tmp3_aliased, tmp1);
3694     notq(tmp3_aliased);
3695     kmovql(mask2, tmp3_aliased);
3696 #else
3697     Label k_init;
3698     jmp(k_init);
3699 
3700     // We could not read 64-bits from a general purpose register thus we move
3701     // data required to compose 64 1's to the instruction stream
3702     // We emit 64 byte wide series of elements from 0..63 which later on would
3703     // be used as a compare targets with tail count contained in tmp1 register.
3704     // Result would be a k register having tmp1 consecutive number or 1
3705     // counting from least significant bit.
3706     address tmp = pc();
3707     emit_int64(0x0706050403020100);
3708     emit_int64(0x0F0E0D0C0B0A0908);
3709     emit_int64(0x1716151413121110);
3710     emit_int64(0x1F1E1D1C1B1A1918);
3711     emit_int64(0x2726252423222120);
3712     emit_int64(0x2F2E2D2C2B2A2928);
3713     emit_int64(0x3736353433323130);
3714     emit_int64(0x3F3E3D3C3B3A3938);
3715 
3716     bind(k_init);
3717     lea(len, InternalAddress(tmp));
3718     // create mask to test for negative byte inside a vector
3719     evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit);
3720     evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit);
3721 
3722 #endif
3723     evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);
3724     ktestq(mask1, mask2);
3725     jcc(Assembler::zero, DONE);
3726 
3727     bind(BREAK_LOOP);
3728     // At least one byte in the last 64 bytes is negative.
3729     // Set up to look at the last 64 bytes as if they were a tail
3730     lea(ary1, Address(ary1, len, Address::times_1));
3731     addptr(result, len);
3732     // Ignore the very last byte: if all others are positive,
3733     // it must be negative, so we can skip right to the 2+1 byte
3734     // end comparison at this point
3735     orl(result, 63);
3736     movl(len, 63);
3737     // Fallthru to tail compare
3738   } else {
3739 
3740     if (UseAVX >= 2 && UseSSE >= 2) {
3741       // With AVX2, use 32-byte vector compare
3742       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
3743 
3744       // Compare 32-byte vectors
3745       testl(len, 0xffffffe0);   // vector count (in bytes)
3746       jccb(Assembler::zero, TAIL_START);
3747 
3748       andl(len, 0xffffffe0);
3749       lea(ary1, Address(ary1, len, Address::times_1));
3750       negptr(len);
3751 
3752       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
3753       movdl(vec2, tmp1);
3754       vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
3755 
3756       bind(COMPARE_WIDE_VECTORS);
3757       vmovdqu(vec1, Address(ary1, len, Address::times_1));
3758       vptest(vec1, vec2);
3759       jccb(Assembler::notZero, BREAK_LOOP);
3760       addptr(len, 32);
3761       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3762 
3763       testl(result, 0x0000001f);   // any bytes remaining?
3764       jcc(Assembler::zero, DONE);
3765 
3766       // Quick test using the already prepared vector mask
3767       movl(len, result);
3768       andl(len, 0x0000001f);
3769       vmovdqu(vec1, Address(ary1, len, Address::times_1, -32));
3770       vptest(vec1, vec2);
3771       jcc(Assembler::zero, DONE);
3772       // There are zeros, jump to the tail to determine exactly where
3773       jmpb(TAIL_START);
3774 
3775       bind(BREAK_LOOP);
3776       // At least one byte in the last 32-byte vector is negative.
3777       // Set up to look at the last 32 bytes as if they were a tail
3778       lea(ary1, Address(ary1, len, Address::times_1));
3779       addptr(result, len);
3780       // Ignore the very last byte: if all others are positive,
3781       // it must be negative, so we can skip right to the 2+1 byte
3782       // end comparison at this point
3783       orl(result, 31);
3784       movl(len, 31);
3785       // Fallthru to tail compare
3786     } else if (UseSSE42Intrinsics) {
3787       // With SSE4.2, use double quad vector compare
3788       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
3789 
3790       // Compare 16-byte vectors
3791       testl(len, 0xfffffff0);   // vector count (in bytes)
3792       jcc(Assembler::zero, TAIL_START);
3793 
3794       andl(len, 0xfffffff0);
3795       lea(ary1, Address(ary1, len, Address::times_1));
3796       negptr(len);
3797 
3798       movl(tmp1, 0x80808080);
3799       movdl(vec2, tmp1);
3800       pshufd(vec2, vec2, 0);
3801 
3802       bind(COMPARE_WIDE_VECTORS);
3803       movdqu(vec1, Address(ary1, len, Address::times_1));
3804       ptest(vec1, vec2);
3805       jccb(Assembler::notZero, BREAK_LOOP);
3806       addptr(len, 16);
3807       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3808 
3809       testl(result, 0x0000000f); // len is zero, any bytes remaining?
3810       jcc(Assembler::zero, DONE);
3811 
3812       // Quick test using the already prepared vector mask
3813       movl(len, result);
3814       andl(len, 0x0000000f);   // tail count (in bytes)
3815       movdqu(vec1, Address(ary1, len, Address::times_1, -16));
3816       ptest(vec1, vec2);
3817       jcc(Assembler::zero, DONE);
3818       jmpb(TAIL_START);
3819 
3820       bind(BREAK_LOOP);
3821       // At least one byte in the last 16-byte vector is negative.
3822       // Set up and look at the last 16 bytes as if they were a tail
3823       lea(ary1, Address(ary1, len, Address::times_1));
3824       addptr(result, len);
3825       // Ignore the very last byte: if all others are positive,
3826       // it must be negative, so we can skip right to the 2+1 byte
3827       // end comparison at this point
3828       orl(result, 15);
3829       movl(len, 15);
3830       // Fallthru to tail compare
3831     }
3832   }
3833 
3834   bind(TAIL_START);
3835   // Compare 4-byte vectors
3836   andl(len, 0xfffffffc); // vector count (in bytes)
3837   jccb(Assembler::zero, COMPARE_CHAR);
3838 
3839   lea(ary1, Address(ary1, len, Address::times_1));
3840   negptr(len);
3841 
3842   bind(COMPARE_VECTORS);
3843   movl(tmp1, Address(ary1, len, Address::times_1));
3844   andl(tmp1, 0x80808080);
3845   jccb(Assembler::notZero, TAIL_ADJUST);
3846   addptr(len, 4);
3847   jccb(Assembler::notZero, COMPARE_VECTORS);
3848 
3849   // Compare trailing char (final 2-3 bytes), if any
3850   bind(COMPARE_CHAR);
3851 
3852   testl(result, 0x2);   // tail  char
3853   jccb(Assembler::zero, COMPARE_BYTE);
3854   load_unsigned_short(tmp1, Address(ary1, 0));
3855   andl(tmp1, 0x00008080);
3856   jccb(Assembler::notZero, CHAR_ADJUST);
3857   lea(ary1, Address(ary1, 2));
3858 
3859   bind(COMPARE_BYTE);
3860   testl(result, 0x1);   // tail  byte
3861   jccb(Assembler::zero, DONE);
3862   load_unsigned_byte(tmp1, Address(ary1, 0));
3863   testl(tmp1, 0x00000080);
3864   jccb(Assembler::zero, DONE);
3865   subptr(result, 1);
3866   jmpb(DONE);
3867 
3868   bind(TAIL_ADJUST);
3869   // there are negative bits in the last 4 byte block.
3870   // Adjust result and check the next three bytes
3871   addptr(result, len);
3872   orl(result, 3);
3873   lea(ary1, Address(ary1, len, Address::times_1));
3874   jmpb(COMPARE_CHAR);
3875 
3876   bind(CHAR_ADJUST);
3877   // We are looking at a char + optional byte tail, and found that one
3878   // of the bytes in the char is negative. Adjust the result, check the
3879   // first byte and readjust if needed.
3880   andl(result, 0xfffffffc);
3881   testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first
3882   jccb(Assembler::notZero, DONE);
3883   addptr(result, 1);
3884 
3885   // That's it
3886   bind(DONE);
3887   if (UseAVX >= 2 && UseSSE >= 2) {
3888     // clean upper bits of YMM registers
3889     vpxor(vec1, vec1);
3890     vpxor(vec2, vec2);
3891   }
3892 }
3893 
3894 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
3895 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
3896                                       Register limit, Register result, Register chr,
3897                                       XMMRegister vec1, XMMRegister vec2, bool is_char, KRegister mask) {
3898   ShortBranchVerifier sbv(this);
3899   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
3900 
3901   int length_offset  = arrayOopDesc::length_offset_in_bytes();
3902   int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
3903 
3904   if (is_array_equ) {
3905     // Check the input args
3906     cmpoop(ary1, ary2);
3907     jcc(Assembler::equal, TRUE_LABEL);
3908 
3909     // Need additional checks for arrays_equals.
3910     testptr(ary1, ary1);
3911     jcc(Assembler::zero, FALSE_LABEL);
3912     testptr(ary2, ary2);
3913     jcc(Assembler::zero, FALSE_LABEL);
3914 
3915     // Check the lengths
3916     movl(limit, Address(ary1, length_offset));
3917     cmpl(limit, Address(ary2, length_offset));
3918     jcc(Assembler::notEqual, FALSE_LABEL);
3919   }
3920 
3921   // count == 0
3922   testl(limit, limit);
3923   jcc(Assembler::zero, TRUE_LABEL);
3924 
3925   if (is_array_equ) {
3926     // Load array address
3927     lea(ary1, Address(ary1, base_offset));
3928     lea(ary2, Address(ary2, base_offset));
3929   }
3930 
3931   if (is_array_equ && is_char) {
3932     // arrays_equals when used for char[].
3933     shll(limit, 1);      // byte count != 0
3934   }
3935   movl(result, limit); // copy
3936 
3937   if (UseAVX >= 2) {
3938     // With AVX2, use 32-byte vector compare
3939     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
3940 
3941     // Compare 32-byte vectors
3942     andl(result, 0x0000001f);  //   tail count (in bytes)
3943     andl(limit, 0xffffffe0);   // vector count (in bytes)
3944     jcc(Assembler::zero, COMPARE_TAIL);
3945 
3946     lea(ary1, Address(ary1, limit, Address::times_1));
3947     lea(ary2, Address(ary2, limit, Address::times_1));
3948     negptr(limit);
3949 
3950 #ifdef _LP64
3951     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3952       Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
3953 
3954       cmpl(limit, -64);
3955       jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3956 
3957       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3958 
3959       evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
3960       evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
3961       kortestql(mask, mask);
3962       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
3963       addptr(limit, 64);  // update since we already compared at this addr
3964       cmpl(limit, -64);
3965       jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3966 
3967       // At this point we may still need to compare -limit+result bytes.
3968       // We could execute the next two instruction and just continue via non-wide path:
3969       //  cmpl(limit, 0);
3970       //  jcc(Assembler::equal, COMPARE_TAIL);  // true
3971       // But since we stopped at the points ary{1,2}+limit which are
3972       // not farther than 64 bytes from the ends of arrays ary{1,2}+result
3973       // (|limit| <= 32 and result < 32),
3974       // we may just compare the last 64 bytes.
3975       //
3976       addptr(result, -64);   // it is safe, bc we just came from this area
3977       evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
3978       evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
3979       kortestql(mask, mask);
3980       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
3981 
3982       jmp(TRUE_LABEL);
3983 
3984       bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3985 
3986     }//if (VM_Version::supports_avx512vlbw())
3987 #endif //_LP64
3988     bind(COMPARE_WIDE_VECTORS);
3989     vmovdqu(vec1, Address(ary1, limit, Address::times_1));
3990     vmovdqu(vec2, Address(ary2, limit, Address::times_1));
3991     vpxor(vec1, vec2);
3992 
3993     vptest(vec1, vec1);
3994     jcc(Assembler::notZero, FALSE_LABEL);
3995     addptr(limit, 32);
3996     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
3997 
3998     testl(result, result);
3999     jcc(Assembler::zero, TRUE_LABEL);
4000 
4001     vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
4002     vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
4003     vpxor(vec1, vec2);
4004 
4005     vptest(vec1, vec1);
4006     jccb(Assembler::notZero, FALSE_LABEL);
4007     jmpb(TRUE_LABEL);
4008 
4009     bind(COMPARE_TAIL); // limit is zero
4010     movl(limit, result);
4011     // Fallthru to tail compare
4012   } else if (UseSSE42Intrinsics) {
4013     // With SSE4.2, use double quad vector compare
4014     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4015 
4016     // Compare 16-byte vectors
4017     andl(result, 0x0000000f);  //   tail count (in bytes)
4018     andl(limit, 0xfffffff0);   // vector count (in bytes)
4019     jcc(Assembler::zero, COMPARE_TAIL);
4020 
4021     lea(ary1, Address(ary1, limit, Address::times_1));
4022     lea(ary2, Address(ary2, limit, Address::times_1));
4023     negptr(limit);
4024 
4025     bind(COMPARE_WIDE_VECTORS);
4026     movdqu(vec1, Address(ary1, limit, Address::times_1));
4027     movdqu(vec2, Address(ary2, limit, Address::times_1));
4028     pxor(vec1, vec2);
4029 
4030     ptest(vec1, vec1);
4031     jcc(Assembler::notZero, FALSE_LABEL);
4032     addptr(limit, 16);
4033     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4034 
4035     testl(result, result);
4036     jcc(Assembler::zero, TRUE_LABEL);
4037 
4038     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
4039     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
4040     pxor(vec1, vec2);
4041 
4042     ptest(vec1, vec1);
4043     jccb(Assembler::notZero, FALSE_LABEL);
4044     jmpb(TRUE_LABEL);
4045 
4046     bind(COMPARE_TAIL); // limit is zero
4047     movl(limit, result);
4048     // Fallthru to tail compare
4049   }
4050 
4051   // Compare 4-byte vectors
4052   andl(limit, 0xfffffffc); // vector count (in bytes)
4053   jccb(Assembler::zero, COMPARE_CHAR);
4054 
4055   lea(ary1, Address(ary1, limit, Address::times_1));
4056   lea(ary2, Address(ary2, limit, Address::times_1));
4057   negptr(limit);
4058 
4059   bind(COMPARE_VECTORS);
4060   movl(chr, Address(ary1, limit, Address::times_1));
4061   cmpl(chr, Address(ary2, limit, Address::times_1));
4062   jccb(Assembler::notEqual, FALSE_LABEL);
4063   addptr(limit, 4);
4064   jcc(Assembler::notZero, COMPARE_VECTORS);
4065 
4066   // Compare trailing char (final 2 bytes), if any
4067   bind(COMPARE_CHAR);
4068   testl(result, 0x2);   // tail  char
4069   jccb(Assembler::zero, COMPARE_BYTE);
4070   load_unsigned_short(chr, Address(ary1, 0));
4071   load_unsigned_short(limit, Address(ary2, 0));
4072   cmpl(chr, limit);
4073   jccb(Assembler::notEqual, FALSE_LABEL);
4074 
4075   if (is_array_equ && is_char) {
4076     bind(COMPARE_BYTE);
4077   } else {
4078     lea(ary1, Address(ary1, 2));
4079     lea(ary2, Address(ary2, 2));
4080 
4081     bind(COMPARE_BYTE);
4082     testl(result, 0x1);   // tail  byte
4083     jccb(Assembler::zero, TRUE_LABEL);
4084     load_unsigned_byte(chr, Address(ary1, 0));
4085     load_unsigned_byte(limit, Address(ary2, 0));
4086     cmpl(chr, limit);
4087     jccb(Assembler::notEqual, FALSE_LABEL);
4088   }
4089   bind(TRUE_LABEL);
4090   movl(result, 1);   // return true
4091   jmpb(DONE);
4092 
4093   bind(FALSE_LABEL);
4094   xorl(result, result); // return false
4095 
4096   // That's it
4097   bind(DONE);
4098   if (UseAVX >= 2) {
4099     // clean upper bits of YMM registers
4100     vpxor(vec1, vec1);
4101     vpxor(vec2, vec2);
4102   }
4103 }
4104 
4105 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4106                                     XMMRegister src1, int imm8, bool merge, int vlen_enc) {
4107   switch(ideal_opc) {
4108     case Op_LShiftVS:
4109       Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break;
4110     case Op_LShiftVI:
4111       Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break;
4112     case Op_LShiftVL:
4113       Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break;
4114     case Op_RShiftVS:
4115       Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break;
4116     case Op_RShiftVI:
4117       Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break;
4118     case Op_RShiftVL:
4119       Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break;
4120     case Op_URShiftVS:
4121       Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break;
4122     case Op_URShiftVI:
4123       Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break;
4124     case Op_URShiftVL:
4125       Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break;
4126     case Op_RotateRightV:
4127       evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4128     case Op_RotateLeftV:
4129       evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4130     default:
4131       fatal("Unsupported masked operation"); break;
4132   }
4133 }
4134 
4135 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4136                                     XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc,
4137                                     bool is_varshift) {
4138   switch (ideal_opc) {
4139     case Op_AddVB:
4140       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4141     case Op_AddVS:
4142       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4143     case Op_AddVI:
4144       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4145     case Op_AddVL:
4146       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4147     case Op_AddVF:
4148       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4149     case Op_AddVD:
4150       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4151     case Op_SubVB:
4152       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4153     case Op_SubVS:
4154       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4155     case Op_SubVI:
4156       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4157     case Op_SubVL:
4158       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4159     case Op_SubVF:
4160       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4161     case Op_SubVD:
4162       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4163     case Op_MulVS:
4164       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4165     case Op_MulVI:
4166       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4167     case Op_MulVL:
4168       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4169     case Op_MulVF:
4170       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4171     case Op_MulVD:
4172       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4173     case Op_DivVF:
4174       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4175     case Op_DivVD:
4176       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4177     case Op_SqrtVF:
4178       evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break;
4179     case Op_SqrtVD:
4180       evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break;
4181     case Op_AbsVB:
4182       evpabsb(dst, mask, src2, merge, vlen_enc); break;
4183     case Op_AbsVS:
4184       evpabsw(dst, mask, src2, merge, vlen_enc); break;
4185     case Op_AbsVI:
4186       evpabsd(dst, mask, src2, merge, vlen_enc); break;
4187     case Op_AbsVL:
4188       evpabsq(dst, mask, src2, merge, vlen_enc); break;
4189     case Op_FmaVF:
4190       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4191     case Op_FmaVD:
4192       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4193     case Op_VectorRearrange:
4194       evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break;
4195     case Op_LShiftVS:
4196       evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4197     case Op_LShiftVI:
4198       evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4199     case Op_LShiftVL:
4200       evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4201     case Op_RShiftVS:
4202       evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4203     case Op_RShiftVI:
4204       evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4205     case Op_RShiftVL:
4206       evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4207     case Op_URShiftVS:
4208       evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4209     case Op_URShiftVI:
4210       evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4211     case Op_URShiftVL:
4212       evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4213     case Op_RotateLeftV:
4214       evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4215     case Op_RotateRightV:
4216       evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4217     case Op_MaxV:
4218       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4219     case Op_MinV:
4220       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4221     case Op_XorV:
4222       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4223     case Op_OrV:
4224       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4225     case Op_AndV:
4226       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4227     default:
4228       fatal("Unsupported masked operation"); break;
4229   }
4230 }
4231 
4232 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4233                                     XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4234   switch (ideal_opc) {
4235     case Op_AddVB:
4236       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4237     case Op_AddVS:
4238       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4239     case Op_AddVI:
4240       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4241     case Op_AddVL:
4242       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4243     case Op_AddVF:
4244       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4245     case Op_AddVD:
4246       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4247     case Op_SubVB:
4248       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4249     case Op_SubVS:
4250       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4251     case Op_SubVI:
4252       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4253     case Op_SubVL:
4254       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4255     case Op_SubVF:
4256       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4257     case Op_SubVD:
4258       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4259     case Op_MulVS:
4260       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4261     case Op_MulVI:
4262       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4263     case Op_MulVL:
4264       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4265     case Op_MulVF:
4266       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4267     case Op_MulVD:
4268       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4269     case Op_DivVF:
4270       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4271     case Op_DivVD:
4272       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4273     case Op_FmaVF:
4274       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4275     case Op_FmaVD:
4276       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4277     case Op_MaxV:
4278       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4279     case Op_MinV:
4280       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4281     case Op_XorV:
4282       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4283     case Op_OrV:
4284       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4285     case Op_AndV:
4286       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4287     default:
4288       fatal("Unsupported masked operation"); break;
4289   }
4290 }
4291 
4292 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst,
4293                                   KRegister src1, KRegister src2) {
4294   BasicType etype = T_ILLEGAL;
4295   switch(mask_len) {
4296     case 2:
4297     case 4:
4298     case 8:  etype = T_BYTE; break;
4299     case 16: etype = T_SHORT; break;
4300     case 32: etype = T_INT; break;
4301     case 64: etype = T_LONG; break;
4302     default: fatal("Unsupported type"); break;
4303   }
4304   assert(etype != T_ILLEGAL, "");
4305   switch(ideal_opc) {
4306     case Op_AndVMask:
4307       kand(etype, dst, src1, src2); break;
4308     case Op_OrVMask:
4309       kor(etype, dst, src1, src2); break;
4310     case Op_XorVMask:
4311       kxor(etype, dst, src1, src2); break;
4312     default:
4313       fatal("Unsupported masked operation"); break;
4314   }
4315 }
4316 
4317 /*
4318  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4319  * If src is NaN, the result is 0.
4320  * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE,
4321  * the result is equal to the value of Integer.MIN_VALUE.
4322  * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE,
4323  * the result is equal to the value of Integer.MAX_VALUE.
4324  */
4325 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4326                                                                    XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
4327                                                                    Register rscratch, AddressLiteral float_sign_flip,
4328                                                                    int vec_enc) {
4329   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4330   Label done;
4331   vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch);
4332   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
4333   vptest(xtmp2, xtmp2, vec_enc);
4334   jccb(Assembler::equal, done);
4335 
4336   vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc);
4337   vpxor(xtmp1, xtmp1, xtmp4, vec_enc);
4338 
4339   vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
4340   vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc);
4341   vblendvps(dst, dst, xtmp4, xtmp3, vec_enc);
4342 
4343   // Recompute the mask for remaining special value.
4344   vpxor(xtmp2, xtmp2, xtmp3, vec_enc);
4345   // Extract SRC values corresponding to TRUE mask lanes.
4346   vpand(xtmp4, xtmp2, src, vec_enc);
4347   // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special
4348   // values are set.
4349   vpxor(xtmp3, xtmp2, xtmp4, vec_enc);
4350 
4351   vblendvps(dst, dst, xtmp1, xtmp3, vec_enc);
4352   bind(done);
4353 }
4354 
4355 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4356                                                                     XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4357                                                                     Register rscratch, AddressLiteral float_sign_flip,
4358                                                                     int vec_enc) {
4359   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4360   Label done;
4361   evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
4362   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4363   kortestwl(ktmp1, ktmp1);
4364   jccb(Assembler::equal, done);
4365 
4366   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4367   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4368   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
4369 
4370   kxorwl(ktmp1, ktmp1, ktmp2);
4371   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4372   vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4373   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
4374   bind(done);
4375 }
4376 
4377 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4378                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4379                                                                      Register rscratch, AddressLiteral double_sign_flip,
4380                                                                      int vec_enc) {
4381   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4382 
4383   Label done;
4384   evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
4385   Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc);
4386   kortestwl(ktmp1, ktmp1);
4387   jccb(Assembler::equal, done);
4388 
4389   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4390   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4391   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
4392 
4393   kxorwl(ktmp1, ktmp1, ktmp2);
4394   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4395   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4396   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
4397   bind(done);
4398 }
4399 
4400 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4401                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4402                                                                      Register rscratch, AddressLiteral float_sign_flip,
4403                                                                      int vec_enc) {
4404   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4405   Label done;
4406   evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
4407   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4408   kortestwl(ktmp1, ktmp1);
4409   jccb(Assembler::equal, done);
4410 
4411   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4412   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4413   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
4414 
4415   kxorwl(ktmp1, ktmp1, ktmp2);
4416   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4417   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4418   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
4419   bind(done);
4420 }
4421 
4422 /*
4423  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4424  * If src is NaN, the result is 0.
4425  * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE,
4426  * the result is equal to the value of Long.MIN_VALUE.
4427  * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE,
4428  * the result is equal to the value of Long.MAX_VALUE.
4429  */
4430 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4431                                                                       XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4432                                                                       Register rscratch, AddressLiteral double_sign_flip,
4433                                                                       int vec_enc) {
4434   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4435 
4436   Label done;
4437   evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
4438   evpcmpeqq(ktmp1, xtmp1, dst, vec_enc);
4439   kortestwl(ktmp1, ktmp1);
4440   jccb(Assembler::equal, done);
4441 
4442   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4443   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4444   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
4445 
4446   kxorwl(ktmp1, ktmp1, ktmp2);
4447   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4448   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4449   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
4450   bind(done);
4451 }
4452 
4453 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero,
4454                                                              XMMRegister xtmp, int index, int vec_enc) {
4455    assert(vec_enc < Assembler::AVX_512bit, "");
4456    if (vec_enc == Assembler::AVX_256bit) {
4457      vextractf128_high(xtmp, src);
4458      vshufps(dst, src, xtmp, index, vec_enc);
4459    } else {
4460      vshufps(dst, src, zero, index, vec_enc);
4461    }
4462 }
4463 
4464 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
4465                                                                     XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch,
4466                                                                     AddressLiteral float_sign_flip, int src_vec_enc) {
4467   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4468 
4469   Label done;
4470   // Compare the destination lanes with float_sign_flip
4471   // value to get mask for all special values.
4472   movdqu(xtmp1, float_sign_flip, rscratch);
4473   vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit);
4474   ptest(xtmp2, xtmp2);
4475   jccb(Assembler::equal, done);
4476 
4477   // Flip float_sign_flip to get max integer value.
4478   vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit);
4479   pxor(xtmp1, xtmp4);
4480 
4481   // Set detination lanes corresponding to unordered source lanes as zero.
4482   vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc);
4483   vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc);
4484 
4485   // Shuffle mask vector and pack lower doubles word from each quadword lane.
4486   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
4487   vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit);
4488 
4489   // Recompute the mask for remaining special value.
4490   pxor(xtmp2, xtmp3);
4491   // Extract mask corresponding to non-negative source lanes.
4492   vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc);
4493 
4494   // Shuffle mask vector and pack lower doubles word from each quadword lane.
4495   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
4496   pand(xtmp3, xtmp2);
4497 
4498   // Replace destination lanes holding special value(0x80000000) with max int
4499   // if corresponding source lane holds a +ve value.
4500   vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit);
4501   bind(done);
4502 }
4503 
4504 
4505 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero,
4506                                                    XMMRegister xtmp, Register rscratch, int vec_enc) {
4507   switch(to_elem_bt) {
4508     case T_SHORT:
4509       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing");
4510       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch);
4511       vpackusdw(dst, dst, zero, vec_enc);
4512       if (vec_enc == Assembler::AVX_256bit) {
4513         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
4514       }
4515       break;
4516     case  T_BYTE:
4517       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing");
4518       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch);
4519       vpackusdw(dst, dst, zero, vec_enc);
4520       if (vec_enc == Assembler::AVX_256bit) {
4521         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
4522       }
4523       vpackuswb(dst, dst, zero, vec_enc);
4524       break;
4525     default: assert(false, "%s", type2name(to_elem_bt));
4526   }
4527 }
4528 
4529 /*
4530  * Algorithm for vector D2L and F2I conversions:-
4531  * a) Perform vector D2L/F2I cast.
4532  * b) Choose fast path if none of the result vector lane contains 0x80000000 value.
4533  *    It signifies that source value could be any of the special floating point
4534  *    values(NaN,-Inf,Inf,Max,-Min).
4535  * c) Set destination to zero if source is NaN value.
4536  * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value.
4537  */
4538 
4539 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4540                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
4541                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
4542   int to_elem_sz = type2aelembytes(to_elem_bt);
4543   assert(to_elem_sz <= 4, "");
4544   vcvttps2dq(dst, src, vec_enc);
4545   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc);
4546   if (to_elem_sz < 4) {
4547     vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
4548     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc);
4549   }
4550 }
4551 
4552 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4553                                             XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip,
4554                                             Register rscratch, int vec_enc) {
4555   int to_elem_sz = type2aelembytes(to_elem_bt);
4556   assert(to_elem_sz <= 4, "");
4557   vcvttps2dq(dst, src, vec_enc);
4558   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc);
4559   switch(to_elem_bt) {
4560     case T_INT:
4561       break;
4562     case T_SHORT:
4563       evpmovdw(dst, dst, vec_enc);
4564       break;
4565     case T_BYTE:
4566       evpmovdb(dst, dst, vec_enc);
4567       break;
4568     default: assert(false, "%s", type2name(to_elem_bt));
4569   }
4570 }
4571 
4572 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
4573                                             KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip,
4574                                             Register rscratch, int vec_enc) {
4575   evcvttps2qq(dst, src, vec_enc);
4576   vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc);
4577 }
4578 
4579 // Handling for downcasting from double to integer or sub-word types on AVX2.
4580 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4581                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5,
4582                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
4583   int to_elem_sz = type2aelembytes(to_elem_bt);
4584   assert(to_elem_sz < 8, "");
4585   vcvttpd2dq(dst, src, vec_enc);
4586   vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch,
4587                                               float_sign_flip, vec_enc);
4588   if (to_elem_sz < 4) {
4589     // xtmp4 holds all zero lanes.
4590     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit);
4591   }
4592 }
4593 
4594 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src,
4595                                             XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1,
4596                                             KRegister ktmp2, AddressLiteral sign_flip,
4597                                             Register rscratch, int vec_enc) {
4598   if (VM_Version::supports_avx512dq()) {
4599     evcvttpd2qq(dst, src, vec_enc);
4600     vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
4601     switch(to_elem_bt) {
4602       case T_LONG:
4603         break;
4604       case T_INT:
4605         evpmovsqd(dst, dst, vec_enc);
4606         break;
4607       case T_SHORT:
4608         evpmovsqd(dst, dst, vec_enc);
4609         evpmovdw(dst, dst, vec_enc);
4610         break;
4611       case T_BYTE:
4612         evpmovsqd(dst, dst, vec_enc);
4613         evpmovdb(dst, dst, vec_enc);
4614         break;
4615       default: assert(false, "%s", type2name(to_elem_bt));
4616     }
4617   } else {
4618     assert(type2aelembytes(to_elem_bt) <= 4, "");
4619     vcvttpd2dq(dst, src, vec_enc);
4620     vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
4621     switch(to_elem_bt) {
4622       case T_INT:
4623         break;
4624       case T_SHORT:
4625         evpmovdw(dst, dst, vec_enc);
4626         break;
4627       case T_BYTE:
4628         evpmovdb(dst, dst, vec_enc);
4629         break;
4630       default: assert(false, "%s", type2name(to_elem_bt));
4631     }
4632   }
4633 }
4634 
4635 #ifdef _LP64
4636 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src,
4637                                                  AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
4638                                                  Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
4639   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
4640   // and re-instantiate original MXCSR.RC mode after that.
4641   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
4642 
4643   mov64(tmp, julong_cast(0.5L));
4644   evpbroadcastq(xtmp1, tmp, vec_enc);
4645   vaddpd(xtmp1, src , xtmp1, vec_enc);
4646   evcvtpd2qq(dst, xtmp1, vec_enc);
4647   vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
4648                                                 double_sign_flip, vec_enc);;
4649 
4650   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
4651 }
4652 
4653 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src,
4654                                                 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
4655                                                 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
4656   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
4657   // and re-instantiate original MXCSR.RC mode after that.
4658   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
4659 
4660   movl(tmp, jint_cast(0.5));
4661   movq(xtmp1, tmp);
4662   vbroadcastss(xtmp1, xtmp1, vec_enc);
4663   vaddps(xtmp1, src , xtmp1, vec_enc);
4664   vcvtps2dq(dst, xtmp1, vec_enc);
4665   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
4666                                               float_sign_flip, vec_enc);
4667 
4668   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
4669 }
4670 
4671 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src,
4672                                                AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
4673                                                Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) {
4674   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
4675   // and re-instantiate original MXCSR.RC mode after that.
4676   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
4677 
4678   movl(tmp, jint_cast(0.5));
4679   movq(xtmp1, tmp);
4680   vbroadcastss(xtmp1, xtmp1, vec_enc);
4681   vaddps(xtmp1, src , xtmp1, vec_enc);
4682   vcvtps2dq(dst, xtmp1, vec_enc);
4683   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc);
4684 
4685   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
4686 }
4687 #endif // _LP64
4688 
4689 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
4690                                              BasicType from_elem_bt, BasicType to_elem_bt) {
4691   switch (from_elem_bt) {
4692     case T_BYTE:
4693       switch (to_elem_bt) {
4694         case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break;
4695         case T_INT:   vpmovzxbd(dst, src, vlen_enc); break;
4696         case T_LONG:  vpmovzxbq(dst, src, vlen_enc); break;
4697         default: ShouldNotReachHere();
4698       }
4699       break;
4700     case T_SHORT:
4701       switch (to_elem_bt) {
4702         case T_INT:  vpmovzxwd(dst, src, vlen_enc); break;
4703         case T_LONG: vpmovzxwq(dst, src, vlen_enc); break;
4704         default: ShouldNotReachHere();
4705       }
4706       break;
4707     case T_INT:
4708       assert(to_elem_bt == T_LONG, "");
4709       vpmovzxdq(dst, src, vlen_enc);
4710       break;
4711     default:
4712       ShouldNotReachHere();
4713   }
4714 }
4715 
4716 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src,
4717                                          BasicType dst_bt, BasicType src_bt, int vlen) {
4718   int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen);
4719   assert(vlen_enc != AVX_512bit, "");
4720 
4721   int dst_bt_size = type2aelembytes(dst_bt);
4722   int src_bt_size = type2aelembytes(src_bt);
4723   if (dst_bt_size > src_bt_size) {
4724     switch (dst_bt_size / src_bt_size) {
4725       case 2: vpmovsxbw(dst, src, vlen_enc); break;
4726       case 4: vpmovsxbd(dst, src, vlen_enc); break;
4727       case 8: vpmovsxbq(dst, src, vlen_enc); break;
4728       default: ShouldNotReachHere();
4729     }
4730   } else {
4731     assert(dst_bt_size < src_bt_size, "");
4732     switch (src_bt_size / dst_bt_size) {
4733       case 2: {
4734         if (vlen_enc == AVX_128bit) {
4735           vpacksswb(dst, src, src, vlen_enc);
4736         } else {
4737           vpacksswb(dst, src, src, vlen_enc);
4738           vpermq(dst, dst, 0x08, vlen_enc);
4739         }
4740         break;
4741       }
4742       case 4: {
4743         if (vlen_enc == AVX_128bit) {
4744           vpackssdw(dst, src, src, vlen_enc);
4745           vpacksswb(dst, dst, dst, vlen_enc);
4746         } else {
4747           vpackssdw(dst, src, src, vlen_enc);
4748           vpermq(dst, dst, 0x08, vlen_enc);
4749           vpacksswb(dst, dst, dst, AVX_128bit);
4750         }
4751         break;
4752       }
4753       case 8: {
4754         if (vlen_enc == AVX_128bit) {
4755           vpshufd(dst, src, 0x08, vlen_enc);
4756           vpackssdw(dst, dst, dst, vlen_enc);
4757           vpacksswb(dst, dst, dst, vlen_enc);
4758         } else {
4759           vpshufd(dst, src, 0x08, vlen_enc);
4760           vpermq(dst, dst, 0x08, vlen_enc);
4761           vpackssdw(dst, dst, dst, AVX_128bit);
4762           vpacksswb(dst, dst, dst, AVX_128bit);
4763         }
4764         break;
4765       }
4766       default: ShouldNotReachHere();
4767     }
4768   }
4769 }
4770 
4771 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3,
4772                                    bool merge, BasicType bt, int vlen_enc) {
4773   if (bt == T_INT) {
4774     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
4775   } else {
4776     assert(bt == T_LONG, "");
4777     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
4778   }
4779 }
4780 
4781 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3,
4782                                    bool merge, BasicType bt, int vlen_enc) {
4783   if (bt == T_INT) {
4784     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
4785   } else {
4786     assert(bt == T_LONG, "");
4787     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
4788   }
4789 }
4790 
4791 #ifdef _LP64
4792 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1,
4793                                                Register rtmp2, XMMRegister xtmp, int mask_len,
4794                                                int vec_enc) {
4795   int index = 0;
4796   int vindex = 0;
4797   mov64(rtmp1, 0x0101010101010101L);
4798   pdepq(rtmp1, src, rtmp1);
4799   if (mask_len > 8) {
4800     movq(rtmp2, src);
4801     vpxor(xtmp, xtmp, xtmp, vec_enc);
4802     movq(xtmp, rtmp1);
4803   }
4804   movq(dst, rtmp1);
4805 
4806   mask_len -= 8;
4807   while (mask_len > 0) {
4808     assert ((mask_len & 0x7) == 0, "mask must be multiple of 8");
4809     index++;
4810     if ((index % 2) == 0) {
4811       pxor(xtmp, xtmp);
4812     }
4813     mov64(rtmp1, 0x0101010101010101L);
4814     shrq(rtmp2, 8);
4815     pdepq(rtmp1, rtmp2, rtmp1);
4816     pinsrq(xtmp, rtmp1, index % 2);
4817     vindex = index / 2;
4818     if (vindex) {
4819       // Write entire 16 byte vector when both 64 bit
4820       // lanes are update to save redundant instructions.
4821       if (index % 2) {
4822         vinsertf128(dst, dst, xtmp, vindex);
4823       }
4824     } else {
4825       vmovdqu(dst, xtmp);
4826     }
4827     mask_len -= 8;
4828   }
4829 }
4830 
4831 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) {
4832   switch(opc) {
4833     case Op_VectorMaskTrueCount:
4834       popcntq(dst, tmp);
4835       break;
4836     case Op_VectorMaskLastTrue:
4837       if (VM_Version::supports_lzcnt()) {
4838         lzcntq(tmp, tmp);
4839         movl(dst, 63);
4840         subl(dst, tmp);
4841       } else {
4842         movl(dst, -1);
4843         bsrq(tmp, tmp);
4844         cmov32(Assembler::notZero, dst, tmp);
4845       }
4846       break;
4847     case Op_VectorMaskFirstTrue:
4848       if (VM_Version::supports_bmi1()) {
4849         if (masklen < 32) {
4850           orl(tmp, 1 << masklen);
4851           tzcntl(dst, tmp);
4852         } else if (masklen == 32) {
4853           tzcntl(dst, tmp);
4854         } else {
4855           assert(masklen == 64, "");
4856           tzcntq(dst, tmp);
4857         }
4858       } else {
4859         if (masklen < 32) {
4860           orl(tmp, 1 << masklen);
4861           bsfl(dst, tmp);
4862         } else {
4863           assert(masklen == 32 || masklen == 64, "");
4864           movl(dst, masklen);
4865           if (masklen == 32)  {
4866             bsfl(tmp, tmp);
4867           } else {
4868             bsfq(tmp, tmp);
4869           }
4870           cmov32(Assembler::notZero, dst, tmp);
4871         }
4872       }
4873       break;
4874     case Op_VectorMaskToLong:
4875       assert(dst == tmp, "Dst and tmp should be the same for toLong operations");
4876       break;
4877     default: assert(false, "Unhandled mask operation");
4878   }
4879 }
4880 
4881 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp,
4882                                               int masklen, int masksize, int vec_enc) {
4883   assert(VM_Version::supports_popcnt(), "");
4884 
4885   if(VM_Version::supports_avx512bw()) {
4886     kmovql(tmp, mask);
4887   } else {
4888     assert(masklen <= 16, "");
4889     kmovwl(tmp, mask);
4890   }
4891 
4892   // Mask generated out of partial vector comparisons/replicate/mask manipulation
4893   // operations needs to be clipped.
4894   if (masksize < 16 && opc != Op_VectorMaskFirstTrue) {
4895     andq(tmp, (1 << masklen) - 1);
4896   }
4897 
4898   vector_mask_operation_helper(opc, dst, tmp, masklen);
4899 }
4900 
4901 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
4902                                               Register tmp, int masklen, BasicType bt, int vec_enc) {
4903   assert(vec_enc == AVX_128bit && VM_Version::supports_avx() ||
4904          vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4), "");
4905   assert(VM_Version::supports_popcnt(), "");
4906 
4907   bool need_clip = false;
4908   switch(bt) {
4909     case T_BOOLEAN:
4910       // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1
4911       vpxor(xtmp, xtmp, xtmp, vec_enc);
4912       vpsubb(xtmp, xtmp, mask, vec_enc);
4913       vpmovmskb(tmp, xtmp, vec_enc);
4914       need_clip = masklen < 16;
4915       break;
4916     case T_BYTE:
4917       vpmovmskb(tmp, mask, vec_enc);
4918       need_clip = masklen < 16;
4919       break;
4920     case T_SHORT:
4921       vpacksswb(xtmp, mask, mask, vec_enc);
4922       if (masklen >= 16) {
4923         vpermpd(xtmp, xtmp, 8, vec_enc);
4924       }
4925       vpmovmskb(tmp, xtmp, Assembler::AVX_128bit);
4926       need_clip = masklen < 16;
4927       break;
4928     case T_INT:
4929     case T_FLOAT:
4930       vmovmskps(tmp, mask, vec_enc);
4931       need_clip = masklen < 4;
4932       break;
4933     case T_LONG:
4934     case T_DOUBLE:
4935       vmovmskpd(tmp, mask, vec_enc);
4936       need_clip = masklen < 2;
4937       break;
4938     default: assert(false, "Unhandled type, %s", type2name(bt));
4939   }
4940 
4941   // Mask generated out of partial vector comparisons/replicate/mask manipulation
4942   // operations needs to be clipped.
4943   if (need_clip && opc != Op_VectorMaskFirstTrue) {
4944     // need_clip implies masklen < 32
4945     andq(tmp, (1 << masklen) - 1);
4946   }
4947 
4948   vector_mask_operation_helper(opc, dst, tmp, masklen);
4949 }
4950 
4951 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1,
4952                                              Register rtmp2, int mask_len) {
4953   kmov(rtmp1, src);
4954   andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len)));
4955   mov64(rtmp2, -1L);
4956   pextq(rtmp2, rtmp2, rtmp1);
4957   kmov(dst, rtmp2);
4958 }
4959 
4960 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask,
4961                                                bool merge, BasicType bt, int vec_enc) {
4962   if (opcode == Op_CompressV) {
4963     switch(bt) {
4964     case T_BYTE:
4965       evpcompressb(dst, mask, src, merge, vec_enc);
4966       break;
4967     case T_CHAR:
4968     case T_SHORT:
4969       evpcompressw(dst, mask, src, merge, vec_enc);
4970       break;
4971     case T_INT:
4972       evpcompressd(dst, mask, src, merge, vec_enc);
4973       break;
4974     case T_FLOAT:
4975       evcompressps(dst, mask, src, merge, vec_enc);
4976       break;
4977     case T_LONG:
4978       evpcompressq(dst, mask, src, merge, vec_enc);
4979       break;
4980     case T_DOUBLE:
4981       evcompresspd(dst, mask, src, merge, vec_enc);
4982       break;
4983     default:
4984       fatal("Unsupported type %s", type2name(bt));
4985       break;
4986     }
4987   } else {
4988     assert(opcode == Op_ExpandV, "");
4989     switch(bt) {
4990     case T_BYTE:
4991       evpexpandb(dst, mask, src, merge, vec_enc);
4992       break;
4993     case T_CHAR:
4994     case T_SHORT:
4995       evpexpandw(dst, mask, src, merge, vec_enc);
4996       break;
4997     case T_INT:
4998       evpexpandd(dst, mask, src, merge, vec_enc);
4999       break;
5000     case T_FLOAT:
5001       evexpandps(dst, mask, src, merge, vec_enc);
5002       break;
5003     case T_LONG:
5004       evpexpandq(dst, mask, src, merge, vec_enc);
5005       break;
5006     case T_DOUBLE:
5007       evexpandpd(dst, mask, src, merge, vec_enc);
5008       break;
5009     default:
5010       fatal("Unsupported type %s", type2name(bt));
5011       break;
5012     }
5013   }
5014 }
5015 #endif
5016 
5017 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5018                                            KRegister ktmp1, int vec_enc) {
5019   if (opcode == Op_SignumVD) {
5020     vsubpd(dst, zero, one, vec_enc);
5021     // if src < 0 ? -1 : 1
5022     evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5023     evblendmpd(dst, ktmp1, one, dst, true, vec_enc);
5024     // if src == NaN, -0.0 or 0.0 return src.
5025     evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5026     evblendmpd(dst, ktmp1, dst, src, true, vec_enc);
5027   } else {
5028     assert(opcode == Op_SignumVF, "");
5029     vsubps(dst, zero, one, vec_enc);
5030     // if src < 0 ? -1 : 1
5031     evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5032     evblendmps(dst, ktmp1, one, dst, true, vec_enc);
5033     // if src == NaN, -0.0 or 0.0 return src.
5034     evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5035     evblendmps(dst, ktmp1, dst, src, true, vec_enc);
5036   }
5037 }
5038 
5039 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5040                                           XMMRegister xtmp1, int vec_enc) {
5041   if (opcode == Op_SignumVD) {
5042     vsubpd(dst, zero, one, vec_enc);
5043     // if src < 0 ? -1 : 1
5044     vblendvpd(dst, one, dst, src, vec_enc);
5045     // if src == NaN, -0.0 or 0.0 return src.
5046     vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5047     vblendvpd(dst, dst, src, xtmp1, vec_enc);
5048   } else {
5049     assert(opcode == Op_SignumVF, "");
5050     vsubps(dst, zero, one, vec_enc);
5051     // if src < 0 ? -1 : 1
5052     vblendvps(dst, one, dst, src, vec_enc);
5053     // if src == NaN, -0.0 or 0.0 return src.
5054     vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5055     vblendvps(dst, dst, src, xtmp1, vec_enc);
5056   }
5057 }
5058 
5059 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) {
5060   if (VM_Version::supports_avx512bw()) {
5061     if (mask_len > 32) {
5062       kmovql(dst, src);
5063     } else {
5064       kmovdl(dst, src);
5065       if (mask_len != 32) {
5066         kshiftrdl(dst, dst, 32 - mask_len);
5067       }
5068     }
5069   } else {
5070     assert(mask_len <= 16, "");
5071     kmovwl(dst, src);
5072     if (mask_len != 16) {
5073       kshiftrwl(dst, dst, 16 - mask_len);
5074     }
5075   }
5076 }
5077 
5078 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) {
5079   int lane_size = type2aelembytes(bt);
5080   bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
5081   if ((is_LP64 || lane_size < 8) &&
5082       ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) ||
5083        (is_subword_type(bt) && VM_Version::supports_avx512vlbw()))) {
5084     movptr(rtmp, imm32);
5085     switch(lane_size) {
5086       case 1 : evpbroadcastb(dst, rtmp, vec_enc); break;
5087       case 2 : evpbroadcastw(dst, rtmp, vec_enc); break;
5088       case 4 : evpbroadcastd(dst, rtmp, vec_enc); break;
5089       case 8 : evpbroadcastq(dst, rtmp, vec_enc); break;
5090       fatal("Unsupported lane size %d", lane_size);
5091       break;
5092     }
5093   } else {
5094     movptr(rtmp, imm32);
5095     LP64_ONLY(movq(dst, rtmp)) NOT_LP64(movdl(dst, rtmp));
5096     switch(lane_size) {
5097       case 1 : vpbroadcastb(dst, dst, vec_enc); break;
5098       case 2 : vpbroadcastw(dst, dst, vec_enc); break;
5099       case 4 : vpbroadcastd(dst, dst, vec_enc); break;
5100       case 8 : vpbroadcastq(dst, dst, vec_enc); break;
5101       fatal("Unsupported lane size %d", lane_size);
5102       break;
5103     }
5104   }
5105 }
5106 
5107 //
5108 // Following is lookup table based popcount computation algorithm:-
5109 //       Index   Bit set count
5110 //     [ 0000 ->   0,
5111 //       0001 ->   1,
5112 //       0010 ->   1,
5113 //       0011 ->   2,
5114 //       0100 ->   1,
5115 //       0101 ->   2,
5116 //       0110 ->   2,
5117 //       0111 ->   3,
5118 //       1000 ->   1,
5119 //       1001 ->   2,
5120 //       1010 ->   3,
5121 //       1011 ->   3,
5122 //       1100 ->   2,
5123 //       1101 ->   3,
5124 //       1111 ->   4 ]
5125 //  a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as
5126 //     shuffle indices for lookup table access.
5127 //  b. Right shift each byte of vector lane by 4 positions.
5128 //  c. Count the number of 1s in 4 MSB bits each byte. These bits are used as
5129 //     shuffle indices for lookup table access.
5130 //  d. Add the bitset count of upper and lower 4 bits of each byte.
5131 //  e. Unpack double words to quad words and compute sum of absolute difference of bitset
5132 //     count of all the bytes of a quadword.
5133 //  f. Perform step e. for upper 128bit vector lane.
5134 //  g. Pack the bitset count of quadwords back to double word.
5135 //  h. Unpacking and packing operations are not needed for 64bit vector lane.
5136 
5137 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5138                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5139   assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), "");
5140   vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc);
5141   vpsrlw(dst, src, 4, vec_enc);
5142   vpand(dst, dst, xtmp1, vec_enc);
5143   vpand(xtmp1, src, xtmp1, vec_enc);
5144   vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg);
5145   vpshufb(xtmp1, xtmp2, xtmp1, vec_enc);
5146   vpshufb(dst, xtmp2, dst, vec_enc);
5147   vpaddb(dst, dst, xtmp1, vec_enc);
5148 }
5149 
5150 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5151                                             XMMRegister xtmp2, Register rtmp, int vec_enc) {
5152   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5153   // Following code is as per steps e,f,g and h of above algorithm.
5154   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5155   vpunpckhdq(dst, xtmp1, xtmp2, vec_enc);
5156   vpsadbw(dst, dst, xtmp2, vec_enc);
5157   vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc);
5158   vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc);
5159   vpackuswb(dst, xtmp1, dst, vec_enc);
5160 }
5161 
5162 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5163                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5164   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5165   // Add the popcount of upper and lower bytes of word.
5166   vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc);
5167   vpsrlw(dst, xtmp1, 8, vec_enc);
5168   vpand(xtmp1, xtmp1, xtmp2, vec_enc);
5169   vpaddw(dst, dst, xtmp1, vec_enc);
5170 }
5171 
5172 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5173                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5174   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5175   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5176   vpsadbw(dst, xtmp1, xtmp2, vec_enc);
5177 }
5178 
5179 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5180                                                  XMMRegister xtmp2, Register rtmp, int vec_enc) {
5181   switch(bt) {
5182     case T_LONG:
5183       vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5184       break;
5185     case T_INT:
5186       vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5187       break;
5188     case T_CHAR:
5189     case T_SHORT:
5190       vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5191       break;
5192     case T_BYTE:
5193     case T_BOOLEAN:
5194       vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5195       break;
5196     default:
5197       fatal("Unsupported type %s", type2name(bt));
5198       break;
5199   }
5200 }
5201 
5202 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5203                                                       KRegister mask, bool merge, int vec_enc) {
5204   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5205   switch(bt) {
5206     case T_LONG:
5207       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5208       evpopcntq(dst, mask, src, merge, vec_enc);
5209       break;
5210     case T_INT:
5211       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5212       evpopcntd(dst, mask, src, merge, vec_enc);
5213       break;
5214     case T_CHAR:
5215     case T_SHORT:
5216       assert(VM_Version::supports_avx512_bitalg(), "");
5217       evpopcntw(dst, mask, src, merge, vec_enc);
5218       break;
5219     case T_BYTE:
5220     case T_BOOLEAN:
5221       assert(VM_Version::supports_avx512_bitalg(), "");
5222       evpopcntb(dst, mask, src, merge, vec_enc);
5223       break;
5224     default:
5225       fatal("Unsupported type %s", type2name(bt));
5226       break;
5227   }
5228 }
5229 
5230 #ifndef _LP64
5231 void C2_MacroAssembler::vector_maskall_operation32(KRegister dst, Register src, KRegister tmp, int mask_len) {
5232   assert(VM_Version::supports_avx512bw(), "");
5233   kmovdl(tmp, src);
5234   kunpckdql(dst, tmp, tmp);
5235 }
5236 #endif
5237 
5238 // Bit reversal algorithm first reverses the bits of each byte followed by
5239 // a byte level reversal for multi-byte primitive types (short/int/long).
5240 // Algorithm performs a lookup table access to get reverse bit sequence
5241 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte
5242 // is obtained by swapping the reverse bit sequences of upper and lower
5243 // nibble of a byte.
5244 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5245                                            XMMRegister xtmp2, Register rtmp, int vec_enc) {
5246   if (VM_Version::supports_avx512vlbw()) {
5247 
5248     // Get the reverse bit sequence of lower nibble of each byte.
5249     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg);
5250     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5251     evpandq(dst, xtmp2, src, vec_enc);
5252     vpshufb(dst, xtmp1, dst, vec_enc);
5253     vpsllq(dst, dst, 4, vec_enc);
5254 
5255     // Get the reverse bit sequence of upper nibble of each byte.
5256     vpandn(xtmp2, xtmp2, src, vec_enc);
5257     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5258     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5259 
5260     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5261     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5262     evporq(xtmp2, dst, xtmp2, vec_enc);
5263     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
5264 
5265   } else if(vec_enc == Assembler::AVX_512bit) {
5266     // Shift based bit reversal.
5267     assert(bt == T_LONG || bt == T_INT, "");
5268 
5269     // Swap lower and upper nibble of each byte.
5270     vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc);
5271 
5272     // Swap two least and most significant bits of each nibble.
5273     vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc);
5274 
5275     // Swap adjacent pair of bits.
5276     evmovdqul(xtmp1, k0, dst, true, vec_enc);
5277     vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc);
5278 
5279     evmovdqul(xtmp1, k0, dst, true, vec_enc);
5280     vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc);
5281   } else {
5282     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp);
5283     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5284 
5285     // Get the reverse bit sequence of lower nibble of each byte.
5286     vpand(dst, xtmp2, src, vec_enc);
5287     vpshufb(dst, xtmp1, dst, vec_enc);
5288     vpsllq(dst, dst, 4, vec_enc);
5289 
5290     // Get the reverse bit sequence of upper nibble of each byte.
5291     vpandn(xtmp2, xtmp2, src, vec_enc);
5292     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5293     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5294 
5295     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5296     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5297     vpor(xtmp2, dst, xtmp2, vec_enc);
5298     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
5299   }
5300 }
5301 
5302 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc,
5303                                                 XMMRegister xtmp, Register rscratch) {
5304   assert(VM_Version::supports_gfni(), "");
5305   assert(rscratch != noreg || always_reachable(mask), "missing");
5306 
5307   // Galois field instruction based bit reversal based on following algorithm.
5308   // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
5309   vpbroadcastq(xtmp, mask, vec_enc, rscratch);
5310   vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc);
5311   vector_reverse_byte(bt, dst, xtmp, vec_enc);
5312 }
5313 
5314 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src,
5315                                           XMMRegister xtmp1, Register rtmp, int vec_enc) {
5316   vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc);
5317   evpandq(dst, xtmp1, src, vec_enc);
5318   vpsllq(dst, dst, nbits, vec_enc);
5319   vpandn(xtmp1, xtmp1, src, vec_enc);
5320   vpsrlq(xtmp1, xtmp1, nbits, vec_enc);
5321   evporq(dst, dst, xtmp1, vec_enc);
5322 }
5323 
5324 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5325                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5326   // Shift based bit reversal.
5327   assert(VM_Version::supports_evex(), "");
5328   switch(bt) {
5329     case T_LONG:
5330       // Swap upper and lower double word of each quad word.
5331       evprorq(xtmp1, k0, src, 32, true, vec_enc);
5332       evprord(xtmp1, k0, xtmp1, 16, true, vec_enc);
5333       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
5334       break;
5335     case T_INT:
5336       // Swap upper and lower word of each double word.
5337       evprord(xtmp1, k0, src, 16, true, vec_enc);
5338       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
5339       break;
5340     case T_CHAR:
5341     case T_SHORT:
5342       // Swap upper and lower byte of each word.
5343       vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc);
5344       break;
5345     case T_BYTE:
5346       evmovdquq(dst, k0, src, true, vec_enc);
5347       break;
5348     default:
5349       fatal("Unsupported type %s", type2name(bt));
5350       break;
5351   }
5352 }
5353 
5354 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) {
5355   if (bt == T_BYTE) {
5356     if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) {
5357       evmovdquq(dst, k0, src, true, vec_enc);
5358     } else {
5359       vmovdqu(dst, src);
5360     }
5361     return;
5362   }
5363   // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using
5364   // pre-computed shuffle indices.
5365   switch(bt) {
5366     case T_LONG:
5367       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg);
5368       break;
5369     case T_INT:
5370       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg);
5371       break;
5372     case T_CHAR:
5373     case T_SHORT:
5374       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg);
5375       break;
5376     default:
5377       fatal("Unsupported type %s", type2name(bt));
5378       break;
5379   }
5380   vpshufb(dst, src, dst, vec_enc);
5381 }
5382 
5383 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5384                                                         XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
5385                                                         KRegister ktmp, Register rtmp, bool merge, int vec_enc) {
5386   assert(is_integral_type(bt), "");
5387   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5388   assert(VM_Version::supports_avx512cd(), "");
5389   switch(bt) {
5390     case T_LONG:
5391       evplzcntq(dst, ktmp, src, merge, vec_enc);
5392       break;
5393     case T_INT:
5394       evplzcntd(dst, ktmp, src, merge, vec_enc);
5395       break;
5396     case T_SHORT:
5397       vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc);
5398       vpunpcklwd(xtmp2, xtmp1, src, vec_enc);
5399       evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc);
5400       vpunpckhwd(dst, xtmp1, src, vec_enc);
5401       evplzcntd(dst, ktmp, dst, merge, vec_enc);
5402       vpackusdw(dst, xtmp2, dst, vec_enc);
5403       break;
5404     case T_BYTE:
5405       // T1 = Compute leading zero counts of 4 LSB bits of each byte by
5406       // accessing the lookup table.
5407       // T2 = Compute leading zero counts of 4 MSB bits of each byte by
5408       // accessing the lookup table.
5409       // Add T1 to T2 if 4 MSB bits of byte are all zeros.
5410       assert(VM_Version::supports_avx512bw(), "");
5411       evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp);
5412       vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc);
5413       vpand(xtmp2, dst, src, vec_enc);
5414       vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5415       vpsrlw(xtmp3, src, 4, vec_enc);
5416       vpand(xtmp3, dst, xtmp3, vec_enc);
5417       vpshufb(dst, xtmp1, xtmp3, vec_enc);
5418       vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
5419       evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc);
5420       evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc);
5421       break;
5422     default:
5423       fatal("Unsupported type %s", type2name(bt));
5424       break;
5425   }
5426 }
5427 
5428 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5429                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
5430   vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp);
5431   vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5432   // T1 = Compute leading zero counts of 4 LSB bits of each byte by
5433   // accessing the lookup table.
5434   vpand(dst, xtmp2, src, vec_enc);
5435   vpshufb(dst, xtmp1, dst, vec_enc);
5436   // T2 = Compute leading zero counts of 4 MSB bits of each byte by
5437   // accessing the lookup table.
5438   vpsrlw(xtmp3, src, 4, vec_enc);
5439   vpand(xtmp3, xtmp2, xtmp3, vec_enc);
5440   vpshufb(xtmp2, xtmp1, xtmp3, vec_enc);
5441   // Add T1 to T2 if 4 MSB bits of byte are all zeros.
5442   vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
5443   vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc);
5444   vpaddb(dst, dst, xtmp2, vec_enc);
5445   vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc);
5446 }
5447 
5448 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5449                                                              XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
5450   vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
5451   // Add zero counts of lower byte and upper byte of a word if
5452   // upper byte holds a zero value.
5453   vpsrlw(xtmp3, src, 8, vec_enc);
5454   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
5455   vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc);
5456   vpsllw(xtmp2, dst, 8, vec_enc);
5457   vpaddw(xtmp2, xtmp2, dst, vec_enc);
5458   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
5459   vpsrlw(dst, dst, 8, vec_enc);
5460 }
5461 
5462 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5463                                                            XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) {
5464   // Since IEEE 754 floating point format represents mantissa in 1.0 format
5465   // hence biased exponent can be used to compute leading zero count as per
5466   // following formula:-
5467   // LZCNT = 32 - (biased_exp - 127)
5468   // Special handling has been introduced for Zero, Max_Int and -ve source values.
5469 
5470   // Broadcast 0xFF
5471   vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc);
5472   vpsrld(xtmp1, xtmp1, 24, vec_enc);
5473 
5474   // Extract biased exponent.
5475   vcvtdq2ps(dst, src, vec_enc);
5476   vpsrld(dst, dst, 23, vec_enc);
5477   vpand(dst, dst, xtmp1, vec_enc);
5478 
5479   // Broadcast 127.
5480   vpsrld(xtmp1, xtmp1, 1, vec_enc);
5481   // Exponent = biased_exp - 127
5482   vpsubd(dst, dst, xtmp1, vec_enc);
5483 
5484   // Exponent = Exponent  + 1
5485   vpsrld(xtmp3, xtmp1, 6, vec_enc);
5486   vpaddd(dst, dst, xtmp3, vec_enc);
5487 
5488   // Replace -ve exponent with zero, exponent is -ve when src
5489   // lane contains a zero value.
5490   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5491   vblendvps(dst, dst, xtmp2, dst, vec_enc);
5492 
5493   // Rematerialize broadcast 32.
5494   vpslld(xtmp1, xtmp3, 5, vec_enc);
5495   // Exponent is 32 if corresponding source lane contains max_int value.
5496   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
5497   // LZCNT = 32 - exponent
5498   vpsubd(dst, xtmp1, dst, vec_enc);
5499 
5500   // Replace LZCNT with a value 1 if corresponding source lane
5501   // contains max_int value.
5502   vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc);
5503 
5504   // Replace biased_exp with 0 if source lane value is less than zero.
5505   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5506   vblendvps(dst, dst, xtmp2, src, vec_enc);
5507 }
5508 
5509 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5510                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
5511   vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
5512   // Add zero counts of lower word and upper word of a double word if
5513   // upper word holds a zero value.
5514   vpsrld(xtmp3, src, 16, vec_enc);
5515   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
5516   vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc);
5517   vpslld(xtmp2, dst, 16, vec_enc);
5518   vpaddd(xtmp2, xtmp2, dst, vec_enc);
5519   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
5520   vpsrld(dst, dst, 16, vec_enc);
5521   // Add zero counts of lower doubleword and upper doubleword of a
5522   // quadword if upper doubleword holds a zero value.
5523   vpsrlq(xtmp3, src, 32, vec_enc);
5524   vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc);
5525   vpsllq(xtmp2, dst, 32, vec_enc);
5526   vpaddq(xtmp2, xtmp2, dst, vec_enc);
5527   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
5528   vpsrlq(dst, dst, 32, vec_enc);
5529 }
5530 
5531 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src,
5532                                                        XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
5533                                                        Register rtmp, int vec_enc) {
5534   assert(is_integral_type(bt), "unexpected type");
5535   assert(vec_enc < Assembler::AVX_512bit, "");
5536   switch(bt) {
5537     case T_LONG:
5538       vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
5539       break;
5540     case T_INT:
5541       vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
5542       break;
5543     case T_SHORT:
5544       vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
5545       break;
5546     case T_BYTE:
5547       vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
5548       break;
5549     default:
5550       fatal("Unsupported type %s", type2name(bt));
5551       break;
5552   }
5553 }
5554 
5555 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) {
5556   switch(bt) {
5557     case T_BYTE:
5558       vpsubb(dst, src1, src2, vec_enc);
5559       break;
5560     case T_SHORT:
5561       vpsubw(dst, src1, src2, vec_enc);
5562       break;
5563     case T_INT:
5564       vpsubd(dst, src1, src2, vec_enc);
5565       break;
5566     case T_LONG:
5567       vpsubq(dst, src1, src2, vec_enc);
5568       break;
5569     default:
5570       fatal("Unsupported type %s", type2name(bt));
5571       break;
5572   }
5573 }
5574 
5575 // Trailing zero count computation is based on leading zero count operation as per
5576 // following equation. All AVX3 targets support AVX512CD feature which offers
5577 // direct vector instruction to compute leading zero count.
5578 //      CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x)
5579 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5580                                                          XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
5581                                                          XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) {
5582   assert(is_integral_type(bt), "");
5583   // xtmp = -1
5584   vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc);
5585   // xtmp = xtmp + src
5586   vpadd(bt, xtmp4, xtmp4, src, vec_enc);
5587   // xtmp = xtmp & ~src
5588   vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc);
5589   vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc);
5590   vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc);
5591   vpsub(bt, dst, xtmp4, dst, vec_enc);
5592 }
5593 
5594 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation
5595 //      CTZ = PRIM_TYPE_WIDHT - POPC(x | -x)
5596 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5597                                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
5598   assert(is_integral_type(bt), "");
5599   // xtmp = 0
5600   vpxor(xtmp3 , xtmp3, xtmp3, vec_enc);
5601   // xtmp = 0 - src
5602   vpsub(bt, xtmp3, xtmp3, src, vec_enc);
5603   // xtmp = xtmp | src
5604   vpor(xtmp3, xtmp3, src, vec_enc);
5605   vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc);
5606   vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc);
5607   vpsub(bt, dst, xtmp1, dst, vec_enc);
5608 }
5609 
5610 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) {
5611   Label done;
5612   Label neg_divisor_fastpath;
5613   cmpl(divisor, 0);
5614   jccb(Assembler::less, neg_divisor_fastpath);
5615   xorl(rdx, rdx);
5616   divl(divisor);
5617   jmpb(done);
5618   bind(neg_divisor_fastpath);
5619   // Fastpath for divisor < 0:
5620   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
5621   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
5622   movl(rdx, rax);
5623   subl(rdx, divisor);
5624   if (VM_Version::supports_bmi1()) {
5625     andnl(rax, rdx, rax);
5626   } else {
5627     notl(rdx);
5628     andl(rax, rdx);
5629   }
5630   shrl(rax, 31);
5631   bind(done);
5632 }
5633 
5634 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) {
5635   Label done;
5636   Label neg_divisor_fastpath;
5637   cmpl(divisor, 0);
5638   jccb(Assembler::less, neg_divisor_fastpath);
5639   xorl(rdx, rdx);
5640   divl(divisor);
5641   jmpb(done);
5642   bind(neg_divisor_fastpath);
5643   // Fastpath when divisor < 0:
5644   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
5645   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
5646   movl(rdx, rax);
5647   subl(rax, divisor);
5648   if (VM_Version::supports_bmi1()) {
5649     andnl(rax, rax, rdx);
5650   } else {
5651     notl(rax);
5652     andl(rax, rdx);
5653   }
5654   sarl(rax, 31);
5655   andl(rax, divisor);
5656   subl(rdx, rax);
5657   bind(done);
5658 }
5659 
5660 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) {
5661   Label done;
5662   Label neg_divisor_fastpath;
5663 
5664   cmpl(divisor, 0);
5665   jccb(Assembler::less, neg_divisor_fastpath);
5666   xorl(rdx, rdx);
5667   divl(divisor);
5668   jmpb(done);
5669   bind(neg_divisor_fastpath);
5670   // Fastpath for divisor < 0:
5671   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
5672   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
5673   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
5674   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
5675   movl(rdx, rax);
5676   subl(rax, divisor);
5677   if (VM_Version::supports_bmi1()) {
5678     andnl(rax, rax, rdx);
5679   } else {
5680     notl(rax);
5681     andl(rax, rdx);
5682   }
5683   movl(tmp, rax);
5684   shrl(rax, 31); // quotient
5685   sarl(tmp, 31);
5686   andl(tmp, divisor);
5687   subl(rdx, tmp); // remainder
5688   bind(done);
5689 }
5690 
5691 #ifdef _LP64
5692 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1,
5693                                  XMMRegister xtmp2, Register rtmp) {
5694   if(VM_Version::supports_gfni()) {
5695     // Galois field instruction based bit reversal based on following algorithm.
5696     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
5697     mov64(rtmp, 0x8040201008040201L);
5698     movq(xtmp1, src);
5699     movq(xtmp2, rtmp);
5700     gf2p8affineqb(xtmp1, xtmp2, 0);
5701     movq(dst, xtmp1);
5702   } else {
5703     // Swap even and odd numbered bits.
5704     movl(rtmp, src);
5705     andl(rtmp, 0x55555555);
5706     shll(rtmp, 1);
5707     movl(dst, src);
5708     andl(dst, 0xAAAAAAAA);
5709     shrl(dst, 1);
5710     orl(dst, rtmp);
5711 
5712     // Swap LSB and MSB 2 bits of each nibble.
5713     movl(rtmp, dst);
5714     andl(rtmp, 0x33333333);
5715     shll(rtmp, 2);
5716     andl(dst, 0xCCCCCCCC);
5717     shrl(dst, 2);
5718     orl(dst, rtmp);
5719 
5720     // Swap LSB and MSB 4 bits of each byte.
5721     movl(rtmp, dst);
5722     andl(rtmp, 0x0F0F0F0F);
5723     shll(rtmp, 4);
5724     andl(dst, 0xF0F0F0F0);
5725     shrl(dst, 4);
5726     orl(dst, rtmp);
5727   }
5728   bswapl(dst);
5729 }
5730 
5731 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1,
5732                                  XMMRegister xtmp2, Register rtmp1, Register rtmp2) {
5733   if(VM_Version::supports_gfni()) {
5734     // Galois field instruction based bit reversal based on following algorithm.
5735     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
5736     mov64(rtmp1, 0x8040201008040201L);
5737     movq(xtmp1, src);
5738     movq(xtmp2, rtmp1);
5739     gf2p8affineqb(xtmp1, xtmp2, 0);
5740     movq(dst, xtmp1);
5741   } else {
5742     // Swap even and odd numbered bits.
5743     movq(rtmp1, src);
5744     mov64(rtmp2, 0x5555555555555555L);
5745     andq(rtmp1, rtmp2);
5746     shlq(rtmp1, 1);
5747     movq(dst, src);
5748     notq(rtmp2);
5749     andq(dst, rtmp2);
5750     shrq(dst, 1);
5751     orq(dst, rtmp1);
5752 
5753     // Swap LSB and MSB 2 bits of each nibble.
5754     movq(rtmp1, dst);
5755     mov64(rtmp2, 0x3333333333333333L);
5756     andq(rtmp1, rtmp2);
5757     shlq(rtmp1, 2);
5758     notq(rtmp2);
5759     andq(dst, rtmp2);
5760     shrq(dst, 2);
5761     orq(dst, rtmp1);
5762 
5763     // Swap LSB and MSB 4 bits of each byte.
5764     movq(rtmp1, dst);
5765     mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL);
5766     andq(rtmp1, rtmp2);
5767     shlq(rtmp1, 4);
5768     notq(rtmp2);
5769     andq(dst, rtmp2);
5770     shrq(dst, 4);
5771     orq(dst, rtmp1);
5772   }
5773   bswapq(dst);
5774 }
5775 
5776 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) {
5777   Label done;
5778   Label neg_divisor_fastpath;
5779   cmpq(divisor, 0);
5780   jccb(Assembler::less, neg_divisor_fastpath);
5781   xorl(rdx, rdx);
5782   divq(divisor);
5783   jmpb(done);
5784   bind(neg_divisor_fastpath);
5785   // Fastpath for divisor < 0:
5786   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
5787   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
5788   movq(rdx, rax);
5789   subq(rdx, divisor);
5790   if (VM_Version::supports_bmi1()) {
5791     andnq(rax, rdx, rax);
5792   } else {
5793     notq(rdx);
5794     andq(rax, rdx);
5795   }
5796   shrq(rax, 63);
5797   bind(done);
5798 }
5799 
5800 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) {
5801   Label done;
5802   Label neg_divisor_fastpath;
5803   cmpq(divisor, 0);
5804   jccb(Assembler::less, neg_divisor_fastpath);
5805   xorq(rdx, rdx);
5806   divq(divisor);
5807   jmp(done);
5808   bind(neg_divisor_fastpath);
5809   // Fastpath when divisor < 0:
5810   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
5811   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
5812   movq(rdx, rax);
5813   subq(rax, divisor);
5814   if (VM_Version::supports_bmi1()) {
5815     andnq(rax, rax, rdx);
5816   } else {
5817     notq(rax);
5818     andq(rax, rdx);
5819   }
5820   sarq(rax, 63);
5821   andq(rax, divisor);
5822   subq(rdx, rax);
5823   bind(done);
5824 }
5825 
5826 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) {
5827   Label done;
5828   Label neg_divisor_fastpath;
5829   cmpq(divisor, 0);
5830   jccb(Assembler::less, neg_divisor_fastpath);
5831   xorq(rdx, rdx);
5832   divq(divisor);
5833   jmp(done);
5834   bind(neg_divisor_fastpath);
5835   // Fastpath for divisor < 0:
5836   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
5837   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
5838   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
5839   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
5840   movq(rdx, rax);
5841   subq(rax, divisor);
5842   if (VM_Version::supports_bmi1()) {
5843     andnq(rax, rax, rdx);
5844   } else {
5845     notq(rax);
5846     andq(rax, rdx);
5847   }
5848   movq(tmp, rax);
5849   shrq(rax, 63); // quotient
5850   sarq(tmp, 63);
5851   andq(tmp, divisor);
5852   subq(rdx, tmp); // remainder
5853   bind(done);
5854 }
5855 #endif
5856 
5857 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1,
5858                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp,
5859                                         int vlen_enc) {
5860   assert(VM_Version::supports_avx512bw(), "");
5861   // Byte shuffles are inlane operations and indices are determined using
5862   // lower 4 bit of each shuffle lane, thus all shuffle indices are
5863   // normalized to index range 0-15. This makes sure that all the multiples
5864   // of an index value are placed at same relative position in 128 bit
5865   // lane i.e. elements corresponding to shuffle indices 16, 32 and 64
5866   // will be 16th element in their respective 128 bit lanes.
5867   movl(rtmp, 16);
5868   evpbroadcastb(xtmp1, rtmp, vlen_enc);
5869 
5870   // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16,
5871   // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using
5872   // original shuffle indices and move the shuffled lanes corresponding to true
5873   // mask to destination vector.
5874   evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
5875   evshufi64x2(xtmp2, src, src, 0x0, vlen_enc);
5876   evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc);
5877 
5878   // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32
5879   // and broadcasting second 128 bit lane.
5880   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
5881   vpsllq(xtmp2, xtmp1, 0x1, vlen_enc);
5882   evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
5883   evshufi64x2(xtmp3, src, src, 0x55, vlen_enc);
5884   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
5885 
5886   // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48
5887   // and broadcasting third 128 bit lane.
5888   evpcmpb(ktmp, k0, shuffle,  xtmp2, Assembler::nlt, true, vlen_enc);
5889   vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc);
5890   evpcmpb(ktmp, ktmp, shuffle,  xtmp1, Assembler::lt, true, vlen_enc);
5891   evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc);
5892   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
5893 
5894   // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64
5895   // and broadcasting third 128 bit lane.
5896   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
5897   vpsllq(xtmp2, xtmp2, 0x1, vlen_enc);
5898   evpcmpb(ktmp, ktmp, shuffle,  xtmp2, Assembler::lt, true, vlen_enc);
5899   evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc);
5900   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
5901 }
5902