1 /*
   2  * Copyright (c) 2020, 2023, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "gc/shared/barrierSet.hpp"
  29 #include "gc/shared/barrierSetAssembler.hpp"
  30 #include "oops/methodData.hpp"
  31 #include "opto/c2_MacroAssembler.hpp"
  32 #include "opto/intrinsicnode.hpp"
  33 #include "opto/output.hpp"
  34 #include "opto/opcodes.hpp"
  35 #include "opto/subnode.hpp"
  36 #include "runtime/objectMonitor.hpp"
  37 #include "runtime/stubRoutines.hpp"
  38 
  39 #ifdef PRODUCT
  40 #define BLOCK_COMMENT(str) /* nothing */
  41 #define STOP(error) stop(error)
  42 #else
  43 #define BLOCK_COMMENT(str) block_comment(str)
  44 #define STOP(error) block_comment(error); stop(error)
  45 #endif
  46 
  47 // C2 compiled method's prolog code.
  48 void C2_MacroAssembler::verified_entry(Compile* C, int sp_inc) {
  49   if (C->clinit_barrier_on_entry()) {
  50     assert(VM_Version::supports_fast_class_init_checks(), "sanity");
  51     assert(!C->method()->holder()->is_not_initialized(), "initialization should have been started");
  52 
  53     Label L_skip_barrier;
  54     Register klass = rscratch1;
  55 
  56     mov_metadata(klass, C->method()->holder()->constant_encoding());
  57     clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
  58 
  59     jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
  60 
  61     bind(L_skip_barrier);
  62   }
  63 
  64   int framesize = C->output()->frame_size_in_bytes();
  65   int bangsize = C->output()->bang_size_in_bytes();
  66   bool fp_mode_24b = false;
  67   int stack_bang_size = C->output()->need_stack_bang(bangsize) ? bangsize : 0;
  68 
  69   // WARNING: Initial instruction MUST be 5 bytes or longer so that
  70   // NativeJump::patch_verified_entry will be able to patch out the entry
  71   // code safely. The push to verify stack depth is ok at 5 bytes,
  72   // the frame allocation can be either 3 or 6 bytes. So if we don't do
  73   // stack bang then we must use the 6 byte frame allocation even if
  74   // we have no frame. :-(
  75   assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
  76 
  77   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
  78   // Remove word for return addr
  79   framesize -= wordSize;
  80   stack_bang_size -= wordSize;
  81 
  82   // Calls to C2R adapters often do not accept exceptional returns.
  83   // We require that their callers must bang for them.  But be careful, because
  84   // some VM calls (such as call site linkage) can use several kilobytes of
  85   // stack.  But the stack safety zone should account for that.
  86   // See bugs 4446381, 4468289, 4497237.
  87   if (stack_bang_size > 0) {
  88     generate_stack_overflow_check(stack_bang_size);
  89 
  90     // We always push rbp, so that on return to interpreter rbp, will be
  91     // restored correctly and we can correct the stack.
  92     push(rbp);
  93     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  94     if (PreserveFramePointer) {
  95       mov(rbp, rsp);
  96     }
  97     // Remove word for ebp
  98     framesize -= wordSize;
  99 
 100     // Create frame
 101     if (framesize) {
 102       subptr(rsp, framesize);
 103     }
 104   } else {
 105     // Create frame (force generation of a 4 byte immediate value)
 106     subptr_imm32(rsp, framesize);
 107 
 108     // Save RBP register now.
 109     framesize -= wordSize;
 110     movptr(Address(rsp, framesize), rbp);
 111     // Save caller's stack pointer into RBP if the frame pointer is preserved.
 112     if (PreserveFramePointer) {
 113       movptr(rbp, rsp);
 114       if (framesize > 0) {
 115         addptr(rbp, framesize);
 116       }
 117     }
 118   }
 119 
 120   if (C->needs_stack_repair()) {
 121     // Save stack increment just below the saved rbp (also account for fixed framesize and rbp)
 122     assert((sp_inc & (StackAlignmentInBytes-1)) == 0, "stack increment not aligned");
 123     movptr(Address(rsp, framesize - wordSize), sp_inc + framesize + wordSize);
 124   }
 125 
 126   if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
 127     framesize -= wordSize;
 128     movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 129   }
 130 
 131 #ifndef _LP64
 132   // If method sets FPU control word do it now
 133   if (fp_mode_24b) {
 134     fldcw(ExternalAddress(StubRoutines::x86::addr_fpu_cntrl_wrd_24()));
 135   }
 136   if (UseSSE >= 2 && VerifyFPU) {
 137     verify_FPU(0, "FPU stack must be clean on entry");
 138   }
 139 #endif
 140 
 141 #ifdef ASSERT
 142   if (VerifyStackAtCalls) {
 143     Label L;
 144     push(rax);
 145     mov(rax, rsp);
 146     andptr(rax, StackAlignmentInBytes-1);
 147     cmpptr(rax, StackAlignmentInBytes-wordSize);
 148     pop(rax);
 149     jcc(Assembler::equal, L);
 150     STOP("Stack is not properly aligned!");
 151     bind(L);
 152   }
 153 #endif
 154 }
 155 
 156 void C2_MacroAssembler::entry_barrier() {
 157   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 158 #ifdef _LP64
 159   if (BarrierSet::barrier_set()->barrier_set_nmethod() != nullptr) {
 160     // We put the non-hot code of the nmethod entry barrier out-of-line in a stub.
 161     Label dummy_slow_path;
 162     Label dummy_continuation;
 163     Label* slow_path = &dummy_slow_path;
 164     Label* continuation = &dummy_continuation;
 165     if (!Compile::current()->output()->in_scratch_emit_size()) {
 166       // Use real labels from actual stub when not emitting code for the purpose of measuring its size
 167       C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
 168       Compile::current()->output()->add_stub(stub);
 169       slow_path = &stub->entry();
 170       continuation = &stub->continuation();
 171     }
 172     bs->nmethod_entry_barrier(this, slow_path, continuation);
 173   }
 174 #else
 175   // Don't bother with out-of-line nmethod entry barrier stub for x86_32.
 176   bs->nmethod_entry_barrier(this, nullptr /* slow_path */, nullptr /* continuation */);
 177 #endif
 178 }
 179 
 180 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
 181   switch (vlen_in_bytes) {
 182     case  4: // fall-through
 183     case  8: // fall-through
 184     case 16: return Assembler::AVX_128bit;
 185     case 32: return Assembler::AVX_256bit;
 186     case 64: return Assembler::AVX_512bit;
 187 
 188     default: {
 189       ShouldNotReachHere();
 190       return Assembler::AVX_NoVec;
 191     }
 192   }
 193 }
 194 
 195 #if INCLUDE_RTM_OPT
 196 
 197 // Update rtm_counters based on abort status
 198 // input: abort_status
 199 //        rtm_counters (RTMLockingCounters*)
 200 // flags are killed
 201 void C2_MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) {
 202 
 203   atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset()));
 204   if (PrintPreciseRTMLockingStatistics) {
 205     for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {
 206       Label check_abort;
 207       testl(abort_status, (1<<i));
 208       jccb(Assembler::equal, check_abort);
 209       atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx))));
 210       bind(check_abort);
 211     }
 212   }
 213 }
 214 
 215 // Branch if (random & (count-1) != 0), count is 2^n
 216 // tmp, scr and flags are killed
 217 void C2_MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) {
 218   assert(tmp == rax, "");
 219   assert(scr == rdx, "");
 220   rdtsc(); // modifies EDX:EAX
 221   andptr(tmp, count-1);
 222   jccb(Assembler::notZero, brLabel);
 223 }
 224 
 225 // Perform abort ratio calculation, set no_rtm bit if high ratio
 226 // input:  rtm_counters_Reg (RTMLockingCounters* address)
 227 // tmpReg, rtm_counters_Reg and flags are killed
 228 void C2_MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg,
 229                                                     Register rtm_counters_Reg,
 230                                                     RTMLockingCounters* rtm_counters,
 231                                                     Metadata* method_data) {
 232   Label L_done, L_check_always_rtm1, L_check_always_rtm2;
 233 
 234   if (RTMLockingCalculationDelay > 0) {
 235     // Delay calculation
 236     movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr()));
 237     testptr(tmpReg, tmpReg);
 238     jccb(Assembler::equal, L_done);
 239   }
 240   // Abort ratio calculation only if abort_count > RTMAbortThreshold
 241   //   Aborted transactions = abort_count * 100
 242   //   All transactions = total_count *  RTMTotalCountIncrRate
 243   //   Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
 244 
 245   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset()));
 246   cmpptr(tmpReg, RTMAbortThreshold);
 247   jccb(Assembler::below, L_check_always_rtm2);
 248   imulptr(tmpReg, tmpReg, 100);
 249 
 250   Register scrReg = rtm_counters_Reg;
 251   movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
 252   imulptr(scrReg, scrReg, RTMTotalCountIncrRate);
 253   imulptr(scrReg, scrReg, RTMAbortRatio);
 254   cmpptr(tmpReg, scrReg);
 255   jccb(Assembler::below, L_check_always_rtm1);
 256   if (method_data != nullptr) {
 257     // set rtm_state to "no rtm" in MDO
 258     mov_metadata(tmpReg, method_data);
 259     lock();
 260     orl(Address(tmpReg, MethodData::rtm_state_offset()), NoRTM);
 261   }
 262   jmpb(L_done);
 263   bind(L_check_always_rtm1);
 264   // Reload RTMLockingCounters* address
 265   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
 266   bind(L_check_always_rtm2);
 267   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
 268   cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate);
 269   jccb(Assembler::below, L_done);
 270   if (method_data != nullptr) {
 271     // set rtm_state to "always rtm" in MDO
 272     mov_metadata(tmpReg, method_data);
 273     lock();
 274     orl(Address(tmpReg, MethodData::rtm_state_offset()), UseRTM);
 275   }
 276   bind(L_done);
 277 }
 278 
 279 // Update counters and perform abort ratio calculation
 280 // input:  abort_status_Reg
 281 // rtm_counters_Reg, flags are killed
 282 void C2_MacroAssembler::rtm_profiling(Register abort_status_Reg,
 283                                       Register rtm_counters_Reg,
 284                                       RTMLockingCounters* rtm_counters,
 285                                       Metadata* method_data,
 286                                       bool profile_rtm) {
 287 
 288   assert(rtm_counters != nullptr, "should not be null when profiling RTM");
 289   // update rtm counters based on rax value at abort
 290   // reads abort_status_Reg, updates flags
 291   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
 292   rtm_counters_update(abort_status_Reg, rtm_counters_Reg);
 293   if (profile_rtm) {
 294     // Save abort status because abort_status_Reg is used by following code.
 295     if (RTMRetryCount > 0) {
 296       push(abort_status_Reg);
 297     }
 298     assert(rtm_counters != nullptr, "should not be null when profiling RTM");
 299     rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data);
 300     // restore abort status
 301     if (RTMRetryCount > 0) {
 302       pop(abort_status_Reg);
 303     }
 304   }
 305 }
 306 
 307 // Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4)
 308 // inputs: retry_count_Reg
 309 //       : abort_status_Reg
 310 // output: retry_count_Reg decremented by 1
 311 // flags are killed
 312 void C2_MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) {
 313   Label doneRetry;
 314   assert(abort_status_Reg == rax, "");
 315   // The abort reason bits are in eax (see all states in rtmLocking.hpp)
 316   // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4)
 317   // if reason is in 0x6 and retry count != 0 then retry
 318   andptr(abort_status_Reg, 0x6);
 319   jccb(Assembler::zero, doneRetry);
 320   testl(retry_count_Reg, retry_count_Reg);
 321   jccb(Assembler::zero, doneRetry);
 322   pause();
 323   decrementl(retry_count_Reg);
 324   jmp(retryLabel);
 325   bind(doneRetry);
 326 }
 327 
 328 // Spin and retry if lock is busy,
 329 // inputs: box_Reg (monitor address)
 330 //       : retry_count_Reg
 331 // output: retry_count_Reg decremented by 1
 332 //       : clear z flag if retry count exceeded
 333 // tmp_Reg, scr_Reg, flags are killed
 334 void C2_MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg,
 335                                                Register tmp_Reg, Register scr_Reg, Label& retryLabel) {
 336   Label SpinLoop, SpinExit, doneRetry;
 337   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 338 
 339   testl(retry_count_Reg, retry_count_Reg);
 340   jccb(Assembler::zero, doneRetry);
 341   decrementl(retry_count_Reg);
 342   movptr(scr_Reg, RTMSpinLoopCount);
 343 
 344   bind(SpinLoop);
 345   pause();
 346   decrementl(scr_Reg);
 347   jccb(Assembler::lessEqual, SpinExit);
 348   movptr(tmp_Reg, Address(box_Reg, owner_offset));
 349   testptr(tmp_Reg, tmp_Reg);
 350   jccb(Assembler::notZero, SpinLoop);
 351 
 352   bind(SpinExit);
 353   jmp(retryLabel);
 354   bind(doneRetry);
 355   incrementl(retry_count_Reg); // clear z flag
 356 }
 357 
 358 // Use RTM for normal stack locks
 359 // Input: objReg (object to lock)
 360 void C2_MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg,
 361                                          Register retry_on_abort_count_Reg,
 362                                          RTMLockingCounters* stack_rtm_counters,
 363                                          Metadata* method_data, bool profile_rtm,
 364                                          Label& DONE_LABEL, Label& IsInflated) {
 365   assert(UseRTMForStackLocks, "why call this otherwise?");
 366   assert(tmpReg == rax, "");
 367   assert(scrReg == rdx, "");
 368   Label L_rtm_retry, L_decrement_retry, L_on_abort;
 369 
 370   if (RTMRetryCount > 0) {
 371     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
 372     bind(L_rtm_retry);
 373   }
 374   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
 375   testptr(tmpReg, markWord::monitor_value);  // inflated vs stack-locked|neutral
 376   jcc(Assembler::notZero, IsInflated);
 377 
 378   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 379     Label L_noincrement;
 380     if (RTMTotalCountIncrRate > 1) {
 381       // tmpReg, scrReg and flags are killed
 382       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
 383     }
 384     assert(stack_rtm_counters != nullptr, "should not be null when profiling RTM");
 385     atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg);
 386     bind(L_noincrement);
 387   }
 388   xbegin(L_on_abort);
 389   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));       // fetch markword
 390   andptr(tmpReg, markWord::lock_mask_in_place);     // look at 2 lock bits
 391   cmpptr(tmpReg, markWord::unlocked_value);         // bits = 01 unlocked
 392   jcc(Assembler::equal, DONE_LABEL);        // all done if unlocked
 393 
 394   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
 395   if (UseRTMXendForLockBusy) {
 396     xend();
 397     movptr(abort_status_Reg, 0x2);   // Set the abort status to 2 (so we can retry)
 398     jmp(L_decrement_retry);
 399   }
 400   else {
 401     xabort(0);
 402   }
 403   bind(L_on_abort);
 404   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 405     rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm);
 406   }
 407   bind(L_decrement_retry);
 408   if (RTMRetryCount > 0) {
 409     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
 410     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
 411   }
 412 }
 413 
 414 // Use RTM for inflating locks
 415 // inputs: objReg (object to lock)
 416 //         boxReg (on-stack box address (displaced header location) - KILLED)
 417 //         tmpReg (ObjectMonitor address + markWord::monitor_value)
 418 void C2_MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg,
 419                                             Register scrReg, Register retry_on_busy_count_Reg,
 420                                             Register retry_on_abort_count_Reg,
 421                                             RTMLockingCounters* rtm_counters,
 422                                             Metadata* method_data, bool profile_rtm,
 423                                             Label& DONE_LABEL) {
 424   assert(UseRTMLocking, "why call this otherwise?");
 425   assert(tmpReg == rax, "");
 426   assert(scrReg == rdx, "");
 427   Label L_rtm_retry, L_decrement_retry, L_on_abort;
 428   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 429 
 430   movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value()));
 431   movptr(boxReg, tmpReg); // Save ObjectMonitor address
 432 
 433   if (RTMRetryCount > 0) {
 434     movl(retry_on_busy_count_Reg, RTMRetryCount);  // Retry on lock busy
 435     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
 436     bind(L_rtm_retry);
 437   }
 438   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 439     Label L_noincrement;
 440     if (RTMTotalCountIncrRate > 1) {
 441       // tmpReg, scrReg and flags are killed
 442       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
 443     }
 444     assert(rtm_counters != nullptr, "should not be null when profiling RTM");
 445     atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg);
 446     bind(L_noincrement);
 447   }
 448   xbegin(L_on_abort);
 449   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
 450   movptr(tmpReg, Address(tmpReg, owner_offset));
 451   testptr(tmpReg, tmpReg);
 452   jcc(Assembler::zero, DONE_LABEL);
 453   if (UseRTMXendForLockBusy) {
 454     xend();
 455     jmp(L_decrement_retry);
 456   }
 457   else {
 458     xabort(0);
 459   }
 460   bind(L_on_abort);
 461   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
 462   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 463     rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm);
 464   }
 465   if (RTMRetryCount > 0) {
 466     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
 467     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
 468   }
 469 
 470   movptr(tmpReg, Address(boxReg, owner_offset)) ;
 471   testptr(tmpReg, tmpReg) ;
 472   jccb(Assembler::notZero, L_decrement_retry) ;
 473 
 474   // Appears unlocked - try to swing _owner from null to non-null.
 475   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 476 #ifdef _LP64
 477   Register threadReg = r15_thread;
 478 #else
 479   get_thread(scrReg);
 480   Register threadReg = scrReg;
 481 #endif
 482   lock();
 483   cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg
 484 
 485   if (RTMRetryCount > 0) {
 486     // success done else retry
 487     jccb(Assembler::equal, DONE_LABEL) ;
 488     bind(L_decrement_retry);
 489     // Spin and retry if lock is busy.
 490     rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry);
 491   }
 492   else {
 493     bind(L_decrement_retry);
 494   }
 495 }
 496 
 497 #endif //  INCLUDE_RTM_OPT
 498 
 499 // fast_lock and fast_unlock used by C2
 500 
 501 // Because the transitions from emitted code to the runtime
 502 // monitorenter/exit helper stubs are so slow it's critical that
 503 // we inline both the stack-locking fast path and the inflated fast path.
 504 //
 505 // See also: cmpFastLock and cmpFastUnlock.
 506 //
 507 // What follows is a specialized inline transliteration of the code
 508 // in enter() and exit(). If we're concerned about I$ bloat another
 509 // option would be to emit TrySlowEnter and TrySlowExit methods
 510 // at startup-time.  These methods would accept arguments as
 511 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
 512 // indications in the icc.ZFlag.  fast_lock and fast_unlock would simply
 513 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
 514 // In practice, however, the # of lock sites is bounded and is usually small.
 515 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
 516 // if the processor uses simple bimodal branch predictors keyed by EIP
 517 // Since the helper routines would be called from multiple synchronization
 518 // sites.
 519 //
 520 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
 521 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
 522 // to those specialized methods.  That'd give us a mostly platform-independent
 523 // implementation that the JITs could optimize and inline at their pleasure.
 524 // Done correctly, the only time we'd need to cross to native could would be
 525 // to park() or unpark() threads.  We'd also need a few more unsafe operators
 526 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
 527 // (b) explicit barriers or fence operations.
 528 //
 529 // TODO:
 530 //
 531 // *  Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
 532 //    This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
 533 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
 534 //    the lock operators would typically be faster than reifying Self.
 535 //
 536 // *  Ideally I'd define the primitives as:
 537 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
 538 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
 539 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
 540 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
 541 //    Furthermore the register assignments are overconstrained, possibly resulting in
 542 //    sub-optimal code near the synchronization site.
 543 //
 544 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
 545 //    Alternately, use a better sp-proximity test.
 546 //
 547 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
 548 //    Either one is sufficient to uniquely identify a thread.
 549 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
 550 //
 551 // *  Intrinsify notify() and notifyAll() for the common cases where the
 552 //    object is locked by the calling thread but the waitlist is empty.
 553 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
 554 //
 555 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
 556 //    But beware of excessive branch density on AMD Opterons.
 557 //
 558 // *  Both fast_lock and fast_unlock set the ICC.ZF to indicate success
 559 //    or failure of the fast path.  If the fast path fails then we pass
 560 //    control to the slow path, typically in C.  In fast_lock and
 561 //    fast_unlock we often branch to DONE_LABEL, just to find that C2
 562 //    will emit a conditional branch immediately after the node.
 563 //    So we have branches to branches and lots of ICC.ZF games.
 564 //    Instead, it might be better to have C2 pass a "FailureLabel"
 565 //    into fast_lock and fast_unlock.  In the case of success, control
 566 //    will drop through the node.  ICC.ZF is undefined at exit.
 567 //    In the case of failure, the node will branch directly to the
 568 //    FailureLabel
 569 
 570 
 571 // obj: object to lock
 572 // box: on-stack box address (displaced header location) - KILLED
 573 // rax,: tmp -- KILLED
 574 // scr: tmp -- KILLED
 575 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
 576                                  Register scrReg, Register cx1Reg, Register cx2Reg, Register thread,
 577                                  RTMLockingCounters* rtm_counters,
 578                                  RTMLockingCounters* stack_rtm_counters,
 579                                  Metadata* method_data,
 580                                  bool use_rtm, bool profile_rtm) {
 581   // Ensure the register assignments are disjoint
 582   assert(tmpReg == rax, "");
 583 
 584   if (use_rtm) {
 585     assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg);
 586   } else {
 587     assert(cx1Reg == noreg, "");
 588     assert(cx2Reg == noreg, "");
 589     assert_different_registers(objReg, boxReg, tmpReg, scrReg);
 590   }
 591 
 592   // Possible cases that we'll encounter in fast_lock
 593   // ------------------------------------------------
 594   // * Inflated
 595   //    -- unlocked
 596   //    -- Locked
 597   //       = by self
 598   //       = by other
 599   // * neutral
 600   // * stack-locked
 601   //    -- by self
 602   //       = sp-proximity test hits
 603   //       = sp-proximity test generates false-negative
 604   //    -- by other
 605   //
 606 
 607   Label IsInflated, DONE_LABEL, NO_COUNT, COUNT;
 608 
 609   if (DiagnoseSyncOnValueBasedClasses != 0) {
 610     load_klass(tmpReg, objReg, scrReg);
 611     movl(tmpReg, Address(tmpReg, Klass::access_flags_offset()));
 612     testl(tmpReg, JVM_ACC_IS_VALUE_BASED_CLASS);
 613     jcc(Assembler::notZero, DONE_LABEL);
 614   }
 615 
 616 #if INCLUDE_RTM_OPT
 617   if (UseRTMForStackLocks && use_rtm) {
 618     assert(LockingMode != LM_MONITOR, "LockingMode == 0 (LM_MONITOR) and +UseRTMForStackLocks are mutually exclusive");
 619     rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg,
 620                       stack_rtm_counters, method_data, profile_rtm,
 621                       DONE_LABEL, IsInflated);
 622   }
 623 #endif // INCLUDE_RTM_OPT
 624 
 625   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));          // [FETCH]
 626   testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral
 627   jcc(Assembler::notZero, IsInflated);
 628 
 629   if (LockingMode == LM_MONITOR) {
 630     // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0.
 631     testptr(objReg, objReg);
 632   } else if (LockingMode == LM_LEGACY) {
 633     // Attempt stack-locking ...
 634     orptr (tmpReg, markWord::unlocked_value);
 635     if (EnableValhalla) {
 636       // Mask inline_type bit such that we go to the slow path if object is an inline type
 637       andptr(tmpReg, ~((int) markWord::inline_type_bit_in_place));
 638     }
 639     movptr(Address(boxReg, 0), tmpReg);          // Anticipate successful CAS
 640     lock();
 641     cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes()));      // Updates tmpReg
 642     jcc(Assembler::equal, COUNT);           // Success
 643 
 644     // Recursive locking.
 645     // The object is stack-locked: markword contains stack pointer to BasicLock.
 646     // Locked by current thread if difference with current SP is less than one page.
 647     subptr(tmpReg, rsp);
 648     // Next instruction set ZFlag == 1 (Success) if difference is less then one page.
 649     andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - (int)os::vm_page_size())) );
 650     movptr(Address(boxReg, 0), tmpReg);
 651   } else {
 652     assert(LockingMode == LM_LIGHTWEIGHT, "");
 653     fast_lock_impl(objReg, tmpReg, thread, scrReg, NO_COUNT);
 654     jmp(COUNT);
 655   }
 656   jmp(DONE_LABEL);
 657 
 658   bind(IsInflated);
 659   // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value
 660 
 661 #if INCLUDE_RTM_OPT
 662   // Use the same RTM locking code in 32- and 64-bit VM.
 663   if (use_rtm) {
 664     rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg,
 665                          rtm_counters, method_data, profile_rtm, DONE_LABEL);
 666   } else {
 667 #endif // INCLUDE_RTM_OPT
 668 
 669 #ifndef _LP64
 670   // The object is inflated.
 671 
 672   // boxReg refers to the on-stack BasicLock in the current frame.
 673   // We'd like to write:
 674   //   set box->_displaced_header = markWord::unused_mark().  Any non-0 value suffices.
 675   // This is convenient but results a ST-before-CAS penalty.  The following CAS suffers
 676   // additional latency as we have another ST in the store buffer that must drain.
 677 
 678   // avoid ST-before-CAS
 679   // register juggle because we need tmpReg for cmpxchgptr below
 680   movptr(scrReg, boxReg);
 681   movptr(boxReg, tmpReg);                   // consider: LEA box, [tmp-2]
 682 
 683   // Optimistic form: consider XORL tmpReg,tmpReg
 684   movptr(tmpReg, NULL_WORD);
 685 
 686   // Appears unlocked - try to swing _owner from null to non-null.
 687   // Ideally, I'd manifest "Self" with get_thread and then attempt
 688   // to CAS the register containing Self into m->Owner.
 689   // But we don't have enough registers, so instead we can either try to CAS
 690   // rsp or the address of the box (in scr) into &m->owner.  If the CAS succeeds
 691   // we later store "Self" into m->Owner.  Transiently storing a stack address
 692   // (rsp or the address of the box) into  m->owner is harmless.
 693   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 694   lock();
 695   cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 696   movptr(Address(scrReg, 0), 3);          // box->_displaced_header = 3
 697   // If we weren't able to swing _owner from null to the BasicLock
 698   // then take the slow path.
 699   jccb  (Assembler::notZero, NO_COUNT);
 700   // update _owner from BasicLock to thread
 701   get_thread (scrReg);                    // beware: clobbers ICCs
 702   movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg);
 703   xorptr(boxReg, boxReg);                 // set icc.ZFlag = 1 to indicate success
 704 
 705   // If the CAS fails we can either retry or pass control to the slow path.
 706   // We use the latter tactic.
 707   // Pass the CAS result in the icc.ZFlag into DONE_LABEL
 708   // If the CAS was successful ...
 709   //   Self has acquired the lock
 710   //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
 711   // Intentional fall-through into DONE_LABEL ...
 712 #else // _LP64
 713   // It's inflated and we use scrReg for ObjectMonitor* in this section.
 714   movq(scrReg, tmpReg);
 715   xorq(tmpReg, tmpReg);
 716   lock();
 717   cmpxchgptr(thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 718   // Unconditionally set box->_displaced_header = markWord::unused_mark().
 719   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 720   movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value()));
 721   // Propagate ICC.ZF from CAS above into DONE_LABEL.
 722   jccb(Assembler::equal, COUNT);          // CAS above succeeded; propagate ZF = 1 (success)
 723 
 724   cmpptr(thread, rax);                // Check if we are already the owner (recursive lock)
 725   jccb(Assembler::notEqual, NO_COUNT);    // If not recursive, ZF = 0 at this point (fail)
 726   incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 727   xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success
 728 #endif // _LP64
 729 #if INCLUDE_RTM_OPT
 730   } // use_rtm()
 731 #endif
 732   bind(DONE_LABEL);
 733 
 734   // ZFlag == 1 count in fast path
 735   // ZFlag == 0 count in slow path
 736   jccb(Assembler::notZero, NO_COUNT); // jump if ZFlag == 0
 737 
 738   bind(COUNT);
 739   // Count monitors in fast path
 740   increment(Address(thread, JavaThread::held_monitor_count_offset()));
 741 
 742   xorl(tmpReg, tmpReg); // Set ZF == 1
 743 
 744   bind(NO_COUNT);
 745 
 746   // At NO_COUNT the icc ZFlag is set as follows ...
 747   // fast_unlock uses the same protocol.
 748   // ZFlag == 1 -> Success
 749   // ZFlag == 0 -> Failure - force control through the slow path
 750 }
 751 
 752 // obj: object to unlock
 753 // box: box address (displaced header location), killed.  Must be EAX.
 754 // tmp: killed, cannot be obj nor box.
 755 //
 756 // Some commentary on balanced locking:
 757 //
 758 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
 759 // Methods that don't have provably balanced locking are forced to run in the
 760 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
 761 // The interpreter provides two properties:
 762 // I1:  At return-time the interpreter automatically and quietly unlocks any
 763 //      objects acquired the current activation (frame).  Recall that the
 764 //      interpreter maintains an on-stack list of locks currently held by
 765 //      a frame.
 766 // I2:  If a method attempts to unlock an object that is not held by the
 767 //      the frame the interpreter throws IMSX.
 768 //
 769 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
 770 // B() doesn't have provably balanced locking so it runs in the interpreter.
 771 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
 772 // is still locked by A().
 773 //
 774 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
 775 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
 776 // should not be unlocked by "normal" java-level locking and vice-versa.  The specification
 777 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
 778 // Arguably given that the spec legislates the JNI case as undefined our implementation
 779 // could reasonably *avoid* checking owner in fast_unlock().
 780 // In the interest of performance we elide m->Owner==Self check in unlock.
 781 // A perfectly viable alternative is to elide the owner check except when
 782 // Xcheck:jni is enabled.
 783 
 784 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) {
 785   assert(boxReg == rax, "");
 786   assert_different_registers(objReg, boxReg, tmpReg);
 787 
 788   Label DONE_LABEL, Stacked, COUNT, NO_COUNT;
 789 
 790 #if INCLUDE_RTM_OPT
 791   if (UseRTMForStackLocks && use_rtm) {
 792     assert(LockingMode != LM_MONITOR, "LockingMode == 0 (LM_MONITOR) and +UseRTMForStackLocks are mutually exclusive");
 793     Label L_regular_unlock;
 794     movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword
 795     andptr(tmpReg, markWord::lock_mask_in_place);                     // look at 2 lock bits
 796     cmpptr(tmpReg, markWord::unlocked_value);                         // bits = 01 unlocked
 797     jccb(Assembler::notEqual, L_regular_unlock);                      // if !HLE RegularLock
 798     xend();                                                           // otherwise end...
 799     jmp(DONE_LABEL);                                                  // ... and we're done
 800     bind(L_regular_unlock);
 801   }
 802 #endif
 803 
 804   if (LockingMode == LM_LEGACY) {
 805     cmpptr(Address(boxReg, 0), NULL_WORD);                            // Examine the displaced header
 806     jcc   (Assembler::zero, COUNT);                                   // 0 indicates recursive stack-lock
 807   }
 808   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));   // Examine the object's markword
 809   if (LockingMode != LM_MONITOR) {
 810     testptr(tmpReg, markWord::monitor_value);                         // Inflated?
 811     jcc(Assembler::zero, Stacked);
 812   }
 813 
 814   // It's inflated.
 815   if (LockingMode == LM_LIGHTWEIGHT) {
 816     // If the owner is ANONYMOUS, we need to fix it -  in an outline stub.
 817     testb(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t) ObjectMonitor::ANONYMOUS_OWNER);
 818 #ifdef _LP64
 819     if (!Compile::current()->output()->in_scratch_emit_size()) {
 820       C2HandleAnonOMOwnerStub* stub = new (Compile::current()->comp_arena()) C2HandleAnonOMOwnerStub(tmpReg, boxReg);
 821       Compile::current()->output()->add_stub(stub);
 822       jcc(Assembler::notEqual, stub->entry());
 823       bind(stub->continuation());
 824     } else
 825 #endif
 826     {
 827       // We can't easily implement this optimization on 32 bit because we don't have a thread register.
 828       // Call the slow-path instead.
 829       jcc(Assembler::notEqual, NO_COUNT);
 830     }
 831   }
 832 
 833 #if INCLUDE_RTM_OPT
 834   if (use_rtm) {
 835     Label L_regular_inflated_unlock;
 836     int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 837     movptr(boxReg, Address(tmpReg, owner_offset));
 838     testptr(boxReg, boxReg);
 839     jccb(Assembler::notZero, L_regular_inflated_unlock);
 840     xend();
 841     jmp(DONE_LABEL);
 842     bind(L_regular_inflated_unlock);
 843   }
 844 #endif
 845 
 846   // Despite our balanced locking property we still check that m->_owner == Self
 847   // as java routines or native JNI code called by this thread might
 848   // have released the lock.
 849   // Refer to the comments in synchronizer.cpp for how we might encode extra
 850   // state in _succ so we can avoid fetching EntryList|cxq.
 851   //
 852   // If there's no contention try a 1-0 exit.  That is, exit without
 853   // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
 854   // we detect and recover from the race that the 1-0 exit admits.
 855   //
 856   // Conceptually fast_unlock() must execute a STST|LDST "release" barrier
 857   // before it STs null into _owner, releasing the lock.  Updates
 858   // to data protected by the critical section must be visible before
 859   // we drop the lock (and thus before any other thread could acquire
 860   // the lock and observe the fields protected by the lock).
 861   // IA32's memory-model is SPO, so STs are ordered with respect to
 862   // each other and there's no need for an explicit barrier (fence).
 863   // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
 864 #ifndef _LP64
 865   // Note that we could employ various encoding schemes to reduce
 866   // the number of loads below (currently 4) to just 2 or 3.
 867   // Refer to the comments in synchronizer.cpp.
 868   // In practice the chain of fetches doesn't seem to impact performance, however.
 869   xorptr(boxReg, boxReg);
 870   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 871   jccb  (Assembler::notZero, DONE_LABEL);
 872   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 873   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 874   jccb  (Assembler::notZero, DONE_LABEL);
 875   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 876   jmpb  (DONE_LABEL);
 877 #else // _LP64
 878   // It's inflated
 879   Label CheckSucc, LNotRecursive, LSuccess, LGoSlowPath;
 880 
 881   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0);
 882   jccb(Assembler::equal, LNotRecursive);
 883 
 884   // Recursive inflated unlock
 885   decq(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 886   jmpb(LSuccess);
 887 
 888   bind(LNotRecursive);
 889   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 890   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 891   jccb  (Assembler::notZero, CheckSucc);
 892   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 893   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 894   jmpb  (DONE_LABEL);
 895 
 896   // Try to avoid passing control into the slow_path ...
 897   bind  (CheckSucc);
 898 
 899   // The following optional optimization can be elided if necessary
 900   // Effectively: if (succ == null) goto slow path
 901   // The code reduces the window for a race, however,
 902   // and thus benefits performance.
 903   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD);
 904   jccb  (Assembler::zero, LGoSlowPath);
 905 
 906   xorptr(boxReg, boxReg);
 907   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 908   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 909 
 910   // Memory barrier/fence
 911   // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ
 912   // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack.
 913   // This is faster on Nehalem and AMD Shanghai/Barcelona.
 914   // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences
 915   // We might also restructure (ST Owner=0;barrier;LD _Succ) to
 916   // (mov box,0; xchgq box, &m->Owner; LD _succ) .
 917   lock(); addl(Address(rsp, 0), 0);
 918 
 919   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD);
 920   jccb  (Assembler::notZero, LSuccess);
 921 
 922   // Rare inopportune interleaving - race.
 923   // The successor vanished in the small window above.
 924   // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor.
 925   // We need to ensure progress and succession.
 926   // Try to reacquire the lock.
 927   // If that fails then the new owner is responsible for succession and this
 928   // thread needs to take no further action and can exit via the fast path (success).
 929   // If the re-acquire succeeds then pass control into the slow path.
 930   // As implemented, this latter mode is horrible because we generated more
 931   // coherence traffic on the lock *and* artificially extended the critical section
 932   // length while by virtue of passing control into the slow path.
 933 
 934   // box is really RAX -- the following CMPXCHG depends on that binding
 935   // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R)
 936   lock();
 937   cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 938   // There's no successor so we tried to regrab the lock.
 939   // If that didn't work, then another thread grabbed the
 940   // lock so we're done (and exit was a success).
 941   jccb  (Assembler::notEqual, LSuccess);
 942   // Intentional fall-through into slow path
 943 
 944   bind  (LGoSlowPath);
 945   orl   (boxReg, 1);                      // set ICC.ZF=0 to indicate failure
 946   jmpb  (DONE_LABEL);
 947 
 948   bind  (LSuccess);
 949   testl (boxReg, 0);                      // set ICC.ZF=1 to indicate success
 950   jmpb  (DONE_LABEL);
 951 
 952 #endif
 953   if (LockingMode != LM_MONITOR) {
 954     bind  (Stacked);
 955     if (LockingMode == LM_LIGHTWEIGHT) {
 956       mov(boxReg, tmpReg);
 957       fast_unlock_impl(objReg, boxReg, tmpReg, NO_COUNT);
 958       jmp(COUNT);
 959     } else if (LockingMode == LM_LEGACY) {
 960       movptr(tmpReg, Address (boxReg, 0));      // re-fetch
 961       lock();
 962       cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
 963     }
 964     // Intentional fall-thru into DONE_LABEL
 965   }
 966   bind(DONE_LABEL);
 967 
 968   // ZFlag == 1 count in fast path
 969   // ZFlag == 0 count in slow path
 970   jccb(Assembler::notZero, NO_COUNT);
 971 
 972   bind(COUNT);
 973   // Count monitors in fast path
 974 #ifndef _LP64
 975   get_thread(tmpReg);
 976   decrementl(Address(tmpReg, JavaThread::held_monitor_count_offset()));
 977 #else // _LP64
 978   decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset()));
 979 #endif
 980 
 981   xorl(tmpReg, tmpReg); // Set ZF == 1
 982 
 983   bind(NO_COUNT);
 984 }
 985 
 986 //-------------------------------------------------------------------------------------------
 987 // Generic instructions support for use in .ad files C2 code generation
 988 
 989 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) {
 990   if (dst != src) {
 991     movdqu(dst, src);
 992   }
 993   if (opcode == Op_AbsVD) {
 994     andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg);
 995   } else {
 996     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 997     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
 998   }
 999 }
1000 
1001 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
1002   if (opcode == Op_AbsVD) {
1003     vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg);
1004   } else {
1005     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
1006     vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg);
1007   }
1008 }
1009 
1010 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) {
1011   if (dst != src) {
1012     movdqu(dst, src);
1013   }
1014   if (opcode == Op_AbsVF) {
1015     andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg);
1016   } else {
1017     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
1018     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
1019   }
1020 }
1021 
1022 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
1023   if (opcode == Op_AbsVF) {
1024     vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg);
1025   } else {
1026     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
1027     vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg);
1028   }
1029 }
1030 
1031 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
1032   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
1033   assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
1034 
1035   if (opcode == Op_MinV) {
1036     if (elem_bt == T_BYTE) {
1037       pminsb(dst, src);
1038     } else if (elem_bt == T_SHORT) {
1039       pminsw(dst, src);
1040     } else if (elem_bt == T_INT) {
1041       pminsd(dst, src);
1042     } else {
1043       assert(elem_bt == T_LONG, "required");
1044       assert(tmp == xmm0, "required");
1045       assert_different_registers(dst, src, tmp);
1046       movdqu(xmm0, dst);
1047       pcmpgtq(xmm0, src);
1048       blendvpd(dst, src);  // xmm0 as mask
1049     }
1050   } else { // opcode == Op_MaxV
1051     if (elem_bt == T_BYTE) {
1052       pmaxsb(dst, src);
1053     } else if (elem_bt == T_SHORT) {
1054       pmaxsw(dst, src);
1055     } else if (elem_bt == T_INT) {
1056       pmaxsd(dst, src);
1057     } else {
1058       assert(elem_bt == T_LONG, "required");
1059       assert(tmp == xmm0, "required");
1060       assert_different_registers(dst, src, tmp);
1061       movdqu(xmm0, src);
1062       pcmpgtq(xmm0, dst);
1063       blendvpd(dst, src);  // xmm0 as mask
1064     }
1065   }
1066 }
1067 
1068 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
1069                                  XMMRegister dst, XMMRegister src1, XMMRegister src2,
1070                                  int vlen_enc) {
1071   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
1072 
1073   if (opcode == Op_MinV) {
1074     if (elem_bt == T_BYTE) {
1075       vpminsb(dst, src1, src2, vlen_enc);
1076     } else if (elem_bt == T_SHORT) {
1077       vpminsw(dst, src1, src2, vlen_enc);
1078     } else if (elem_bt == T_INT) {
1079       vpminsd(dst, src1, src2, vlen_enc);
1080     } else {
1081       assert(elem_bt == T_LONG, "required");
1082       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
1083         vpminsq(dst, src1, src2, vlen_enc);
1084       } else {
1085         assert_different_registers(dst, src1, src2);
1086         vpcmpgtq(dst, src1, src2, vlen_enc);
1087         vblendvpd(dst, src1, src2, dst, vlen_enc);
1088       }
1089     }
1090   } else { // opcode == Op_MaxV
1091     if (elem_bt == T_BYTE) {
1092       vpmaxsb(dst, src1, src2, vlen_enc);
1093     } else if (elem_bt == T_SHORT) {
1094       vpmaxsw(dst, src1, src2, vlen_enc);
1095     } else if (elem_bt == T_INT) {
1096       vpmaxsd(dst, src1, src2, vlen_enc);
1097     } else {
1098       assert(elem_bt == T_LONG, "required");
1099       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
1100         vpmaxsq(dst, src1, src2, vlen_enc);
1101       } else {
1102         assert_different_registers(dst, src1, src2);
1103         vpcmpgtq(dst, src1, src2, vlen_enc);
1104         vblendvpd(dst, src2, src1, dst, vlen_enc);
1105       }
1106     }
1107   }
1108 }
1109 
1110 // Float/Double min max
1111 
1112 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
1113                                    XMMRegister dst, XMMRegister a, XMMRegister b,
1114                                    XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
1115                                    int vlen_enc) {
1116   assert(UseAVX > 0, "required");
1117   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1118          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1119   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1120   assert_different_registers(a, b, tmp, atmp, btmp);
1121 
1122   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1123   bool is_double_word = is_double_word_type(elem_bt);
1124 
1125   if (!is_double_word && is_min) {
1126     vblendvps(atmp, a, b, a, vlen_enc);
1127     vblendvps(btmp, b, a, a, vlen_enc);
1128     vminps(tmp, atmp, btmp, vlen_enc);
1129     vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1130     vblendvps(dst, tmp, atmp, btmp, vlen_enc);
1131   } else if (!is_double_word && !is_min) {
1132     vblendvps(btmp, b, a, b, vlen_enc);
1133     vblendvps(atmp, a, b, b, vlen_enc);
1134     vmaxps(tmp, atmp, btmp, vlen_enc);
1135     vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1136     vblendvps(dst, tmp, atmp, btmp, vlen_enc);
1137   } else if (is_double_word && is_min) {
1138     vblendvpd(atmp, a, b, a, vlen_enc);
1139     vblendvpd(btmp, b, a, a, vlen_enc);
1140     vminpd(tmp, atmp, btmp, vlen_enc);
1141     vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1142     vblendvpd(dst, tmp, atmp, btmp, vlen_enc);
1143   } else {
1144     assert(is_double_word && !is_min, "sanity");
1145     vblendvpd(btmp, b, a, b, vlen_enc);
1146     vblendvpd(atmp, a, b, b, vlen_enc);
1147     vmaxpd(tmp, atmp, btmp, vlen_enc);
1148     vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1149     vblendvpd(dst, tmp, atmp, btmp, vlen_enc);
1150   }
1151 }
1152 
1153 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
1154                                     XMMRegister dst, XMMRegister a, XMMRegister b,
1155                                     KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
1156                                     int vlen_enc) {
1157   assert(UseAVX > 2, "required");
1158   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1159          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1160   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1161   assert_different_registers(dst, a, b, atmp, btmp);
1162 
1163   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1164   bool is_double_word = is_double_word_type(elem_bt);
1165   bool merge = true;
1166 
1167   if (!is_double_word && is_min) {
1168     evpmovd2m(ktmp, a, vlen_enc);
1169     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1170     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1171     vminps(dst, atmp, btmp, vlen_enc);
1172     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1173     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1174   } else if (!is_double_word && !is_min) {
1175     evpmovd2m(ktmp, b, vlen_enc);
1176     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1177     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1178     vmaxps(dst, atmp, btmp, vlen_enc);
1179     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1180     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1181   } else if (is_double_word && is_min) {
1182     evpmovq2m(ktmp, a, vlen_enc);
1183     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1184     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1185     vminpd(dst, atmp, btmp, vlen_enc);
1186     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1187     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1188   } else {
1189     assert(is_double_word && !is_min, "sanity");
1190     evpmovq2m(ktmp, b, vlen_enc);
1191     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1192     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1193     vmaxpd(dst, atmp, btmp, vlen_enc);
1194     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1195     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1196   }
1197 }
1198 
1199 // Float/Double signum
1200 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) {
1201   assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");
1202 
1203   Label DONE_LABEL;
1204 
1205   if (opcode == Op_SignumF) {
1206     assert(UseSSE > 0, "required");
1207     ucomiss(dst, zero);
1208     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1209     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1210     movflt(dst, one);
1211     jcc(Assembler::above, DONE_LABEL);
1212     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
1213   } else if (opcode == Op_SignumD) {
1214     assert(UseSSE > 1, "required");
1215     ucomisd(dst, zero);
1216     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1217     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1218     movdbl(dst, one);
1219     jcc(Assembler::above, DONE_LABEL);
1220     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
1221   }
1222 
1223   bind(DONE_LABEL);
1224 }
1225 
1226 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1227   if (sign) {
1228     pmovsxbw(dst, src);
1229   } else {
1230     pmovzxbw(dst, src);
1231   }
1232 }
1233 
1234 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1235   if (sign) {
1236     vpmovsxbw(dst, src, vector_len);
1237   } else {
1238     vpmovzxbw(dst, src, vector_len);
1239   }
1240 }
1241 
1242 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1243   if (sign) {
1244     vpmovsxbd(dst, src, vector_len);
1245   } else {
1246     vpmovzxbd(dst, src, vector_len);
1247   }
1248 }
1249 
1250 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1251   if (sign) {
1252     vpmovsxwd(dst, src, vector_len);
1253   } else {
1254     vpmovzxwd(dst, src, vector_len);
1255   }
1256 }
1257 
1258 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1259                                      int shift, int vector_len) {
1260   if (opcode == Op_RotateLeftV) {
1261     if (etype == T_INT) {
1262       evprold(dst, src, shift, vector_len);
1263     } else {
1264       assert(etype == T_LONG, "expected type T_LONG");
1265       evprolq(dst, src, shift, vector_len);
1266     }
1267   } else {
1268     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1269     if (etype == T_INT) {
1270       evprord(dst, src, shift, vector_len);
1271     } else {
1272       assert(etype == T_LONG, "expected type T_LONG");
1273       evprorq(dst, src, shift, vector_len);
1274     }
1275   }
1276 }
1277 
1278 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1279                                      XMMRegister shift, int vector_len) {
1280   if (opcode == Op_RotateLeftV) {
1281     if (etype == T_INT) {
1282       evprolvd(dst, src, shift, vector_len);
1283     } else {
1284       assert(etype == T_LONG, "expected type T_LONG");
1285       evprolvq(dst, src, shift, vector_len);
1286     }
1287   } else {
1288     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1289     if (etype == T_INT) {
1290       evprorvd(dst, src, shift, vector_len);
1291     } else {
1292       assert(etype == T_LONG, "expected type T_LONG");
1293       evprorvq(dst, src, shift, vector_len);
1294     }
1295   }
1296 }
1297 
1298 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1299   if (opcode == Op_RShiftVI) {
1300     psrad(dst, shift);
1301   } else if (opcode == Op_LShiftVI) {
1302     pslld(dst, shift);
1303   } else {
1304     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1305     psrld(dst, shift);
1306   }
1307 }
1308 
1309 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1310   switch (opcode) {
1311     case Op_RShiftVI:  psrad(dst, shift); break;
1312     case Op_LShiftVI:  pslld(dst, shift); break;
1313     case Op_URShiftVI: psrld(dst, shift); break;
1314 
1315     default: assert(false, "%s", NodeClassNames[opcode]);
1316   }
1317 }
1318 
1319 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1320   if (opcode == Op_RShiftVI) {
1321     vpsrad(dst, nds, shift, vector_len);
1322   } else if (opcode == Op_LShiftVI) {
1323     vpslld(dst, nds, shift, vector_len);
1324   } else {
1325     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1326     vpsrld(dst, nds, shift, vector_len);
1327   }
1328 }
1329 
1330 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1331   switch (opcode) {
1332     case Op_RShiftVI:  vpsrad(dst, src, shift, vlen_enc); break;
1333     case Op_LShiftVI:  vpslld(dst, src, shift, vlen_enc); break;
1334     case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1335 
1336     default: assert(false, "%s", NodeClassNames[opcode]);
1337   }
1338 }
1339 
1340 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1341   switch (opcode) {
1342     case Op_RShiftVB:  // fall-through
1343     case Op_RShiftVS:  psraw(dst, shift); break;
1344 
1345     case Op_LShiftVB:  // fall-through
1346     case Op_LShiftVS:  psllw(dst, shift);   break;
1347 
1348     case Op_URShiftVS: // fall-through
1349     case Op_URShiftVB: psrlw(dst, shift);  break;
1350 
1351     default: assert(false, "%s", NodeClassNames[opcode]);
1352   }
1353 }
1354 
1355 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1356   switch (opcode) {
1357     case Op_RShiftVB:  // fall-through
1358     case Op_RShiftVS:  vpsraw(dst, src, shift, vlen_enc); break;
1359 
1360     case Op_LShiftVB:  // fall-through
1361     case Op_LShiftVS:  vpsllw(dst, src, shift, vlen_enc); break;
1362 
1363     case Op_URShiftVS: // fall-through
1364     case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1365 
1366     default: assert(false, "%s", NodeClassNames[opcode]);
1367   }
1368 }
1369 
1370 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1371   switch (opcode) {
1372     case Op_RShiftVL:  psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1373     case Op_LShiftVL:  psllq(dst, shift); break;
1374     case Op_URShiftVL: psrlq(dst, shift); break;
1375 
1376     default: assert(false, "%s", NodeClassNames[opcode]);
1377   }
1378 }
1379 
1380 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1381   if (opcode == Op_RShiftVL) {
1382     psrlq(dst, shift);  // using srl to implement sra on pre-avs512 systems
1383   } else if (opcode == Op_LShiftVL) {
1384     psllq(dst, shift);
1385   } else {
1386     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1387     psrlq(dst, shift);
1388   }
1389 }
1390 
1391 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1392   switch (opcode) {
1393     case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1394     case Op_LShiftVL:  vpsllq(dst, src, shift, vlen_enc); break;
1395     case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1396 
1397     default: assert(false, "%s", NodeClassNames[opcode]);
1398   }
1399 }
1400 
1401 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1402   if (opcode == Op_RShiftVL) {
1403     evpsraq(dst, nds, shift, vector_len);
1404   } else if (opcode == Op_LShiftVL) {
1405     vpsllq(dst, nds, shift, vector_len);
1406   } else {
1407     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1408     vpsrlq(dst, nds, shift, vector_len);
1409   }
1410 }
1411 
1412 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1413   switch (opcode) {
1414     case Op_RShiftVB:  // fall-through
1415     case Op_RShiftVS:  // fall-through
1416     case Op_RShiftVI:  vpsravd(dst, src, shift, vlen_enc); break;
1417 
1418     case Op_LShiftVB:  // fall-through
1419     case Op_LShiftVS:  // fall-through
1420     case Op_LShiftVI:  vpsllvd(dst, src, shift, vlen_enc); break;
1421 
1422     case Op_URShiftVB: // fall-through
1423     case Op_URShiftVS: // fall-through
1424     case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1425 
1426     default: assert(false, "%s", NodeClassNames[opcode]);
1427   }
1428 }
1429 
1430 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1431   switch (opcode) {
1432     case Op_RShiftVB:  // fall-through
1433     case Op_RShiftVS:  evpsravw(dst, src, shift, vlen_enc); break;
1434 
1435     case Op_LShiftVB:  // fall-through
1436     case Op_LShiftVS:  evpsllvw(dst, src, shift, vlen_enc); break;
1437 
1438     case Op_URShiftVB: // fall-through
1439     case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1440 
1441     default: assert(false, "%s", NodeClassNames[opcode]);
1442   }
1443 }
1444 
1445 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1446   assert(UseAVX >= 2, "required");
1447   switch (opcode) {
1448     case Op_RShiftVL: {
1449       if (UseAVX > 2) {
1450         assert(tmp == xnoreg, "not used");
1451         if (!VM_Version::supports_avx512vl()) {
1452           vlen_enc = Assembler::AVX_512bit;
1453         }
1454         evpsravq(dst, src, shift, vlen_enc);
1455       } else {
1456         vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1457         vpsrlvq(dst, src, shift, vlen_enc);
1458         vpsrlvq(tmp, tmp, shift, vlen_enc);
1459         vpxor(dst, dst, tmp, vlen_enc);
1460         vpsubq(dst, dst, tmp, vlen_enc);
1461       }
1462       break;
1463     }
1464     case Op_LShiftVL: {
1465       assert(tmp == xnoreg, "not used");
1466       vpsllvq(dst, src, shift, vlen_enc);
1467       break;
1468     }
1469     case Op_URShiftVL: {
1470       assert(tmp == xnoreg, "not used");
1471       vpsrlvq(dst, src, shift, vlen_enc);
1472       break;
1473     }
1474     default: assert(false, "%s", NodeClassNames[opcode]);
1475   }
1476 }
1477 
1478 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1479 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1480   assert(opcode == Op_LShiftVB ||
1481          opcode == Op_RShiftVB ||
1482          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1483   bool sign = (opcode != Op_URShiftVB);
1484   assert(vector_len == 0, "required");
1485   vextendbd(sign, dst, src, 1);
1486   vpmovzxbd(vtmp, shift, 1);
1487   varshiftd(opcode, dst, dst, vtmp, 1);
1488   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg);
1489   vextracti128_high(vtmp, dst);
1490   vpackusdw(dst, dst, vtmp, 0);
1491 }
1492 
1493 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1494 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1495   assert(opcode == Op_LShiftVB ||
1496          opcode == Op_RShiftVB ||
1497          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1498   bool sign = (opcode != Op_URShiftVB);
1499   int ext_vector_len = vector_len + 1;
1500   vextendbw(sign, dst, src, ext_vector_len);
1501   vpmovzxbw(vtmp, shift, ext_vector_len);
1502   varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1503   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg);
1504   if (vector_len == 0) {
1505     vextracti128_high(vtmp, dst);
1506     vpackuswb(dst, dst, vtmp, vector_len);
1507   } else {
1508     vextracti64x4_high(vtmp, dst);
1509     vpackuswb(dst, dst, vtmp, vector_len);
1510     vpermq(dst, dst, 0xD8, vector_len);
1511   }
1512 }
1513 
1514 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1515   switch(typ) {
1516     case T_BYTE:
1517       pinsrb(dst, val, idx);
1518       break;
1519     case T_SHORT:
1520       pinsrw(dst, val, idx);
1521       break;
1522     case T_INT:
1523       pinsrd(dst, val, idx);
1524       break;
1525     case T_LONG:
1526       pinsrq(dst, val, idx);
1527       break;
1528     default:
1529       assert(false,"Should not reach here.");
1530       break;
1531   }
1532 }
1533 
1534 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1535   switch(typ) {
1536     case T_BYTE:
1537       vpinsrb(dst, src, val, idx);
1538       break;
1539     case T_SHORT:
1540       vpinsrw(dst, src, val, idx);
1541       break;
1542     case T_INT:
1543       vpinsrd(dst, src, val, idx);
1544       break;
1545     case T_LONG:
1546       vpinsrq(dst, src, val, idx);
1547       break;
1548     default:
1549       assert(false,"Should not reach here.");
1550       break;
1551   }
1552 }
1553 
1554 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1555   switch(typ) {
1556     case T_INT:
1557       vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1558       break;
1559     case T_FLOAT:
1560       vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1561       break;
1562     case T_LONG:
1563       vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1564       break;
1565     case T_DOUBLE:
1566       vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1567       break;
1568     default:
1569       assert(false,"Should not reach here.");
1570       break;
1571   }
1572 }
1573 
1574 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1575   switch(typ) {
1576     case T_INT:
1577       evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1578       break;
1579     case T_FLOAT:
1580       evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1581       break;
1582     case T_LONG:
1583       evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1584       break;
1585     case T_DOUBLE:
1586       evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1587       break;
1588     default:
1589       assert(false,"Should not reach here.");
1590       break;
1591   }
1592 }
1593 
1594 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1595   switch(typ) {
1596     case T_INT:
1597       evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1598       break;
1599     case T_FLOAT:
1600       evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1601       break;
1602     case T_LONG:
1603       evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1604       break;
1605     case T_DOUBLE:
1606       evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1607       break;
1608     default:
1609       assert(false,"Should not reach here.");
1610       break;
1611   }
1612 }
1613 
1614 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) {
1615   if (vlen_in_bytes <= 16) {
1616     pxor (dst, dst);
1617     psubb(dst, src);
1618     switch (elem_bt) {
1619       case T_BYTE:   /* nothing to do */ break;
1620       case T_SHORT:  pmovsxbw(dst, dst); break;
1621       case T_INT:    pmovsxbd(dst, dst); break;
1622       case T_FLOAT:  pmovsxbd(dst, dst); break;
1623       case T_LONG:   pmovsxbq(dst, dst); break;
1624       case T_DOUBLE: pmovsxbq(dst, dst); break;
1625 
1626       default: assert(false, "%s", type2name(elem_bt));
1627     }
1628   } else {
1629     assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, "");
1630     int vlen_enc = vector_length_encoding(vlen_in_bytes);
1631 
1632     vpxor (dst, dst, dst, vlen_enc);
1633     vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc);
1634 
1635     switch (elem_bt) {
1636       case T_BYTE:   /* nothing to do */            break;
1637       case T_SHORT:  vpmovsxbw(dst, dst, vlen_enc); break;
1638       case T_INT:    vpmovsxbd(dst, dst, vlen_enc); break;
1639       case T_FLOAT:  vpmovsxbd(dst, dst, vlen_enc); break;
1640       case T_LONG:   vpmovsxbq(dst, dst, vlen_enc); break;
1641       case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1642 
1643       default: assert(false, "%s", type2name(elem_bt));
1644     }
1645   }
1646 }
1647 
1648 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) {
1649   if (novlbwdq) {
1650     vpmovsxbd(xtmp, src, vlen_enc);
1651     evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()),
1652             Assembler::eq, true, vlen_enc, noreg);
1653   } else {
1654     vpxor(xtmp, xtmp, xtmp, vlen_enc);
1655     vpsubb(xtmp, xtmp, src, vlen_enc);
1656     evpmovb2m(dst, xtmp, vlen_enc);
1657   }
1658 }
1659 
1660 void C2_MacroAssembler::load_vector(XMMRegister dst, Address src, int vlen_in_bytes) {
1661   switch (vlen_in_bytes) {
1662     case 4:  movdl(dst, src);   break;
1663     case 8:  movq(dst, src);    break;
1664     case 16: movdqu(dst, src);  break;
1665     case 32: vmovdqu(dst, src); break;
1666     case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break;
1667     default: ShouldNotReachHere();
1668   }
1669 }
1670 
1671 void C2_MacroAssembler::load_vector(XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) {
1672   assert(rscratch != noreg || always_reachable(src), "missing");
1673 
1674   if (reachable(src)) {
1675     load_vector(dst, as_Address(src), vlen_in_bytes);
1676   } else {
1677     lea(rscratch, src);
1678     load_vector(dst, Address(rscratch, 0), vlen_in_bytes);
1679   }
1680 }
1681 
1682 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) {
1683   int vlen_enc = vector_length_encoding(vlen);
1684   if (VM_Version::supports_avx()) {
1685     if (bt == T_LONG) {
1686       if (VM_Version::supports_avx2()) {
1687         vpbroadcastq(dst, src, vlen_enc);
1688       } else {
1689         vmovddup(dst, src, vlen_enc);
1690       }
1691     } else if (bt == T_DOUBLE) {
1692       if (vlen_enc != Assembler::AVX_128bit) {
1693         vbroadcastsd(dst, src, vlen_enc, noreg);
1694       } else {
1695         vmovddup(dst, src, vlen_enc);
1696       }
1697     } else {
1698       if (VM_Version::supports_avx2() && is_integral_type(bt)) {
1699         vpbroadcastd(dst, src, vlen_enc);
1700       } else {
1701         vbroadcastss(dst, src, vlen_enc);
1702       }
1703     }
1704   } else if (VM_Version::supports_sse3()) {
1705     movddup(dst, src);
1706   } else {
1707     movq(dst, src);
1708     if (vlen == 16) {
1709       punpcklqdq(dst, dst);
1710     }
1711   }
1712 }
1713 
1714 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) {
1715   // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64.
1716   int offset = exact_log2(type2aelembytes(bt)) << 6;
1717   if (is_floating_point_type(bt)) {
1718     offset += 128;
1719   }
1720   ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset);
1721   load_vector(dst, addr, vlen_in_bytes);
1722 }
1723 
1724 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1725 
1726 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1727   int vector_len = Assembler::AVX_128bit;
1728 
1729   switch (opcode) {
1730     case Op_AndReductionV:  pand(dst, src); break;
1731     case Op_OrReductionV:   por (dst, src); break;
1732     case Op_XorReductionV:  pxor(dst, src); break;
1733     case Op_MinReductionV:
1734       switch (typ) {
1735         case T_BYTE:        pminsb(dst, src); break;
1736         case T_SHORT:       pminsw(dst, src); break;
1737         case T_INT:         pminsd(dst, src); break;
1738         case T_LONG:        assert(UseAVX > 2, "required");
1739                             vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
1740         default:            assert(false, "wrong type");
1741       }
1742       break;
1743     case Op_MaxReductionV:
1744       switch (typ) {
1745         case T_BYTE:        pmaxsb(dst, src); break;
1746         case T_SHORT:       pmaxsw(dst, src); break;
1747         case T_INT:         pmaxsd(dst, src); break;
1748         case T_LONG:        assert(UseAVX > 2, "required");
1749                             vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
1750         default:            assert(false, "wrong type");
1751       }
1752       break;
1753     case Op_AddReductionVF: addss(dst, src); break;
1754     case Op_AddReductionVD: addsd(dst, src); break;
1755     case Op_AddReductionVI:
1756       switch (typ) {
1757         case T_BYTE:        paddb(dst, src); break;
1758         case T_SHORT:       paddw(dst, src); break;
1759         case T_INT:         paddd(dst, src); break;
1760         default:            assert(false, "wrong type");
1761       }
1762       break;
1763     case Op_AddReductionVL: paddq(dst, src); break;
1764     case Op_MulReductionVF: mulss(dst, src); break;
1765     case Op_MulReductionVD: mulsd(dst, src); break;
1766     case Op_MulReductionVI:
1767       switch (typ) {
1768         case T_SHORT:       pmullw(dst, src); break;
1769         case T_INT:         pmulld(dst, src); break;
1770         default:            assert(false, "wrong type");
1771       }
1772       break;
1773     case Op_MulReductionVL: assert(UseAVX > 2, "required");
1774                             evpmullq(dst, dst, src, vector_len); break;
1775     default:                assert(false, "wrong opcode");
1776   }
1777 }
1778 
1779 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1780   int vector_len = Assembler::AVX_256bit;
1781 
1782   switch (opcode) {
1783     case Op_AndReductionV:  vpand(dst, src1, src2, vector_len); break;
1784     case Op_OrReductionV:   vpor (dst, src1, src2, vector_len); break;
1785     case Op_XorReductionV:  vpxor(dst, src1, src2, vector_len); break;
1786     case Op_MinReductionV:
1787       switch (typ) {
1788         case T_BYTE:        vpminsb(dst, src1, src2, vector_len); break;
1789         case T_SHORT:       vpminsw(dst, src1, src2, vector_len); break;
1790         case T_INT:         vpminsd(dst, src1, src2, vector_len); break;
1791         case T_LONG:        assert(UseAVX > 2, "required");
1792                             vpminsq(dst, src1, src2, vector_len); break;
1793         default:            assert(false, "wrong type");
1794       }
1795       break;
1796     case Op_MaxReductionV:
1797       switch (typ) {
1798         case T_BYTE:        vpmaxsb(dst, src1, src2, vector_len); break;
1799         case T_SHORT:       vpmaxsw(dst, src1, src2, vector_len); break;
1800         case T_INT:         vpmaxsd(dst, src1, src2, vector_len); break;
1801         case T_LONG:        assert(UseAVX > 2, "required");
1802                             vpmaxsq(dst, src1, src2, vector_len); break;
1803         default:            assert(false, "wrong type");
1804       }
1805       break;
1806     case Op_AddReductionVI:
1807       switch (typ) {
1808         case T_BYTE:        vpaddb(dst, src1, src2, vector_len); break;
1809         case T_SHORT:       vpaddw(dst, src1, src2, vector_len); break;
1810         case T_INT:         vpaddd(dst, src1, src2, vector_len); break;
1811         default:            assert(false, "wrong type");
1812       }
1813       break;
1814     case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
1815     case Op_MulReductionVI:
1816       switch (typ) {
1817         case T_SHORT:       vpmullw(dst, src1, src2, vector_len); break;
1818         case T_INT:         vpmulld(dst, src1, src2, vector_len); break;
1819         default:            assert(false, "wrong type");
1820       }
1821       break;
1822     case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break;
1823     default:                assert(false, "wrong opcode");
1824   }
1825 }
1826 
1827 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
1828                                   XMMRegister dst, XMMRegister src,
1829                                   XMMRegister vtmp1, XMMRegister vtmp2) {
1830   switch (opcode) {
1831     case Op_AddReductionVF:
1832     case Op_MulReductionVF:
1833       reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1834       break;
1835 
1836     case Op_AddReductionVD:
1837     case Op_MulReductionVD:
1838       reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1839       break;
1840 
1841     default: assert(false, "wrong opcode");
1842   }
1843 }
1844 
1845 void C2_MacroAssembler::reduceB(int opcode, int vlen,
1846                              Register dst, Register src1, XMMRegister src2,
1847                              XMMRegister vtmp1, XMMRegister vtmp2) {
1848   switch (vlen) {
1849     case  8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1850     case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1851     case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1852     case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1853 
1854     default: assert(false, "wrong vector length");
1855   }
1856 }
1857 
1858 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
1859                              Register dst, Register src1, XMMRegister src2,
1860                              XMMRegister vtmp1, XMMRegister vtmp2) {
1861   switch (vlen) {
1862     case  8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1863     case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1864     case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1865     case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1866 
1867     default: assert(false, "wrong vector length");
1868   }
1869 }
1870 
1871 void C2_MacroAssembler::reduceS(int opcode, int vlen,
1872                              Register dst, Register src1, XMMRegister src2,
1873                              XMMRegister vtmp1, XMMRegister vtmp2) {
1874   switch (vlen) {
1875     case  4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1876     case  8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1877     case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1878     case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1879 
1880     default: assert(false, "wrong vector length");
1881   }
1882 }
1883 
1884 void C2_MacroAssembler::reduceI(int opcode, int vlen,
1885                              Register dst, Register src1, XMMRegister src2,
1886                              XMMRegister vtmp1, XMMRegister vtmp2) {
1887   switch (vlen) {
1888     case  2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1889     case  4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1890     case  8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1891     case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1892 
1893     default: assert(false, "wrong vector length");
1894   }
1895 }
1896 
1897 #ifdef _LP64
1898 void C2_MacroAssembler::reduceL(int opcode, int vlen,
1899                              Register dst, Register src1, XMMRegister src2,
1900                              XMMRegister vtmp1, XMMRegister vtmp2) {
1901   switch (vlen) {
1902     case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1903     case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1904     case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1905 
1906     default: assert(false, "wrong vector length");
1907   }
1908 }
1909 #endif // _LP64
1910 
1911 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1912   switch (vlen) {
1913     case 2:
1914       assert(vtmp2 == xnoreg, "");
1915       reduce2F(opcode, dst, src, vtmp1);
1916       break;
1917     case 4:
1918       assert(vtmp2 == xnoreg, "");
1919       reduce4F(opcode, dst, src, vtmp1);
1920       break;
1921     case 8:
1922       reduce8F(opcode, dst, src, vtmp1, vtmp2);
1923       break;
1924     case 16:
1925       reduce16F(opcode, dst, src, vtmp1, vtmp2);
1926       break;
1927     default: assert(false, "wrong vector length");
1928   }
1929 }
1930 
1931 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1932   switch (vlen) {
1933     case 2:
1934       assert(vtmp2 == xnoreg, "");
1935       reduce2D(opcode, dst, src, vtmp1);
1936       break;
1937     case 4:
1938       reduce4D(opcode, dst, src, vtmp1, vtmp2);
1939       break;
1940     case 8:
1941       reduce8D(opcode, dst, src, vtmp1, vtmp2);
1942       break;
1943     default: assert(false, "wrong vector length");
1944   }
1945 }
1946 
1947 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1948   if (opcode == Op_AddReductionVI) {
1949     if (vtmp1 != src2) {
1950       movdqu(vtmp1, src2);
1951     }
1952     phaddd(vtmp1, vtmp1);
1953   } else {
1954     pshufd(vtmp1, src2, 0x1);
1955     reduce_operation_128(T_INT, opcode, vtmp1, src2);
1956   }
1957   movdl(vtmp2, src1);
1958   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
1959   movdl(dst, vtmp1);
1960 }
1961 
1962 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1963   if (opcode == Op_AddReductionVI) {
1964     if (vtmp1 != src2) {
1965       movdqu(vtmp1, src2);
1966     }
1967     phaddd(vtmp1, src2);
1968     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1969   } else {
1970     pshufd(vtmp2, src2, 0xE);
1971     reduce_operation_128(T_INT, opcode, vtmp2, src2);
1972     reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1973   }
1974 }
1975 
1976 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1977   if (opcode == Op_AddReductionVI) {
1978     vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
1979     vextracti128_high(vtmp2, vtmp1);
1980     vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
1981     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1982   } else {
1983     vextracti128_high(vtmp1, src2);
1984     reduce_operation_128(T_INT, opcode, vtmp1, src2);
1985     reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1986   }
1987 }
1988 
1989 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1990   vextracti64x4_high(vtmp2, src2);
1991   reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
1992   reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1993 }
1994 
1995 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1996   pshufd(vtmp2, src2, 0x1);
1997   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
1998   movdqu(vtmp1, vtmp2);
1999   psrldq(vtmp1, 2);
2000   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2001   movdqu(vtmp2, vtmp1);
2002   psrldq(vtmp2, 1);
2003   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2004   movdl(vtmp2, src1);
2005   pmovsxbd(vtmp1, vtmp1);
2006   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2007   pextrb(dst, vtmp1, 0x0);
2008   movsbl(dst, dst);
2009 }
2010 
2011 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2012   pshufd(vtmp1, src2, 0xE);
2013   reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
2014   reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2015 }
2016 
2017 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2018   vextracti128_high(vtmp2, src2);
2019   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2020   reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2021 }
2022 
2023 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2024   vextracti64x4_high(vtmp1, src2);
2025   reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
2026   reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2027 }
2028 
2029 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2030   pmovsxbw(vtmp2, src2);
2031   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2032 }
2033 
2034 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2035   if (UseAVX > 1) {
2036     int vector_len = Assembler::AVX_256bit;
2037     vpmovsxbw(vtmp1, src2, vector_len);
2038     reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2039   } else {
2040     pmovsxbw(vtmp2, src2);
2041     reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2042     pshufd(vtmp2, src2, 0x1);
2043     pmovsxbw(vtmp2, src2);
2044     reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2045   }
2046 }
2047 
2048 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2049   if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
2050     int vector_len = Assembler::AVX_512bit;
2051     vpmovsxbw(vtmp1, src2, vector_len);
2052     reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2053   } else {
2054     assert(UseAVX >= 2,"Should not reach here.");
2055     mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
2056     vextracti128_high(vtmp2, src2);
2057     mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2058   }
2059 }
2060 
2061 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2062   mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
2063   vextracti64x4_high(vtmp2, src2);
2064   mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2065 }
2066 
2067 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2068   if (opcode == Op_AddReductionVI) {
2069     if (vtmp1 != src2) {
2070       movdqu(vtmp1, src2);
2071     }
2072     phaddw(vtmp1, vtmp1);
2073     phaddw(vtmp1, vtmp1);
2074   } else {
2075     pshufd(vtmp2, src2, 0x1);
2076     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2077     movdqu(vtmp1, vtmp2);
2078     psrldq(vtmp1, 2);
2079     reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
2080   }
2081   movdl(vtmp2, src1);
2082   pmovsxwd(vtmp1, vtmp1);
2083   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2084   pextrw(dst, vtmp1, 0x0);
2085   movswl(dst, dst);
2086 }
2087 
2088 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2089   if (opcode == Op_AddReductionVI) {
2090     if (vtmp1 != src2) {
2091       movdqu(vtmp1, src2);
2092     }
2093     phaddw(vtmp1, src2);
2094   } else {
2095     pshufd(vtmp1, src2, 0xE);
2096     reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
2097   }
2098   reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2099 }
2100 
2101 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2102   if (opcode == Op_AddReductionVI) {
2103     int vector_len = Assembler::AVX_256bit;
2104     vphaddw(vtmp2, src2, src2, vector_len);
2105     vpermq(vtmp2, vtmp2, 0xD8, vector_len);
2106   } else {
2107     vextracti128_high(vtmp2, src2);
2108     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2109   }
2110   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2111 }
2112 
2113 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2114   int vector_len = Assembler::AVX_256bit;
2115   vextracti64x4_high(vtmp1, src2);
2116   reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
2117   reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2118 }
2119 
2120 #ifdef _LP64
2121 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2122   pshufd(vtmp2, src2, 0xE);
2123   reduce_operation_128(T_LONG, opcode, vtmp2, src2);
2124   movdq(vtmp1, src1);
2125   reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
2126   movdq(dst, vtmp1);
2127 }
2128 
2129 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2130   vextracti128_high(vtmp1, src2);
2131   reduce_operation_128(T_LONG, opcode, vtmp1, src2);
2132   reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2133 }
2134 
2135 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2136   vextracti64x4_high(vtmp2, src2);
2137   reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
2138   reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2139 }
2140 
2141 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
2142   mov64(temp, -1L);
2143   bzhiq(temp, temp, len);
2144   kmovql(dst, temp);
2145 }
2146 #endif // _LP64
2147 
2148 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2149   reduce_operation_128(T_FLOAT, opcode, dst, src);
2150   pshufd(vtmp, src, 0x1);
2151   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2152 }
2153 
2154 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2155   reduce2F(opcode, dst, src, vtmp);
2156   pshufd(vtmp, src, 0x2);
2157   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2158   pshufd(vtmp, src, 0x3);
2159   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2160 }
2161 
2162 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2163   reduce4F(opcode, dst, src, vtmp2);
2164   vextractf128_high(vtmp2, src);
2165   reduce4F(opcode, dst, vtmp2, vtmp1);
2166 }
2167 
2168 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2169   reduce8F(opcode, dst, src, vtmp1, vtmp2);
2170   vextracti64x4_high(vtmp1, src);
2171   reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
2172 }
2173 
2174 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2175   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2176   pshufd(vtmp, src, 0xE);
2177   reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
2178 }
2179 
2180 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2181   reduce2D(opcode, dst, src, vtmp2);
2182   vextractf128_high(vtmp2, src);
2183   reduce2D(opcode, dst, vtmp2, vtmp1);
2184 }
2185 
2186 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2187   reduce4D(opcode, dst, src, vtmp1, vtmp2);
2188   vextracti64x4_high(vtmp1, src);
2189   reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
2190 }
2191 
2192 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) {
2193   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2194 }
2195 
2196 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) {
2197   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2198 }
2199 
2200 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask,
2201                                  int vec_enc) {
2202   switch(elem_bt) {
2203     case T_INT:
2204     case T_FLOAT:
2205       vmaskmovps(dst, src, mask, vec_enc);
2206       break;
2207     case T_LONG:
2208     case T_DOUBLE:
2209       vmaskmovpd(dst, src, mask, vec_enc);
2210       break;
2211     default:
2212       fatal("Unsupported type %s", type2name(elem_bt));
2213       break;
2214   }
2215 }
2216 
2217 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask,
2218                                  int vec_enc) {
2219   switch(elem_bt) {
2220     case T_INT:
2221     case T_FLOAT:
2222       vmaskmovps(dst, src, mask, vec_enc);
2223       break;
2224     case T_LONG:
2225     case T_DOUBLE:
2226       vmaskmovpd(dst, src, mask, vec_enc);
2227       break;
2228     default:
2229       fatal("Unsupported type %s", type2name(elem_bt));
2230       break;
2231   }
2232 }
2233 
2234 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
2235                                           XMMRegister dst, XMMRegister src,
2236                                           XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2237                                           XMMRegister xmm_0, XMMRegister xmm_1) {
2238   const int permconst[] = {1, 14};
2239   XMMRegister wsrc = src;
2240   XMMRegister wdst = xmm_0;
2241   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2242 
2243   int vlen_enc = Assembler::AVX_128bit;
2244   if (vlen == 16) {
2245     vlen_enc = Assembler::AVX_256bit;
2246   }
2247 
2248   for (int i = log2(vlen) - 1; i >=0; i--) {
2249     if (i == 0 && !is_dst_valid) {
2250       wdst = dst;
2251     }
2252     if (i == 3) {
2253       vextracti64x4_high(wtmp, wsrc);
2254     } else if (i == 2) {
2255       vextracti128_high(wtmp, wsrc);
2256     } else { // i = [0,1]
2257       vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
2258     }
2259     vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2260     wsrc = wdst;
2261     vlen_enc = Assembler::AVX_128bit;
2262   }
2263   if (is_dst_valid) {
2264     vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2265   }
2266 }
2267 
2268 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
2269                                         XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2270                                         XMMRegister xmm_0, XMMRegister xmm_1) {
2271   XMMRegister wsrc = src;
2272   XMMRegister wdst = xmm_0;
2273   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2274   int vlen_enc = Assembler::AVX_128bit;
2275   if (vlen == 8) {
2276     vlen_enc = Assembler::AVX_256bit;
2277   }
2278   for (int i = log2(vlen) - 1; i >=0; i--) {
2279     if (i == 0 && !is_dst_valid) {
2280       wdst = dst;
2281     }
2282     if (i == 1) {
2283       vextracti128_high(wtmp, wsrc);
2284     } else if (i == 2) {
2285       vextracti64x4_high(wtmp, wsrc);
2286     } else {
2287       assert(i == 0, "%d", i);
2288       vpermilpd(wtmp, wsrc, 1, vlen_enc);
2289     }
2290     vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2291     wsrc = wdst;
2292     vlen_enc = Assembler::AVX_128bit;
2293   }
2294   if (is_dst_valid) {
2295     vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2296   }
2297 }
2298 
2299 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2300   switch (bt) {
2301     case T_BYTE:  pextrb(dst, src, idx); break;
2302     case T_SHORT: pextrw(dst, src, idx); break;
2303     case T_INT:   pextrd(dst, src, idx); break;
2304     case T_LONG:  pextrq(dst, src, idx); break;
2305 
2306     default:
2307       assert(false,"Should not reach here.");
2308       break;
2309   }
2310 }
2311 
2312 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2313   int esize =  type2aelembytes(typ);
2314   int elem_per_lane = 16/esize;
2315   int lane = elemindex / elem_per_lane;
2316   int eindex = elemindex % elem_per_lane;
2317 
2318   if (lane >= 2) {
2319     assert(UseAVX > 2, "required");
2320     vextractf32x4(dst, src, lane & 3);
2321     return dst;
2322   } else if (lane > 0) {
2323     assert(UseAVX > 0, "required");
2324     vextractf128(dst, src, lane);
2325     return dst;
2326   } else {
2327     return src;
2328   }
2329 }
2330 
2331 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) {
2332   if (typ == T_BYTE) {
2333     movsbl(dst, dst);
2334   } else if (typ == T_SHORT) {
2335     movswl(dst, dst);
2336   }
2337 }
2338 
2339 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2340   int esize =  type2aelembytes(typ);
2341   int elem_per_lane = 16/esize;
2342   int eindex = elemindex % elem_per_lane;
2343   assert(is_integral_type(typ),"required");
2344 
2345   if (eindex == 0) {
2346     if (typ == T_LONG) {
2347       movq(dst, src);
2348     } else {
2349       movdl(dst, src);
2350       movsxl(typ, dst);
2351     }
2352   } else {
2353     extract(typ, dst, src, eindex);
2354     movsxl(typ, dst);
2355   }
2356 }
2357 
2358 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) {
2359   int esize =  type2aelembytes(typ);
2360   int elem_per_lane = 16/esize;
2361   int eindex = elemindex % elem_per_lane;
2362   assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2363 
2364   if (eindex == 0) {
2365     movq(dst, src);
2366   } else {
2367     if (typ == T_FLOAT) {
2368       if (UseAVX == 0) {
2369         movdqu(dst, src);
2370         shufps(dst, dst, eindex);
2371       } else {
2372         vshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2373       }
2374     } else {
2375       if (UseAVX == 0) {
2376         movdqu(dst, src);
2377         psrldq(dst, eindex*esize);
2378       } else {
2379         vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2380       }
2381       movq(dst, dst);
2382     }
2383   }
2384   // Zero upper bits
2385   if (typ == T_FLOAT) {
2386     if (UseAVX == 0) {
2387       assert(vtmp != xnoreg, "required.");
2388       movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg);
2389       pand(dst, vtmp);
2390     } else {
2391       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg);
2392     }
2393   }
2394 }
2395 
2396 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) {
2397   switch(typ) {
2398     case T_BYTE:
2399     case T_BOOLEAN:
2400       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2401       break;
2402     case T_SHORT:
2403     case T_CHAR:
2404       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2405       break;
2406     case T_INT:
2407     case T_FLOAT:
2408       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2409       break;
2410     case T_LONG:
2411     case T_DOUBLE:
2412       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2413       break;
2414     default:
2415       assert(false,"Should not reach here.");
2416       break;
2417   }
2418 }
2419 
2420 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) {
2421   assert(rscratch != noreg || always_reachable(src2), "missing");
2422 
2423   switch(typ) {
2424     case T_BOOLEAN:
2425     case T_BYTE:
2426       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2427       break;
2428     case T_CHAR:
2429     case T_SHORT:
2430       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2431       break;
2432     case T_INT:
2433     case T_FLOAT:
2434       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2435       break;
2436     case T_LONG:
2437     case T_DOUBLE:
2438       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2439       break;
2440     default:
2441       assert(false,"Should not reach here.");
2442       break;
2443   }
2444 }
2445 
2446 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2447   switch(typ) {
2448     case T_BYTE:
2449       evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2450       break;
2451     case T_SHORT:
2452       evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2453       break;
2454     case T_INT:
2455     case T_FLOAT:
2456       evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2457       break;
2458     case T_LONG:
2459     case T_DOUBLE:
2460       evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2461       break;
2462     default:
2463       assert(false,"Should not reach here.");
2464       break;
2465   }
2466 }
2467 
2468 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) {
2469   assert(vlen_in_bytes <= 32, "");
2470   int esize = type2aelembytes(bt);
2471   if (vlen_in_bytes == 32) {
2472     assert(vtmp == xnoreg, "required.");
2473     if (esize >= 4) {
2474       vtestps(src1, src2, AVX_256bit);
2475     } else {
2476       vptest(src1, src2, AVX_256bit);
2477     }
2478     return;
2479   }
2480   if (vlen_in_bytes < 16) {
2481     // Duplicate the lower part to fill the whole register,
2482     // Don't need to do so for src2
2483     assert(vtmp != xnoreg, "required");
2484     int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04;
2485     pshufd(vtmp, src1, shuffle_imm);
2486   } else {
2487     assert(vtmp == xnoreg, "required");
2488     vtmp = src1;
2489   }
2490   if (esize >= 4 && VM_Version::supports_avx()) {
2491     vtestps(vtmp, src2, AVX_128bit);
2492   } else {
2493     ptest(vtmp, src2);
2494   }
2495 }
2496 
2497 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
2498   assert(UseAVX >= 2, "required");
2499 #ifdef ASSERT
2500   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2501   bool is_bw_supported = VM_Version::supports_avx512bw();
2502   if (is_bw && !is_bw_supported) {
2503     assert(vlen_enc != Assembler::AVX_512bit, "required");
2504     assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16),
2505            "XMM register should be 0-15");
2506   }
2507 #endif // ASSERT
2508   switch (elem_bt) {
2509     case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return;
2510     case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return;
2511     case T_INT: vpaddd(dst, src1, src2, vlen_enc); return;
2512     case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return;
2513     case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return;
2514     case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return;
2515     default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2516   }
2517 }
2518 
2519 #ifdef _LP64
2520 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) {
2521   assert(UseAVX >= 2, "required");
2522   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2523   bool is_vl = vlen_enc != Assembler::AVX_512bit;
2524   if ((UseAVX > 2) &&
2525       (!is_bw || VM_Version::supports_avx512bw()) &&
2526       (!is_vl || VM_Version::supports_avx512vl())) {
2527     switch (elem_bt) {
2528       case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return;
2529       case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return;
2530       case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return;
2531       case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return;
2532       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2533     }
2534   } else {
2535     assert(vlen_enc != Assembler::AVX_512bit, "required");
2536     assert((dst->encoding() < 16),"XMM register should be 0-15");
2537     switch (elem_bt) {
2538       case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return;
2539       case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return;
2540       case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return;
2541       case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return;
2542       case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return;
2543       case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return;
2544       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2545     }
2546   }
2547 }
2548 #endif
2549 
2550 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
2551   switch (to_elem_bt) {
2552     case T_SHORT:
2553       vpmovsxbw(dst, src, vlen_enc);
2554       break;
2555     case T_INT:
2556       vpmovsxbd(dst, src, vlen_enc);
2557       break;
2558     case T_FLOAT:
2559       vpmovsxbd(dst, src, vlen_enc);
2560       vcvtdq2ps(dst, dst, vlen_enc);
2561       break;
2562     case T_LONG:
2563       vpmovsxbq(dst, src, vlen_enc);
2564       break;
2565     case T_DOUBLE: {
2566       int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
2567       vpmovsxbd(dst, src, mid_vlen_enc);
2568       vcvtdq2pd(dst, dst, vlen_enc);
2569       break;
2570     }
2571     default:
2572       fatal("Unsupported type %s", type2name(to_elem_bt));
2573       break;
2574   }
2575 }
2576 
2577 //-------------------------------------------------------------------------------------------
2578 
2579 // IndexOf for constant substrings with size >= 8 chars
2580 // which don't need to be loaded through stack.
2581 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2582                                          Register cnt1, Register cnt2,
2583                                          int int_cnt2,  Register result,
2584                                          XMMRegister vec, Register tmp,
2585                                          int ae) {
2586   ShortBranchVerifier sbv(this);
2587   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2588   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2589 
2590   // This method uses the pcmpestri instruction with bound registers
2591   //   inputs:
2592   //     xmm - substring
2593   //     rax - substring length (elements count)
2594   //     mem - scanned string
2595   //     rdx - string length (elements count)
2596   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2597   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2598   //   outputs:
2599   //     rcx - matched index in string
2600   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2601   int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2602   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2603   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2604   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2605 
2606   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2607         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2608         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2609 
2610   // Note, inline_string_indexOf() generates checks:
2611   // if (substr.count > string.count) return -1;
2612   // if (substr.count == 0) return 0;
2613   assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2614 
2615   // Load substring.
2616   if (ae == StrIntrinsicNode::UL) {
2617     pmovzxbw(vec, Address(str2, 0));
2618   } else {
2619     movdqu(vec, Address(str2, 0));
2620   }
2621   movl(cnt2, int_cnt2);
2622   movptr(result, str1); // string addr
2623 
2624   if (int_cnt2 > stride) {
2625     jmpb(SCAN_TO_SUBSTR);
2626 
2627     // Reload substr for rescan, this code
2628     // is executed only for large substrings (> 8 chars)
2629     bind(RELOAD_SUBSTR);
2630     if (ae == StrIntrinsicNode::UL) {
2631       pmovzxbw(vec, Address(str2, 0));
2632     } else {
2633       movdqu(vec, Address(str2, 0));
2634     }
2635     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2636 
2637     bind(RELOAD_STR);
2638     // We came here after the beginning of the substring was
2639     // matched but the rest of it was not so we need to search
2640     // again. Start from the next element after the previous match.
2641 
2642     // cnt2 is number of substring reminding elements and
2643     // cnt1 is number of string reminding elements when cmp failed.
2644     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
2645     subl(cnt1, cnt2);
2646     addl(cnt1, int_cnt2);
2647     movl(cnt2, int_cnt2); // Now restore cnt2
2648 
2649     decrementl(cnt1);     // Shift to next element
2650     cmpl(cnt1, cnt2);
2651     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2652 
2653     addptr(result, (1<<scale1));
2654 
2655   } // (int_cnt2 > 8)
2656 
2657   // Scan string for start of substr in 16-byte vectors
2658   bind(SCAN_TO_SUBSTR);
2659   pcmpestri(vec, Address(result, 0), mode);
2660   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2661   subl(cnt1, stride);
2662   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2663   cmpl(cnt1, cnt2);
2664   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2665   addptr(result, 16);
2666   jmpb(SCAN_TO_SUBSTR);
2667 
2668   // Found a potential substr
2669   bind(FOUND_CANDIDATE);
2670   // Matched whole vector if first element matched (tmp(rcx) == 0).
2671   if (int_cnt2 == stride) {
2672     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
2673   } else { // int_cnt2 > 8
2674     jccb(Assembler::overflow, FOUND_SUBSTR);
2675   }
2676   // After pcmpestri tmp(rcx) contains matched element index
2677   // Compute start addr of substr
2678   lea(result, Address(result, tmp, scale1));
2679 
2680   // Make sure string is still long enough
2681   subl(cnt1, tmp);
2682   cmpl(cnt1, cnt2);
2683   if (int_cnt2 == stride) {
2684     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2685   } else { // int_cnt2 > 8
2686     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
2687   }
2688   // Left less then substring.
2689 
2690   bind(RET_NOT_FOUND);
2691   movl(result, -1);
2692   jmp(EXIT);
2693 
2694   if (int_cnt2 > stride) {
2695     // This code is optimized for the case when whole substring
2696     // is matched if its head is matched.
2697     bind(MATCH_SUBSTR_HEAD);
2698     pcmpestri(vec, Address(result, 0), mode);
2699     // Reload only string if does not match
2700     jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
2701 
2702     Label CONT_SCAN_SUBSTR;
2703     // Compare the rest of substring (> 8 chars).
2704     bind(FOUND_SUBSTR);
2705     // First 8 chars are already matched.
2706     negptr(cnt2);
2707     addptr(cnt2, stride);
2708 
2709     bind(SCAN_SUBSTR);
2710     subl(cnt1, stride);
2711     cmpl(cnt2, -stride); // Do not read beyond substring
2712     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
2713     // Back-up strings to avoid reading beyond substring:
2714     // cnt1 = cnt1 - cnt2 + 8
2715     addl(cnt1, cnt2); // cnt2 is negative
2716     addl(cnt1, stride);
2717     movl(cnt2, stride); negptr(cnt2);
2718     bind(CONT_SCAN_SUBSTR);
2719     if (int_cnt2 < (int)G) {
2720       int tail_off1 = int_cnt2<<scale1;
2721       int tail_off2 = int_cnt2<<scale2;
2722       if (ae == StrIntrinsicNode::UL) {
2723         pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
2724       } else {
2725         movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
2726       }
2727       pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
2728     } else {
2729       // calculate index in register to avoid integer overflow (int_cnt2*2)
2730       movl(tmp, int_cnt2);
2731       addptr(tmp, cnt2);
2732       if (ae == StrIntrinsicNode::UL) {
2733         pmovzxbw(vec, Address(str2, tmp, scale2, 0));
2734       } else {
2735         movdqu(vec, Address(str2, tmp, scale2, 0));
2736       }
2737       pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
2738     }
2739     // Need to reload strings pointers if not matched whole vector
2740     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2741     addptr(cnt2, stride);
2742     jcc(Assembler::negative, SCAN_SUBSTR);
2743     // Fall through if found full substring
2744 
2745   } // (int_cnt2 > 8)
2746 
2747   bind(RET_FOUND);
2748   // Found result if we matched full small substring.
2749   // Compute substr offset
2750   subptr(result, str1);
2751   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2752     shrl(result, 1); // index
2753   }
2754   bind(EXIT);
2755 
2756 } // string_indexofC8
2757 
2758 // Small strings are loaded through stack if they cross page boundary.
2759 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
2760                                        Register cnt1, Register cnt2,
2761                                        int int_cnt2,  Register result,
2762                                        XMMRegister vec, Register tmp,
2763                                        int ae) {
2764   ShortBranchVerifier sbv(this);
2765   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2766   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2767 
2768   //
2769   // int_cnt2 is length of small (< 8 chars) constant substring
2770   // or (-1) for non constant substring in which case its length
2771   // is in cnt2 register.
2772   //
2773   // Note, inline_string_indexOf() generates checks:
2774   // if (substr.count > string.count) return -1;
2775   // if (substr.count == 0) return 0;
2776   //
2777   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2778   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
2779   // This method uses the pcmpestri instruction with bound registers
2780   //   inputs:
2781   //     xmm - substring
2782   //     rax - substring length (elements count)
2783   //     mem - scanned string
2784   //     rdx - string length (elements count)
2785   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2786   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2787   //   outputs:
2788   //     rcx - matched index in string
2789   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2790   int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2791   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2792   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2793 
2794   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
2795         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
2796         FOUND_CANDIDATE;
2797 
2798   { //========================================================
2799     // We don't know where these strings are located
2800     // and we can't read beyond them. Load them through stack.
2801     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
2802 
2803     movptr(tmp, rsp); // save old SP
2804 
2805     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
2806       if (int_cnt2 == (1>>scale2)) { // One byte
2807         assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
2808         load_unsigned_byte(result, Address(str2, 0));
2809         movdl(vec, result); // move 32 bits
2810       } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
2811         // Not enough header space in 32-bit VM: 12+3 = 15.
2812         movl(result, Address(str2, -1));
2813         shrl(result, 8);
2814         movdl(vec, result); // move 32 bits
2815       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
2816         load_unsigned_short(result, Address(str2, 0));
2817         movdl(vec, result); // move 32 bits
2818       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
2819         movdl(vec, Address(str2, 0)); // move 32 bits
2820       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
2821         movq(vec, Address(str2, 0));  // move 64 bits
2822       } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
2823         // Array header size is 12 bytes in 32-bit VM
2824         // + 6 bytes for 3 chars == 18 bytes,
2825         // enough space to load vec and shift.
2826         assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
2827         if (ae == StrIntrinsicNode::UL) {
2828           int tail_off = int_cnt2-8;
2829           pmovzxbw(vec, Address(str2, tail_off));
2830           psrldq(vec, -2*tail_off);
2831         }
2832         else {
2833           int tail_off = int_cnt2*(1<<scale2);
2834           movdqu(vec, Address(str2, tail_off-16));
2835           psrldq(vec, 16-tail_off);
2836         }
2837       }
2838     } else { // not constant substring
2839       cmpl(cnt2, stride);
2840       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
2841 
2842       // We can read beyond string if srt+16 does not cross page boundary
2843       // since heaps are aligned and mapped by pages.
2844       assert(os::vm_page_size() < (int)G, "default page should be small");
2845       movl(result, str2); // We need only low 32 bits
2846       andl(result, ((int)os::vm_page_size()-1));
2847       cmpl(result, ((int)os::vm_page_size()-16));
2848       jccb(Assembler::belowEqual, CHECK_STR);
2849 
2850       // Move small strings to stack to allow load 16 bytes into vec.
2851       subptr(rsp, 16);
2852       int stk_offset = wordSize-(1<<scale2);
2853       push(cnt2);
2854 
2855       bind(COPY_SUBSTR);
2856       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
2857         load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
2858         movb(Address(rsp, cnt2, scale2, stk_offset), result);
2859       } else if (ae == StrIntrinsicNode::UU) {
2860         load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
2861         movw(Address(rsp, cnt2, scale2, stk_offset), result);
2862       }
2863       decrement(cnt2);
2864       jccb(Assembler::notZero, COPY_SUBSTR);
2865 
2866       pop(cnt2);
2867       movptr(str2, rsp);  // New substring address
2868     } // non constant
2869 
2870     bind(CHECK_STR);
2871     cmpl(cnt1, stride);
2872     jccb(Assembler::aboveEqual, BIG_STRINGS);
2873 
2874     // Check cross page boundary.
2875     movl(result, str1); // We need only low 32 bits
2876     andl(result, ((int)os::vm_page_size()-1));
2877     cmpl(result, ((int)os::vm_page_size()-16));
2878     jccb(Assembler::belowEqual, BIG_STRINGS);
2879 
2880     subptr(rsp, 16);
2881     int stk_offset = -(1<<scale1);
2882     if (int_cnt2 < 0) { // not constant
2883       push(cnt2);
2884       stk_offset += wordSize;
2885     }
2886     movl(cnt2, cnt1);
2887 
2888     bind(COPY_STR);
2889     if (ae == StrIntrinsicNode::LL) {
2890       load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
2891       movb(Address(rsp, cnt2, scale1, stk_offset), result);
2892     } else {
2893       load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
2894       movw(Address(rsp, cnt2, scale1, stk_offset), result);
2895     }
2896     decrement(cnt2);
2897     jccb(Assembler::notZero, COPY_STR);
2898 
2899     if (int_cnt2 < 0) { // not constant
2900       pop(cnt2);
2901     }
2902     movptr(str1, rsp);  // New string address
2903 
2904     bind(BIG_STRINGS);
2905     // Load substring.
2906     if (int_cnt2 < 0) { // -1
2907       if (ae == StrIntrinsicNode::UL) {
2908         pmovzxbw(vec, Address(str2, 0));
2909       } else {
2910         movdqu(vec, Address(str2, 0));
2911       }
2912       push(cnt2);       // substr count
2913       push(str2);       // substr addr
2914       push(str1);       // string addr
2915     } else {
2916       // Small (< 8 chars) constant substrings are loaded already.
2917       movl(cnt2, int_cnt2);
2918     }
2919     push(tmp);  // original SP
2920 
2921   } // Finished loading
2922 
2923   //========================================================
2924   // Start search
2925   //
2926 
2927   movptr(result, str1); // string addr
2928 
2929   if (int_cnt2  < 0) {  // Only for non constant substring
2930     jmpb(SCAN_TO_SUBSTR);
2931 
2932     // SP saved at sp+0
2933     // String saved at sp+1*wordSize
2934     // Substr saved at sp+2*wordSize
2935     // Substr count saved at sp+3*wordSize
2936 
2937     // Reload substr for rescan, this code
2938     // is executed only for large substrings (> 8 chars)
2939     bind(RELOAD_SUBSTR);
2940     movptr(str2, Address(rsp, 2*wordSize));
2941     movl(cnt2, Address(rsp, 3*wordSize));
2942     if (ae == StrIntrinsicNode::UL) {
2943       pmovzxbw(vec, Address(str2, 0));
2944     } else {
2945       movdqu(vec, Address(str2, 0));
2946     }
2947     // We came here after the beginning of the substring was
2948     // matched but the rest of it was not so we need to search
2949     // again. Start from the next element after the previous match.
2950     subptr(str1, result); // Restore counter
2951     if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2952       shrl(str1, 1);
2953     }
2954     addl(cnt1, str1);
2955     decrementl(cnt1);   // Shift to next element
2956     cmpl(cnt1, cnt2);
2957     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2958 
2959     addptr(result, (1<<scale1));
2960   } // non constant
2961 
2962   // Scan string for start of substr in 16-byte vectors
2963   bind(SCAN_TO_SUBSTR);
2964   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2965   pcmpestri(vec, Address(result, 0), mode);
2966   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2967   subl(cnt1, stride);
2968   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2969   cmpl(cnt1, cnt2);
2970   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2971   addptr(result, 16);
2972 
2973   bind(ADJUST_STR);
2974   cmpl(cnt1, stride); // Do not read beyond string
2975   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2976   // Back-up string to avoid reading beyond string.
2977   lea(result, Address(result, cnt1, scale1, -16));
2978   movl(cnt1, stride);
2979   jmpb(SCAN_TO_SUBSTR);
2980 
2981   // Found a potential substr
2982   bind(FOUND_CANDIDATE);
2983   // After pcmpestri tmp(rcx) contains matched element index
2984 
2985   // Make sure string is still long enough
2986   subl(cnt1, tmp);
2987   cmpl(cnt1, cnt2);
2988   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
2989   // Left less then substring.
2990 
2991   bind(RET_NOT_FOUND);
2992   movl(result, -1);
2993   jmp(CLEANUP);
2994 
2995   bind(FOUND_SUBSTR);
2996   // Compute start addr of substr
2997   lea(result, Address(result, tmp, scale1));
2998   if (int_cnt2 > 0) { // Constant substring
2999     // Repeat search for small substring (< 8 chars)
3000     // from new point without reloading substring.
3001     // Have to check that we don't read beyond string.
3002     cmpl(tmp, stride-int_cnt2);
3003     jccb(Assembler::greater, ADJUST_STR);
3004     // Fall through if matched whole substring.
3005   } else { // non constant
3006     assert(int_cnt2 == -1, "should be != 0");
3007 
3008     addl(tmp, cnt2);
3009     // Found result if we matched whole substring.
3010     cmpl(tmp, stride);
3011     jcc(Assembler::lessEqual, RET_FOUND);
3012 
3013     // Repeat search for small substring (<= 8 chars)
3014     // from new point 'str1' without reloading substring.
3015     cmpl(cnt2, stride);
3016     // Have to check that we don't read beyond string.
3017     jccb(Assembler::lessEqual, ADJUST_STR);
3018 
3019     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
3020     // Compare the rest of substring (> 8 chars).
3021     movptr(str1, result);
3022 
3023     cmpl(tmp, cnt2);
3024     // First 8 chars are already matched.
3025     jccb(Assembler::equal, CHECK_NEXT);
3026 
3027     bind(SCAN_SUBSTR);
3028     pcmpestri(vec, Address(str1, 0), mode);
3029     // Need to reload strings pointers if not matched whole vector
3030     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
3031 
3032     bind(CHECK_NEXT);
3033     subl(cnt2, stride);
3034     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
3035     addptr(str1, 16);
3036     if (ae == StrIntrinsicNode::UL) {
3037       addptr(str2, 8);
3038     } else {
3039       addptr(str2, 16);
3040     }
3041     subl(cnt1, stride);
3042     cmpl(cnt2, stride); // Do not read beyond substring
3043     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
3044     // Back-up strings to avoid reading beyond substring.
3045 
3046     if (ae == StrIntrinsicNode::UL) {
3047       lea(str2, Address(str2, cnt2, scale2, -8));
3048       lea(str1, Address(str1, cnt2, scale1, -16));
3049     } else {
3050       lea(str2, Address(str2, cnt2, scale2, -16));
3051       lea(str1, Address(str1, cnt2, scale1, -16));
3052     }
3053     subl(cnt1, cnt2);
3054     movl(cnt2, stride);
3055     addl(cnt1, stride);
3056     bind(CONT_SCAN_SUBSTR);
3057     if (ae == StrIntrinsicNode::UL) {
3058       pmovzxbw(vec, Address(str2, 0));
3059     } else {
3060       movdqu(vec, Address(str2, 0));
3061     }
3062     jmp(SCAN_SUBSTR);
3063 
3064     bind(RET_FOUND_LONG);
3065     movptr(str1, Address(rsp, wordSize));
3066   } // non constant
3067 
3068   bind(RET_FOUND);
3069   // Compute substr offset
3070   subptr(result, str1);
3071   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3072     shrl(result, 1); // index
3073   }
3074   bind(CLEANUP);
3075   pop(rsp); // restore SP
3076 
3077 } // string_indexof
3078 
3079 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3080                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3081   ShortBranchVerifier sbv(this);
3082   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3083 
3084   int stride = 8;
3085 
3086   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
3087         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
3088         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
3089         FOUND_SEQ_CHAR, DONE_LABEL;
3090 
3091   movptr(result, str1);
3092   if (UseAVX >= 2) {
3093     cmpl(cnt1, stride);
3094     jcc(Assembler::less, SCAN_TO_CHAR);
3095     cmpl(cnt1, 2*stride);
3096     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
3097     movdl(vec1, ch);
3098     vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
3099     vpxor(vec2, vec2);
3100     movl(tmp, cnt1);
3101     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
3102     andl(cnt1,0x0000000F);  //tail count (in chars)
3103 
3104     bind(SCAN_TO_16_CHAR_LOOP);
3105     vmovdqu(vec3, Address(result, 0));
3106     vpcmpeqw(vec3, vec3, vec1, 1);
3107     vptest(vec2, vec3);
3108     jcc(Assembler::carryClear, FOUND_CHAR);
3109     addptr(result, 32);
3110     subl(tmp, 2*stride);
3111     jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
3112     jmp(SCAN_TO_8_CHAR);
3113     bind(SCAN_TO_8_CHAR_INIT);
3114     movdl(vec1, ch);
3115     pshuflw(vec1, vec1, 0x00);
3116     pshufd(vec1, vec1, 0);
3117     pxor(vec2, vec2);
3118   }
3119   bind(SCAN_TO_8_CHAR);
3120   cmpl(cnt1, stride);
3121   jcc(Assembler::less, SCAN_TO_CHAR);
3122   if (UseAVX < 2) {
3123     movdl(vec1, ch);
3124     pshuflw(vec1, vec1, 0x00);
3125     pshufd(vec1, vec1, 0);
3126     pxor(vec2, vec2);
3127   }
3128   movl(tmp, cnt1);
3129   andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
3130   andl(cnt1,0x00000007);  //tail count (in chars)
3131 
3132   bind(SCAN_TO_8_CHAR_LOOP);
3133   movdqu(vec3, Address(result, 0));
3134   pcmpeqw(vec3, vec1);
3135   ptest(vec2, vec3);
3136   jcc(Assembler::carryClear, FOUND_CHAR);
3137   addptr(result, 16);
3138   subl(tmp, stride);
3139   jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
3140   bind(SCAN_TO_CHAR);
3141   testl(cnt1, cnt1);
3142   jcc(Assembler::zero, RET_NOT_FOUND);
3143   bind(SCAN_TO_CHAR_LOOP);
3144   load_unsigned_short(tmp, Address(result, 0));
3145   cmpl(ch, tmp);
3146   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3147   addptr(result, 2);
3148   subl(cnt1, 1);
3149   jccb(Assembler::zero, RET_NOT_FOUND);
3150   jmp(SCAN_TO_CHAR_LOOP);
3151 
3152   bind(RET_NOT_FOUND);
3153   movl(result, -1);
3154   jmpb(DONE_LABEL);
3155 
3156   bind(FOUND_CHAR);
3157   if (UseAVX >= 2) {
3158     vpmovmskb(tmp, vec3);
3159   } else {
3160     pmovmskb(tmp, vec3);
3161   }
3162   bsfl(ch, tmp);
3163   addptr(result, ch);
3164 
3165   bind(FOUND_SEQ_CHAR);
3166   subptr(result, str1);
3167   shrl(result, 1);
3168 
3169   bind(DONE_LABEL);
3170 } // string_indexof_char
3171 
3172 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3173                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3174   ShortBranchVerifier sbv(this);
3175   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3176 
3177   int stride = 16;
3178 
3179   Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
3180         SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
3181         RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
3182         FOUND_SEQ_CHAR, DONE_LABEL;
3183 
3184   movptr(result, str1);
3185   if (UseAVX >= 2) {
3186     cmpl(cnt1, stride);
3187     jcc(Assembler::less, SCAN_TO_CHAR_INIT);
3188     cmpl(cnt1, stride*2);
3189     jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
3190     movdl(vec1, ch);
3191     vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
3192     vpxor(vec2, vec2);
3193     movl(tmp, cnt1);
3194     andl(tmp, 0xFFFFFFE0);  //vector count (in chars)
3195     andl(cnt1,0x0000001F);  //tail count (in chars)
3196 
3197     bind(SCAN_TO_32_CHAR_LOOP);
3198     vmovdqu(vec3, Address(result, 0));
3199     vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
3200     vptest(vec2, vec3);
3201     jcc(Assembler::carryClear, FOUND_CHAR);
3202     addptr(result, 32);
3203     subl(tmp, stride*2);
3204     jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
3205     jmp(SCAN_TO_16_CHAR);
3206 
3207     bind(SCAN_TO_16_CHAR_INIT);
3208     movdl(vec1, ch);
3209     pxor(vec2, vec2);
3210     pshufb(vec1, vec2);
3211   }
3212 
3213   bind(SCAN_TO_16_CHAR);
3214   cmpl(cnt1, stride);
3215   jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left
3216   if (UseAVX < 2) {
3217     movdl(vec1, ch);
3218     pxor(vec2, vec2);
3219     pshufb(vec1, vec2);
3220   }
3221   movl(tmp, cnt1);
3222   andl(tmp, 0xFFFFFFF0);  //vector count (in bytes)
3223   andl(cnt1,0x0000000F);  //tail count (in bytes)
3224 
3225   bind(SCAN_TO_16_CHAR_LOOP);
3226   movdqu(vec3, Address(result, 0));
3227   pcmpeqb(vec3, vec1);
3228   ptest(vec2, vec3);
3229   jcc(Assembler::carryClear, FOUND_CHAR);
3230   addptr(result, 16);
3231   subl(tmp, stride);
3232   jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
3233 
3234   bind(SCAN_TO_CHAR_INIT);
3235   testl(cnt1, cnt1);
3236   jcc(Assembler::zero, RET_NOT_FOUND);
3237   bind(SCAN_TO_CHAR_LOOP);
3238   load_unsigned_byte(tmp, Address(result, 0));
3239   cmpl(ch, tmp);
3240   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3241   addptr(result, 1);
3242   subl(cnt1, 1);
3243   jccb(Assembler::zero, RET_NOT_FOUND);
3244   jmp(SCAN_TO_CHAR_LOOP);
3245 
3246   bind(RET_NOT_FOUND);
3247   movl(result, -1);
3248   jmpb(DONE_LABEL);
3249 
3250   bind(FOUND_CHAR);
3251   if (UseAVX >= 2) {
3252     vpmovmskb(tmp, vec3);
3253   } else {
3254     pmovmskb(tmp, vec3);
3255   }
3256   bsfl(ch, tmp);
3257   addptr(result, ch);
3258 
3259   bind(FOUND_SEQ_CHAR);
3260   subptr(result, str1);
3261 
3262   bind(DONE_LABEL);
3263 } // stringL_indexof_char
3264 
3265 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) {
3266   switch (eltype) {
3267   case T_BOOLEAN: return sizeof(jboolean);
3268   case T_BYTE:  return sizeof(jbyte);
3269   case T_SHORT: return sizeof(jshort);
3270   case T_CHAR:  return sizeof(jchar);
3271   case T_INT:   return sizeof(jint);
3272   default:
3273     ShouldNotReachHere();
3274     return -1;
3275   }
3276 }
3277 
3278 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) {
3279   switch (eltype) {
3280   // T_BOOLEAN used as surrogate for unsigned byte
3281   case T_BOOLEAN: movzbl(dst, src);   break;
3282   case T_BYTE:    movsbl(dst, src);   break;
3283   case T_SHORT:   movswl(dst, src);   break;
3284   case T_CHAR:    movzwl(dst, src);   break;
3285   case T_INT:     movl(dst, src);     break;
3286   default:
3287     ShouldNotReachHere();
3288   }
3289 }
3290 
3291 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) {
3292   load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8);
3293 }
3294 
3295 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) {
3296   load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8);
3297 }
3298 
3299 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) {
3300   const int vlen = Assembler::AVX_256bit;
3301   switch (eltype) {
3302   case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3303   case T_BYTE:      vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3304   case T_SHORT:     vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3305   case T_CHAR:    vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3306   case T_INT:
3307     // do nothing
3308     break;
3309   default:
3310     ShouldNotReachHere();
3311   }
3312 }
3313 
3314 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result,
3315                                         Register index, Register tmp2, Register tmp3, XMMRegister vnext,
3316                                         XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3,
3317                                         XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3,
3318                                         XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3,
3319                                         BasicType eltype) {
3320   ShortBranchVerifier sbv(this);
3321   assert(UseAVX >= 2, "AVX2 intrinsics are required");
3322   assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3);
3323   assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3);
3324 
3325   Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN,
3326         SHORT_UNROLLED_LOOP_EXIT,
3327         UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME,
3328         UNROLLED_VECTOR_LOOP_BEGIN,
3329         END;
3330   switch (eltype) {
3331   case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break;
3332   case T_CHAR:    BLOCK_COMMENT("arrays_hashcode(char) {");          break;
3333   case T_BYTE:    BLOCK_COMMENT("arrays_hashcode(byte) {");          break;
3334   case T_SHORT:   BLOCK_COMMENT("arrays_hashcode(short) {");         break;
3335   case T_INT:     BLOCK_COMMENT("arrays_hashcode(int) {");           break;
3336   default:        BLOCK_COMMENT("arrays_hashcode {");                break;
3337   }
3338 
3339   // For "renaming" for readibility of the code
3340   const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 },
3341                     vresult[] = { vresult0, vresult1, vresult2, vresult3 },
3342                     vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 };
3343 
3344   const int elsize = arrays_hashcode_elsize(eltype);
3345 
3346   /*
3347     if (cnt1 >= 2) {
3348       if (cnt1 >= 32) {
3349         UNROLLED VECTOR LOOP
3350       }
3351       UNROLLED SCALAR LOOP
3352     }
3353     SINGLE SCALAR
3354    */
3355 
3356   cmpl(cnt1, 32);
3357   jcc(Assembler::less, SHORT_UNROLLED_BEGIN);
3358 
3359   // cnt1 >= 32 && generate_vectorized_loop
3360   xorl(index, index);
3361 
3362   // vresult = IntVector.zero(I256);
3363   for (int idx = 0; idx < 4; idx++) {
3364     vpxor(vresult[idx], vresult[idx]);
3365   }
3366   // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]);
3367   Register bound = tmp2;
3368   Register next = tmp3;
3369   lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint))));
3370   movl(next, Address(tmp2, 0));
3371   movdl(vnext, next);
3372   vpbroadcastd(vnext, vnext, Assembler::AVX_256bit);
3373 
3374   // index = 0;
3375   // bound = cnt1 & ~(32 - 1);
3376   movl(bound, cnt1);
3377   andl(bound, ~(32 - 1));
3378   // for (; index < bound; index += 32) {
3379   bind(UNROLLED_VECTOR_LOOP_BEGIN);
3380   // result *= next;
3381   imull(result, next);
3382   // loop fission to upfront the cost of fetching from memory, OOO execution
3383   // can then hopefully do a better job of prefetching
3384   for (int idx = 0; idx < 4; idx++) {
3385     arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype);
3386   }
3387   // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7];
3388   for (int idx = 0; idx < 4; idx++) {
3389     vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit);
3390     arrays_hashcode_elvcast(vtmp[idx], eltype);
3391     vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit);
3392   }
3393   // index += 32;
3394   addl(index, 32);
3395   // index < bound;
3396   cmpl(index, bound);
3397   jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN);
3398   // }
3399 
3400   lea(ary1, Address(ary1, bound, Address::times(elsize)));
3401   subl(cnt1, bound);
3402   // release bound
3403 
3404   // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1);
3405   for (int idx = 0; idx < 4; idx++) {
3406     lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint))));
3407     arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT);
3408     vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit);
3409   }
3410   // result += vresult.reduceLanes(ADD);
3411   for (int idx = 0; idx < 4; idx++) {
3412     reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]);
3413   }
3414 
3415   // } else if (cnt1 < 32) {
3416 
3417   bind(SHORT_UNROLLED_BEGIN);
3418   // int i = 1;
3419   movl(index, 1);
3420   cmpl(index, cnt1);
3421   jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT);
3422 
3423   // for (; i < cnt1 ; i += 2) {
3424   bind(SHORT_UNROLLED_LOOP_BEGIN);
3425   movl(tmp3, 961);
3426   imull(result, tmp3);
3427   arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3428   movl(tmp3, tmp2);
3429   shll(tmp3, 5);
3430   subl(tmp3, tmp2);
3431   addl(result, tmp3);
3432   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype);
3433   addl(result, tmp3);
3434   addl(index, 2);
3435   cmpl(index, cnt1);
3436   jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN);
3437 
3438   // }
3439   // if (i >= cnt1) {
3440   bind(SHORT_UNROLLED_LOOP_EXIT);
3441   jccb(Assembler::greater, END);
3442   movl(tmp2, result);
3443   shll(result, 5);
3444   subl(result, tmp2);
3445   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3446   addl(result, tmp3);
3447   // }
3448   bind(END);
3449 
3450   BLOCK_COMMENT("} // arrays_hashcode");
3451 
3452 } // arrays_hashcode
3453 
3454 // helper function for string_compare
3455 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
3456                                            Address::ScaleFactor scale, Address::ScaleFactor scale1,
3457                                            Address::ScaleFactor scale2, Register index, int ae) {
3458   if (ae == StrIntrinsicNode::LL) {
3459     load_unsigned_byte(elem1, Address(str1, index, scale, 0));
3460     load_unsigned_byte(elem2, Address(str2, index, scale, 0));
3461   } else if (ae == StrIntrinsicNode::UU) {
3462     load_unsigned_short(elem1, Address(str1, index, scale, 0));
3463     load_unsigned_short(elem2, Address(str2, index, scale, 0));
3464   } else {
3465     load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
3466     load_unsigned_short(elem2, Address(str2, index, scale2, 0));
3467   }
3468 }
3469 
3470 // Compare strings, used for char[] and byte[].
3471 void C2_MacroAssembler::string_compare(Register str1, Register str2,
3472                                        Register cnt1, Register cnt2, Register result,
3473                                        XMMRegister vec1, int ae, KRegister mask) {
3474   ShortBranchVerifier sbv(this);
3475   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
3476   Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only _LP64 && AVX3
3477   int stride, stride2, adr_stride, adr_stride1, adr_stride2;
3478   int stride2x2 = 0x40;
3479   Address::ScaleFactor scale = Address::no_scale;
3480   Address::ScaleFactor scale1 = Address::no_scale;
3481   Address::ScaleFactor scale2 = Address::no_scale;
3482 
3483   if (ae != StrIntrinsicNode::LL) {
3484     stride2x2 = 0x20;
3485   }
3486 
3487   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
3488     shrl(cnt2, 1);
3489   }
3490   // Compute the minimum of the string lengths and the
3491   // difference of the string lengths (stack).
3492   // Do the conditional move stuff
3493   movl(result, cnt1);
3494   subl(cnt1, cnt2);
3495   push(cnt1);
3496   cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
3497 
3498   // Is the minimum length zero?
3499   testl(cnt2, cnt2);
3500   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3501   if (ae == StrIntrinsicNode::LL) {
3502     // Load first bytes
3503     load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
3504     load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
3505   } else if (ae == StrIntrinsicNode::UU) {
3506     // Load first characters
3507     load_unsigned_short(result, Address(str1, 0));
3508     load_unsigned_short(cnt1, Address(str2, 0));
3509   } else {
3510     load_unsigned_byte(result, Address(str1, 0));
3511     load_unsigned_short(cnt1, Address(str2, 0));
3512   }
3513   subl(result, cnt1);
3514   jcc(Assembler::notZero,  POP_LABEL);
3515 
3516   if (ae == StrIntrinsicNode::UU) {
3517     // Divide length by 2 to get number of chars
3518     shrl(cnt2, 1);
3519   }
3520   cmpl(cnt2, 1);
3521   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3522 
3523   // Check if the strings start at the same location and setup scale and stride
3524   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3525     cmpptr(str1, str2);
3526     jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3527     if (ae == StrIntrinsicNode::LL) {
3528       scale = Address::times_1;
3529       stride = 16;
3530     } else {
3531       scale = Address::times_2;
3532       stride = 8;
3533     }
3534   } else {
3535     scale1 = Address::times_1;
3536     scale2 = Address::times_2;
3537     // scale not used
3538     stride = 8;
3539   }
3540 
3541   if (UseAVX >= 2 && UseSSE42Intrinsics) {
3542     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
3543     Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
3544     Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
3545     Label COMPARE_TAIL_LONG;
3546     Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only _LP64 && AVX3
3547 
3548     int pcmpmask = 0x19;
3549     if (ae == StrIntrinsicNode::LL) {
3550       pcmpmask &= ~0x01;
3551     }
3552 
3553     // Setup to compare 16-chars (32-bytes) vectors,
3554     // start from first character again because it has aligned address.
3555     if (ae == StrIntrinsicNode::LL) {
3556       stride2 = 32;
3557     } else {
3558       stride2 = 16;
3559     }
3560     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3561       adr_stride = stride << scale;
3562     } else {
3563       adr_stride1 = 8;  //stride << scale1;
3564       adr_stride2 = 16; //stride << scale2;
3565     }
3566 
3567     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3568     // rax and rdx are used by pcmpestri as elements counters
3569     movl(result, cnt2);
3570     andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
3571     jcc(Assembler::zero, COMPARE_TAIL_LONG);
3572 
3573     // fast path : compare first 2 8-char vectors.
3574     bind(COMPARE_16_CHARS);
3575     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3576       movdqu(vec1, Address(str1, 0));
3577     } else {
3578       pmovzxbw(vec1, Address(str1, 0));
3579     }
3580     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3581     jccb(Assembler::below, COMPARE_INDEX_CHAR);
3582 
3583     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3584       movdqu(vec1, Address(str1, adr_stride));
3585       pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
3586     } else {
3587       pmovzxbw(vec1, Address(str1, adr_stride1));
3588       pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
3589     }
3590     jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
3591     addl(cnt1, stride);
3592 
3593     // Compare the characters at index in cnt1
3594     bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
3595     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3596     subl(result, cnt2);
3597     jmp(POP_LABEL);
3598 
3599     // Setup the registers to start vector comparison loop
3600     bind(COMPARE_WIDE_VECTORS);
3601     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3602       lea(str1, Address(str1, result, scale));
3603       lea(str2, Address(str2, result, scale));
3604     } else {
3605       lea(str1, Address(str1, result, scale1));
3606       lea(str2, Address(str2, result, scale2));
3607     }
3608     subl(result, stride2);
3609     subl(cnt2, stride2);
3610     jcc(Assembler::zero, COMPARE_WIDE_TAIL);
3611     negptr(result);
3612 
3613     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
3614     bind(COMPARE_WIDE_VECTORS_LOOP);
3615 
3616 #ifdef _LP64
3617     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3618       cmpl(cnt2, stride2x2);
3619       jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3620       testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
3621       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
3622 
3623       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3624       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3625         evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
3626         evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3627       } else {
3628         vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
3629         evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3630       }
3631       kortestql(mask, mask);
3632       jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
3633       addptr(result, stride2x2);  // update since we already compared at this addr
3634       subl(cnt2, stride2x2);      // and sub the size too
3635       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3636 
3637       vpxor(vec1, vec1);
3638       jmpb(COMPARE_WIDE_TAIL);
3639     }//if (VM_Version::supports_avx512vlbw())
3640 #endif // _LP64
3641 
3642 
3643     bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3644     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3645       vmovdqu(vec1, Address(str1, result, scale));
3646       vpxor(vec1, Address(str2, result, scale));
3647     } else {
3648       vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
3649       vpxor(vec1, Address(str2, result, scale2));
3650     }
3651     vptest(vec1, vec1);
3652     jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
3653     addptr(result, stride2);
3654     subl(cnt2, stride2);
3655     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
3656     // clean upper bits of YMM registers
3657     vpxor(vec1, vec1);
3658 
3659     // compare wide vectors tail
3660     bind(COMPARE_WIDE_TAIL);
3661     testptr(result, result);
3662     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3663 
3664     movl(result, stride2);
3665     movl(cnt2, result);
3666     negptr(result);
3667     jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3668 
3669     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
3670     bind(VECTOR_NOT_EQUAL);
3671     // clean upper bits of YMM registers
3672     vpxor(vec1, vec1);
3673     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3674       lea(str1, Address(str1, result, scale));
3675       lea(str2, Address(str2, result, scale));
3676     } else {
3677       lea(str1, Address(str1, result, scale1));
3678       lea(str2, Address(str2, result, scale2));
3679     }
3680     jmp(COMPARE_16_CHARS);
3681 
3682     // Compare tail chars, length between 1 to 15 chars
3683     bind(COMPARE_TAIL_LONG);
3684     movl(cnt2, result);
3685     cmpl(cnt2, stride);
3686     jcc(Assembler::less, COMPARE_SMALL_STR);
3687 
3688     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3689       movdqu(vec1, Address(str1, 0));
3690     } else {
3691       pmovzxbw(vec1, Address(str1, 0));
3692     }
3693     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3694     jcc(Assembler::below, COMPARE_INDEX_CHAR);
3695     subptr(cnt2, stride);
3696     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3697     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3698       lea(str1, Address(str1, result, scale));
3699       lea(str2, Address(str2, result, scale));
3700     } else {
3701       lea(str1, Address(str1, result, scale1));
3702       lea(str2, Address(str2, result, scale2));
3703     }
3704     negptr(cnt2);
3705     jmpb(WHILE_HEAD_LABEL);
3706 
3707     bind(COMPARE_SMALL_STR);
3708   } else if (UseSSE42Intrinsics) {
3709     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
3710     int pcmpmask = 0x19;
3711     // Setup to compare 8-char (16-byte) vectors,
3712     // start from first character again because it has aligned address.
3713     movl(result, cnt2);
3714     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
3715     if (ae == StrIntrinsicNode::LL) {
3716       pcmpmask &= ~0x01;
3717     }
3718     jcc(Assembler::zero, COMPARE_TAIL);
3719     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3720       lea(str1, Address(str1, result, scale));
3721       lea(str2, Address(str2, result, scale));
3722     } else {
3723       lea(str1, Address(str1, result, scale1));
3724       lea(str2, Address(str2, result, scale2));
3725     }
3726     negptr(result);
3727 
3728     // pcmpestri
3729     //   inputs:
3730     //     vec1- substring
3731     //     rax - negative string length (elements count)
3732     //     mem - scanned string
3733     //     rdx - string length (elements count)
3734     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
3735     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
3736     //   outputs:
3737     //     rcx - first mismatched element index
3738     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3739 
3740     bind(COMPARE_WIDE_VECTORS);
3741     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3742       movdqu(vec1, Address(str1, result, scale));
3743       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3744     } else {
3745       pmovzxbw(vec1, Address(str1, result, scale1));
3746       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3747     }
3748     // After pcmpestri cnt1(rcx) contains mismatched element index
3749 
3750     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
3751     addptr(result, stride);
3752     subptr(cnt2, stride);
3753     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3754 
3755     // compare wide vectors tail
3756     testptr(result, result);
3757     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3758 
3759     movl(cnt2, stride);
3760     movl(result, stride);
3761     negptr(result);
3762     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3763       movdqu(vec1, Address(str1, result, scale));
3764       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3765     } else {
3766       pmovzxbw(vec1, Address(str1, result, scale1));
3767       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3768     }
3769     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
3770 
3771     // Mismatched characters in the vectors
3772     bind(VECTOR_NOT_EQUAL);
3773     addptr(cnt1, result);
3774     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3775     subl(result, cnt2);
3776     jmpb(POP_LABEL);
3777 
3778     bind(COMPARE_TAIL); // limit is zero
3779     movl(cnt2, result);
3780     // Fallthru to tail compare
3781   }
3782   // Shift str2 and str1 to the end of the arrays, negate min
3783   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3784     lea(str1, Address(str1, cnt2, scale));
3785     lea(str2, Address(str2, cnt2, scale));
3786   } else {
3787     lea(str1, Address(str1, cnt2, scale1));
3788     lea(str2, Address(str2, cnt2, scale2));
3789   }
3790   decrementl(cnt2);  // first character was compared already
3791   negptr(cnt2);
3792 
3793   // Compare the rest of the elements
3794   bind(WHILE_HEAD_LABEL);
3795   load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
3796   subl(result, cnt1);
3797   jccb(Assembler::notZero, POP_LABEL);
3798   increment(cnt2);
3799   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
3800 
3801   // Strings are equal up to min length.  Return the length difference.
3802   bind(LENGTH_DIFF_LABEL);
3803   pop(result);
3804   if (ae == StrIntrinsicNode::UU) {
3805     // Divide diff by 2 to get number of chars
3806     sarl(result, 1);
3807   }
3808   jmpb(DONE_LABEL);
3809 
3810 #ifdef _LP64
3811   if (VM_Version::supports_avx512vlbw()) {
3812 
3813     bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
3814 
3815     kmovql(cnt1, mask);
3816     notq(cnt1);
3817     bsfq(cnt2, cnt1);
3818     if (ae != StrIntrinsicNode::LL) {
3819       // Divide diff by 2 to get number of chars
3820       sarl(cnt2, 1);
3821     }
3822     addq(result, cnt2);
3823     if (ae == StrIntrinsicNode::LL) {
3824       load_unsigned_byte(cnt1, Address(str2, result));
3825       load_unsigned_byte(result, Address(str1, result));
3826     } else if (ae == StrIntrinsicNode::UU) {
3827       load_unsigned_short(cnt1, Address(str2, result, scale));
3828       load_unsigned_short(result, Address(str1, result, scale));
3829     } else {
3830       load_unsigned_short(cnt1, Address(str2, result, scale2));
3831       load_unsigned_byte(result, Address(str1, result, scale1));
3832     }
3833     subl(result, cnt1);
3834     jmpb(POP_LABEL);
3835   }//if (VM_Version::supports_avx512vlbw())
3836 #endif // _LP64
3837 
3838   // Discard the stored length difference
3839   bind(POP_LABEL);
3840   pop(cnt1);
3841 
3842   // That's it
3843   bind(DONE_LABEL);
3844   if(ae == StrIntrinsicNode::UL) {
3845     negl(result);
3846   }
3847 
3848 }
3849 
3850 // Search for Non-ASCII character (Negative byte value) in a byte array,
3851 // return the index of the first such character, otherwise the length
3852 // of the array segment searched.
3853 //   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
3854 //   @IntrinsicCandidate
3855 //   public static int countPositives(byte[] ba, int off, int len) {
3856 //     for (int i = off; i < off + len; i++) {
3857 //       if (ba[i] < 0) {
3858 //         return i - off;
3859 //       }
3860 //     }
3861 //     return len;
3862 //   }
3863 void C2_MacroAssembler::count_positives(Register ary1, Register len,
3864   Register result, Register tmp1,
3865   XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {
3866   // rsi: byte array
3867   // rcx: len
3868   // rax: result
3869   ShortBranchVerifier sbv(this);
3870   assert_different_registers(ary1, len, result, tmp1);
3871   assert_different_registers(vec1, vec2);
3872   Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
3873 
3874   movl(result, len); // copy
3875   // len == 0
3876   testl(len, len);
3877   jcc(Assembler::zero, DONE);
3878 
3879   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
3880     VM_Version::supports_avx512vlbw() &&
3881     VM_Version::supports_bmi2()) {
3882 
3883     Label test_64_loop, test_tail, BREAK_LOOP;
3884     Register tmp3_aliased = len;
3885 
3886     movl(tmp1, len);
3887     vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
3888 
3889     andl(tmp1, 64 - 1);   // tail count (in chars) 0x3F
3890     andl(len, ~(64 - 1));    // vector count (in chars)
3891     jccb(Assembler::zero, test_tail);
3892 
3893     lea(ary1, Address(ary1, len, Address::times_1));
3894     negptr(len);
3895 
3896     bind(test_64_loop);
3897     // Check whether our 64 elements of size byte contain negatives
3898     evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
3899     kortestql(mask1, mask1);
3900     jcc(Assembler::notZero, BREAK_LOOP);
3901 
3902     addptr(len, 64);
3903     jccb(Assembler::notZero, test_64_loop);
3904 
3905     bind(test_tail);
3906     // bail out when there is nothing to be done
3907     testl(tmp1, -1);
3908     jcc(Assembler::zero, DONE);
3909 
3910     // ~(~0 << len) applied up to two times (for 32-bit scenario)
3911 #ifdef _LP64
3912     mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
3913     shlxq(tmp3_aliased, tmp3_aliased, tmp1);
3914     notq(tmp3_aliased);
3915     kmovql(mask2, tmp3_aliased);
3916 #else
3917     Label k_init;
3918     jmp(k_init);
3919 
3920     // We could not read 64-bits from a general purpose register thus we move
3921     // data required to compose 64 1's to the instruction stream
3922     // We emit 64 byte wide series of elements from 0..63 which later on would
3923     // be used as a compare targets with tail count contained in tmp1 register.
3924     // Result would be a k register having tmp1 consecutive number or 1
3925     // counting from least significant bit.
3926     address tmp = pc();
3927     emit_int64(0x0706050403020100);
3928     emit_int64(0x0F0E0D0C0B0A0908);
3929     emit_int64(0x1716151413121110);
3930     emit_int64(0x1F1E1D1C1B1A1918);
3931     emit_int64(0x2726252423222120);
3932     emit_int64(0x2F2E2D2C2B2A2928);
3933     emit_int64(0x3736353433323130);
3934     emit_int64(0x3F3E3D3C3B3A3938);
3935 
3936     bind(k_init);
3937     lea(len, InternalAddress(tmp));
3938     // create mask to test for negative byte inside a vector
3939     evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit);
3940     evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit);
3941 
3942 #endif
3943     evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);
3944     ktestq(mask1, mask2);
3945     jcc(Assembler::zero, DONE);
3946 
3947     bind(BREAK_LOOP);
3948     // At least one byte in the last 64 bytes is negative.
3949     // Set up to look at the last 64 bytes as if they were a tail
3950     lea(ary1, Address(ary1, len, Address::times_1));
3951     addptr(result, len);
3952     // Ignore the very last byte: if all others are positive,
3953     // it must be negative, so we can skip right to the 2+1 byte
3954     // end comparison at this point
3955     orl(result, 63);
3956     movl(len, 63);
3957     // Fallthru to tail compare
3958   } else {
3959 
3960     if (UseAVX >= 2 && UseSSE >= 2) {
3961       // With AVX2, use 32-byte vector compare
3962       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
3963 
3964       // Compare 32-byte vectors
3965       testl(len, 0xffffffe0);   // vector count (in bytes)
3966       jccb(Assembler::zero, TAIL_START);
3967 
3968       andl(len, 0xffffffe0);
3969       lea(ary1, Address(ary1, len, Address::times_1));
3970       negptr(len);
3971 
3972       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
3973       movdl(vec2, tmp1);
3974       vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
3975 
3976       bind(COMPARE_WIDE_VECTORS);
3977       vmovdqu(vec1, Address(ary1, len, Address::times_1));
3978       vptest(vec1, vec2);
3979       jccb(Assembler::notZero, BREAK_LOOP);
3980       addptr(len, 32);
3981       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3982 
3983       testl(result, 0x0000001f);   // any bytes remaining?
3984       jcc(Assembler::zero, DONE);
3985 
3986       // Quick test using the already prepared vector mask
3987       movl(len, result);
3988       andl(len, 0x0000001f);
3989       vmovdqu(vec1, Address(ary1, len, Address::times_1, -32));
3990       vptest(vec1, vec2);
3991       jcc(Assembler::zero, DONE);
3992       // There are zeros, jump to the tail to determine exactly where
3993       jmpb(TAIL_START);
3994 
3995       bind(BREAK_LOOP);
3996       // At least one byte in the last 32-byte vector is negative.
3997       // Set up to look at the last 32 bytes as if they were a tail
3998       lea(ary1, Address(ary1, len, Address::times_1));
3999       addptr(result, len);
4000       // Ignore the very last byte: if all others are positive,
4001       // it must be negative, so we can skip right to the 2+1 byte
4002       // end comparison at this point
4003       orl(result, 31);
4004       movl(len, 31);
4005       // Fallthru to tail compare
4006     } else if (UseSSE42Intrinsics) {
4007       // With SSE4.2, use double quad vector compare
4008       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4009 
4010       // Compare 16-byte vectors
4011       testl(len, 0xfffffff0);   // vector count (in bytes)
4012       jcc(Assembler::zero, TAIL_START);
4013 
4014       andl(len, 0xfffffff0);
4015       lea(ary1, Address(ary1, len, Address::times_1));
4016       negptr(len);
4017 
4018       movl(tmp1, 0x80808080);
4019       movdl(vec2, tmp1);
4020       pshufd(vec2, vec2, 0);
4021 
4022       bind(COMPARE_WIDE_VECTORS);
4023       movdqu(vec1, Address(ary1, len, Address::times_1));
4024       ptest(vec1, vec2);
4025       jccb(Assembler::notZero, BREAK_LOOP);
4026       addptr(len, 16);
4027       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4028 
4029       testl(result, 0x0000000f); // len is zero, any bytes remaining?
4030       jcc(Assembler::zero, DONE);
4031 
4032       // Quick test using the already prepared vector mask
4033       movl(len, result);
4034       andl(len, 0x0000000f);   // tail count (in bytes)
4035       movdqu(vec1, Address(ary1, len, Address::times_1, -16));
4036       ptest(vec1, vec2);
4037       jcc(Assembler::zero, DONE);
4038       jmpb(TAIL_START);
4039 
4040       bind(BREAK_LOOP);
4041       // At least one byte in the last 16-byte vector is negative.
4042       // Set up and look at the last 16 bytes as if they were a tail
4043       lea(ary1, Address(ary1, len, Address::times_1));
4044       addptr(result, len);
4045       // Ignore the very last byte: if all others are positive,
4046       // it must be negative, so we can skip right to the 2+1 byte
4047       // end comparison at this point
4048       orl(result, 15);
4049       movl(len, 15);
4050       // Fallthru to tail compare
4051     }
4052   }
4053 
4054   bind(TAIL_START);
4055   // Compare 4-byte vectors
4056   andl(len, 0xfffffffc); // vector count (in bytes)
4057   jccb(Assembler::zero, COMPARE_CHAR);
4058 
4059   lea(ary1, Address(ary1, len, Address::times_1));
4060   negptr(len);
4061 
4062   bind(COMPARE_VECTORS);
4063   movl(tmp1, Address(ary1, len, Address::times_1));
4064   andl(tmp1, 0x80808080);
4065   jccb(Assembler::notZero, TAIL_ADJUST);
4066   addptr(len, 4);
4067   jccb(Assembler::notZero, COMPARE_VECTORS);
4068 
4069   // Compare trailing char (final 2-3 bytes), if any
4070   bind(COMPARE_CHAR);
4071 
4072   testl(result, 0x2);   // tail  char
4073   jccb(Assembler::zero, COMPARE_BYTE);
4074   load_unsigned_short(tmp1, Address(ary1, 0));
4075   andl(tmp1, 0x00008080);
4076   jccb(Assembler::notZero, CHAR_ADJUST);
4077   lea(ary1, Address(ary1, 2));
4078 
4079   bind(COMPARE_BYTE);
4080   testl(result, 0x1);   // tail  byte
4081   jccb(Assembler::zero, DONE);
4082   load_unsigned_byte(tmp1, Address(ary1, 0));
4083   testl(tmp1, 0x00000080);
4084   jccb(Assembler::zero, DONE);
4085   subptr(result, 1);
4086   jmpb(DONE);
4087 
4088   bind(TAIL_ADJUST);
4089   // there are negative bits in the last 4 byte block.
4090   // Adjust result and check the next three bytes
4091   addptr(result, len);
4092   orl(result, 3);
4093   lea(ary1, Address(ary1, len, Address::times_1));
4094   jmpb(COMPARE_CHAR);
4095 
4096   bind(CHAR_ADJUST);
4097   // We are looking at a char + optional byte tail, and found that one
4098   // of the bytes in the char is negative. Adjust the result, check the
4099   // first byte and readjust if needed.
4100   andl(result, 0xfffffffc);
4101   testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first
4102   jccb(Assembler::notZero, DONE);
4103   addptr(result, 1);
4104 
4105   // That's it
4106   bind(DONE);
4107   if (UseAVX >= 2 && UseSSE >= 2) {
4108     // clean upper bits of YMM registers
4109     vpxor(vec1, vec1);
4110     vpxor(vec2, vec2);
4111   }
4112 }
4113 
4114 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
4115 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
4116                                       Register limit, Register result, Register chr,
4117                                       XMMRegister vec1, XMMRegister vec2, bool is_char, KRegister mask) {
4118   ShortBranchVerifier sbv(this);
4119   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
4120 
4121   int length_offset  = arrayOopDesc::length_offset_in_bytes();
4122   int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
4123 
4124   if (is_array_equ) {
4125     // Check the input args
4126     cmpoop(ary1, ary2);
4127     jcc(Assembler::equal, TRUE_LABEL);
4128 
4129     // Need additional checks for arrays_equals.
4130     testptr(ary1, ary1);
4131     jcc(Assembler::zero, FALSE_LABEL);
4132     testptr(ary2, ary2);
4133     jcc(Assembler::zero, FALSE_LABEL);
4134 
4135     // Check the lengths
4136     movl(limit, Address(ary1, length_offset));
4137     cmpl(limit, Address(ary2, length_offset));
4138     jcc(Assembler::notEqual, FALSE_LABEL);
4139   }
4140 
4141   // count == 0
4142   testl(limit, limit);
4143   jcc(Assembler::zero, TRUE_LABEL);
4144 
4145   if (is_array_equ) {
4146     // Load array address
4147     lea(ary1, Address(ary1, base_offset));
4148     lea(ary2, Address(ary2, base_offset));
4149   }
4150 
4151   if (is_array_equ && is_char) {
4152     // arrays_equals when used for char[].
4153     shll(limit, 1);      // byte count != 0
4154   }
4155   movl(result, limit); // copy
4156 
4157   if (UseAVX >= 2) {
4158     // With AVX2, use 32-byte vector compare
4159     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4160 
4161     // Compare 32-byte vectors
4162     andl(result, 0x0000001f);  //   tail count (in bytes)
4163     andl(limit, 0xffffffe0);   // vector count (in bytes)
4164     jcc(Assembler::zero, COMPARE_TAIL);
4165 
4166     lea(ary1, Address(ary1, limit, Address::times_1));
4167     lea(ary2, Address(ary2, limit, Address::times_1));
4168     negptr(limit);
4169 
4170 #ifdef _LP64
4171     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
4172       Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
4173 
4174       cmpl(limit, -64);
4175       jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
4176 
4177       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
4178 
4179       evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
4180       evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
4181       kortestql(mask, mask);
4182       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4183       addptr(limit, 64);  // update since we already compared at this addr
4184       cmpl(limit, -64);
4185       jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
4186 
4187       // At this point we may still need to compare -limit+result bytes.
4188       // We could execute the next two instruction and just continue via non-wide path:
4189       //  cmpl(limit, 0);
4190       //  jcc(Assembler::equal, COMPARE_TAIL);  // true
4191       // But since we stopped at the points ary{1,2}+limit which are
4192       // not farther than 64 bytes from the ends of arrays ary{1,2}+result
4193       // (|limit| <= 32 and result < 32),
4194       // we may just compare the last 64 bytes.
4195       //
4196       addptr(result, -64);   // it is safe, bc we just came from this area
4197       evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
4198       evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
4199       kortestql(mask, mask);
4200       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4201 
4202       jmp(TRUE_LABEL);
4203 
4204       bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
4205 
4206     }//if (VM_Version::supports_avx512vlbw())
4207 #endif //_LP64
4208     bind(COMPARE_WIDE_VECTORS);
4209     vmovdqu(vec1, Address(ary1, limit, Address::times_1));
4210     vmovdqu(vec2, Address(ary2, limit, Address::times_1));
4211     vpxor(vec1, vec2);
4212 
4213     vptest(vec1, vec1);
4214     jcc(Assembler::notZero, FALSE_LABEL);
4215     addptr(limit, 32);
4216     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4217 
4218     testl(result, result);
4219     jcc(Assembler::zero, TRUE_LABEL);
4220 
4221     vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
4222     vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
4223     vpxor(vec1, vec2);
4224 
4225     vptest(vec1, vec1);
4226     jccb(Assembler::notZero, FALSE_LABEL);
4227     jmpb(TRUE_LABEL);
4228 
4229     bind(COMPARE_TAIL); // limit is zero
4230     movl(limit, result);
4231     // Fallthru to tail compare
4232   } else if (UseSSE42Intrinsics) {
4233     // With SSE4.2, use double quad vector compare
4234     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4235 
4236     // Compare 16-byte vectors
4237     andl(result, 0x0000000f);  //   tail count (in bytes)
4238     andl(limit, 0xfffffff0);   // vector count (in bytes)
4239     jcc(Assembler::zero, COMPARE_TAIL);
4240 
4241     lea(ary1, Address(ary1, limit, Address::times_1));
4242     lea(ary2, Address(ary2, limit, Address::times_1));
4243     negptr(limit);
4244 
4245     bind(COMPARE_WIDE_VECTORS);
4246     movdqu(vec1, Address(ary1, limit, Address::times_1));
4247     movdqu(vec2, Address(ary2, limit, Address::times_1));
4248     pxor(vec1, vec2);
4249 
4250     ptest(vec1, vec1);
4251     jcc(Assembler::notZero, FALSE_LABEL);
4252     addptr(limit, 16);
4253     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4254 
4255     testl(result, result);
4256     jcc(Assembler::zero, TRUE_LABEL);
4257 
4258     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
4259     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
4260     pxor(vec1, vec2);
4261 
4262     ptest(vec1, vec1);
4263     jccb(Assembler::notZero, FALSE_LABEL);
4264     jmpb(TRUE_LABEL);
4265 
4266     bind(COMPARE_TAIL); // limit is zero
4267     movl(limit, result);
4268     // Fallthru to tail compare
4269   }
4270 
4271   // Compare 4-byte vectors
4272   andl(limit, 0xfffffffc); // vector count (in bytes)
4273   jccb(Assembler::zero, COMPARE_CHAR);
4274 
4275   lea(ary1, Address(ary1, limit, Address::times_1));
4276   lea(ary2, Address(ary2, limit, Address::times_1));
4277   negptr(limit);
4278 
4279   bind(COMPARE_VECTORS);
4280   movl(chr, Address(ary1, limit, Address::times_1));
4281   cmpl(chr, Address(ary2, limit, Address::times_1));
4282   jccb(Assembler::notEqual, FALSE_LABEL);
4283   addptr(limit, 4);
4284   jcc(Assembler::notZero, COMPARE_VECTORS);
4285 
4286   // Compare trailing char (final 2 bytes), if any
4287   bind(COMPARE_CHAR);
4288   testl(result, 0x2);   // tail  char
4289   jccb(Assembler::zero, COMPARE_BYTE);
4290   load_unsigned_short(chr, Address(ary1, 0));
4291   load_unsigned_short(limit, Address(ary2, 0));
4292   cmpl(chr, limit);
4293   jccb(Assembler::notEqual, FALSE_LABEL);
4294 
4295   if (is_array_equ && is_char) {
4296     bind(COMPARE_BYTE);
4297   } else {
4298     lea(ary1, Address(ary1, 2));
4299     lea(ary2, Address(ary2, 2));
4300 
4301     bind(COMPARE_BYTE);
4302     testl(result, 0x1);   // tail  byte
4303     jccb(Assembler::zero, TRUE_LABEL);
4304     load_unsigned_byte(chr, Address(ary1, 0));
4305     load_unsigned_byte(limit, Address(ary2, 0));
4306     cmpl(chr, limit);
4307     jccb(Assembler::notEqual, FALSE_LABEL);
4308   }
4309   bind(TRUE_LABEL);
4310   movl(result, 1);   // return true
4311   jmpb(DONE);
4312 
4313   bind(FALSE_LABEL);
4314   xorl(result, result); // return false
4315 
4316   // That's it
4317   bind(DONE);
4318   if (UseAVX >= 2) {
4319     // clean upper bits of YMM registers
4320     vpxor(vec1, vec1);
4321     vpxor(vec2, vec2);
4322   }
4323 }
4324 
4325 #ifdef _LP64
4326 
4327 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) {
4328 #define __ masm.
4329   Register dst = stub.data<0>();
4330   XMMRegister src = stub.data<1>();
4331   address target = stub.data<2>();
4332   __ bind(stub.entry());
4333   __ subptr(rsp, 8);
4334   __ movdbl(Address(rsp), src);
4335   __ call(RuntimeAddress(target));
4336   __ pop(dst);
4337   __ jmp(stub.continuation());
4338 #undef __
4339 }
4340 
4341 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) {
4342   assert(dst_bt == T_INT || dst_bt == T_LONG, "");
4343   assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, "");
4344 
4345   address slowpath_target;
4346   if (dst_bt == T_INT) {
4347     if (src_bt == T_FLOAT) {
4348       cvttss2sil(dst, src);
4349       cmpl(dst, 0x80000000);
4350       slowpath_target = StubRoutines::x86::f2i_fixup();
4351     } else {
4352       cvttsd2sil(dst, src);
4353       cmpl(dst, 0x80000000);
4354       slowpath_target = StubRoutines::x86::d2i_fixup();
4355     }
4356   } else {
4357     if (src_bt == T_FLOAT) {
4358       cvttss2siq(dst, src);
4359       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4360       slowpath_target = StubRoutines::x86::f2l_fixup();
4361     } else {
4362       cvttsd2siq(dst, src);
4363       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4364       slowpath_target = StubRoutines::x86::d2l_fixup();
4365     }
4366   }
4367 
4368   auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, 23, convertF2I_slowpath);
4369   jcc(Assembler::equal, stub->entry());
4370   bind(stub->continuation());
4371 }
4372 
4373 #endif // _LP64
4374 
4375 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4376                                     XMMRegister src1, int imm8, bool merge, int vlen_enc) {
4377   switch(ideal_opc) {
4378     case Op_LShiftVS:
4379       Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break;
4380     case Op_LShiftVI:
4381       Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break;
4382     case Op_LShiftVL:
4383       Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break;
4384     case Op_RShiftVS:
4385       Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break;
4386     case Op_RShiftVI:
4387       Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break;
4388     case Op_RShiftVL:
4389       Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break;
4390     case Op_URShiftVS:
4391       Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break;
4392     case Op_URShiftVI:
4393       Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break;
4394     case Op_URShiftVL:
4395       Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break;
4396     case Op_RotateRightV:
4397       evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4398     case Op_RotateLeftV:
4399       evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4400     default:
4401       fatal("Unsupported masked operation"); break;
4402   }
4403 }
4404 
4405 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4406                                     XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc,
4407                                     bool is_varshift) {
4408   switch (ideal_opc) {
4409     case Op_AddVB:
4410       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4411     case Op_AddVS:
4412       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4413     case Op_AddVI:
4414       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4415     case Op_AddVL:
4416       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4417     case Op_AddVF:
4418       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4419     case Op_AddVD:
4420       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4421     case Op_SubVB:
4422       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4423     case Op_SubVS:
4424       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4425     case Op_SubVI:
4426       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4427     case Op_SubVL:
4428       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4429     case Op_SubVF:
4430       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4431     case Op_SubVD:
4432       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4433     case Op_MulVS:
4434       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4435     case Op_MulVI:
4436       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4437     case Op_MulVL:
4438       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4439     case Op_MulVF:
4440       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4441     case Op_MulVD:
4442       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4443     case Op_DivVF:
4444       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4445     case Op_DivVD:
4446       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4447     case Op_SqrtVF:
4448       evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break;
4449     case Op_SqrtVD:
4450       evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break;
4451     case Op_AbsVB:
4452       evpabsb(dst, mask, src2, merge, vlen_enc); break;
4453     case Op_AbsVS:
4454       evpabsw(dst, mask, src2, merge, vlen_enc); break;
4455     case Op_AbsVI:
4456       evpabsd(dst, mask, src2, merge, vlen_enc); break;
4457     case Op_AbsVL:
4458       evpabsq(dst, mask, src2, merge, vlen_enc); break;
4459     case Op_FmaVF:
4460       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4461     case Op_FmaVD:
4462       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4463     case Op_VectorRearrange:
4464       evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break;
4465     case Op_LShiftVS:
4466       evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4467     case Op_LShiftVI:
4468       evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4469     case Op_LShiftVL:
4470       evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4471     case Op_RShiftVS:
4472       evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4473     case Op_RShiftVI:
4474       evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4475     case Op_RShiftVL:
4476       evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4477     case Op_URShiftVS:
4478       evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4479     case Op_URShiftVI:
4480       evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4481     case Op_URShiftVL:
4482       evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4483     case Op_RotateLeftV:
4484       evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4485     case Op_RotateRightV:
4486       evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4487     case Op_MaxV:
4488       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4489     case Op_MinV:
4490       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4491     case Op_XorV:
4492       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4493     case Op_OrV:
4494       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4495     case Op_AndV:
4496       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4497     default:
4498       fatal("Unsupported masked operation"); break;
4499   }
4500 }
4501 
4502 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4503                                     XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4504   switch (ideal_opc) {
4505     case Op_AddVB:
4506       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4507     case Op_AddVS:
4508       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4509     case Op_AddVI:
4510       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4511     case Op_AddVL:
4512       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4513     case Op_AddVF:
4514       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4515     case Op_AddVD:
4516       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4517     case Op_SubVB:
4518       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4519     case Op_SubVS:
4520       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4521     case Op_SubVI:
4522       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4523     case Op_SubVL:
4524       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4525     case Op_SubVF:
4526       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4527     case Op_SubVD:
4528       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4529     case Op_MulVS:
4530       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4531     case Op_MulVI:
4532       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4533     case Op_MulVL:
4534       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4535     case Op_MulVF:
4536       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4537     case Op_MulVD:
4538       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4539     case Op_DivVF:
4540       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4541     case Op_DivVD:
4542       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4543     case Op_FmaVF:
4544       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4545     case Op_FmaVD:
4546       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4547     case Op_MaxV:
4548       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4549     case Op_MinV:
4550       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4551     case Op_XorV:
4552       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4553     case Op_OrV:
4554       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4555     case Op_AndV:
4556       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4557     default:
4558       fatal("Unsupported masked operation"); break;
4559   }
4560 }
4561 
4562 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst,
4563                                   KRegister src1, KRegister src2) {
4564   BasicType etype = T_ILLEGAL;
4565   switch(mask_len) {
4566     case 2:
4567     case 4:
4568     case 8:  etype = T_BYTE; break;
4569     case 16: etype = T_SHORT; break;
4570     case 32: etype = T_INT; break;
4571     case 64: etype = T_LONG; break;
4572     default: fatal("Unsupported type"); break;
4573   }
4574   assert(etype != T_ILLEGAL, "");
4575   switch(ideal_opc) {
4576     case Op_AndVMask:
4577       kand(etype, dst, src1, src2); break;
4578     case Op_OrVMask:
4579       kor(etype, dst, src1, src2); break;
4580     case Op_XorVMask:
4581       kxor(etype, dst, src1, src2); break;
4582     default:
4583       fatal("Unsupported masked operation"); break;
4584   }
4585 }
4586 
4587 /*
4588  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4589  * If src is NaN, the result is 0.
4590  * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE,
4591  * the result is equal to the value of Integer.MIN_VALUE.
4592  * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE,
4593  * the result is equal to the value of Integer.MAX_VALUE.
4594  */
4595 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4596                                                                    XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
4597                                                                    Register rscratch, AddressLiteral float_sign_flip,
4598                                                                    int vec_enc) {
4599   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4600   Label done;
4601   vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch);
4602   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
4603   vptest(xtmp2, xtmp2, vec_enc);
4604   jccb(Assembler::equal, done);
4605 
4606   vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc);
4607   vpxor(xtmp1, xtmp1, xtmp4, vec_enc);
4608 
4609   vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
4610   vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc);
4611   vblendvps(dst, dst, xtmp4, xtmp3, vec_enc);
4612 
4613   // Recompute the mask for remaining special value.
4614   vpxor(xtmp2, xtmp2, xtmp3, vec_enc);
4615   // Extract SRC values corresponding to TRUE mask lanes.
4616   vpand(xtmp4, xtmp2, src, vec_enc);
4617   // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special
4618   // values are set.
4619   vpxor(xtmp3, xtmp2, xtmp4, vec_enc);
4620 
4621   vblendvps(dst, dst, xtmp1, xtmp3, vec_enc);
4622   bind(done);
4623 }
4624 
4625 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4626                                                                     XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4627                                                                     Register rscratch, AddressLiteral float_sign_flip,
4628                                                                     int vec_enc) {
4629   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4630   Label done;
4631   evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
4632   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4633   kortestwl(ktmp1, ktmp1);
4634   jccb(Assembler::equal, done);
4635 
4636   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4637   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4638   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
4639 
4640   kxorwl(ktmp1, ktmp1, ktmp2);
4641   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4642   vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4643   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
4644   bind(done);
4645 }
4646 
4647 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4648                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4649                                                                      Register rscratch, AddressLiteral double_sign_flip,
4650                                                                      int vec_enc) {
4651   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4652 
4653   Label done;
4654   evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
4655   Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc);
4656   kortestwl(ktmp1, ktmp1);
4657   jccb(Assembler::equal, done);
4658 
4659   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4660   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4661   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
4662 
4663   kxorwl(ktmp1, ktmp1, ktmp2);
4664   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4665   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4666   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
4667   bind(done);
4668 }
4669 
4670 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4671                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4672                                                                      Register rscratch, AddressLiteral float_sign_flip,
4673                                                                      int vec_enc) {
4674   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4675   Label done;
4676   evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
4677   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4678   kortestwl(ktmp1, ktmp1);
4679   jccb(Assembler::equal, done);
4680 
4681   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4682   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4683   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
4684 
4685   kxorwl(ktmp1, ktmp1, ktmp2);
4686   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4687   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4688   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
4689   bind(done);
4690 }
4691 
4692 /*
4693  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4694  * If src is NaN, the result is 0.
4695  * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE,
4696  * the result is equal to the value of Long.MIN_VALUE.
4697  * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE,
4698  * the result is equal to the value of Long.MAX_VALUE.
4699  */
4700 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4701                                                                       XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4702                                                                       Register rscratch, AddressLiteral double_sign_flip,
4703                                                                       int vec_enc) {
4704   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4705 
4706   Label done;
4707   evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
4708   evpcmpeqq(ktmp1, xtmp1, dst, vec_enc);
4709   kortestwl(ktmp1, ktmp1);
4710   jccb(Assembler::equal, done);
4711 
4712   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4713   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4714   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
4715 
4716   kxorwl(ktmp1, ktmp1, ktmp2);
4717   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4718   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4719   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
4720   bind(done);
4721 }
4722 
4723 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero,
4724                                                              XMMRegister xtmp, int index, int vec_enc) {
4725    assert(vec_enc < Assembler::AVX_512bit, "");
4726    if (vec_enc == Assembler::AVX_256bit) {
4727      vextractf128_high(xtmp, src);
4728      vshufps(dst, src, xtmp, index, vec_enc);
4729    } else {
4730      vshufps(dst, src, zero, index, vec_enc);
4731    }
4732 }
4733 
4734 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
4735                                                                     XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch,
4736                                                                     AddressLiteral float_sign_flip, int src_vec_enc) {
4737   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4738 
4739   Label done;
4740   // Compare the destination lanes with float_sign_flip
4741   // value to get mask for all special values.
4742   movdqu(xtmp1, float_sign_flip, rscratch);
4743   vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit);
4744   ptest(xtmp2, xtmp2);
4745   jccb(Assembler::equal, done);
4746 
4747   // Flip float_sign_flip to get max integer value.
4748   vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit);
4749   pxor(xtmp1, xtmp4);
4750 
4751   // Set detination lanes corresponding to unordered source lanes as zero.
4752   vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc);
4753   vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc);
4754 
4755   // Shuffle mask vector and pack lower doubles word from each quadword lane.
4756   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
4757   vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit);
4758 
4759   // Recompute the mask for remaining special value.
4760   pxor(xtmp2, xtmp3);
4761   // Extract mask corresponding to non-negative source lanes.
4762   vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc);
4763 
4764   // Shuffle mask vector and pack lower doubles word from each quadword lane.
4765   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
4766   pand(xtmp3, xtmp2);
4767 
4768   // Replace destination lanes holding special value(0x80000000) with max int
4769   // if corresponding source lane holds a +ve value.
4770   vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit);
4771   bind(done);
4772 }
4773 
4774 
4775 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero,
4776                                                    XMMRegister xtmp, Register rscratch, int vec_enc) {
4777   switch(to_elem_bt) {
4778     case T_SHORT:
4779       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing");
4780       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch);
4781       vpackusdw(dst, dst, zero, vec_enc);
4782       if (vec_enc == Assembler::AVX_256bit) {
4783         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
4784       }
4785       break;
4786     case  T_BYTE:
4787       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing");
4788       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch);
4789       vpackusdw(dst, dst, zero, vec_enc);
4790       if (vec_enc == Assembler::AVX_256bit) {
4791         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
4792       }
4793       vpackuswb(dst, dst, zero, vec_enc);
4794       break;
4795     default: assert(false, "%s", type2name(to_elem_bt));
4796   }
4797 }
4798 
4799 /*
4800  * Algorithm for vector D2L and F2I conversions:-
4801  * a) Perform vector D2L/F2I cast.
4802  * b) Choose fast path if none of the result vector lane contains 0x80000000 value.
4803  *    It signifies that source value could be any of the special floating point
4804  *    values(NaN,-Inf,Inf,Max,-Min).
4805  * c) Set destination to zero if source is NaN value.
4806  * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value.
4807  */
4808 
4809 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4810                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
4811                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
4812   int to_elem_sz = type2aelembytes(to_elem_bt);
4813   assert(to_elem_sz <= 4, "");
4814   vcvttps2dq(dst, src, vec_enc);
4815   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc);
4816   if (to_elem_sz < 4) {
4817     vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
4818     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc);
4819   }
4820 }
4821 
4822 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4823                                             XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip,
4824                                             Register rscratch, int vec_enc) {
4825   int to_elem_sz = type2aelembytes(to_elem_bt);
4826   assert(to_elem_sz <= 4, "");
4827   vcvttps2dq(dst, src, vec_enc);
4828   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc);
4829   switch(to_elem_bt) {
4830     case T_INT:
4831       break;
4832     case T_SHORT:
4833       evpmovdw(dst, dst, vec_enc);
4834       break;
4835     case T_BYTE:
4836       evpmovdb(dst, dst, vec_enc);
4837       break;
4838     default: assert(false, "%s", type2name(to_elem_bt));
4839   }
4840 }
4841 
4842 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
4843                                             KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip,
4844                                             Register rscratch, int vec_enc) {
4845   evcvttps2qq(dst, src, vec_enc);
4846   vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc);
4847 }
4848 
4849 // Handling for downcasting from double to integer or sub-word types on AVX2.
4850 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4851                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5,
4852                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
4853   int to_elem_sz = type2aelembytes(to_elem_bt);
4854   assert(to_elem_sz < 8, "");
4855   vcvttpd2dq(dst, src, vec_enc);
4856   vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch,
4857                                               float_sign_flip, vec_enc);
4858   if (to_elem_sz < 4) {
4859     // xtmp4 holds all zero lanes.
4860     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit);
4861   }
4862 }
4863 
4864 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src,
4865                                             XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1,
4866                                             KRegister ktmp2, AddressLiteral sign_flip,
4867                                             Register rscratch, int vec_enc) {
4868   if (VM_Version::supports_avx512dq()) {
4869     evcvttpd2qq(dst, src, vec_enc);
4870     vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
4871     switch(to_elem_bt) {
4872       case T_LONG:
4873         break;
4874       case T_INT:
4875         evpmovsqd(dst, dst, vec_enc);
4876         break;
4877       case T_SHORT:
4878         evpmovsqd(dst, dst, vec_enc);
4879         evpmovdw(dst, dst, vec_enc);
4880         break;
4881       case T_BYTE:
4882         evpmovsqd(dst, dst, vec_enc);
4883         evpmovdb(dst, dst, vec_enc);
4884         break;
4885       default: assert(false, "%s", type2name(to_elem_bt));
4886     }
4887   } else {
4888     assert(type2aelembytes(to_elem_bt) <= 4, "");
4889     vcvttpd2dq(dst, src, vec_enc);
4890     vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
4891     switch(to_elem_bt) {
4892       case T_INT:
4893         break;
4894       case T_SHORT:
4895         evpmovdw(dst, dst, vec_enc);
4896         break;
4897       case T_BYTE:
4898         evpmovdb(dst, dst, vec_enc);
4899         break;
4900       default: assert(false, "%s", type2name(to_elem_bt));
4901     }
4902   }
4903 }
4904 
4905 #ifdef _LP64
4906 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src,
4907                                                  AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
4908                                                  Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
4909   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
4910   // and re-instantiate original MXCSR.RC mode after that.
4911   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
4912 
4913   mov64(tmp, julong_cast(0.5L));
4914   evpbroadcastq(xtmp1, tmp, vec_enc);
4915   vaddpd(xtmp1, src , xtmp1, vec_enc);
4916   evcvtpd2qq(dst, xtmp1, vec_enc);
4917   vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
4918                                                 double_sign_flip, vec_enc);;
4919 
4920   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
4921 }
4922 
4923 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src,
4924                                                 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
4925                                                 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
4926   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
4927   // and re-instantiate original MXCSR.RC mode after that.
4928   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
4929 
4930   movl(tmp, jint_cast(0.5));
4931   movq(xtmp1, tmp);
4932   vbroadcastss(xtmp1, xtmp1, vec_enc);
4933   vaddps(xtmp1, src , xtmp1, vec_enc);
4934   vcvtps2dq(dst, xtmp1, vec_enc);
4935   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
4936                                               float_sign_flip, vec_enc);
4937 
4938   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
4939 }
4940 
4941 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src,
4942                                                AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
4943                                                Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) {
4944   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
4945   // and re-instantiate original MXCSR.RC mode after that.
4946   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
4947 
4948   movl(tmp, jint_cast(0.5));
4949   movq(xtmp1, tmp);
4950   vbroadcastss(xtmp1, xtmp1, vec_enc);
4951   vaddps(xtmp1, src , xtmp1, vec_enc);
4952   vcvtps2dq(dst, xtmp1, vec_enc);
4953   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc);
4954 
4955   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
4956 }
4957 #endif // _LP64
4958 
4959 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
4960                                              BasicType from_elem_bt, BasicType to_elem_bt) {
4961   switch (from_elem_bt) {
4962     case T_BYTE:
4963       switch (to_elem_bt) {
4964         case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break;
4965         case T_INT:   vpmovzxbd(dst, src, vlen_enc); break;
4966         case T_LONG:  vpmovzxbq(dst, src, vlen_enc); break;
4967         default: ShouldNotReachHere();
4968       }
4969       break;
4970     case T_SHORT:
4971       switch (to_elem_bt) {
4972         case T_INT:  vpmovzxwd(dst, src, vlen_enc); break;
4973         case T_LONG: vpmovzxwq(dst, src, vlen_enc); break;
4974         default: ShouldNotReachHere();
4975       }
4976       break;
4977     case T_INT:
4978       assert(to_elem_bt == T_LONG, "");
4979       vpmovzxdq(dst, src, vlen_enc);
4980       break;
4981     default:
4982       ShouldNotReachHere();
4983   }
4984 }
4985 
4986 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
4987                                            BasicType from_elem_bt, BasicType to_elem_bt) {
4988   switch (from_elem_bt) {
4989     case T_BYTE:
4990       switch (to_elem_bt) {
4991         case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break;
4992         case T_INT:   vpmovsxbd(dst, src, vlen_enc); break;
4993         case T_LONG:  vpmovsxbq(dst, src, vlen_enc); break;
4994         default: ShouldNotReachHere();
4995       }
4996       break;
4997     case T_SHORT:
4998       switch (to_elem_bt) {
4999         case T_INT:  vpmovsxwd(dst, src, vlen_enc); break;
5000         case T_LONG: vpmovsxwq(dst, src, vlen_enc); break;
5001         default: ShouldNotReachHere();
5002       }
5003       break;
5004     case T_INT:
5005       assert(to_elem_bt == T_LONG, "");
5006       vpmovsxdq(dst, src, vlen_enc);
5007       break;
5008     default:
5009       ShouldNotReachHere();
5010   }
5011 }
5012 
5013 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src,
5014                                          BasicType dst_bt, BasicType src_bt, int vlen) {
5015   int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen);
5016   assert(vlen_enc != AVX_512bit, "");
5017 
5018   int dst_bt_size = type2aelembytes(dst_bt);
5019   int src_bt_size = type2aelembytes(src_bt);
5020   if (dst_bt_size > src_bt_size) {
5021     switch (dst_bt_size / src_bt_size) {
5022       case 2: vpmovsxbw(dst, src, vlen_enc); break;
5023       case 4: vpmovsxbd(dst, src, vlen_enc); break;
5024       case 8: vpmovsxbq(dst, src, vlen_enc); break;
5025       default: ShouldNotReachHere();
5026     }
5027   } else {
5028     assert(dst_bt_size < src_bt_size, "");
5029     switch (src_bt_size / dst_bt_size) {
5030       case 2: {
5031         if (vlen_enc == AVX_128bit) {
5032           vpacksswb(dst, src, src, vlen_enc);
5033         } else {
5034           vpacksswb(dst, src, src, vlen_enc);
5035           vpermq(dst, dst, 0x08, vlen_enc);
5036         }
5037         break;
5038       }
5039       case 4: {
5040         if (vlen_enc == AVX_128bit) {
5041           vpackssdw(dst, src, src, vlen_enc);
5042           vpacksswb(dst, dst, dst, vlen_enc);
5043         } else {
5044           vpackssdw(dst, src, src, vlen_enc);
5045           vpermq(dst, dst, 0x08, vlen_enc);
5046           vpacksswb(dst, dst, dst, AVX_128bit);
5047         }
5048         break;
5049       }
5050       case 8: {
5051         if (vlen_enc == AVX_128bit) {
5052           vpshufd(dst, src, 0x08, vlen_enc);
5053           vpackssdw(dst, dst, dst, vlen_enc);
5054           vpacksswb(dst, dst, dst, vlen_enc);
5055         } else {
5056           vpshufd(dst, src, 0x08, vlen_enc);
5057           vpermq(dst, dst, 0x08, vlen_enc);
5058           vpackssdw(dst, dst, dst, AVX_128bit);
5059           vpacksswb(dst, dst, dst, AVX_128bit);
5060         }
5061         break;
5062       }
5063       default: ShouldNotReachHere();
5064     }
5065   }
5066 }
5067 
5068 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3,
5069                                    bool merge, BasicType bt, int vlen_enc) {
5070   if (bt == T_INT) {
5071     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5072   } else {
5073     assert(bt == T_LONG, "");
5074     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5075   }
5076 }
5077 
5078 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3,
5079                                    bool merge, BasicType bt, int vlen_enc) {
5080   if (bt == T_INT) {
5081     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5082   } else {
5083     assert(bt == T_LONG, "");
5084     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5085   }
5086 }
5087 
5088 #ifdef _LP64
5089 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1,
5090                                                Register rtmp2, XMMRegister xtmp, int mask_len,
5091                                                int vec_enc) {
5092   int index = 0;
5093   int vindex = 0;
5094   mov64(rtmp1, 0x0101010101010101L);
5095   pdepq(rtmp1, src, rtmp1);
5096   if (mask_len > 8) {
5097     movq(rtmp2, src);
5098     vpxor(xtmp, xtmp, xtmp, vec_enc);
5099     movq(xtmp, rtmp1);
5100   }
5101   movq(dst, rtmp1);
5102 
5103   mask_len -= 8;
5104   while (mask_len > 0) {
5105     assert ((mask_len & 0x7) == 0, "mask must be multiple of 8");
5106     index++;
5107     if ((index % 2) == 0) {
5108       pxor(xtmp, xtmp);
5109     }
5110     mov64(rtmp1, 0x0101010101010101L);
5111     shrq(rtmp2, 8);
5112     pdepq(rtmp1, rtmp2, rtmp1);
5113     pinsrq(xtmp, rtmp1, index % 2);
5114     vindex = index / 2;
5115     if (vindex) {
5116       // Write entire 16 byte vector when both 64 bit
5117       // lanes are update to save redundant instructions.
5118       if (index % 2) {
5119         vinsertf128(dst, dst, xtmp, vindex);
5120       }
5121     } else {
5122       vmovdqu(dst, xtmp);
5123     }
5124     mask_len -= 8;
5125   }
5126 }
5127 
5128 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) {
5129   switch(opc) {
5130     case Op_VectorMaskTrueCount:
5131       popcntq(dst, tmp);
5132       break;
5133     case Op_VectorMaskLastTrue:
5134       if (VM_Version::supports_lzcnt()) {
5135         lzcntq(tmp, tmp);
5136         movl(dst, 63);
5137         subl(dst, tmp);
5138       } else {
5139         movl(dst, -1);
5140         bsrq(tmp, tmp);
5141         cmov32(Assembler::notZero, dst, tmp);
5142       }
5143       break;
5144     case Op_VectorMaskFirstTrue:
5145       if (VM_Version::supports_bmi1()) {
5146         if (masklen < 32) {
5147           orl(tmp, 1 << masklen);
5148           tzcntl(dst, tmp);
5149         } else if (masklen == 32) {
5150           tzcntl(dst, tmp);
5151         } else {
5152           assert(masklen == 64, "");
5153           tzcntq(dst, tmp);
5154         }
5155       } else {
5156         if (masklen < 32) {
5157           orl(tmp, 1 << masklen);
5158           bsfl(dst, tmp);
5159         } else {
5160           assert(masklen == 32 || masklen == 64, "");
5161           movl(dst, masklen);
5162           if (masklen == 32)  {
5163             bsfl(tmp, tmp);
5164           } else {
5165             bsfq(tmp, tmp);
5166           }
5167           cmov32(Assembler::notZero, dst, tmp);
5168         }
5169       }
5170       break;
5171     case Op_VectorMaskToLong:
5172       assert(dst == tmp, "Dst and tmp should be the same for toLong operations");
5173       break;
5174     default: assert(false, "Unhandled mask operation");
5175   }
5176 }
5177 
5178 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp,
5179                                               int masklen, int masksize, int vec_enc) {
5180   assert(VM_Version::supports_popcnt(), "");
5181 
5182   if(VM_Version::supports_avx512bw()) {
5183     kmovql(tmp, mask);
5184   } else {
5185     assert(masklen <= 16, "");
5186     kmovwl(tmp, mask);
5187   }
5188 
5189   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5190   // operations needs to be clipped.
5191   if (masksize < 16 && opc != Op_VectorMaskFirstTrue) {
5192     andq(tmp, (1 << masklen) - 1);
5193   }
5194 
5195   vector_mask_operation_helper(opc, dst, tmp, masklen);
5196 }
5197 
5198 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
5199                                               Register tmp, int masklen, BasicType bt, int vec_enc) {
5200   assert(vec_enc == AVX_128bit && VM_Version::supports_avx() ||
5201          vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4), "");
5202   assert(VM_Version::supports_popcnt(), "");
5203 
5204   bool need_clip = false;
5205   switch(bt) {
5206     case T_BOOLEAN:
5207       // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1
5208       vpxor(xtmp, xtmp, xtmp, vec_enc);
5209       vpsubb(xtmp, xtmp, mask, vec_enc);
5210       vpmovmskb(tmp, xtmp, vec_enc);
5211       need_clip = masklen < 16;
5212       break;
5213     case T_BYTE:
5214       vpmovmskb(tmp, mask, vec_enc);
5215       need_clip = masklen < 16;
5216       break;
5217     case T_SHORT:
5218       vpacksswb(xtmp, mask, mask, vec_enc);
5219       if (masklen >= 16) {
5220         vpermpd(xtmp, xtmp, 8, vec_enc);
5221       }
5222       vpmovmskb(tmp, xtmp, Assembler::AVX_128bit);
5223       need_clip = masklen < 16;
5224       break;
5225     case T_INT:
5226     case T_FLOAT:
5227       vmovmskps(tmp, mask, vec_enc);
5228       need_clip = masklen < 4;
5229       break;
5230     case T_LONG:
5231     case T_DOUBLE:
5232       vmovmskpd(tmp, mask, vec_enc);
5233       need_clip = masklen < 2;
5234       break;
5235     default: assert(false, "Unhandled type, %s", type2name(bt));
5236   }
5237 
5238   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5239   // operations needs to be clipped.
5240   if (need_clip && opc != Op_VectorMaskFirstTrue) {
5241     // need_clip implies masklen < 32
5242     andq(tmp, (1 << masklen) - 1);
5243   }
5244 
5245   vector_mask_operation_helper(opc, dst, tmp, masklen);
5246 }
5247 
5248 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1,
5249                                              Register rtmp2, int mask_len) {
5250   kmov(rtmp1, src);
5251   andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len)));
5252   mov64(rtmp2, -1L);
5253   pextq(rtmp2, rtmp2, rtmp1);
5254   kmov(dst, rtmp2);
5255 }
5256 
5257 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask,
5258                                                bool merge, BasicType bt, int vec_enc) {
5259   if (opcode == Op_CompressV) {
5260     switch(bt) {
5261     case T_BYTE:
5262       evpcompressb(dst, mask, src, merge, vec_enc);
5263       break;
5264     case T_CHAR:
5265     case T_SHORT:
5266       evpcompressw(dst, mask, src, merge, vec_enc);
5267       break;
5268     case T_INT:
5269       evpcompressd(dst, mask, src, merge, vec_enc);
5270       break;
5271     case T_FLOAT:
5272       evcompressps(dst, mask, src, merge, vec_enc);
5273       break;
5274     case T_LONG:
5275       evpcompressq(dst, mask, src, merge, vec_enc);
5276       break;
5277     case T_DOUBLE:
5278       evcompresspd(dst, mask, src, merge, vec_enc);
5279       break;
5280     default:
5281       fatal("Unsupported type %s", type2name(bt));
5282       break;
5283     }
5284   } else {
5285     assert(opcode == Op_ExpandV, "");
5286     switch(bt) {
5287     case T_BYTE:
5288       evpexpandb(dst, mask, src, merge, vec_enc);
5289       break;
5290     case T_CHAR:
5291     case T_SHORT:
5292       evpexpandw(dst, mask, src, merge, vec_enc);
5293       break;
5294     case T_INT:
5295       evpexpandd(dst, mask, src, merge, vec_enc);
5296       break;
5297     case T_FLOAT:
5298       evexpandps(dst, mask, src, merge, vec_enc);
5299       break;
5300     case T_LONG:
5301       evpexpandq(dst, mask, src, merge, vec_enc);
5302       break;
5303     case T_DOUBLE:
5304       evexpandpd(dst, mask, src, merge, vec_enc);
5305       break;
5306     default:
5307       fatal("Unsupported type %s", type2name(bt));
5308       break;
5309     }
5310   }
5311 }
5312 #endif
5313 
5314 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5315                                            KRegister ktmp1, int vec_enc) {
5316   if (opcode == Op_SignumVD) {
5317     vsubpd(dst, zero, one, vec_enc);
5318     // if src < 0 ? -1 : 1
5319     evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5320     evblendmpd(dst, ktmp1, one, dst, true, vec_enc);
5321     // if src == NaN, -0.0 or 0.0 return src.
5322     evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5323     evblendmpd(dst, ktmp1, dst, src, true, vec_enc);
5324   } else {
5325     assert(opcode == Op_SignumVF, "");
5326     vsubps(dst, zero, one, vec_enc);
5327     // if src < 0 ? -1 : 1
5328     evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5329     evblendmps(dst, ktmp1, one, dst, true, vec_enc);
5330     // if src == NaN, -0.0 or 0.0 return src.
5331     evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5332     evblendmps(dst, ktmp1, dst, src, true, vec_enc);
5333   }
5334 }
5335 
5336 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5337                                           XMMRegister xtmp1, int vec_enc) {
5338   if (opcode == Op_SignumVD) {
5339     vsubpd(dst, zero, one, vec_enc);
5340     // if src < 0 ? -1 : 1
5341     vblendvpd(dst, one, dst, src, vec_enc);
5342     // if src == NaN, -0.0 or 0.0 return src.
5343     vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5344     vblendvpd(dst, dst, src, xtmp1, vec_enc);
5345   } else {
5346     assert(opcode == Op_SignumVF, "");
5347     vsubps(dst, zero, one, vec_enc);
5348     // if src < 0 ? -1 : 1
5349     vblendvps(dst, one, dst, src, vec_enc);
5350     // if src == NaN, -0.0 or 0.0 return src.
5351     vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5352     vblendvps(dst, dst, src, xtmp1, vec_enc);
5353   }
5354 }
5355 
5356 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) {
5357   if (VM_Version::supports_avx512bw()) {
5358     if (mask_len > 32) {
5359       kmovql(dst, src);
5360     } else {
5361       kmovdl(dst, src);
5362       if (mask_len != 32) {
5363         kshiftrdl(dst, dst, 32 - mask_len);
5364       }
5365     }
5366   } else {
5367     assert(mask_len <= 16, "");
5368     kmovwl(dst, src);
5369     if (mask_len != 16) {
5370       kshiftrwl(dst, dst, 16 - mask_len);
5371     }
5372   }
5373 }
5374 
5375 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) {
5376   int lane_size = type2aelembytes(bt);
5377   bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
5378   if ((is_LP64 || lane_size < 8) &&
5379       ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) ||
5380        (is_subword_type(bt) && VM_Version::supports_avx512vlbw()))) {
5381     movptr(rtmp, imm32);
5382     switch(lane_size) {
5383       case 1 : evpbroadcastb(dst, rtmp, vec_enc); break;
5384       case 2 : evpbroadcastw(dst, rtmp, vec_enc); break;
5385       case 4 : evpbroadcastd(dst, rtmp, vec_enc); break;
5386       case 8 : evpbroadcastq(dst, rtmp, vec_enc); break;
5387       fatal("Unsupported lane size %d", lane_size);
5388       break;
5389     }
5390   } else {
5391     movptr(rtmp, imm32);
5392     LP64_ONLY(movq(dst, rtmp)) NOT_LP64(movdl(dst, rtmp));
5393     switch(lane_size) {
5394       case 1 : vpbroadcastb(dst, dst, vec_enc); break;
5395       case 2 : vpbroadcastw(dst, dst, vec_enc); break;
5396       case 4 : vpbroadcastd(dst, dst, vec_enc); break;
5397       case 8 : vpbroadcastq(dst, dst, vec_enc); break;
5398       fatal("Unsupported lane size %d", lane_size);
5399       break;
5400     }
5401   }
5402 }
5403 
5404 //
5405 // Following is lookup table based popcount computation algorithm:-
5406 //       Index   Bit set count
5407 //     [ 0000 ->   0,
5408 //       0001 ->   1,
5409 //       0010 ->   1,
5410 //       0011 ->   2,
5411 //       0100 ->   1,
5412 //       0101 ->   2,
5413 //       0110 ->   2,
5414 //       0111 ->   3,
5415 //       1000 ->   1,
5416 //       1001 ->   2,
5417 //       1010 ->   3,
5418 //       1011 ->   3,
5419 //       1100 ->   2,
5420 //       1101 ->   3,
5421 //       1111 ->   4 ]
5422 //  a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as
5423 //     shuffle indices for lookup table access.
5424 //  b. Right shift each byte of vector lane by 4 positions.
5425 //  c. Count the number of 1s in 4 MSB bits each byte. These bits are used as
5426 //     shuffle indices for lookup table access.
5427 //  d. Add the bitset count of upper and lower 4 bits of each byte.
5428 //  e. Unpack double words to quad words and compute sum of absolute difference of bitset
5429 //     count of all the bytes of a quadword.
5430 //  f. Perform step e. for upper 128bit vector lane.
5431 //  g. Pack the bitset count of quadwords back to double word.
5432 //  h. Unpacking and packing operations are not needed for 64bit vector lane.
5433 
5434 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5435                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5436   assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), "");
5437   vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc);
5438   vpsrlw(dst, src, 4, vec_enc);
5439   vpand(dst, dst, xtmp1, vec_enc);
5440   vpand(xtmp1, src, xtmp1, vec_enc);
5441   vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg);
5442   vpshufb(xtmp1, xtmp2, xtmp1, vec_enc);
5443   vpshufb(dst, xtmp2, dst, vec_enc);
5444   vpaddb(dst, dst, xtmp1, vec_enc);
5445 }
5446 
5447 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5448                                             XMMRegister xtmp2, Register rtmp, int vec_enc) {
5449   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5450   // Following code is as per steps e,f,g and h of above algorithm.
5451   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5452   vpunpckhdq(dst, xtmp1, xtmp2, vec_enc);
5453   vpsadbw(dst, dst, xtmp2, vec_enc);
5454   vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc);
5455   vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc);
5456   vpackuswb(dst, xtmp1, dst, vec_enc);
5457 }
5458 
5459 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5460                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5461   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5462   // Add the popcount of upper and lower bytes of word.
5463   vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc);
5464   vpsrlw(dst, xtmp1, 8, vec_enc);
5465   vpand(xtmp1, xtmp1, xtmp2, vec_enc);
5466   vpaddw(dst, dst, xtmp1, vec_enc);
5467 }
5468 
5469 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5470                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5471   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5472   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5473   vpsadbw(dst, xtmp1, xtmp2, vec_enc);
5474 }
5475 
5476 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5477                                                  XMMRegister xtmp2, Register rtmp, int vec_enc) {
5478   switch(bt) {
5479     case T_LONG:
5480       vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5481       break;
5482     case T_INT:
5483       vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5484       break;
5485     case T_CHAR:
5486     case T_SHORT:
5487       vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5488       break;
5489     case T_BYTE:
5490     case T_BOOLEAN:
5491       vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5492       break;
5493     default:
5494       fatal("Unsupported type %s", type2name(bt));
5495       break;
5496   }
5497 }
5498 
5499 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5500                                                       KRegister mask, bool merge, int vec_enc) {
5501   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5502   switch(bt) {
5503     case T_LONG:
5504       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5505       evpopcntq(dst, mask, src, merge, vec_enc);
5506       break;
5507     case T_INT:
5508       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5509       evpopcntd(dst, mask, src, merge, vec_enc);
5510       break;
5511     case T_CHAR:
5512     case T_SHORT:
5513       assert(VM_Version::supports_avx512_bitalg(), "");
5514       evpopcntw(dst, mask, src, merge, vec_enc);
5515       break;
5516     case T_BYTE:
5517     case T_BOOLEAN:
5518       assert(VM_Version::supports_avx512_bitalg(), "");
5519       evpopcntb(dst, mask, src, merge, vec_enc);
5520       break;
5521     default:
5522       fatal("Unsupported type %s", type2name(bt));
5523       break;
5524   }
5525 }
5526 
5527 #ifndef _LP64
5528 void C2_MacroAssembler::vector_maskall_operation32(KRegister dst, Register src, KRegister tmp, int mask_len) {
5529   assert(VM_Version::supports_avx512bw(), "");
5530   kmovdl(tmp, src);
5531   kunpckdql(dst, tmp, tmp);
5532 }
5533 #endif
5534 
5535 // Bit reversal algorithm first reverses the bits of each byte followed by
5536 // a byte level reversal for multi-byte primitive types (short/int/long).
5537 // Algorithm performs a lookup table access to get reverse bit sequence
5538 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte
5539 // is obtained by swapping the reverse bit sequences of upper and lower
5540 // nibble of a byte.
5541 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5542                                            XMMRegister xtmp2, Register rtmp, int vec_enc) {
5543   if (VM_Version::supports_avx512vlbw()) {
5544 
5545     // Get the reverse bit sequence of lower nibble of each byte.
5546     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg);
5547     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5548     evpandq(dst, xtmp2, src, vec_enc);
5549     vpshufb(dst, xtmp1, dst, vec_enc);
5550     vpsllq(dst, dst, 4, vec_enc);
5551 
5552     // Get the reverse bit sequence of upper nibble of each byte.
5553     vpandn(xtmp2, xtmp2, src, vec_enc);
5554     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5555     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5556 
5557     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5558     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5559     evporq(xtmp2, dst, xtmp2, vec_enc);
5560     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
5561 
5562   } else if(vec_enc == Assembler::AVX_512bit) {
5563     // Shift based bit reversal.
5564     assert(bt == T_LONG || bt == T_INT, "");
5565 
5566     // Swap lower and upper nibble of each byte.
5567     vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc);
5568 
5569     // Swap two least and most significant bits of each nibble.
5570     vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc);
5571 
5572     // Swap adjacent pair of bits.
5573     evmovdqul(xtmp1, k0, dst, true, vec_enc);
5574     vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc);
5575 
5576     evmovdqul(xtmp1, k0, dst, true, vec_enc);
5577     vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc);
5578   } else {
5579     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp);
5580     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5581 
5582     // Get the reverse bit sequence of lower nibble of each byte.
5583     vpand(dst, xtmp2, src, vec_enc);
5584     vpshufb(dst, xtmp1, dst, vec_enc);
5585     vpsllq(dst, dst, 4, vec_enc);
5586 
5587     // Get the reverse bit sequence of upper nibble of each byte.
5588     vpandn(xtmp2, xtmp2, src, vec_enc);
5589     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5590     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5591 
5592     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5593     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5594     vpor(xtmp2, dst, xtmp2, vec_enc);
5595     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
5596   }
5597 }
5598 
5599 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc,
5600                                                 XMMRegister xtmp, Register rscratch) {
5601   assert(VM_Version::supports_gfni(), "");
5602   assert(rscratch != noreg || always_reachable(mask), "missing");
5603 
5604   // Galois field instruction based bit reversal based on following algorithm.
5605   // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
5606   vpbroadcastq(xtmp, mask, vec_enc, rscratch);
5607   vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc);
5608   vector_reverse_byte(bt, dst, xtmp, vec_enc);
5609 }
5610 
5611 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src,
5612                                           XMMRegister xtmp1, Register rtmp, int vec_enc) {
5613   vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc);
5614   evpandq(dst, xtmp1, src, vec_enc);
5615   vpsllq(dst, dst, nbits, vec_enc);
5616   vpandn(xtmp1, xtmp1, src, vec_enc);
5617   vpsrlq(xtmp1, xtmp1, nbits, vec_enc);
5618   evporq(dst, dst, xtmp1, vec_enc);
5619 }
5620 
5621 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5622                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5623   // Shift based bit reversal.
5624   assert(VM_Version::supports_evex(), "");
5625   switch(bt) {
5626     case T_LONG:
5627       // Swap upper and lower double word of each quad word.
5628       evprorq(xtmp1, k0, src, 32, true, vec_enc);
5629       evprord(xtmp1, k0, xtmp1, 16, true, vec_enc);
5630       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
5631       break;
5632     case T_INT:
5633       // Swap upper and lower word of each double word.
5634       evprord(xtmp1, k0, src, 16, true, vec_enc);
5635       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
5636       break;
5637     case T_CHAR:
5638     case T_SHORT:
5639       // Swap upper and lower byte of each word.
5640       vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc);
5641       break;
5642     case T_BYTE:
5643       evmovdquq(dst, k0, src, true, vec_enc);
5644       break;
5645     default:
5646       fatal("Unsupported type %s", type2name(bt));
5647       break;
5648   }
5649 }
5650 
5651 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) {
5652   if (bt == T_BYTE) {
5653     if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) {
5654       evmovdquq(dst, k0, src, true, vec_enc);
5655     } else {
5656       vmovdqu(dst, src);
5657     }
5658     return;
5659   }
5660   // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using
5661   // pre-computed shuffle indices.
5662   switch(bt) {
5663     case T_LONG:
5664       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg);
5665       break;
5666     case T_INT:
5667       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg);
5668       break;
5669     case T_CHAR:
5670     case T_SHORT:
5671       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg);
5672       break;
5673     default:
5674       fatal("Unsupported type %s", type2name(bt));
5675       break;
5676   }
5677   vpshufb(dst, src, dst, vec_enc);
5678 }
5679 
5680 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5681                                                         XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
5682                                                         KRegister ktmp, Register rtmp, bool merge, int vec_enc) {
5683   assert(is_integral_type(bt), "");
5684   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5685   assert(VM_Version::supports_avx512cd(), "");
5686   switch(bt) {
5687     case T_LONG:
5688       evplzcntq(dst, ktmp, src, merge, vec_enc);
5689       break;
5690     case T_INT:
5691       evplzcntd(dst, ktmp, src, merge, vec_enc);
5692       break;
5693     case T_SHORT:
5694       vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc);
5695       vpunpcklwd(xtmp2, xtmp1, src, vec_enc);
5696       evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc);
5697       vpunpckhwd(dst, xtmp1, src, vec_enc);
5698       evplzcntd(dst, ktmp, dst, merge, vec_enc);
5699       vpackusdw(dst, xtmp2, dst, vec_enc);
5700       break;
5701     case T_BYTE:
5702       // T1 = Compute leading zero counts of 4 LSB bits of each byte by
5703       // accessing the lookup table.
5704       // T2 = Compute leading zero counts of 4 MSB bits of each byte by
5705       // accessing the lookup table.
5706       // Add T1 to T2 if 4 MSB bits of byte are all zeros.
5707       assert(VM_Version::supports_avx512bw(), "");
5708       evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp);
5709       vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc);
5710       vpand(xtmp2, dst, src, vec_enc);
5711       vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5712       vpsrlw(xtmp3, src, 4, vec_enc);
5713       vpand(xtmp3, dst, xtmp3, vec_enc);
5714       vpshufb(dst, xtmp1, xtmp3, vec_enc);
5715       vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
5716       evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc);
5717       evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc);
5718       break;
5719     default:
5720       fatal("Unsupported type %s", type2name(bt));
5721       break;
5722   }
5723 }
5724 
5725 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5726                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
5727   vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp);
5728   vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5729   // T1 = Compute leading zero counts of 4 LSB bits of each byte by
5730   // accessing the lookup table.
5731   vpand(dst, xtmp2, src, vec_enc);
5732   vpshufb(dst, xtmp1, dst, vec_enc);
5733   // T2 = Compute leading zero counts of 4 MSB bits of each byte by
5734   // accessing the lookup table.
5735   vpsrlw(xtmp3, src, 4, vec_enc);
5736   vpand(xtmp3, xtmp2, xtmp3, vec_enc);
5737   vpshufb(xtmp2, xtmp1, xtmp3, vec_enc);
5738   // Add T1 to T2 if 4 MSB bits of byte are all zeros.
5739   vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
5740   vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc);
5741   vpaddb(dst, dst, xtmp2, vec_enc);
5742   vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc);
5743 }
5744 
5745 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5746                                                              XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
5747   vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
5748   // Add zero counts of lower byte and upper byte of a word if
5749   // upper byte holds a zero value.
5750   vpsrlw(xtmp3, src, 8, vec_enc);
5751   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
5752   vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc);
5753   vpsllw(xtmp2, dst, 8, vec_enc);
5754   vpaddw(xtmp2, xtmp2, dst, vec_enc);
5755   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
5756   vpsrlw(dst, dst, 8, vec_enc);
5757 }
5758 
5759 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5760                                                            XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) {
5761   // Since IEEE 754 floating point format represents mantissa in 1.0 format
5762   // hence biased exponent can be used to compute leading zero count as per
5763   // following formula:-
5764   // LZCNT = 32 - (biased_exp - 127)
5765   // Special handling has been introduced for Zero, Max_Int and -ve source values.
5766 
5767   // Broadcast 0xFF
5768   vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc);
5769   vpsrld(xtmp1, xtmp1, 24, vec_enc);
5770 
5771   // Extract biased exponent.
5772   vcvtdq2ps(dst, src, vec_enc);
5773   vpsrld(dst, dst, 23, vec_enc);
5774   vpand(dst, dst, xtmp1, vec_enc);
5775 
5776   // Broadcast 127.
5777   vpsrld(xtmp1, xtmp1, 1, vec_enc);
5778   // Exponent = biased_exp - 127
5779   vpsubd(dst, dst, xtmp1, vec_enc);
5780 
5781   // Exponent = Exponent  + 1
5782   vpsrld(xtmp3, xtmp1, 6, vec_enc);
5783   vpaddd(dst, dst, xtmp3, vec_enc);
5784 
5785   // Replace -ve exponent with zero, exponent is -ve when src
5786   // lane contains a zero value.
5787   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5788   vblendvps(dst, dst, xtmp2, dst, vec_enc);
5789 
5790   // Rematerialize broadcast 32.
5791   vpslld(xtmp1, xtmp3, 5, vec_enc);
5792   // Exponent is 32 if corresponding source lane contains max_int value.
5793   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
5794   // LZCNT = 32 - exponent
5795   vpsubd(dst, xtmp1, dst, vec_enc);
5796 
5797   // Replace LZCNT with a value 1 if corresponding source lane
5798   // contains max_int value.
5799   vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc);
5800 
5801   // Replace biased_exp with 0 if source lane value is less than zero.
5802   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5803   vblendvps(dst, dst, xtmp2, src, vec_enc);
5804 }
5805 
5806 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5807                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
5808   vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
5809   // Add zero counts of lower word and upper word of a double word if
5810   // upper word holds a zero value.
5811   vpsrld(xtmp3, src, 16, vec_enc);
5812   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
5813   vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc);
5814   vpslld(xtmp2, dst, 16, vec_enc);
5815   vpaddd(xtmp2, xtmp2, dst, vec_enc);
5816   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
5817   vpsrld(dst, dst, 16, vec_enc);
5818   // Add zero counts of lower doubleword and upper doubleword of a
5819   // quadword if upper doubleword holds a zero value.
5820   vpsrlq(xtmp3, src, 32, vec_enc);
5821   vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc);
5822   vpsllq(xtmp2, dst, 32, vec_enc);
5823   vpaddq(xtmp2, xtmp2, dst, vec_enc);
5824   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
5825   vpsrlq(dst, dst, 32, vec_enc);
5826 }
5827 
5828 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src,
5829                                                        XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
5830                                                        Register rtmp, int vec_enc) {
5831   assert(is_integral_type(bt), "unexpected type");
5832   assert(vec_enc < Assembler::AVX_512bit, "");
5833   switch(bt) {
5834     case T_LONG:
5835       vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
5836       break;
5837     case T_INT:
5838       vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
5839       break;
5840     case T_SHORT:
5841       vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
5842       break;
5843     case T_BYTE:
5844       vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
5845       break;
5846     default:
5847       fatal("Unsupported type %s", type2name(bt));
5848       break;
5849   }
5850 }
5851 
5852 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) {
5853   switch(bt) {
5854     case T_BYTE:
5855       vpsubb(dst, src1, src2, vec_enc);
5856       break;
5857     case T_SHORT:
5858       vpsubw(dst, src1, src2, vec_enc);
5859       break;
5860     case T_INT:
5861       vpsubd(dst, src1, src2, vec_enc);
5862       break;
5863     case T_LONG:
5864       vpsubq(dst, src1, src2, vec_enc);
5865       break;
5866     default:
5867       fatal("Unsupported type %s", type2name(bt));
5868       break;
5869   }
5870 }
5871 
5872 // Trailing zero count computation is based on leading zero count operation as per
5873 // following equation. All AVX3 targets support AVX512CD feature which offers
5874 // direct vector instruction to compute leading zero count.
5875 //      CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x)
5876 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5877                                                          XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
5878                                                          XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) {
5879   assert(is_integral_type(bt), "");
5880   // xtmp = -1
5881   vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc);
5882   // xtmp = xtmp + src
5883   vpadd(bt, xtmp4, xtmp4, src, vec_enc);
5884   // xtmp = xtmp & ~src
5885   vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc);
5886   vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc);
5887   vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc);
5888   vpsub(bt, dst, xtmp4, dst, vec_enc);
5889 }
5890 
5891 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation
5892 //      CTZ = PRIM_TYPE_WIDHT - POPC(x | -x)
5893 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5894                                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
5895   assert(is_integral_type(bt), "");
5896   // xtmp = 0
5897   vpxor(xtmp3 , xtmp3, xtmp3, vec_enc);
5898   // xtmp = 0 - src
5899   vpsub(bt, xtmp3, xtmp3, src, vec_enc);
5900   // xtmp = xtmp | src
5901   vpor(xtmp3, xtmp3, src, vec_enc);
5902   vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc);
5903   vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc);
5904   vpsub(bt, dst, xtmp1, dst, vec_enc);
5905 }
5906 
5907 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) {
5908   Label done;
5909   Label neg_divisor_fastpath;
5910   cmpl(divisor, 0);
5911   jccb(Assembler::less, neg_divisor_fastpath);
5912   xorl(rdx, rdx);
5913   divl(divisor);
5914   jmpb(done);
5915   bind(neg_divisor_fastpath);
5916   // Fastpath for divisor < 0:
5917   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
5918   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
5919   movl(rdx, rax);
5920   subl(rdx, divisor);
5921   if (VM_Version::supports_bmi1()) {
5922     andnl(rax, rdx, rax);
5923   } else {
5924     notl(rdx);
5925     andl(rax, rdx);
5926   }
5927   shrl(rax, 31);
5928   bind(done);
5929 }
5930 
5931 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) {
5932   Label done;
5933   Label neg_divisor_fastpath;
5934   cmpl(divisor, 0);
5935   jccb(Assembler::less, neg_divisor_fastpath);
5936   xorl(rdx, rdx);
5937   divl(divisor);
5938   jmpb(done);
5939   bind(neg_divisor_fastpath);
5940   // Fastpath when divisor < 0:
5941   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
5942   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
5943   movl(rdx, rax);
5944   subl(rax, divisor);
5945   if (VM_Version::supports_bmi1()) {
5946     andnl(rax, rax, rdx);
5947   } else {
5948     notl(rax);
5949     andl(rax, rdx);
5950   }
5951   sarl(rax, 31);
5952   andl(rax, divisor);
5953   subl(rdx, rax);
5954   bind(done);
5955 }
5956 
5957 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) {
5958   Label done;
5959   Label neg_divisor_fastpath;
5960 
5961   cmpl(divisor, 0);
5962   jccb(Assembler::less, neg_divisor_fastpath);
5963   xorl(rdx, rdx);
5964   divl(divisor);
5965   jmpb(done);
5966   bind(neg_divisor_fastpath);
5967   // Fastpath for divisor < 0:
5968   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
5969   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
5970   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
5971   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
5972   movl(rdx, rax);
5973   subl(rax, divisor);
5974   if (VM_Version::supports_bmi1()) {
5975     andnl(rax, rax, rdx);
5976   } else {
5977     notl(rax);
5978     andl(rax, rdx);
5979   }
5980   movl(tmp, rax);
5981   shrl(rax, 31); // quotient
5982   sarl(tmp, 31);
5983   andl(tmp, divisor);
5984   subl(rdx, tmp); // remainder
5985   bind(done);
5986 }
5987 
5988 #ifdef _LP64
5989 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1,
5990                                  XMMRegister xtmp2, Register rtmp) {
5991   if(VM_Version::supports_gfni()) {
5992     // Galois field instruction based bit reversal based on following algorithm.
5993     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
5994     mov64(rtmp, 0x8040201008040201L);
5995     movq(xtmp1, src);
5996     movq(xtmp2, rtmp);
5997     gf2p8affineqb(xtmp1, xtmp2, 0);
5998     movq(dst, xtmp1);
5999   } else {
6000     // Swap even and odd numbered bits.
6001     movl(rtmp, src);
6002     andl(rtmp, 0x55555555);
6003     shll(rtmp, 1);
6004     movl(dst, src);
6005     andl(dst, 0xAAAAAAAA);
6006     shrl(dst, 1);
6007     orl(dst, rtmp);
6008 
6009     // Swap LSB and MSB 2 bits of each nibble.
6010     movl(rtmp, dst);
6011     andl(rtmp, 0x33333333);
6012     shll(rtmp, 2);
6013     andl(dst, 0xCCCCCCCC);
6014     shrl(dst, 2);
6015     orl(dst, rtmp);
6016 
6017     // Swap LSB and MSB 4 bits of each byte.
6018     movl(rtmp, dst);
6019     andl(rtmp, 0x0F0F0F0F);
6020     shll(rtmp, 4);
6021     andl(dst, 0xF0F0F0F0);
6022     shrl(dst, 4);
6023     orl(dst, rtmp);
6024   }
6025   bswapl(dst);
6026 }
6027 
6028 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1,
6029                                  XMMRegister xtmp2, Register rtmp1, Register rtmp2) {
6030   if(VM_Version::supports_gfni()) {
6031     // Galois field instruction based bit reversal based on following algorithm.
6032     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6033     mov64(rtmp1, 0x8040201008040201L);
6034     movq(xtmp1, src);
6035     movq(xtmp2, rtmp1);
6036     gf2p8affineqb(xtmp1, xtmp2, 0);
6037     movq(dst, xtmp1);
6038   } else {
6039     // Swap even and odd numbered bits.
6040     movq(rtmp1, src);
6041     mov64(rtmp2, 0x5555555555555555L);
6042     andq(rtmp1, rtmp2);
6043     shlq(rtmp1, 1);
6044     movq(dst, src);
6045     notq(rtmp2);
6046     andq(dst, rtmp2);
6047     shrq(dst, 1);
6048     orq(dst, rtmp1);
6049 
6050     // Swap LSB and MSB 2 bits of each nibble.
6051     movq(rtmp1, dst);
6052     mov64(rtmp2, 0x3333333333333333L);
6053     andq(rtmp1, rtmp2);
6054     shlq(rtmp1, 2);
6055     notq(rtmp2);
6056     andq(dst, rtmp2);
6057     shrq(dst, 2);
6058     orq(dst, rtmp1);
6059 
6060     // Swap LSB and MSB 4 bits of each byte.
6061     movq(rtmp1, dst);
6062     mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL);
6063     andq(rtmp1, rtmp2);
6064     shlq(rtmp1, 4);
6065     notq(rtmp2);
6066     andq(dst, rtmp2);
6067     shrq(dst, 4);
6068     orq(dst, rtmp1);
6069   }
6070   bswapq(dst);
6071 }
6072 
6073 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) {
6074   Label done;
6075   Label neg_divisor_fastpath;
6076   cmpq(divisor, 0);
6077   jccb(Assembler::less, neg_divisor_fastpath);
6078   xorl(rdx, rdx);
6079   divq(divisor);
6080   jmpb(done);
6081   bind(neg_divisor_fastpath);
6082   // Fastpath for divisor < 0:
6083   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6084   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6085   movq(rdx, rax);
6086   subq(rdx, divisor);
6087   if (VM_Version::supports_bmi1()) {
6088     andnq(rax, rdx, rax);
6089   } else {
6090     notq(rdx);
6091     andq(rax, rdx);
6092   }
6093   shrq(rax, 63);
6094   bind(done);
6095 }
6096 
6097 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) {
6098   Label done;
6099   Label neg_divisor_fastpath;
6100   cmpq(divisor, 0);
6101   jccb(Assembler::less, neg_divisor_fastpath);
6102   xorq(rdx, rdx);
6103   divq(divisor);
6104   jmp(done);
6105   bind(neg_divisor_fastpath);
6106   // Fastpath when divisor < 0:
6107   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6108   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6109   movq(rdx, rax);
6110   subq(rax, divisor);
6111   if (VM_Version::supports_bmi1()) {
6112     andnq(rax, rax, rdx);
6113   } else {
6114     notq(rax);
6115     andq(rax, rdx);
6116   }
6117   sarq(rax, 63);
6118   andq(rax, divisor);
6119   subq(rdx, rax);
6120   bind(done);
6121 }
6122 
6123 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) {
6124   Label done;
6125   Label neg_divisor_fastpath;
6126   cmpq(divisor, 0);
6127   jccb(Assembler::less, neg_divisor_fastpath);
6128   xorq(rdx, rdx);
6129   divq(divisor);
6130   jmp(done);
6131   bind(neg_divisor_fastpath);
6132   // Fastpath for divisor < 0:
6133   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6134   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6135   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6136   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6137   movq(rdx, rax);
6138   subq(rax, divisor);
6139   if (VM_Version::supports_bmi1()) {
6140     andnq(rax, rax, rdx);
6141   } else {
6142     notq(rax);
6143     andq(rax, rdx);
6144   }
6145   movq(tmp, rax);
6146   shrq(rax, 63); // quotient
6147   sarq(tmp, 63);
6148   andq(tmp, divisor);
6149   subq(rdx, tmp); // remainder
6150   bind(done);
6151 }
6152 #endif
6153 
6154 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1,
6155                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp,
6156                                         int vlen_enc) {
6157   assert(VM_Version::supports_avx512bw(), "");
6158   // Byte shuffles are inlane operations and indices are determined using
6159   // lower 4 bit of each shuffle lane, thus all shuffle indices are
6160   // normalized to index range 0-15. This makes sure that all the multiples
6161   // of an index value are placed at same relative position in 128 bit
6162   // lane i.e. elements corresponding to shuffle indices 16, 32 and 64
6163   // will be 16th element in their respective 128 bit lanes.
6164   movl(rtmp, 16);
6165   evpbroadcastb(xtmp1, rtmp, vlen_enc);
6166 
6167   // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16,
6168   // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using
6169   // original shuffle indices and move the shuffled lanes corresponding to true
6170   // mask to destination vector.
6171   evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
6172   evshufi64x2(xtmp2, src, src, 0x0, vlen_enc);
6173   evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc);
6174 
6175   // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32
6176   // and broadcasting second 128 bit lane.
6177   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6178   vpsllq(xtmp2, xtmp1, 0x1, vlen_enc);
6179   evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
6180   evshufi64x2(xtmp3, src, src, 0x55, vlen_enc);
6181   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6182 
6183   // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48
6184   // and broadcasting third 128 bit lane.
6185   evpcmpb(ktmp, k0, shuffle,  xtmp2, Assembler::nlt, true, vlen_enc);
6186   vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc);
6187   evpcmpb(ktmp, ktmp, shuffle,  xtmp1, Assembler::lt, true, vlen_enc);
6188   evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc);
6189   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6190 
6191   // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64
6192   // and broadcasting third 128 bit lane.
6193   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6194   vpsllq(xtmp2, xtmp2, 0x1, vlen_enc);
6195   evpcmpb(ktmp, ktmp, shuffle,  xtmp2, Assembler::lt, true, vlen_enc);
6196   evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc);
6197   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6198 }
6199 
6200 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst,
6201                                                    XMMRegister shuffle, XMMRegister src, int vlen_enc) {
6202   if (vlen_enc == AVX_128bit) {
6203     vpermilps(dst, src, shuffle, vlen_enc);
6204   } else if (bt == T_INT) {
6205     vpermd(dst, shuffle, src, vlen_enc);
6206   } else {
6207     assert(bt == T_FLOAT, "");
6208     vpermps(dst, shuffle, src, vlen_enc);
6209   }
6210 }