1 /*
   2  * Copyright (c) 2020, 2023, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "gc/shared/barrierSet.hpp"
  29 #include "gc/shared/barrierSetAssembler.hpp"
  30 #include "oops/methodData.hpp"
  31 #include "opto/c2_MacroAssembler.hpp"
  32 #include "opto/intrinsicnode.hpp"
  33 #include "opto/output.hpp"
  34 #include "opto/opcodes.hpp"
  35 #include "opto/subnode.hpp"
  36 #include "runtime/objectMonitor.hpp"
  37 #include "runtime/stubRoutines.hpp"
  38 #include "utilities/checkedCast.hpp"
  39 
  40 #ifdef PRODUCT
  41 #define BLOCK_COMMENT(str) /* nothing */
  42 #define STOP(error) stop(error)
  43 #else
  44 #define BLOCK_COMMENT(str) block_comment(str)
  45 #define STOP(error) block_comment(error); stop(error)
  46 #endif
  47 
  48 // C2 compiled method's prolog code.
  49 void C2_MacroAssembler::verified_entry(Compile* C, int sp_inc) {
  50   if (C->clinit_barrier_on_entry()) {
  51     assert(VM_Version::supports_fast_class_init_checks(), "sanity");
  52     assert(!C->method()->holder()->is_not_initialized(), "initialization should have been started");
  53 
  54     Label L_skip_barrier;
  55     Register klass = rscratch1;
  56 
  57     mov_metadata(klass, C->method()->holder()->constant_encoding());
  58     clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
  59 
  60     jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
  61 
  62     bind(L_skip_barrier);
  63   }
  64 
  65   int framesize = C->output()->frame_size_in_bytes();
  66   int bangsize = C->output()->bang_size_in_bytes();
  67   bool fp_mode_24b = false;
  68   int stack_bang_size = C->output()->need_stack_bang(bangsize) ? bangsize : 0;
  69 
  70   // WARNING: Initial instruction MUST be 5 bytes or longer so that
  71   // NativeJump::patch_verified_entry will be able to patch out the entry
  72   // code safely. The push to verify stack depth is ok at 5 bytes,
  73   // the frame allocation can be either 3 or 6 bytes. So if we don't do
  74   // stack bang then we must use the 6 byte frame allocation even if
  75   // we have no frame. :-(
  76   assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
  77 
  78   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
  79   // Remove word for return addr
  80   framesize -= wordSize;
  81   stack_bang_size -= wordSize;
  82 
  83   // Calls to C2R adapters often do not accept exceptional returns.
  84   // We require that their callers must bang for them.  But be careful, because
  85   // some VM calls (such as call site linkage) can use several kilobytes of
  86   // stack.  But the stack safety zone should account for that.
  87   // See bugs 4446381, 4468289, 4497237.
  88   if (stack_bang_size > 0) {
  89     generate_stack_overflow_check(stack_bang_size);
  90 
  91     // We always push rbp, so that on return to interpreter rbp, will be
  92     // restored correctly and we can correct the stack.
  93     push(rbp);
  94     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  95     if (PreserveFramePointer) {
  96       mov(rbp, rsp);
  97     }
  98     // Remove word for ebp
  99     framesize -= wordSize;
 100 
 101     // Create frame
 102     if (framesize) {
 103       subptr(rsp, framesize);
 104     }
 105   } else {
 106     // Create frame (force generation of a 4 byte immediate value)
 107     subptr_imm32(rsp, framesize);
 108 
 109     // Save RBP register now.
 110     framesize -= wordSize;
 111     movptr(Address(rsp, framesize), rbp);
 112     // Save caller's stack pointer into RBP if the frame pointer is preserved.
 113     if (PreserveFramePointer) {
 114       movptr(rbp, rsp);
 115       if (framesize > 0) {
 116         addptr(rbp, framesize);
 117       }
 118     }
 119   }
 120 
 121   if (C->needs_stack_repair()) {
 122     // Save stack increment just below the saved rbp (also account for fixed framesize and rbp)
 123     assert((sp_inc & (StackAlignmentInBytes-1)) == 0, "stack increment not aligned");
 124     movptr(Address(rsp, framesize - wordSize), sp_inc + framesize + wordSize);
 125   }
 126 
 127   if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
 128     framesize -= wordSize;
 129     movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 130   }
 131 
 132 #ifndef _LP64
 133   // If method sets FPU control word do it now
 134   if (fp_mode_24b) {
 135     fldcw(ExternalAddress(StubRoutines::x86::addr_fpu_cntrl_wrd_24()));
 136   }
 137   if (UseSSE >= 2 && VerifyFPU) {
 138     verify_FPU(0, "FPU stack must be clean on entry");
 139   }
 140 #endif
 141 
 142 #ifdef ASSERT
 143   if (VerifyStackAtCalls) {
 144     Label L;
 145     push(rax);
 146     mov(rax, rsp);
 147     andptr(rax, StackAlignmentInBytes-1);
 148     cmpptr(rax, StackAlignmentInBytes-wordSize);
 149     pop(rax);
 150     jcc(Assembler::equal, L);
 151     STOP("Stack is not properly aligned!");
 152     bind(L);
 153   }
 154 #endif
 155 }
 156 
 157 void C2_MacroAssembler::entry_barrier() {
 158   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 159 #ifdef _LP64
 160   if (BarrierSet::barrier_set()->barrier_set_nmethod() != nullptr) {
 161     // We put the non-hot code of the nmethod entry barrier out-of-line in a stub.
 162     Label dummy_slow_path;
 163     Label dummy_continuation;
 164     Label* slow_path = &dummy_slow_path;
 165     Label* continuation = &dummy_continuation;
 166     if (!Compile::current()->output()->in_scratch_emit_size()) {
 167       // Use real labels from actual stub when not emitting code for the purpose of measuring its size
 168       C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
 169       Compile::current()->output()->add_stub(stub);
 170       slow_path = &stub->entry();
 171       continuation = &stub->continuation();
 172     }
 173     bs->nmethod_entry_barrier(this, slow_path, continuation);
 174   }
 175 #else
 176   // Don't bother with out-of-line nmethod entry barrier stub for x86_32.
 177   bs->nmethod_entry_barrier(this, nullptr /* slow_path */, nullptr /* continuation */);
 178 #endif
 179 }
 180 
 181 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
 182   switch (vlen_in_bytes) {
 183     case  4: // fall-through
 184     case  8: // fall-through
 185     case 16: return Assembler::AVX_128bit;
 186     case 32: return Assembler::AVX_256bit;
 187     case 64: return Assembler::AVX_512bit;
 188 
 189     default: {
 190       ShouldNotReachHere();
 191       return Assembler::AVX_NoVec;
 192     }
 193   }
 194 }
 195 
 196 #if INCLUDE_RTM_OPT
 197 
 198 // Update rtm_counters based on abort status
 199 // input: abort_status
 200 //        rtm_counters (RTMLockingCounters*)
 201 // flags are killed
 202 void C2_MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) {
 203 
 204   atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset()));
 205   if (PrintPreciseRTMLockingStatistics) {
 206     for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {
 207       Label check_abort;
 208       testl(abort_status, (1<<i));
 209       jccb(Assembler::equal, check_abort);
 210       atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx))));
 211       bind(check_abort);
 212     }
 213   }
 214 }
 215 
 216 // Branch if (random & (count-1) != 0), count is 2^n
 217 // tmp, scr and flags are killed
 218 void C2_MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) {
 219   assert(tmp == rax, "");
 220   assert(scr == rdx, "");
 221   rdtsc(); // modifies EDX:EAX
 222   andptr(tmp, count-1);
 223   jccb(Assembler::notZero, brLabel);
 224 }
 225 
 226 // Perform abort ratio calculation, set no_rtm bit if high ratio
 227 // input:  rtm_counters_Reg (RTMLockingCounters* address)
 228 // tmpReg, rtm_counters_Reg and flags are killed
 229 void C2_MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg,
 230                                                     Register rtm_counters_Reg,
 231                                                     RTMLockingCounters* rtm_counters,
 232                                                     Metadata* method_data) {
 233   Label L_done, L_check_always_rtm1, L_check_always_rtm2;
 234 
 235   if (RTMLockingCalculationDelay > 0) {
 236     // Delay calculation
 237     movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr()));
 238     testptr(tmpReg, tmpReg);
 239     jccb(Assembler::equal, L_done);
 240   }
 241   // Abort ratio calculation only if abort_count > RTMAbortThreshold
 242   //   Aborted transactions = abort_count * 100
 243   //   All transactions = total_count *  RTMTotalCountIncrRate
 244   //   Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
 245 
 246   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset()));
 247   cmpptr(tmpReg, RTMAbortThreshold);
 248   jccb(Assembler::below, L_check_always_rtm2);
 249   imulptr(tmpReg, tmpReg, 100);
 250 
 251   Register scrReg = rtm_counters_Reg;
 252   movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
 253   imulptr(scrReg, scrReg, RTMTotalCountIncrRate);
 254   imulptr(scrReg, scrReg, RTMAbortRatio);
 255   cmpptr(tmpReg, scrReg);
 256   jccb(Assembler::below, L_check_always_rtm1);
 257   if (method_data != nullptr) {
 258     // set rtm_state to "no rtm" in MDO
 259     mov_metadata(tmpReg, method_data);
 260     lock();
 261     orl(Address(tmpReg, MethodData::rtm_state_offset()), NoRTM);
 262   }
 263   jmpb(L_done);
 264   bind(L_check_always_rtm1);
 265   // Reload RTMLockingCounters* address
 266   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
 267   bind(L_check_always_rtm2);
 268   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
 269   cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate);
 270   jccb(Assembler::below, L_done);
 271   if (method_data != nullptr) {
 272     // set rtm_state to "always rtm" in MDO
 273     mov_metadata(tmpReg, method_data);
 274     lock();
 275     orl(Address(tmpReg, MethodData::rtm_state_offset()), UseRTM);
 276   }
 277   bind(L_done);
 278 }
 279 
 280 // Update counters and perform abort ratio calculation
 281 // input:  abort_status_Reg
 282 // rtm_counters_Reg, flags are killed
 283 void C2_MacroAssembler::rtm_profiling(Register abort_status_Reg,
 284                                       Register rtm_counters_Reg,
 285                                       RTMLockingCounters* rtm_counters,
 286                                       Metadata* method_data,
 287                                       bool profile_rtm) {
 288 
 289   assert(rtm_counters != nullptr, "should not be null when profiling RTM");
 290   // update rtm counters based on rax value at abort
 291   // reads abort_status_Reg, updates flags
 292   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
 293   rtm_counters_update(abort_status_Reg, rtm_counters_Reg);
 294   if (profile_rtm) {
 295     // Save abort status because abort_status_Reg is used by following code.
 296     if (RTMRetryCount > 0) {
 297       push(abort_status_Reg);
 298     }
 299     assert(rtm_counters != nullptr, "should not be null when profiling RTM");
 300     rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data);
 301     // restore abort status
 302     if (RTMRetryCount > 0) {
 303       pop(abort_status_Reg);
 304     }
 305   }
 306 }
 307 
 308 // Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4)
 309 // inputs: retry_count_Reg
 310 //       : abort_status_Reg
 311 // output: retry_count_Reg decremented by 1
 312 // flags are killed
 313 void C2_MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) {
 314   Label doneRetry;
 315   assert(abort_status_Reg == rax, "");
 316   // The abort reason bits are in eax (see all states in rtmLocking.hpp)
 317   // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4)
 318   // if reason is in 0x6 and retry count != 0 then retry
 319   andptr(abort_status_Reg, 0x6);
 320   jccb(Assembler::zero, doneRetry);
 321   testl(retry_count_Reg, retry_count_Reg);
 322   jccb(Assembler::zero, doneRetry);
 323   pause();
 324   decrementl(retry_count_Reg);
 325   jmp(retryLabel);
 326   bind(doneRetry);
 327 }
 328 
 329 // Spin and retry if lock is busy,
 330 // inputs: box_Reg (monitor address)
 331 //       : retry_count_Reg
 332 // output: retry_count_Reg decremented by 1
 333 //       : clear z flag if retry count exceeded
 334 // tmp_Reg, scr_Reg, flags are killed
 335 void C2_MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg,
 336                                                Register tmp_Reg, Register scr_Reg, Label& retryLabel) {
 337   Label SpinLoop, SpinExit, doneRetry;
 338   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 339 
 340   testl(retry_count_Reg, retry_count_Reg);
 341   jccb(Assembler::zero, doneRetry);
 342   decrementl(retry_count_Reg);
 343   movptr(scr_Reg, RTMSpinLoopCount);
 344 
 345   bind(SpinLoop);
 346   pause();
 347   decrementl(scr_Reg);
 348   jccb(Assembler::lessEqual, SpinExit);
 349   movptr(tmp_Reg, Address(box_Reg, owner_offset));
 350   testptr(tmp_Reg, tmp_Reg);
 351   jccb(Assembler::notZero, SpinLoop);
 352 
 353   bind(SpinExit);
 354   jmp(retryLabel);
 355   bind(doneRetry);
 356   incrementl(retry_count_Reg); // clear z flag
 357 }
 358 
 359 // Use RTM for normal stack locks
 360 // Input: objReg (object to lock)
 361 void C2_MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg,
 362                                          Register retry_on_abort_count_Reg,
 363                                          RTMLockingCounters* stack_rtm_counters,
 364                                          Metadata* method_data, bool profile_rtm,
 365                                          Label& DONE_LABEL, Label& IsInflated) {
 366   assert(UseRTMForStackLocks, "why call this otherwise?");
 367   assert(tmpReg == rax, "");
 368   assert(scrReg == rdx, "");
 369   Label L_rtm_retry, L_decrement_retry, L_on_abort;
 370 
 371   if (RTMRetryCount > 0) {
 372     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
 373     bind(L_rtm_retry);
 374   }
 375   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
 376   testptr(tmpReg, markWord::monitor_value);  // inflated vs stack-locked|neutral
 377   jcc(Assembler::notZero, IsInflated);
 378 
 379   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 380     Label L_noincrement;
 381     if (RTMTotalCountIncrRate > 1) {
 382       // tmpReg, scrReg and flags are killed
 383       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
 384     }
 385     assert(stack_rtm_counters != nullptr, "should not be null when profiling RTM");
 386     atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg);
 387     bind(L_noincrement);
 388   }
 389   xbegin(L_on_abort);
 390   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));       // fetch markword
 391   andptr(tmpReg, markWord::lock_mask_in_place);     // look at 2 lock bits
 392   cmpptr(tmpReg, markWord::unlocked_value);         // bits = 01 unlocked
 393   jcc(Assembler::equal, DONE_LABEL);        // all done if unlocked
 394 
 395   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
 396   if (UseRTMXendForLockBusy) {
 397     xend();
 398     movptr(abort_status_Reg, 0x2);   // Set the abort status to 2 (so we can retry)
 399     jmp(L_decrement_retry);
 400   }
 401   else {
 402     xabort(0);
 403   }
 404   bind(L_on_abort);
 405   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 406     rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm);
 407   }
 408   bind(L_decrement_retry);
 409   if (RTMRetryCount > 0) {
 410     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
 411     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
 412   }
 413 }
 414 
 415 // Use RTM for inflating locks
 416 // inputs: objReg (object to lock)
 417 //         boxReg (on-stack box address (displaced header location) - KILLED)
 418 //         tmpReg (ObjectMonitor address + markWord::monitor_value)
 419 void C2_MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg,
 420                                             Register scrReg, Register retry_on_busy_count_Reg,
 421                                             Register retry_on_abort_count_Reg,
 422                                             RTMLockingCounters* rtm_counters,
 423                                             Metadata* method_data, bool profile_rtm,
 424                                             Label& DONE_LABEL) {
 425   assert(UseRTMLocking, "why call this otherwise?");
 426   assert(tmpReg == rax, "");
 427   assert(scrReg == rdx, "");
 428   Label L_rtm_retry, L_decrement_retry, L_on_abort;
 429   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 430 
 431   movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value()));
 432   movptr(boxReg, tmpReg); // Save ObjectMonitor address
 433 
 434   if (RTMRetryCount > 0) {
 435     movl(retry_on_busy_count_Reg, RTMRetryCount);  // Retry on lock busy
 436     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
 437     bind(L_rtm_retry);
 438   }
 439   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 440     Label L_noincrement;
 441     if (RTMTotalCountIncrRate > 1) {
 442       // tmpReg, scrReg and flags are killed
 443       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
 444     }
 445     assert(rtm_counters != nullptr, "should not be null when profiling RTM");
 446     atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg);
 447     bind(L_noincrement);
 448   }
 449   xbegin(L_on_abort);
 450   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
 451   movptr(tmpReg, Address(tmpReg, owner_offset));
 452   testptr(tmpReg, tmpReg);
 453   jcc(Assembler::zero, DONE_LABEL);
 454   if (UseRTMXendForLockBusy) {
 455     xend();
 456     jmp(L_decrement_retry);
 457   }
 458   else {
 459     xabort(0);
 460   }
 461   bind(L_on_abort);
 462   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
 463   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 464     rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm);
 465   }
 466   if (RTMRetryCount > 0) {
 467     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
 468     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
 469   }
 470 
 471   movptr(tmpReg, Address(boxReg, owner_offset)) ;
 472   testptr(tmpReg, tmpReg) ;
 473   jccb(Assembler::notZero, L_decrement_retry) ;
 474 
 475   // Appears unlocked - try to swing _owner from null to non-null.
 476   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 477 #ifdef _LP64
 478   Register threadReg = r15_thread;
 479 #else
 480   get_thread(scrReg);
 481   Register threadReg = scrReg;
 482 #endif
 483   lock();
 484   cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg
 485 
 486   if (RTMRetryCount > 0) {
 487     // success done else retry
 488     jccb(Assembler::equal, DONE_LABEL) ;
 489     bind(L_decrement_retry);
 490     // Spin and retry if lock is busy.
 491     rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry);
 492   }
 493   else {
 494     bind(L_decrement_retry);
 495   }
 496 }
 497 
 498 #endif //  INCLUDE_RTM_OPT
 499 
 500 // fast_lock and fast_unlock used by C2
 501 
 502 // Because the transitions from emitted code to the runtime
 503 // monitorenter/exit helper stubs are so slow it's critical that
 504 // we inline both the stack-locking fast path and the inflated fast path.
 505 //
 506 // See also: cmpFastLock and cmpFastUnlock.
 507 //
 508 // What follows is a specialized inline transliteration of the code
 509 // in enter() and exit(). If we're concerned about I$ bloat another
 510 // option would be to emit TrySlowEnter and TrySlowExit methods
 511 // at startup-time.  These methods would accept arguments as
 512 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
 513 // indications in the icc.ZFlag.  fast_lock and fast_unlock would simply
 514 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
 515 // In practice, however, the # of lock sites is bounded and is usually small.
 516 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
 517 // if the processor uses simple bimodal branch predictors keyed by EIP
 518 // Since the helper routines would be called from multiple synchronization
 519 // sites.
 520 //
 521 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
 522 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
 523 // to those specialized methods.  That'd give us a mostly platform-independent
 524 // implementation that the JITs could optimize and inline at their pleasure.
 525 // Done correctly, the only time we'd need to cross to native could would be
 526 // to park() or unpark() threads.  We'd also need a few more unsafe operators
 527 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
 528 // (b) explicit barriers or fence operations.
 529 //
 530 // TODO:
 531 //
 532 // *  Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
 533 //    This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
 534 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
 535 //    the lock operators would typically be faster than reifying Self.
 536 //
 537 // *  Ideally I'd define the primitives as:
 538 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
 539 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
 540 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
 541 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
 542 //    Furthermore the register assignments are overconstrained, possibly resulting in
 543 //    sub-optimal code near the synchronization site.
 544 //
 545 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
 546 //    Alternately, use a better sp-proximity test.
 547 //
 548 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
 549 //    Either one is sufficient to uniquely identify a thread.
 550 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
 551 //
 552 // *  Intrinsify notify() and notifyAll() for the common cases where the
 553 //    object is locked by the calling thread but the waitlist is empty.
 554 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
 555 //
 556 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
 557 //    But beware of excessive branch density on AMD Opterons.
 558 //
 559 // *  Both fast_lock and fast_unlock set the ICC.ZF to indicate success
 560 //    or failure of the fast path.  If the fast path fails then we pass
 561 //    control to the slow path, typically in C.  In fast_lock and
 562 //    fast_unlock we often branch to DONE_LABEL, just to find that C2
 563 //    will emit a conditional branch immediately after the node.
 564 //    So we have branches to branches and lots of ICC.ZF games.
 565 //    Instead, it might be better to have C2 pass a "FailureLabel"
 566 //    into fast_lock and fast_unlock.  In the case of success, control
 567 //    will drop through the node.  ICC.ZF is undefined at exit.
 568 //    In the case of failure, the node will branch directly to the
 569 //    FailureLabel
 570 
 571 
 572 // obj: object to lock
 573 // box: on-stack box address (displaced header location) - KILLED
 574 // rax,: tmp -- KILLED
 575 // scr: tmp -- KILLED
 576 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
 577                                  Register scrReg, Register cx1Reg, Register cx2Reg, Register thread,
 578                                  RTMLockingCounters* rtm_counters,
 579                                  RTMLockingCounters* stack_rtm_counters,
 580                                  Metadata* method_data,
 581                                  bool use_rtm, bool profile_rtm) {
 582   // Ensure the register assignments are disjoint
 583   assert(tmpReg == rax, "");
 584 
 585   if (use_rtm) {
 586     assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg);
 587   } else {
 588     assert(cx1Reg == noreg, "");
 589     assert(cx2Reg == noreg, "");
 590     assert_different_registers(objReg, boxReg, tmpReg, scrReg);
 591   }
 592 
 593   // Possible cases that we'll encounter in fast_lock
 594   // ------------------------------------------------
 595   // * Inflated
 596   //    -- unlocked
 597   //    -- Locked
 598   //       = by self
 599   //       = by other
 600   // * neutral
 601   // * stack-locked
 602   //    -- by self
 603   //       = sp-proximity test hits
 604   //       = sp-proximity test generates false-negative
 605   //    -- by other
 606   //
 607 
 608   Label IsInflated, DONE_LABEL, NO_COUNT, COUNT;
 609 
 610   if (DiagnoseSyncOnValueBasedClasses != 0) {
 611     load_klass(tmpReg, objReg, scrReg);
 612     movl(tmpReg, Address(tmpReg, Klass::access_flags_offset()));
 613     testl(tmpReg, JVM_ACC_IS_VALUE_BASED_CLASS);
 614     jcc(Assembler::notZero, DONE_LABEL);
 615   }
 616 
 617 #if INCLUDE_RTM_OPT
 618   if (UseRTMForStackLocks && use_rtm) {
 619     assert(LockingMode != LM_MONITOR, "LockingMode == 0 (LM_MONITOR) and +UseRTMForStackLocks are mutually exclusive");
 620     rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg,
 621                       stack_rtm_counters, method_data, profile_rtm,
 622                       DONE_LABEL, IsInflated);
 623   }
 624 #endif // INCLUDE_RTM_OPT
 625 
 626   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));          // [FETCH]
 627   testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral
 628   jcc(Assembler::notZero, IsInflated);
 629 
 630   if (LockingMode == LM_MONITOR) {
 631     // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0.
 632     testptr(objReg, objReg);
 633   } else if (LockingMode == LM_LEGACY) {
 634     // Attempt stack-locking ...
 635     orptr (tmpReg, markWord::unlocked_value);
 636     if (EnableValhalla) {
 637       // Mask inline_type bit such that we go to the slow path if object is an inline type
 638       andptr(tmpReg, ~((int) markWord::inline_type_bit_in_place));
 639     }
 640     movptr(Address(boxReg, 0), tmpReg);          // Anticipate successful CAS
 641     lock();
 642     cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes()));      // Updates tmpReg
 643     jcc(Assembler::equal, COUNT);           // Success
 644 
 645     // Recursive locking.
 646     // The object is stack-locked: markword contains stack pointer to BasicLock.
 647     // Locked by current thread if difference with current SP is less than one page.
 648     subptr(tmpReg, rsp);
 649     // Next instruction set ZFlag == 1 (Success) if difference is less then one page.
 650     andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - (int)os::vm_page_size())) );
 651     movptr(Address(boxReg, 0), tmpReg);
 652   } else {
 653     assert(LockingMode == LM_LIGHTWEIGHT, "");
 654     lightweight_lock(objReg, tmpReg, thread, scrReg, NO_COUNT);
 655     jmp(COUNT);
 656   }
 657   jmp(DONE_LABEL);
 658 
 659   bind(IsInflated);
 660   // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value
 661 
 662 #if INCLUDE_RTM_OPT
 663   // Use the same RTM locking code in 32- and 64-bit VM.
 664   if (use_rtm) {
 665     rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg,
 666                          rtm_counters, method_data, profile_rtm, DONE_LABEL);
 667   } else {
 668 #endif // INCLUDE_RTM_OPT
 669 
 670 #ifndef _LP64
 671   // The object is inflated.
 672 
 673   // boxReg refers to the on-stack BasicLock in the current frame.
 674   // We'd like to write:
 675   //   set box->_displaced_header = markWord::unused_mark().  Any non-0 value suffices.
 676   // This is convenient but results a ST-before-CAS penalty.  The following CAS suffers
 677   // additional latency as we have another ST in the store buffer that must drain.
 678 
 679   // avoid ST-before-CAS
 680   // register juggle because we need tmpReg for cmpxchgptr below
 681   movptr(scrReg, boxReg);
 682   movptr(boxReg, tmpReg);                   // consider: LEA box, [tmp-2]
 683 
 684   // Optimistic form: consider XORL tmpReg,tmpReg
 685   movptr(tmpReg, NULL_WORD);
 686 
 687   // Appears unlocked - try to swing _owner from null to non-null.
 688   // Ideally, I'd manifest "Self" with get_thread and then attempt
 689   // to CAS the register containing Self into m->Owner.
 690   // But we don't have enough registers, so instead we can either try to CAS
 691   // rsp or the address of the box (in scr) into &m->owner.  If the CAS succeeds
 692   // we later store "Self" into m->Owner.  Transiently storing a stack address
 693   // (rsp or the address of the box) into  m->owner is harmless.
 694   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 695   lock();
 696   cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 697   movptr(Address(scrReg, 0), 3);          // box->_displaced_header = 3
 698   // If we weren't able to swing _owner from null to the BasicLock
 699   // then take the slow path.
 700   jccb  (Assembler::notZero, NO_COUNT);
 701   // update _owner from BasicLock to thread
 702   get_thread (scrReg);                    // beware: clobbers ICCs
 703   movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg);
 704   xorptr(boxReg, boxReg);                 // set icc.ZFlag = 1 to indicate success
 705 
 706   // If the CAS fails we can either retry or pass control to the slow path.
 707   // We use the latter tactic.
 708   // Pass the CAS result in the icc.ZFlag into DONE_LABEL
 709   // If the CAS was successful ...
 710   //   Self has acquired the lock
 711   //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
 712   // Intentional fall-through into DONE_LABEL ...
 713 #else // _LP64
 714   // It's inflated and we use scrReg for ObjectMonitor* in this section.
 715   movq(scrReg, tmpReg);
 716   xorq(tmpReg, tmpReg);
 717   lock();
 718   cmpxchgptr(thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 719   // Unconditionally set box->_displaced_header = markWord::unused_mark().
 720   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 721   movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value()));
 722   // Propagate ICC.ZF from CAS above into DONE_LABEL.
 723   jccb(Assembler::equal, COUNT);          // CAS above succeeded; propagate ZF = 1 (success)
 724 
 725   cmpptr(thread, rax);                // Check if we are already the owner (recursive lock)
 726   jccb(Assembler::notEqual, NO_COUNT);    // If not recursive, ZF = 0 at this point (fail)
 727   incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 728   xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success
 729 #endif // _LP64
 730 #if INCLUDE_RTM_OPT
 731   } // use_rtm()
 732 #endif
 733   bind(DONE_LABEL);
 734 
 735   // ZFlag == 1 count in fast path
 736   // ZFlag == 0 count in slow path
 737   jccb(Assembler::notZero, NO_COUNT); // jump if ZFlag == 0
 738 
 739   bind(COUNT);
 740   // Count monitors in fast path
 741   increment(Address(thread, JavaThread::held_monitor_count_offset()));
 742 
 743   xorl(tmpReg, tmpReg); // Set ZF == 1
 744 
 745   bind(NO_COUNT);
 746 
 747   // At NO_COUNT the icc ZFlag is set as follows ...
 748   // fast_unlock uses the same protocol.
 749   // ZFlag == 1 -> Success
 750   // ZFlag == 0 -> Failure - force control through the slow path
 751 }
 752 
 753 // obj: object to unlock
 754 // box: box address (displaced header location), killed.  Must be EAX.
 755 // tmp: killed, cannot be obj nor box.
 756 //
 757 // Some commentary on balanced locking:
 758 //
 759 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
 760 // Methods that don't have provably balanced locking are forced to run in the
 761 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
 762 // The interpreter provides two properties:
 763 // I1:  At return-time the interpreter automatically and quietly unlocks any
 764 //      objects acquired the current activation (frame).  Recall that the
 765 //      interpreter maintains an on-stack list of locks currently held by
 766 //      a frame.
 767 // I2:  If a method attempts to unlock an object that is not held by the
 768 //      the frame the interpreter throws IMSX.
 769 //
 770 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
 771 // B() doesn't have provably balanced locking so it runs in the interpreter.
 772 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
 773 // is still locked by A().
 774 //
 775 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
 776 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
 777 // should not be unlocked by "normal" java-level locking and vice-versa.  The specification
 778 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
 779 // Arguably given that the spec legislates the JNI case as undefined our implementation
 780 // could reasonably *avoid* checking owner in fast_unlock().
 781 // In the interest of performance we elide m->Owner==Self check in unlock.
 782 // A perfectly viable alternative is to elide the owner check except when
 783 // Xcheck:jni is enabled.
 784 
 785 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) {
 786   assert(boxReg == rax, "");
 787   assert_different_registers(objReg, boxReg, tmpReg);
 788 
 789   Label DONE_LABEL, Stacked, COUNT, NO_COUNT;
 790 
 791 #if INCLUDE_RTM_OPT
 792   if (UseRTMForStackLocks && use_rtm) {
 793     assert(LockingMode != LM_MONITOR, "LockingMode == 0 (LM_MONITOR) and +UseRTMForStackLocks are mutually exclusive");
 794     Label L_regular_unlock;
 795     movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword
 796     andptr(tmpReg, markWord::lock_mask_in_place);                     // look at 2 lock bits
 797     cmpptr(tmpReg, markWord::unlocked_value);                         // bits = 01 unlocked
 798     jccb(Assembler::notEqual, L_regular_unlock);                      // if !HLE RegularLock
 799     xend();                                                           // otherwise end...
 800     jmp(DONE_LABEL);                                                  // ... and we're done
 801     bind(L_regular_unlock);
 802   }
 803 #endif
 804 
 805   if (LockingMode == LM_LEGACY) {
 806     cmpptr(Address(boxReg, 0), NULL_WORD);                            // Examine the displaced header
 807     jcc   (Assembler::zero, COUNT);                                   // 0 indicates recursive stack-lock
 808   }
 809   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));   // Examine the object's markword
 810   if (LockingMode != LM_MONITOR) {
 811     testptr(tmpReg, markWord::monitor_value);                         // Inflated?
 812     jcc(Assembler::zero, Stacked);
 813   }
 814 
 815   // It's inflated.
 816   if (LockingMode == LM_LIGHTWEIGHT) {
 817     // If the owner is ANONYMOUS, we need to fix it -  in an outline stub.
 818     testb(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t) ObjectMonitor::ANONYMOUS_OWNER);
 819 #ifdef _LP64
 820     if (!Compile::current()->output()->in_scratch_emit_size()) {
 821       C2HandleAnonOMOwnerStub* stub = new (Compile::current()->comp_arena()) C2HandleAnonOMOwnerStub(tmpReg, boxReg);
 822       Compile::current()->output()->add_stub(stub);
 823       jcc(Assembler::notEqual, stub->entry());
 824       bind(stub->continuation());
 825     } else
 826 #endif
 827     {
 828       // We can't easily implement this optimization on 32 bit because we don't have a thread register.
 829       // Call the slow-path instead.
 830       jcc(Assembler::notEqual, NO_COUNT);
 831     }
 832   }
 833 
 834 #if INCLUDE_RTM_OPT
 835   if (use_rtm) {
 836     Label L_regular_inflated_unlock;
 837     int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 838     movptr(boxReg, Address(tmpReg, owner_offset));
 839     testptr(boxReg, boxReg);
 840     jccb(Assembler::notZero, L_regular_inflated_unlock);
 841     xend();
 842     jmp(DONE_LABEL);
 843     bind(L_regular_inflated_unlock);
 844   }
 845 #endif
 846 
 847   // Despite our balanced locking property we still check that m->_owner == Self
 848   // as java routines or native JNI code called by this thread might
 849   // have released the lock.
 850   // Refer to the comments in synchronizer.cpp for how we might encode extra
 851   // state in _succ so we can avoid fetching EntryList|cxq.
 852   //
 853   // If there's no contention try a 1-0 exit.  That is, exit without
 854   // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
 855   // we detect and recover from the race that the 1-0 exit admits.
 856   //
 857   // Conceptually fast_unlock() must execute a STST|LDST "release" barrier
 858   // before it STs null into _owner, releasing the lock.  Updates
 859   // to data protected by the critical section must be visible before
 860   // we drop the lock (and thus before any other thread could acquire
 861   // the lock and observe the fields protected by the lock).
 862   // IA32's memory-model is SPO, so STs are ordered with respect to
 863   // each other and there's no need for an explicit barrier (fence).
 864   // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
 865 #ifndef _LP64
 866   // Note that we could employ various encoding schemes to reduce
 867   // the number of loads below (currently 4) to just 2 or 3.
 868   // Refer to the comments in synchronizer.cpp.
 869   // In practice the chain of fetches doesn't seem to impact performance, however.
 870   xorptr(boxReg, boxReg);
 871   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 872   jccb  (Assembler::notZero, DONE_LABEL);
 873   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 874   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 875   jccb  (Assembler::notZero, DONE_LABEL);
 876   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 877   jmpb  (DONE_LABEL);
 878 #else // _LP64
 879   // It's inflated
 880   Label CheckSucc, LNotRecursive, LSuccess, LGoSlowPath;
 881 
 882   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0);
 883   jccb(Assembler::equal, LNotRecursive);
 884 
 885   // Recursive inflated unlock
 886   decq(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 887   jmpb(LSuccess);
 888 
 889   bind(LNotRecursive);
 890   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 891   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 892   jccb  (Assembler::notZero, CheckSucc);
 893   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 894   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 895   jmpb  (DONE_LABEL);
 896 
 897   // Try to avoid passing control into the slow_path ...
 898   bind  (CheckSucc);
 899 
 900   // The following optional optimization can be elided if necessary
 901   // Effectively: if (succ == null) goto slow path
 902   // The code reduces the window for a race, however,
 903   // and thus benefits performance.
 904   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD);
 905   jccb  (Assembler::zero, LGoSlowPath);
 906 
 907   xorptr(boxReg, boxReg);
 908   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 909   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 910 
 911   // Memory barrier/fence
 912   // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ
 913   // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack.
 914   // This is faster on Nehalem and AMD Shanghai/Barcelona.
 915   // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences
 916   // We might also restructure (ST Owner=0;barrier;LD _Succ) to
 917   // (mov box,0; xchgq box, &m->Owner; LD _succ) .
 918   lock(); addl(Address(rsp, 0), 0);
 919 
 920   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD);
 921   jccb  (Assembler::notZero, LSuccess);
 922 
 923   // Rare inopportune interleaving - race.
 924   // The successor vanished in the small window above.
 925   // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor.
 926   // We need to ensure progress and succession.
 927   // Try to reacquire the lock.
 928   // If that fails then the new owner is responsible for succession and this
 929   // thread needs to take no further action and can exit via the fast path (success).
 930   // If the re-acquire succeeds then pass control into the slow path.
 931   // As implemented, this latter mode is horrible because we generated more
 932   // coherence traffic on the lock *and* artificially extended the critical section
 933   // length while by virtue of passing control into the slow path.
 934 
 935   // box is really RAX -- the following CMPXCHG depends on that binding
 936   // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R)
 937   lock();
 938   cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 939   // There's no successor so we tried to regrab the lock.
 940   // If that didn't work, then another thread grabbed the
 941   // lock so we're done (and exit was a success).
 942   jccb  (Assembler::notEqual, LSuccess);
 943   // Intentional fall-through into slow path
 944 
 945   bind  (LGoSlowPath);
 946   orl   (boxReg, 1);                      // set ICC.ZF=0 to indicate failure
 947   jmpb  (DONE_LABEL);
 948 
 949   bind  (LSuccess);
 950   testl (boxReg, 0);                      // set ICC.ZF=1 to indicate success
 951   jmpb  (DONE_LABEL);
 952 
 953 #endif
 954   if (LockingMode != LM_MONITOR) {
 955     bind  (Stacked);
 956     if (LockingMode == LM_LIGHTWEIGHT) {
 957       mov(boxReg, tmpReg);
 958       lightweight_unlock(objReg, boxReg, tmpReg, NO_COUNT);
 959       jmp(COUNT);
 960     } else if (LockingMode == LM_LEGACY) {
 961       movptr(tmpReg, Address (boxReg, 0));      // re-fetch
 962       lock();
 963       cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
 964     }
 965     // Intentional fall-thru into DONE_LABEL
 966   }
 967   bind(DONE_LABEL);
 968 
 969   // ZFlag == 1 count in fast path
 970   // ZFlag == 0 count in slow path
 971   jccb(Assembler::notZero, NO_COUNT);
 972 
 973   bind(COUNT);
 974   // Count monitors in fast path
 975 #ifndef _LP64
 976   get_thread(tmpReg);
 977   decrementl(Address(tmpReg, JavaThread::held_monitor_count_offset()));
 978 #else // _LP64
 979   decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset()));
 980 #endif
 981 
 982   xorl(tmpReg, tmpReg); // Set ZF == 1
 983 
 984   bind(NO_COUNT);
 985 }
 986 
 987 //-------------------------------------------------------------------------------------------
 988 // Generic instructions support for use in .ad files C2 code generation
 989 
 990 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) {
 991   if (dst != src) {
 992     movdqu(dst, src);
 993   }
 994   if (opcode == Op_AbsVD) {
 995     andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg);
 996   } else {
 997     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 998     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
 999   }
1000 }
1001 
1002 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
1003   if (opcode == Op_AbsVD) {
1004     vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg);
1005   } else {
1006     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
1007     vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg);
1008   }
1009 }
1010 
1011 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) {
1012   if (dst != src) {
1013     movdqu(dst, src);
1014   }
1015   if (opcode == Op_AbsVF) {
1016     andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg);
1017   } else {
1018     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
1019     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
1020   }
1021 }
1022 
1023 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
1024   if (opcode == Op_AbsVF) {
1025     vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg);
1026   } else {
1027     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
1028     vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg);
1029   }
1030 }
1031 
1032 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
1033   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
1034   assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
1035 
1036   if (opcode == Op_MinV) {
1037     if (elem_bt == T_BYTE) {
1038       pminsb(dst, src);
1039     } else if (elem_bt == T_SHORT) {
1040       pminsw(dst, src);
1041     } else if (elem_bt == T_INT) {
1042       pminsd(dst, src);
1043     } else {
1044       assert(elem_bt == T_LONG, "required");
1045       assert(tmp == xmm0, "required");
1046       assert_different_registers(dst, src, tmp);
1047       movdqu(xmm0, dst);
1048       pcmpgtq(xmm0, src);
1049       blendvpd(dst, src);  // xmm0 as mask
1050     }
1051   } else { // opcode == Op_MaxV
1052     if (elem_bt == T_BYTE) {
1053       pmaxsb(dst, src);
1054     } else if (elem_bt == T_SHORT) {
1055       pmaxsw(dst, src);
1056     } else if (elem_bt == T_INT) {
1057       pmaxsd(dst, src);
1058     } else {
1059       assert(elem_bt == T_LONG, "required");
1060       assert(tmp == xmm0, "required");
1061       assert_different_registers(dst, src, tmp);
1062       movdqu(xmm0, src);
1063       pcmpgtq(xmm0, dst);
1064       blendvpd(dst, src);  // xmm0 as mask
1065     }
1066   }
1067 }
1068 
1069 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
1070                                  XMMRegister dst, XMMRegister src1, XMMRegister src2,
1071                                  int vlen_enc) {
1072   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
1073 
1074   if (opcode == Op_MinV) {
1075     if (elem_bt == T_BYTE) {
1076       vpminsb(dst, src1, src2, vlen_enc);
1077     } else if (elem_bt == T_SHORT) {
1078       vpminsw(dst, src1, src2, vlen_enc);
1079     } else if (elem_bt == T_INT) {
1080       vpminsd(dst, src1, src2, vlen_enc);
1081     } else {
1082       assert(elem_bt == T_LONG, "required");
1083       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
1084         vpminsq(dst, src1, src2, vlen_enc);
1085       } else {
1086         assert_different_registers(dst, src1, src2);
1087         vpcmpgtq(dst, src1, src2, vlen_enc);
1088         vblendvpd(dst, src1, src2, dst, vlen_enc);
1089       }
1090     }
1091   } else { // opcode == Op_MaxV
1092     if (elem_bt == T_BYTE) {
1093       vpmaxsb(dst, src1, src2, vlen_enc);
1094     } else if (elem_bt == T_SHORT) {
1095       vpmaxsw(dst, src1, src2, vlen_enc);
1096     } else if (elem_bt == T_INT) {
1097       vpmaxsd(dst, src1, src2, vlen_enc);
1098     } else {
1099       assert(elem_bt == T_LONG, "required");
1100       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
1101         vpmaxsq(dst, src1, src2, vlen_enc);
1102       } else {
1103         assert_different_registers(dst, src1, src2);
1104         vpcmpgtq(dst, src1, src2, vlen_enc);
1105         vblendvpd(dst, src2, src1, dst, vlen_enc);
1106       }
1107     }
1108   }
1109 }
1110 
1111 // Float/Double min max
1112 
1113 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
1114                                    XMMRegister dst, XMMRegister a, XMMRegister b,
1115                                    XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
1116                                    int vlen_enc) {
1117   assert(UseAVX > 0, "required");
1118   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1119          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1120   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1121   assert_different_registers(a, b, tmp, atmp, btmp);
1122 
1123   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1124   bool is_double_word = is_double_word_type(elem_bt);
1125 
1126   if (!is_double_word && is_min) {
1127     vblendvps(atmp, a, b, a, vlen_enc);
1128     vblendvps(btmp, b, a, a, vlen_enc);
1129     vminps(tmp, atmp, btmp, vlen_enc);
1130     vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1131     vblendvps(dst, tmp, atmp, btmp, vlen_enc);
1132   } else if (!is_double_word && !is_min) {
1133     vblendvps(btmp, b, a, b, vlen_enc);
1134     vblendvps(atmp, a, b, b, vlen_enc);
1135     vmaxps(tmp, atmp, btmp, vlen_enc);
1136     vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1137     vblendvps(dst, tmp, atmp, btmp, vlen_enc);
1138   } else if (is_double_word && is_min) {
1139     vblendvpd(atmp, a, b, a, vlen_enc);
1140     vblendvpd(btmp, b, a, a, vlen_enc);
1141     vminpd(tmp, atmp, btmp, vlen_enc);
1142     vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1143     vblendvpd(dst, tmp, atmp, btmp, vlen_enc);
1144   } else {
1145     assert(is_double_word && !is_min, "sanity");
1146     vblendvpd(btmp, b, a, b, vlen_enc);
1147     vblendvpd(atmp, a, b, b, vlen_enc);
1148     vmaxpd(tmp, atmp, btmp, vlen_enc);
1149     vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1150     vblendvpd(dst, tmp, atmp, btmp, vlen_enc);
1151   }
1152 }
1153 
1154 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
1155                                     XMMRegister dst, XMMRegister a, XMMRegister b,
1156                                     KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
1157                                     int vlen_enc) {
1158   assert(UseAVX > 2, "required");
1159   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1160          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1161   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1162   assert_different_registers(dst, a, b, atmp, btmp);
1163 
1164   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1165   bool is_double_word = is_double_word_type(elem_bt);
1166   bool merge = true;
1167 
1168   if (!is_double_word && is_min) {
1169     evpmovd2m(ktmp, a, vlen_enc);
1170     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1171     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1172     vminps(dst, atmp, btmp, vlen_enc);
1173     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1174     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1175   } else if (!is_double_word && !is_min) {
1176     evpmovd2m(ktmp, b, vlen_enc);
1177     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1178     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1179     vmaxps(dst, atmp, btmp, vlen_enc);
1180     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1181     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1182   } else if (is_double_word && is_min) {
1183     evpmovq2m(ktmp, a, vlen_enc);
1184     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1185     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1186     vminpd(dst, atmp, btmp, vlen_enc);
1187     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1188     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1189   } else {
1190     assert(is_double_word && !is_min, "sanity");
1191     evpmovq2m(ktmp, b, vlen_enc);
1192     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1193     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1194     vmaxpd(dst, atmp, btmp, vlen_enc);
1195     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1196     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1197   }
1198 }
1199 
1200 // Float/Double signum
1201 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) {
1202   assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");
1203 
1204   Label DONE_LABEL;
1205 
1206   if (opcode == Op_SignumF) {
1207     assert(UseSSE > 0, "required");
1208     ucomiss(dst, zero);
1209     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1210     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1211     movflt(dst, one);
1212     jcc(Assembler::above, DONE_LABEL);
1213     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
1214   } else if (opcode == Op_SignumD) {
1215     assert(UseSSE > 1, "required");
1216     ucomisd(dst, zero);
1217     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1218     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1219     movdbl(dst, one);
1220     jcc(Assembler::above, DONE_LABEL);
1221     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
1222   }
1223 
1224   bind(DONE_LABEL);
1225 }
1226 
1227 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1228   if (sign) {
1229     pmovsxbw(dst, src);
1230   } else {
1231     pmovzxbw(dst, src);
1232   }
1233 }
1234 
1235 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1236   if (sign) {
1237     vpmovsxbw(dst, src, vector_len);
1238   } else {
1239     vpmovzxbw(dst, src, vector_len);
1240   }
1241 }
1242 
1243 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1244   if (sign) {
1245     vpmovsxbd(dst, src, vector_len);
1246   } else {
1247     vpmovzxbd(dst, src, vector_len);
1248   }
1249 }
1250 
1251 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1252   if (sign) {
1253     vpmovsxwd(dst, src, vector_len);
1254   } else {
1255     vpmovzxwd(dst, src, vector_len);
1256   }
1257 }
1258 
1259 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1260                                      int shift, int vector_len) {
1261   if (opcode == Op_RotateLeftV) {
1262     if (etype == T_INT) {
1263       evprold(dst, src, shift, vector_len);
1264     } else {
1265       assert(etype == T_LONG, "expected type T_LONG");
1266       evprolq(dst, src, shift, vector_len);
1267     }
1268   } else {
1269     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1270     if (etype == T_INT) {
1271       evprord(dst, src, shift, vector_len);
1272     } else {
1273       assert(etype == T_LONG, "expected type T_LONG");
1274       evprorq(dst, src, shift, vector_len);
1275     }
1276   }
1277 }
1278 
1279 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1280                                      XMMRegister shift, int vector_len) {
1281   if (opcode == Op_RotateLeftV) {
1282     if (etype == T_INT) {
1283       evprolvd(dst, src, shift, vector_len);
1284     } else {
1285       assert(etype == T_LONG, "expected type T_LONG");
1286       evprolvq(dst, src, shift, vector_len);
1287     }
1288   } else {
1289     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1290     if (etype == T_INT) {
1291       evprorvd(dst, src, shift, vector_len);
1292     } else {
1293       assert(etype == T_LONG, "expected type T_LONG");
1294       evprorvq(dst, src, shift, vector_len);
1295     }
1296   }
1297 }
1298 
1299 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1300   if (opcode == Op_RShiftVI) {
1301     psrad(dst, shift);
1302   } else if (opcode == Op_LShiftVI) {
1303     pslld(dst, shift);
1304   } else {
1305     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1306     psrld(dst, shift);
1307   }
1308 }
1309 
1310 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1311   switch (opcode) {
1312     case Op_RShiftVI:  psrad(dst, shift); break;
1313     case Op_LShiftVI:  pslld(dst, shift); break;
1314     case Op_URShiftVI: psrld(dst, shift); break;
1315 
1316     default: assert(false, "%s", NodeClassNames[opcode]);
1317   }
1318 }
1319 
1320 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1321   if (opcode == Op_RShiftVI) {
1322     vpsrad(dst, nds, shift, vector_len);
1323   } else if (opcode == Op_LShiftVI) {
1324     vpslld(dst, nds, shift, vector_len);
1325   } else {
1326     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1327     vpsrld(dst, nds, shift, vector_len);
1328   }
1329 }
1330 
1331 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1332   switch (opcode) {
1333     case Op_RShiftVI:  vpsrad(dst, src, shift, vlen_enc); break;
1334     case Op_LShiftVI:  vpslld(dst, src, shift, vlen_enc); break;
1335     case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1336 
1337     default: assert(false, "%s", NodeClassNames[opcode]);
1338   }
1339 }
1340 
1341 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1342   switch (opcode) {
1343     case Op_RShiftVB:  // fall-through
1344     case Op_RShiftVS:  psraw(dst, shift); break;
1345 
1346     case Op_LShiftVB:  // fall-through
1347     case Op_LShiftVS:  psllw(dst, shift);   break;
1348 
1349     case Op_URShiftVS: // fall-through
1350     case Op_URShiftVB: psrlw(dst, shift);  break;
1351 
1352     default: assert(false, "%s", NodeClassNames[opcode]);
1353   }
1354 }
1355 
1356 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1357   switch (opcode) {
1358     case Op_RShiftVB:  // fall-through
1359     case Op_RShiftVS:  vpsraw(dst, src, shift, vlen_enc); break;
1360 
1361     case Op_LShiftVB:  // fall-through
1362     case Op_LShiftVS:  vpsllw(dst, src, shift, vlen_enc); break;
1363 
1364     case Op_URShiftVS: // fall-through
1365     case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1366 
1367     default: assert(false, "%s", NodeClassNames[opcode]);
1368   }
1369 }
1370 
1371 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1372   switch (opcode) {
1373     case Op_RShiftVL:  psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1374     case Op_LShiftVL:  psllq(dst, shift); break;
1375     case Op_URShiftVL: psrlq(dst, shift); break;
1376 
1377     default: assert(false, "%s", NodeClassNames[opcode]);
1378   }
1379 }
1380 
1381 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1382   if (opcode == Op_RShiftVL) {
1383     psrlq(dst, shift);  // using srl to implement sra on pre-avs512 systems
1384   } else if (opcode == Op_LShiftVL) {
1385     psllq(dst, shift);
1386   } else {
1387     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1388     psrlq(dst, shift);
1389   }
1390 }
1391 
1392 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1393   switch (opcode) {
1394     case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1395     case Op_LShiftVL:  vpsllq(dst, src, shift, vlen_enc); break;
1396     case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1397 
1398     default: assert(false, "%s", NodeClassNames[opcode]);
1399   }
1400 }
1401 
1402 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1403   if (opcode == Op_RShiftVL) {
1404     evpsraq(dst, nds, shift, vector_len);
1405   } else if (opcode == Op_LShiftVL) {
1406     vpsllq(dst, nds, shift, vector_len);
1407   } else {
1408     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1409     vpsrlq(dst, nds, shift, vector_len);
1410   }
1411 }
1412 
1413 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1414   switch (opcode) {
1415     case Op_RShiftVB:  // fall-through
1416     case Op_RShiftVS:  // fall-through
1417     case Op_RShiftVI:  vpsravd(dst, src, shift, vlen_enc); break;
1418 
1419     case Op_LShiftVB:  // fall-through
1420     case Op_LShiftVS:  // fall-through
1421     case Op_LShiftVI:  vpsllvd(dst, src, shift, vlen_enc); break;
1422 
1423     case Op_URShiftVB: // fall-through
1424     case Op_URShiftVS: // fall-through
1425     case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1426 
1427     default: assert(false, "%s", NodeClassNames[opcode]);
1428   }
1429 }
1430 
1431 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1432   switch (opcode) {
1433     case Op_RShiftVB:  // fall-through
1434     case Op_RShiftVS:  evpsravw(dst, src, shift, vlen_enc); break;
1435 
1436     case Op_LShiftVB:  // fall-through
1437     case Op_LShiftVS:  evpsllvw(dst, src, shift, vlen_enc); break;
1438 
1439     case Op_URShiftVB: // fall-through
1440     case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1441 
1442     default: assert(false, "%s", NodeClassNames[opcode]);
1443   }
1444 }
1445 
1446 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1447   assert(UseAVX >= 2, "required");
1448   switch (opcode) {
1449     case Op_RShiftVL: {
1450       if (UseAVX > 2) {
1451         assert(tmp == xnoreg, "not used");
1452         if (!VM_Version::supports_avx512vl()) {
1453           vlen_enc = Assembler::AVX_512bit;
1454         }
1455         evpsravq(dst, src, shift, vlen_enc);
1456       } else {
1457         vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1458         vpsrlvq(dst, src, shift, vlen_enc);
1459         vpsrlvq(tmp, tmp, shift, vlen_enc);
1460         vpxor(dst, dst, tmp, vlen_enc);
1461         vpsubq(dst, dst, tmp, vlen_enc);
1462       }
1463       break;
1464     }
1465     case Op_LShiftVL: {
1466       assert(tmp == xnoreg, "not used");
1467       vpsllvq(dst, src, shift, vlen_enc);
1468       break;
1469     }
1470     case Op_URShiftVL: {
1471       assert(tmp == xnoreg, "not used");
1472       vpsrlvq(dst, src, shift, vlen_enc);
1473       break;
1474     }
1475     default: assert(false, "%s", NodeClassNames[opcode]);
1476   }
1477 }
1478 
1479 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1480 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1481   assert(opcode == Op_LShiftVB ||
1482          opcode == Op_RShiftVB ||
1483          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1484   bool sign = (opcode != Op_URShiftVB);
1485   assert(vector_len == 0, "required");
1486   vextendbd(sign, dst, src, 1);
1487   vpmovzxbd(vtmp, shift, 1);
1488   varshiftd(opcode, dst, dst, vtmp, 1);
1489   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg);
1490   vextracti128_high(vtmp, dst);
1491   vpackusdw(dst, dst, vtmp, 0);
1492 }
1493 
1494 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1495 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1496   assert(opcode == Op_LShiftVB ||
1497          opcode == Op_RShiftVB ||
1498          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1499   bool sign = (opcode != Op_URShiftVB);
1500   int ext_vector_len = vector_len + 1;
1501   vextendbw(sign, dst, src, ext_vector_len);
1502   vpmovzxbw(vtmp, shift, ext_vector_len);
1503   varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1504   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg);
1505   if (vector_len == 0) {
1506     vextracti128_high(vtmp, dst);
1507     vpackuswb(dst, dst, vtmp, vector_len);
1508   } else {
1509     vextracti64x4_high(vtmp, dst);
1510     vpackuswb(dst, dst, vtmp, vector_len);
1511     vpermq(dst, dst, 0xD8, vector_len);
1512   }
1513 }
1514 
1515 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1516   switch(typ) {
1517     case T_BYTE:
1518       pinsrb(dst, val, idx);
1519       break;
1520     case T_SHORT:
1521       pinsrw(dst, val, idx);
1522       break;
1523     case T_INT:
1524       pinsrd(dst, val, idx);
1525       break;
1526     case T_LONG:
1527       pinsrq(dst, val, idx);
1528       break;
1529     default:
1530       assert(false,"Should not reach here.");
1531       break;
1532   }
1533 }
1534 
1535 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1536   switch(typ) {
1537     case T_BYTE:
1538       vpinsrb(dst, src, val, idx);
1539       break;
1540     case T_SHORT:
1541       vpinsrw(dst, src, val, idx);
1542       break;
1543     case T_INT:
1544       vpinsrd(dst, src, val, idx);
1545       break;
1546     case T_LONG:
1547       vpinsrq(dst, src, val, idx);
1548       break;
1549     default:
1550       assert(false,"Should not reach here.");
1551       break;
1552   }
1553 }
1554 
1555 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1556   switch(typ) {
1557     case T_INT:
1558       vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1559       break;
1560     case T_FLOAT:
1561       vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1562       break;
1563     case T_LONG:
1564       vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1565       break;
1566     case T_DOUBLE:
1567       vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1568       break;
1569     default:
1570       assert(false,"Should not reach here.");
1571       break;
1572   }
1573 }
1574 
1575 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1576   switch(typ) {
1577     case T_INT:
1578       evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1579       break;
1580     case T_FLOAT:
1581       evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1582       break;
1583     case T_LONG:
1584       evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1585       break;
1586     case T_DOUBLE:
1587       evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1588       break;
1589     default:
1590       assert(false,"Should not reach here.");
1591       break;
1592   }
1593 }
1594 
1595 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1596   switch(typ) {
1597     case T_INT:
1598       evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1599       break;
1600     case T_FLOAT:
1601       evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1602       break;
1603     case T_LONG:
1604       evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1605       break;
1606     case T_DOUBLE:
1607       evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1608       break;
1609     default:
1610       assert(false,"Should not reach here.");
1611       break;
1612   }
1613 }
1614 
1615 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) {
1616   if (vlen_in_bytes <= 16) {
1617     pxor (dst, dst);
1618     psubb(dst, src);
1619     switch (elem_bt) {
1620       case T_BYTE:   /* nothing to do */ break;
1621       case T_SHORT:  pmovsxbw(dst, dst); break;
1622       case T_INT:    pmovsxbd(dst, dst); break;
1623       case T_FLOAT:  pmovsxbd(dst, dst); break;
1624       case T_LONG:   pmovsxbq(dst, dst); break;
1625       case T_DOUBLE: pmovsxbq(dst, dst); break;
1626 
1627       default: assert(false, "%s", type2name(elem_bt));
1628     }
1629   } else {
1630     assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, "");
1631     int vlen_enc = vector_length_encoding(vlen_in_bytes);
1632 
1633     vpxor (dst, dst, dst, vlen_enc);
1634     vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc);
1635 
1636     switch (elem_bt) {
1637       case T_BYTE:   /* nothing to do */            break;
1638       case T_SHORT:  vpmovsxbw(dst, dst, vlen_enc); break;
1639       case T_INT:    vpmovsxbd(dst, dst, vlen_enc); break;
1640       case T_FLOAT:  vpmovsxbd(dst, dst, vlen_enc); break;
1641       case T_LONG:   vpmovsxbq(dst, dst, vlen_enc); break;
1642       case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1643 
1644       default: assert(false, "%s", type2name(elem_bt));
1645     }
1646   }
1647 }
1648 
1649 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) {
1650   if (novlbwdq) {
1651     vpmovsxbd(xtmp, src, vlen_enc);
1652     evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()),
1653             Assembler::eq, true, vlen_enc, noreg);
1654   } else {
1655     vpxor(xtmp, xtmp, xtmp, vlen_enc);
1656     vpsubb(xtmp, xtmp, src, vlen_enc);
1657     evpmovb2m(dst, xtmp, vlen_enc);
1658   }
1659 }
1660 
1661 void C2_MacroAssembler::load_vector(XMMRegister dst, Address src, int vlen_in_bytes) {
1662   switch (vlen_in_bytes) {
1663     case 4:  movdl(dst, src);   break;
1664     case 8:  movq(dst, src);    break;
1665     case 16: movdqu(dst, src);  break;
1666     case 32: vmovdqu(dst, src); break;
1667     case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break;
1668     default: ShouldNotReachHere();
1669   }
1670 }
1671 
1672 void C2_MacroAssembler::load_vector(XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) {
1673   assert(rscratch != noreg || always_reachable(src), "missing");
1674 
1675   if (reachable(src)) {
1676     load_vector(dst, as_Address(src), vlen_in_bytes);
1677   } else {
1678     lea(rscratch, src);
1679     load_vector(dst, Address(rscratch, 0), vlen_in_bytes);
1680   }
1681 }
1682 
1683 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) {
1684   int vlen_enc = vector_length_encoding(vlen);
1685   if (VM_Version::supports_avx()) {
1686     if (bt == T_LONG) {
1687       if (VM_Version::supports_avx2()) {
1688         vpbroadcastq(dst, src, vlen_enc);
1689       } else {
1690         vmovddup(dst, src, vlen_enc);
1691       }
1692     } else if (bt == T_DOUBLE) {
1693       if (vlen_enc != Assembler::AVX_128bit) {
1694         vbroadcastsd(dst, src, vlen_enc, noreg);
1695       } else {
1696         vmovddup(dst, src, vlen_enc);
1697       }
1698     } else {
1699       if (VM_Version::supports_avx2() && is_integral_type(bt)) {
1700         vpbroadcastd(dst, src, vlen_enc);
1701       } else {
1702         vbroadcastss(dst, src, vlen_enc);
1703       }
1704     }
1705   } else if (VM_Version::supports_sse3()) {
1706     movddup(dst, src);
1707   } else {
1708     movq(dst, src);
1709     if (vlen == 16) {
1710       punpcklqdq(dst, dst);
1711     }
1712   }
1713 }
1714 
1715 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) {
1716   // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64.
1717   int offset = exact_log2(type2aelembytes(bt)) << 6;
1718   if (is_floating_point_type(bt)) {
1719     offset += 128;
1720   }
1721   ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset);
1722   load_vector(dst, addr, vlen_in_bytes);
1723 }
1724 
1725 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1726 
1727 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1728   int vector_len = Assembler::AVX_128bit;
1729 
1730   switch (opcode) {
1731     case Op_AndReductionV:  pand(dst, src); break;
1732     case Op_OrReductionV:   por (dst, src); break;
1733     case Op_XorReductionV:  pxor(dst, src); break;
1734     case Op_MinReductionV:
1735       switch (typ) {
1736         case T_BYTE:        pminsb(dst, src); break;
1737         case T_SHORT:       pminsw(dst, src); break;
1738         case T_INT:         pminsd(dst, src); break;
1739         case T_LONG:        assert(UseAVX > 2, "required");
1740                             vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
1741         default:            assert(false, "wrong type");
1742       }
1743       break;
1744     case Op_MaxReductionV:
1745       switch (typ) {
1746         case T_BYTE:        pmaxsb(dst, src); break;
1747         case T_SHORT:       pmaxsw(dst, src); break;
1748         case T_INT:         pmaxsd(dst, src); break;
1749         case T_LONG:        assert(UseAVX > 2, "required");
1750                             vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
1751         default:            assert(false, "wrong type");
1752       }
1753       break;
1754     case Op_AddReductionVF: addss(dst, src); break;
1755     case Op_AddReductionVD: addsd(dst, src); break;
1756     case Op_AddReductionVI:
1757       switch (typ) {
1758         case T_BYTE:        paddb(dst, src); break;
1759         case T_SHORT:       paddw(dst, src); break;
1760         case T_INT:         paddd(dst, src); break;
1761         default:            assert(false, "wrong type");
1762       }
1763       break;
1764     case Op_AddReductionVL: paddq(dst, src); break;
1765     case Op_MulReductionVF: mulss(dst, src); break;
1766     case Op_MulReductionVD: mulsd(dst, src); break;
1767     case Op_MulReductionVI:
1768       switch (typ) {
1769         case T_SHORT:       pmullw(dst, src); break;
1770         case T_INT:         pmulld(dst, src); break;
1771         default:            assert(false, "wrong type");
1772       }
1773       break;
1774     case Op_MulReductionVL: assert(UseAVX > 2, "required");
1775                             evpmullq(dst, dst, src, vector_len); break;
1776     default:                assert(false, "wrong opcode");
1777   }
1778 }
1779 
1780 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1781   int vector_len = Assembler::AVX_256bit;
1782 
1783   switch (opcode) {
1784     case Op_AndReductionV:  vpand(dst, src1, src2, vector_len); break;
1785     case Op_OrReductionV:   vpor (dst, src1, src2, vector_len); break;
1786     case Op_XorReductionV:  vpxor(dst, src1, src2, vector_len); break;
1787     case Op_MinReductionV:
1788       switch (typ) {
1789         case T_BYTE:        vpminsb(dst, src1, src2, vector_len); break;
1790         case T_SHORT:       vpminsw(dst, src1, src2, vector_len); break;
1791         case T_INT:         vpminsd(dst, src1, src2, vector_len); break;
1792         case T_LONG:        assert(UseAVX > 2, "required");
1793                             vpminsq(dst, src1, src2, vector_len); break;
1794         default:            assert(false, "wrong type");
1795       }
1796       break;
1797     case Op_MaxReductionV:
1798       switch (typ) {
1799         case T_BYTE:        vpmaxsb(dst, src1, src2, vector_len); break;
1800         case T_SHORT:       vpmaxsw(dst, src1, src2, vector_len); break;
1801         case T_INT:         vpmaxsd(dst, src1, src2, vector_len); break;
1802         case T_LONG:        assert(UseAVX > 2, "required");
1803                             vpmaxsq(dst, src1, src2, vector_len); break;
1804         default:            assert(false, "wrong type");
1805       }
1806       break;
1807     case Op_AddReductionVI:
1808       switch (typ) {
1809         case T_BYTE:        vpaddb(dst, src1, src2, vector_len); break;
1810         case T_SHORT:       vpaddw(dst, src1, src2, vector_len); break;
1811         case T_INT:         vpaddd(dst, src1, src2, vector_len); break;
1812         default:            assert(false, "wrong type");
1813       }
1814       break;
1815     case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
1816     case Op_MulReductionVI:
1817       switch (typ) {
1818         case T_SHORT:       vpmullw(dst, src1, src2, vector_len); break;
1819         case T_INT:         vpmulld(dst, src1, src2, vector_len); break;
1820         default:            assert(false, "wrong type");
1821       }
1822       break;
1823     case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break;
1824     default:                assert(false, "wrong opcode");
1825   }
1826 }
1827 
1828 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
1829                                   XMMRegister dst, XMMRegister src,
1830                                   XMMRegister vtmp1, XMMRegister vtmp2) {
1831   switch (opcode) {
1832     case Op_AddReductionVF:
1833     case Op_MulReductionVF:
1834       reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1835       break;
1836 
1837     case Op_AddReductionVD:
1838     case Op_MulReductionVD:
1839       reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1840       break;
1841 
1842     default: assert(false, "wrong opcode");
1843   }
1844 }
1845 
1846 void C2_MacroAssembler::reduceB(int opcode, int vlen,
1847                              Register dst, Register src1, XMMRegister src2,
1848                              XMMRegister vtmp1, XMMRegister vtmp2) {
1849   switch (vlen) {
1850     case  8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1851     case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1852     case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1853     case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1854 
1855     default: assert(false, "wrong vector length");
1856   }
1857 }
1858 
1859 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
1860                              Register dst, Register src1, XMMRegister src2,
1861                              XMMRegister vtmp1, XMMRegister vtmp2) {
1862   switch (vlen) {
1863     case  8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1864     case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1865     case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1866     case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1867 
1868     default: assert(false, "wrong vector length");
1869   }
1870 }
1871 
1872 void C2_MacroAssembler::reduceS(int opcode, int vlen,
1873                              Register dst, Register src1, XMMRegister src2,
1874                              XMMRegister vtmp1, XMMRegister vtmp2) {
1875   switch (vlen) {
1876     case  4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1877     case  8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1878     case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1879     case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1880 
1881     default: assert(false, "wrong vector length");
1882   }
1883 }
1884 
1885 void C2_MacroAssembler::reduceI(int opcode, int vlen,
1886                              Register dst, Register src1, XMMRegister src2,
1887                              XMMRegister vtmp1, XMMRegister vtmp2) {
1888   switch (vlen) {
1889     case  2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1890     case  4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1891     case  8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1892     case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1893 
1894     default: assert(false, "wrong vector length");
1895   }
1896 }
1897 
1898 #ifdef _LP64
1899 void C2_MacroAssembler::reduceL(int opcode, int vlen,
1900                              Register dst, Register src1, XMMRegister src2,
1901                              XMMRegister vtmp1, XMMRegister vtmp2) {
1902   switch (vlen) {
1903     case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1904     case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1905     case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1906 
1907     default: assert(false, "wrong vector length");
1908   }
1909 }
1910 #endif // _LP64
1911 
1912 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1913   switch (vlen) {
1914     case 2:
1915       assert(vtmp2 == xnoreg, "");
1916       reduce2F(opcode, dst, src, vtmp1);
1917       break;
1918     case 4:
1919       assert(vtmp2 == xnoreg, "");
1920       reduce4F(opcode, dst, src, vtmp1);
1921       break;
1922     case 8:
1923       reduce8F(opcode, dst, src, vtmp1, vtmp2);
1924       break;
1925     case 16:
1926       reduce16F(opcode, dst, src, vtmp1, vtmp2);
1927       break;
1928     default: assert(false, "wrong vector length");
1929   }
1930 }
1931 
1932 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1933   switch (vlen) {
1934     case 2:
1935       assert(vtmp2 == xnoreg, "");
1936       reduce2D(opcode, dst, src, vtmp1);
1937       break;
1938     case 4:
1939       reduce4D(opcode, dst, src, vtmp1, vtmp2);
1940       break;
1941     case 8:
1942       reduce8D(opcode, dst, src, vtmp1, vtmp2);
1943       break;
1944     default: assert(false, "wrong vector length");
1945   }
1946 }
1947 
1948 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1949   if (opcode == Op_AddReductionVI) {
1950     if (vtmp1 != src2) {
1951       movdqu(vtmp1, src2);
1952     }
1953     phaddd(vtmp1, vtmp1);
1954   } else {
1955     pshufd(vtmp1, src2, 0x1);
1956     reduce_operation_128(T_INT, opcode, vtmp1, src2);
1957   }
1958   movdl(vtmp2, src1);
1959   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
1960   movdl(dst, vtmp1);
1961 }
1962 
1963 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1964   if (opcode == Op_AddReductionVI) {
1965     if (vtmp1 != src2) {
1966       movdqu(vtmp1, src2);
1967     }
1968     phaddd(vtmp1, src2);
1969     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1970   } else {
1971     pshufd(vtmp2, src2, 0xE);
1972     reduce_operation_128(T_INT, opcode, vtmp2, src2);
1973     reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1974   }
1975 }
1976 
1977 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1978   if (opcode == Op_AddReductionVI) {
1979     vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
1980     vextracti128_high(vtmp2, vtmp1);
1981     vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
1982     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1983   } else {
1984     vextracti128_high(vtmp1, src2);
1985     reduce_operation_128(T_INT, opcode, vtmp1, src2);
1986     reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1987   }
1988 }
1989 
1990 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1991   vextracti64x4_high(vtmp2, src2);
1992   reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
1993   reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1994 }
1995 
1996 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1997   pshufd(vtmp2, src2, 0x1);
1998   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
1999   movdqu(vtmp1, vtmp2);
2000   psrldq(vtmp1, 2);
2001   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2002   movdqu(vtmp2, vtmp1);
2003   psrldq(vtmp2, 1);
2004   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2005   movdl(vtmp2, src1);
2006   pmovsxbd(vtmp1, vtmp1);
2007   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2008   pextrb(dst, vtmp1, 0x0);
2009   movsbl(dst, dst);
2010 }
2011 
2012 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2013   pshufd(vtmp1, src2, 0xE);
2014   reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
2015   reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2016 }
2017 
2018 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2019   vextracti128_high(vtmp2, src2);
2020   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2021   reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2022 }
2023 
2024 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2025   vextracti64x4_high(vtmp1, src2);
2026   reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
2027   reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2028 }
2029 
2030 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2031   pmovsxbw(vtmp2, src2);
2032   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2033 }
2034 
2035 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2036   if (UseAVX > 1) {
2037     int vector_len = Assembler::AVX_256bit;
2038     vpmovsxbw(vtmp1, src2, vector_len);
2039     reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2040   } else {
2041     pmovsxbw(vtmp2, src2);
2042     reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2043     pshufd(vtmp2, src2, 0x1);
2044     pmovsxbw(vtmp2, src2);
2045     reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2046   }
2047 }
2048 
2049 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2050   if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
2051     int vector_len = Assembler::AVX_512bit;
2052     vpmovsxbw(vtmp1, src2, vector_len);
2053     reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2054   } else {
2055     assert(UseAVX >= 2,"Should not reach here.");
2056     mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
2057     vextracti128_high(vtmp2, src2);
2058     mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2059   }
2060 }
2061 
2062 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2063   mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
2064   vextracti64x4_high(vtmp2, src2);
2065   mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2066 }
2067 
2068 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2069   if (opcode == Op_AddReductionVI) {
2070     if (vtmp1 != src2) {
2071       movdqu(vtmp1, src2);
2072     }
2073     phaddw(vtmp1, vtmp1);
2074     phaddw(vtmp1, vtmp1);
2075   } else {
2076     pshufd(vtmp2, src2, 0x1);
2077     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2078     movdqu(vtmp1, vtmp2);
2079     psrldq(vtmp1, 2);
2080     reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
2081   }
2082   movdl(vtmp2, src1);
2083   pmovsxwd(vtmp1, vtmp1);
2084   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2085   pextrw(dst, vtmp1, 0x0);
2086   movswl(dst, dst);
2087 }
2088 
2089 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2090   if (opcode == Op_AddReductionVI) {
2091     if (vtmp1 != src2) {
2092       movdqu(vtmp1, src2);
2093     }
2094     phaddw(vtmp1, src2);
2095   } else {
2096     pshufd(vtmp1, src2, 0xE);
2097     reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
2098   }
2099   reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2100 }
2101 
2102 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2103   if (opcode == Op_AddReductionVI) {
2104     int vector_len = Assembler::AVX_256bit;
2105     vphaddw(vtmp2, src2, src2, vector_len);
2106     vpermq(vtmp2, vtmp2, 0xD8, vector_len);
2107   } else {
2108     vextracti128_high(vtmp2, src2);
2109     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2110   }
2111   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2112 }
2113 
2114 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2115   int vector_len = Assembler::AVX_256bit;
2116   vextracti64x4_high(vtmp1, src2);
2117   reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
2118   reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2119 }
2120 
2121 #ifdef _LP64
2122 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2123   pshufd(vtmp2, src2, 0xE);
2124   reduce_operation_128(T_LONG, opcode, vtmp2, src2);
2125   movdq(vtmp1, src1);
2126   reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
2127   movdq(dst, vtmp1);
2128 }
2129 
2130 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2131   vextracti128_high(vtmp1, src2);
2132   reduce_operation_128(T_LONG, opcode, vtmp1, src2);
2133   reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2134 }
2135 
2136 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2137   vextracti64x4_high(vtmp2, src2);
2138   reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
2139   reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2140 }
2141 
2142 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
2143   mov64(temp, -1L);
2144   bzhiq(temp, temp, len);
2145   kmovql(dst, temp);
2146 }
2147 #endif // _LP64
2148 
2149 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2150   reduce_operation_128(T_FLOAT, opcode, dst, src);
2151   pshufd(vtmp, src, 0x1);
2152   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2153 }
2154 
2155 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2156   reduce2F(opcode, dst, src, vtmp);
2157   pshufd(vtmp, src, 0x2);
2158   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2159   pshufd(vtmp, src, 0x3);
2160   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2161 }
2162 
2163 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2164   reduce4F(opcode, dst, src, vtmp2);
2165   vextractf128_high(vtmp2, src);
2166   reduce4F(opcode, dst, vtmp2, vtmp1);
2167 }
2168 
2169 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2170   reduce8F(opcode, dst, src, vtmp1, vtmp2);
2171   vextracti64x4_high(vtmp1, src);
2172   reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
2173 }
2174 
2175 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2176   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2177   pshufd(vtmp, src, 0xE);
2178   reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
2179 }
2180 
2181 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2182   reduce2D(opcode, dst, src, vtmp2);
2183   vextractf128_high(vtmp2, src);
2184   reduce2D(opcode, dst, vtmp2, vtmp1);
2185 }
2186 
2187 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2188   reduce4D(opcode, dst, src, vtmp1, vtmp2);
2189   vextracti64x4_high(vtmp1, src);
2190   reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
2191 }
2192 
2193 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) {
2194   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2195 }
2196 
2197 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) {
2198   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2199 }
2200 
2201 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask,
2202                                  int vec_enc) {
2203   switch(elem_bt) {
2204     case T_INT:
2205     case T_FLOAT:
2206       vmaskmovps(dst, src, mask, vec_enc);
2207       break;
2208     case T_LONG:
2209     case T_DOUBLE:
2210       vmaskmovpd(dst, src, mask, vec_enc);
2211       break;
2212     default:
2213       fatal("Unsupported type %s", type2name(elem_bt));
2214       break;
2215   }
2216 }
2217 
2218 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask,
2219                                  int vec_enc) {
2220   switch(elem_bt) {
2221     case T_INT:
2222     case T_FLOAT:
2223       vmaskmovps(dst, src, mask, vec_enc);
2224       break;
2225     case T_LONG:
2226     case T_DOUBLE:
2227       vmaskmovpd(dst, src, mask, vec_enc);
2228       break;
2229     default:
2230       fatal("Unsupported type %s", type2name(elem_bt));
2231       break;
2232   }
2233 }
2234 
2235 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
2236                                           XMMRegister dst, XMMRegister src,
2237                                           XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2238                                           XMMRegister xmm_0, XMMRegister xmm_1) {
2239   const int permconst[] = {1, 14};
2240   XMMRegister wsrc = src;
2241   XMMRegister wdst = xmm_0;
2242   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2243 
2244   int vlen_enc = Assembler::AVX_128bit;
2245   if (vlen == 16) {
2246     vlen_enc = Assembler::AVX_256bit;
2247   }
2248 
2249   for (int i = log2(vlen) - 1; i >=0; i--) {
2250     if (i == 0 && !is_dst_valid) {
2251       wdst = dst;
2252     }
2253     if (i == 3) {
2254       vextracti64x4_high(wtmp, wsrc);
2255     } else if (i == 2) {
2256       vextracti128_high(wtmp, wsrc);
2257     } else { // i = [0,1]
2258       vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
2259     }
2260     vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2261     wsrc = wdst;
2262     vlen_enc = Assembler::AVX_128bit;
2263   }
2264   if (is_dst_valid) {
2265     vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2266   }
2267 }
2268 
2269 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
2270                                         XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2271                                         XMMRegister xmm_0, XMMRegister xmm_1) {
2272   XMMRegister wsrc = src;
2273   XMMRegister wdst = xmm_0;
2274   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2275   int vlen_enc = Assembler::AVX_128bit;
2276   if (vlen == 8) {
2277     vlen_enc = Assembler::AVX_256bit;
2278   }
2279   for (int i = log2(vlen) - 1; i >=0; i--) {
2280     if (i == 0 && !is_dst_valid) {
2281       wdst = dst;
2282     }
2283     if (i == 1) {
2284       vextracti128_high(wtmp, wsrc);
2285     } else if (i == 2) {
2286       vextracti64x4_high(wtmp, wsrc);
2287     } else {
2288       assert(i == 0, "%d", i);
2289       vpermilpd(wtmp, wsrc, 1, vlen_enc);
2290     }
2291     vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2292     wsrc = wdst;
2293     vlen_enc = Assembler::AVX_128bit;
2294   }
2295   if (is_dst_valid) {
2296     vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2297   }
2298 }
2299 
2300 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2301   switch (bt) {
2302     case T_BYTE:  pextrb(dst, src, idx); break;
2303     case T_SHORT: pextrw(dst, src, idx); break;
2304     case T_INT:   pextrd(dst, src, idx); break;
2305     case T_LONG:  pextrq(dst, src, idx); break;
2306 
2307     default:
2308       assert(false,"Should not reach here.");
2309       break;
2310   }
2311 }
2312 
2313 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2314   int esize =  type2aelembytes(typ);
2315   int elem_per_lane = 16/esize;
2316   int lane = elemindex / elem_per_lane;
2317   int eindex = elemindex % elem_per_lane;
2318 
2319   if (lane >= 2) {
2320     assert(UseAVX > 2, "required");
2321     vextractf32x4(dst, src, lane & 3);
2322     return dst;
2323   } else if (lane > 0) {
2324     assert(UseAVX > 0, "required");
2325     vextractf128(dst, src, lane);
2326     return dst;
2327   } else {
2328     return src;
2329   }
2330 }
2331 
2332 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) {
2333   if (typ == T_BYTE) {
2334     movsbl(dst, dst);
2335   } else if (typ == T_SHORT) {
2336     movswl(dst, dst);
2337   }
2338 }
2339 
2340 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2341   int esize =  type2aelembytes(typ);
2342   int elem_per_lane = 16/esize;
2343   int eindex = elemindex % elem_per_lane;
2344   assert(is_integral_type(typ),"required");
2345 
2346   if (eindex == 0) {
2347     if (typ == T_LONG) {
2348       movq(dst, src);
2349     } else {
2350       movdl(dst, src);
2351       movsxl(typ, dst);
2352     }
2353   } else {
2354     extract(typ, dst, src, eindex);
2355     movsxl(typ, dst);
2356   }
2357 }
2358 
2359 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) {
2360   int esize =  type2aelembytes(typ);
2361   int elem_per_lane = 16/esize;
2362   int eindex = elemindex % elem_per_lane;
2363   assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2364 
2365   if (eindex == 0) {
2366     movq(dst, src);
2367   } else {
2368     if (typ == T_FLOAT) {
2369       if (UseAVX == 0) {
2370         movdqu(dst, src);
2371         shufps(dst, dst, eindex);
2372       } else {
2373         vshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2374       }
2375     } else {
2376       if (UseAVX == 0) {
2377         movdqu(dst, src);
2378         psrldq(dst, eindex*esize);
2379       } else {
2380         vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2381       }
2382       movq(dst, dst);
2383     }
2384   }
2385   // Zero upper bits
2386   if (typ == T_FLOAT) {
2387     if (UseAVX == 0) {
2388       assert(vtmp != xnoreg, "required.");
2389       movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg);
2390       pand(dst, vtmp);
2391     } else {
2392       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg);
2393     }
2394   }
2395 }
2396 
2397 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) {
2398   switch(typ) {
2399     case T_BYTE:
2400     case T_BOOLEAN:
2401       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2402       break;
2403     case T_SHORT:
2404     case T_CHAR:
2405       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2406       break;
2407     case T_INT:
2408     case T_FLOAT:
2409       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2410       break;
2411     case T_LONG:
2412     case T_DOUBLE:
2413       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2414       break;
2415     default:
2416       assert(false,"Should not reach here.");
2417       break;
2418   }
2419 }
2420 
2421 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) {
2422   assert(rscratch != noreg || always_reachable(src2), "missing");
2423 
2424   switch(typ) {
2425     case T_BOOLEAN:
2426     case T_BYTE:
2427       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2428       break;
2429     case T_CHAR:
2430     case T_SHORT:
2431       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2432       break;
2433     case T_INT:
2434     case T_FLOAT:
2435       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2436       break;
2437     case T_LONG:
2438     case T_DOUBLE:
2439       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2440       break;
2441     default:
2442       assert(false,"Should not reach here.");
2443       break;
2444   }
2445 }
2446 
2447 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2448   switch(typ) {
2449     case T_BYTE:
2450       evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2451       break;
2452     case T_SHORT:
2453       evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2454       break;
2455     case T_INT:
2456     case T_FLOAT:
2457       evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2458       break;
2459     case T_LONG:
2460     case T_DOUBLE:
2461       evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2462       break;
2463     default:
2464       assert(false,"Should not reach here.");
2465       break;
2466   }
2467 }
2468 
2469 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) {
2470   assert(vlen_in_bytes <= 32, "");
2471   int esize = type2aelembytes(bt);
2472   if (vlen_in_bytes == 32) {
2473     assert(vtmp == xnoreg, "required.");
2474     if (esize >= 4) {
2475       vtestps(src1, src2, AVX_256bit);
2476     } else {
2477       vptest(src1, src2, AVX_256bit);
2478     }
2479     return;
2480   }
2481   if (vlen_in_bytes < 16) {
2482     // Duplicate the lower part to fill the whole register,
2483     // Don't need to do so for src2
2484     assert(vtmp != xnoreg, "required");
2485     int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04;
2486     pshufd(vtmp, src1, shuffle_imm);
2487   } else {
2488     assert(vtmp == xnoreg, "required");
2489     vtmp = src1;
2490   }
2491   if (esize >= 4 && VM_Version::supports_avx()) {
2492     vtestps(vtmp, src2, AVX_128bit);
2493   } else {
2494     ptest(vtmp, src2);
2495   }
2496 }
2497 
2498 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
2499   assert(UseAVX >= 2, "required");
2500 #ifdef ASSERT
2501   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2502   bool is_bw_supported = VM_Version::supports_avx512bw();
2503   if (is_bw && !is_bw_supported) {
2504     assert(vlen_enc != Assembler::AVX_512bit, "required");
2505     assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16),
2506            "XMM register should be 0-15");
2507   }
2508 #endif // ASSERT
2509   switch (elem_bt) {
2510     case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return;
2511     case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return;
2512     case T_INT: vpaddd(dst, src1, src2, vlen_enc); return;
2513     case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return;
2514     case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return;
2515     case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return;
2516     default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2517   }
2518 }
2519 
2520 #ifdef _LP64
2521 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) {
2522   assert(UseAVX >= 2, "required");
2523   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2524   bool is_vl = vlen_enc != Assembler::AVX_512bit;
2525   if ((UseAVX > 2) &&
2526       (!is_bw || VM_Version::supports_avx512bw()) &&
2527       (!is_vl || VM_Version::supports_avx512vl())) {
2528     switch (elem_bt) {
2529       case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return;
2530       case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return;
2531       case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return;
2532       case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return;
2533       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2534     }
2535   } else {
2536     assert(vlen_enc != Assembler::AVX_512bit, "required");
2537     assert((dst->encoding() < 16),"XMM register should be 0-15");
2538     switch (elem_bt) {
2539       case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return;
2540       case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return;
2541       case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return;
2542       case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return;
2543       case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return;
2544       case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return;
2545       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2546     }
2547   }
2548 }
2549 #endif
2550 
2551 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
2552   switch (to_elem_bt) {
2553     case T_SHORT:
2554       vpmovsxbw(dst, src, vlen_enc);
2555       break;
2556     case T_INT:
2557       vpmovsxbd(dst, src, vlen_enc);
2558       break;
2559     case T_FLOAT:
2560       vpmovsxbd(dst, src, vlen_enc);
2561       vcvtdq2ps(dst, dst, vlen_enc);
2562       break;
2563     case T_LONG:
2564       vpmovsxbq(dst, src, vlen_enc);
2565       break;
2566     case T_DOUBLE: {
2567       int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
2568       vpmovsxbd(dst, src, mid_vlen_enc);
2569       vcvtdq2pd(dst, dst, vlen_enc);
2570       break;
2571     }
2572     default:
2573       fatal("Unsupported type %s", type2name(to_elem_bt));
2574       break;
2575   }
2576 }
2577 
2578 //-------------------------------------------------------------------------------------------
2579 
2580 // IndexOf for constant substrings with size >= 8 chars
2581 // which don't need to be loaded through stack.
2582 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2583                                          Register cnt1, Register cnt2,
2584                                          int int_cnt2,  Register result,
2585                                          XMMRegister vec, Register tmp,
2586                                          int ae) {
2587   ShortBranchVerifier sbv(this);
2588   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2589   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2590 
2591   // This method uses the pcmpestri instruction with bound registers
2592   //   inputs:
2593   //     xmm - substring
2594   //     rax - substring length (elements count)
2595   //     mem - scanned string
2596   //     rdx - string length (elements count)
2597   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2598   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2599   //   outputs:
2600   //     rcx - matched index in string
2601   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2602   int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2603   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2604   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2605   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2606 
2607   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2608         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2609         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2610 
2611   // Note, inline_string_indexOf() generates checks:
2612   // if (substr.count > string.count) return -1;
2613   // if (substr.count == 0) return 0;
2614   assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2615 
2616   // Load substring.
2617   if (ae == StrIntrinsicNode::UL) {
2618     pmovzxbw(vec, Address(str2, 0));
2619   } else {
2620     movdqu(vec, Address(str2, 0));
2621   }
2622   movl(cnt2, int_cnt2);
2623   movptr(result, str1); // string addr
2624 
2625   if (int_cnt2 > stride) {
2626     jmpb(SCAN_TO_SUBSTR);
2627 
2628     // Reload substr for rescan, this code
2629     // is executed only for large substrings (> 8 chars)
2630     bind(RELOAD_SUBSTR);
2631     if (ae == StrIntrinsicNode::UL) {
2632       pmovzxbw(vec, Address(str2, 0));
2633     } else {
2634       movdqu(vec, Address(str2, 0));
2635     }
2636     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2637 
2638     bind(RELOAD_STR);
2639     // We came here after the beginning of the substring was
2640     // matched but the rest of it was not so we need to search
2641     // again. Start from the next element after the previous match.
2642 
2643     // cnt2 is number of substring reminding elements and
2644     // cnt1 is number of string reminding elements when cmp failed.
2645     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
2646     subl(cnt1, cnt2);
2647     addl(cnt1, int_cnt2);
2648     movl(cnt2, int_cnt2); // Now restore cnt2
2649 
2650     decrementl(cnt1);     // Shift to next element
2651     cmpl(cnt1, cnt2);
2652     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2653 
2654     addptr(result, (1<<scale1));
2655 
2656   } // (int_cnt2 > 8)
2657 
2658   // Scan string for start of substr in 16-byte vectors
2659   bind(SCAN_TO_SUBSTR);
2660   pcmpestri(vec, Address(result, 0), mode);
2661   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2662   subl(cnt1, stride);
2663   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2664   cmpl(cnt1, cnt2);
2665   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2666   addptr(result, 16);
2667   jmpb(SCAN_TO_SUBSTR);
2668 
2669   // Found a potential substr
2670   bind(FOUND_CANDIDATE);
2671   // Matched whole vector if first element matched (tmp(rcx) == 0).
2672   if (int_cnt2 == stride) {
2673     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
2674   } else { // int_cnt2 > 8
2675     jccb(Assembler::overflow, FOUND_SUBSTR);
2676   }
2677   // After pcmpestri tmp(rcx) contains matched element index
2678   // Compute start addr of substr
2679   lea(result, Address(result, tmp, scale1));
2680 
2681   // Make sure string is still long enough
2682   subl(cnt1, tmp);
2683   cmpl(cnt1, cnt2);
2684   if (int_cnt2 == stride) {
2685     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2686   } else { // int_cnt2 > 8
2687     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
2688   }
2689   // Left less then substring.
2690 
2691   bind(RET_NOT_FOUND);
2692   movl(result, -1);
2693   jmp(EXIT);
2694 
2695   if (int_cnt2 > stride) {
2696     // This code is optimized for the case when whole substring
2697     // is matched if its head is matched.
2698     bind(MATCH_SUBSTR_HEAD);
2699     pcmpestri(vec, Address(result, 0), mode);
2700     // Reload only string if does not match
2701     jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
2702 
2703     Label CONT_SCAN_SUBSTR;
2704     // Compare the rest of substring (> 8 chars).
2705     bind(FOUND_SUBSTR);
2706     // First 8 chars are already matched.
2707     negptr(cnt2);
2708     addptr(cnt2, stride);
2709 
2710     bind(SCAN_SUBSTR);
2711     subl(cnt1, stride);
2712     cmpl(cnt2, -stride); // Do not read beyond substring
2713     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
2714     // Back-up strings to avoid reading beyond substring:
2715     // cnt1 = cnt1 - cnt2 + 8
2716     addl(cnt1, cnt2); // cnt2 is negative
2717     addl(cnt1, stride);
2718     movl(cnt2, stride); negptr(cnt2);
2719     bind(CONT_SCAN_SUBSTR);
2720     if (int_cnt2 < (int)G) {
2721       int tail_off1 = int_cnt2<<scale1;
2722       int tail_off2 = int_cnt2<<scale2;
2723       if (ae == StrIntrinsicNode::UL) {
2724         pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
2725       } else {
2726         movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
2727       }
2728       pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
2729     } else {
2730       // calculate index in register to avoid integer overflow (int_cnt2*2)
2731       movl(tmp, int_cnt2);
2732       addptr(tmp, cnt2);
2733       if (ae == StrIntrinsicNode::UL) {
2734         pmovzxbw(vec, Address(str2, tmp, scale2, 0));
2735       } else {
2736         movdqu(vec, Address(str2, tmp, scale2, 0));
2737       }
2738       pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
2739     }
2740     // Need to reload strings pointers if not matched whole vector
2741     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2742     addptr(cnt2, stride);
2743     jcc(Assembler::negative, SCAN_SUBSTR);
2744     // Fall through if found full substring
2745 
2746   } // (int_cnt2 > 8)
2747 
2748   bind(RET_FOUND);
2749   // Found result if we matched full small substring.
2750   // Compute substr offset
2751   subptr(result, str1);
2752   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2753     shrl(result, 1); // index
2754   }
2755   bind(EXIT);
2756 
2757 } // string_indexofC8
2758 
2759 // Small strings are loaded through stack if they cross page boundary.
2760 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
2761                                        Register cnt1, Register cnt2,
2762                                        int int_cnt2,  Register result,
2763                                        XMMRegister vec, Register tmp,
2764                                        int ae) {
2765   ShortBranchVerifier sbv(this);
2766   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2767   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2768 
2769   //
2770   // int_cnt2 is length of small (< 8 chars) constant substring
2771   // or (-1) for non constant substring in which case its length
2772   // is in cnt2 register.
2773   //
2774   // Note, inline_string_indexOf() generates checks:
2775   // if (substr.count > string.count) return -1;
2776   // if (substr.count == 0) return 0;
2777   //
2778   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2779   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
2780   // This method uses the pcmpestri instruction with bound registers
2781   //   inputs:
2782   //     xmm - substring
2783   //     rax - substring length (elements count)
2784   //     mem - scanned string
2785   //     rdx - string length (elements count)
2786   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2787   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2788   //   outputs:
2789   //     rcx - matched index in string
2790   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2791   int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2792   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2793   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2794 
2795   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
2796         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
2797         FOUND_CANDIDATE;
2798 
2799   { //========================================================
2800     // We don't know where these strings are located
2801     // and we can't read beyond them. Load them through stack.
2802     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
2803 
2804     movptr(tmp, rsp); // save old SP
2805 
2806     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
2807       if (int_cnt2 == (1>>scale2)) { // One byte
2808         assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
2809         load_unsigned_byte(result, Address(str2, 0));
2810         movdl(vec, result); // move 32 bits
2811       } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
2812         // Not enough header space in 32-bit VM: 12+3 = 15.
2813         movl(result, Address(str2, -1));
2814         shrl(result, 8);
2815         movdl(vec, result); // move 32 bits
2816       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
2817         load_unsigned_short(result, Address(str2, 0));
2818         movdl(vec, result); // move 32 bits
2819       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
2820         movdl(vec, Address(str2, 0)); // move 32 bits
2821       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
2822         movq(vec, Address(str2, 0));  // move 64 bits
2823       } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
2824         // Array header size is 12 bytes in 32-bit VM
2825         // + 6 bytes for 3 chars == 18 bytes,
2826         // enough space to load vec and shift.
2827         assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
2828         if (ae == StrIntrinsicNode::UL) {
2829           int tail_off = int_cnt2-8;
2830           pmovzxbw(vec, Address(str2, tail_off));
2831           psrldq(vec, -2*tail_off);
2832         }
2833         else {
2834           int tail_off = int_cnt2*(1<<scale2);
2835           movdqu(vec, Address(str2, tail_off-16));
2836           psrldq(vec, 16-tail_off);
2837         }
2838       }
2839     } else { // not constant substring
2840       cmpl(cnt2, stride);
2841       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
2842 
2843       // We can read beyond string if srt+16 does not cross page boundary
2844       // since heaps are aligned and mapped by pages.
2845       assert(os::vm_page_size() < (int)G, "default page should be small");
2846       movl(result, str2); // We need only low 32 bits
2847       andl(result, ((int)os::vm_page_size()-1));
2848       cmpl(result, ((int)os::vm_page_size()-16));
2849       jccb(Assembler::belowEqual, CHECK_STR);
2850 
2851       // Move small strings to stack to allow load 16 bytes into vec.
2852       subptr(rsp, 16);
2853       int stk_offset = wordSize-(1<<scale2);
2854       push(cnt2);
2855 
2856       bind(COPY_SUBSTR);
2857       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
2858         load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
2859         movb(Address(rsp, cnt2, scale2, stk_offset), result);
2860       } else if (ae == StrIntrinsicNode::UU) {
2861         load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
2862         movw(Address(rsp, cnt2, scale2, stk_offset), result);
2863       }
2864       decrement(cnt2);
2865       jccb(Assembler::notZero, COPY_SUBSTR);
2866 
2867       pop(cnt2);
2868       movptr(str2, rsp);  // New substring address
2869     } // non constant
2870 
2871     bind(CHECK_STR);
2872     cmpl(cnt1, stride);
2873     jccb(Assembler::aboveEqual, BIG_STRINGS);
2874 
2875     // Check cross page boundary.
2876     movl(result, str1); // We need only low 32 bits
2877     andl(result, ((int)os::vm_page_size()-1));
2878     cmpl(result, ((int)os::vm_page_size()-16));
2879     jccb(Assembler::belowEqual, BIG_STRINGS);
2880 
2881     subptr(rsp, 16);
2882     int stk_offset = -(1<<scale1);
2883     if (int_cnt2 < 0) { // not constant
2884       push(cnt2);
2885       stk_offset += wordSize;
2886     }
2887     movl(cnt2, cnt1);
2888 
2889     bind(COPY_STR);
2890     if (ae == StrIntrinsicNode::LL) {
2891       load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
2892       movb(Address(rsp, cnt2, scale1, stk_offset), result);
2893     } else {
2894       load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
2895       movw(Address(rsp, cnt2, scale1, stk_offset), result);
2896     }
2897     decrement(cnt2);
2898     jccb(Assembler::notZero, COPY_STR);
2899 
2900     if (int_cnt2 < 0) { // not constant
2901       pop(cnt2);
2902     }
2903     movptr(str1, rsp);  // New string address
2904 
2905     bind(BIG_STRINGS);
2906     // Load substring.
2907     if (int_cnt2 < 0) { // -1
2908       if (ae == StrIntrinsicNode::UL) {
2909         pmovzxbw(vec, Address(str2, 0));
2910       } else {
2911         movdqu(vec, Address(str2, 0));
2912       }
2913       push(cnt2);       // substr count
2914       push(str2);       // substr addr
2915       push(str1);       // string addr
2916     } else {
2917       // Small (< 8 chars) constant substrings are loaded already.
2918       movl(cnt2, int_cnt2);
2919     }
2920     push(tmp);  // original SP
2921 
2922   } // Finished loading
2923 
2924   //========================================================
2925   // Start search
2926   //
2927 
2928   movptr(result, str1); // string addr
2929 
2930   if (int_cnt2  < 0) {  // Only for non constant substring
2931     jmpb(SCAN_TO_SUBSTR);
2932 
2933     // SP saved at sp+0
2934     // String saved at sp+1*wordSize
2935     // Substr saved at sp+2*wordSize
2936     // Substr count saved at sp+3*wordSize
2937 
2938     // Reload substr for rescan, this code
2939     // is executed only for large substrings (> 8 chars)
2940     bind(RELOAD_SUBSTR);
2941     movptr(str2, Address(rsp, 2*wordSize));
2942     movl(cnt2, Address(rsp, 3*wordSize));
2943     if (ae == StrIntrinsicNode::UL) {
2944       pmovzxbw(vec, Address(str2, 0));
2945     } else {
2946       movdqu(vec, Address(str2, 0));
2947     }
2948     // We came here after the beginning of the substring was
2949     // matched but the rest of it was not so we need to search
2950     // again. Start from the next element after the previous match.
2951     subptr(str1, result); // Restore counter
2952     if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2953       shrl(str1, 1);
2954     }
2955     addl(cnt1, str1);
2956     decrementl(cnt1);   // Shift to next element
2957     cmpl(cnt1, cnt2);
2958     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2959 
2960     addptr(result, (1<<scale1));
2961   } // non constant
2962 
2963   // Scan string for start of substr in 16-byte vectors
2964   bind(SCAN_TO_SUBSTR);
2965   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2966   pcmpestri(vec, Address(result, 0), mode);
2967   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2968   subl(cnt1, stride);
2969   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2970   cmpl(cnt1, cnt2);
2971   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2972   addptr(result, 16);
2973 
2974   bind(ADJUST_STR);
2975   cmpl(cnt1, stride); // Do not read beyond string
2976   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2977   // Back-up string to avoid reading beyond string.
2978   lea(result, Address(result, cnt1, scale1, -16));
2979   movl(cnt1, stride);
2980   jmpb(SCAN_TO_SUBSTR);
2981 
2982   // Found a potential substr
2983   bind(FOUND_CANDIDATE);
2984   // After pcmpestri tmp(rcx) contains matched element index
2985 
2986   // Make sure string is still long enough
2987   subl(cnt1, tmp);
2988   cmpl(cnt1, cnt2);
2989   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
2990   // Left less then substring.
2991 
2992   bind(RET_NOT_FOUND);
2993   movl(result, -1);
2994   jmp(CLEANUP);
2995 
2996   bind(FOUND_SUBSTR);
2997   // Compute start addr of substr
2998   lea(result, Address(result, tmp, scale1));
2999   if (int_cnt2 > 0) { // Constant substring
3000     // Repeat search for small substring (< 8 chars)
3001     // from new point without reloading substring.
3002     // Have to check that we don't read beyond string.
3003     cmpl(tmp, stride-int_cnt2);
3004     jccb(Assembler::greater, ADJUST_STR);
3005     // Fall through if matched whole substring.
3006   } else { // non constant
3007     assert(int_cnt2 == -1, "should be != 0");
3008 
3009     addl(tmp, cnt2);
3010     // Found result if we matched whole substring.
3011     cmpl(tmp, stride);
3012     jcc(Assembler::lessEqual, RET_FOUND);
3013 
3014     // Repeat search for small substring (<= 8 chars)
3015     // from new point 'str1' without reloading substring.
3016     cmpl(cnt2, stride);
3017     // Have to check that we don't read beyond string.
3018     jccb(Assembler::lessEqual, ADJUST_STR);
3019 
3020     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
3021     // Compare the rest of substring (> 8 chars).
3022     movptr(str1, result);
3023 
3024     cmpl(tmp, cnt2);
3025     // First 8 chars are already matched.
3026     jccb(Assembler::equal, CHECK_NEXT);
3027 
3028     bind(SCAN_SUBSTR);
3029     pcmpestri(vec, Address(str1, 0), mode);
3030     // Need to reload strings pointers if not matched whole vector
3031     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
3032 
3033     bind(CHECK_NEXT);
3034     subl(cnt2, stride);
3035     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
3036     addptr(str1, 16);
3037     if (ae == StrIntrinsicNode::UL) {
3038       addptr(str2, 8);
3039     } else {
3040       addptr(str2, 16);
3041     }
3042     subl(cnt1, stride);
3043     cmpl(cnt2, stride); // Do not read beyond substring
3044     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
3045     // Back-up strings to avoid reading beyond substring.
3046 
3047     if (ae == StrIntrinsicNode::UL) {
3048       lea(str2, Address(str2, cnt2, scale2, -8));
3049       lea(str1, Address(str1, cnt2, scale1, -16));
3050     } else {
3051       lea(str2, Address(str2, cnt2, scale2, -16));
3052       lea(str1, Address(str1, cnt2, scale1, -16));
3053     }
3054     subl(cnt1, cnt2);
3055     movl(cnt2, stride);
3056     addl(cnt1, stride);
3057     bind(CONT_SCAN_SUBSTR);
3058     if (ae == StrIntrinsicNode::UL) {
3059       pmovzxbw(vec, Address(str2, 0));
3060     } else {
3061       movdqu(vec, Address(str2, 0));
3062     }
3063     jmp(SCAN_SUBSTR);
3064 
3065     bind(RET_FOUND_LONG);
3066     movptr(str1, Address(rsp, wordSize));
3067   } // non constant
3068 
3069   bind(RET_FOUND);
3070   // Compute substr offset
3071   subptr(result, str1);
3072   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3073     shrl(result, 1); // index
3074   }
3075   bind(CLEANUP);
3076   pop(rsp); // restore SP
3077 
3078 } // string_indexof
3079 
3080 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3081                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3082   ShortBranchVerifier sbv(this);
3083   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3084 
3085   int stride = 8;
3086 
3087   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
3088         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
3089         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
3090         FOUND_SEQ_CHAR, DONE_LABEL;
3091 
3092   movptr(result, str1);
3093   if (UseAVX >= 2) {
3094     cmpl(cnt1, stride);
3095     jcc(Assembler::less, SCAN_TO_CHAR);
3096     cmpl(cnt1, 2*stride);
3097     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
3098     movdl(vec1, ch);
3099     vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
3100     vpxor(vec2, vec2);
3101     movl(tmp, cnt1);
3102     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
3103     andl(cnt1,0x0000000F);  //tail count (in chars)
3104 
3105     bind(SCAN_TO_16_CHAR_LOOP);
3106     vmovdqu(vec3, Address(result, 0));
3107     vpcmpeqw(vec3, vec3, vec1, 1);
3108     vptest(vec2, vec3);
3109     jcc(Assembler::carryClear, FOUND_CHAR);
3110     addptr(result, 32);
3111     subl(tmp, 2*stride);
3112     jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
3113     jmp(SCAN_TO_8_CHAR);
3114     bind(SCAN_TO_8_CHAR_INIT);
3115     movdl(vec1, ch);
3116     pshuflw(vec1, vec1, 0x00);
3117     pshufd(vec1, vec1, 0);
3118     pxor(vec2, vec2);
3119   }
3120   bind(SCAN_TO_8_CHAR);
3121   cmpl(cnt1, stride);
3122   jcc(Assembler::less, SCAN_TO_CHAR);
3123   if (UseAVX < 2) {
3124     movdl(vec1, ch);
3125     pshuflw(vec1, vec1, 0x00);
3126     pshufd(vec1, vec1, 0);
3127     pxor(vec2, vec2);
3128   }
3129   movl(tmp, cnt1);
3130   andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
3131   andl(cnt1,0x00000007);  //tail count (in chars)
3132 
3133   bind(SCAN_TO_8_CHAR_LOOP);
3134   movdqu(vec3, Address(result, 0));
3135   pcmpeqw(vec3, vec1);
3136   ptest(vec2, vec3);
3137   jcc(Assembler::carryClear, FOUND_CHAR);
3138   addptr(result, 16);
3139   subl(tmp, stride);
3140   jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
3141   bind(SCAN_TO_CHAR);
3142   testl(cnt1, cnt1);
3143   jcc(Assembler::zero, RET_NOT_FOUND);
3144   bind(SCAN_TO_CHAR_LOOP);
3145   load_unsigned_short(tmp, Address(result, 0));
3146   cmpl(ch, tmp);
3147   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3148   addptr(result, 2);
3149   subl(cnt1, 1);
3150   jccb(Assembler::zero, RET_NOT_FOUND);
3151   jmp(SCAN_TO_CHAR_LOOP);
3152 
3153   bind(RET_NOT_FOUND);
3154   movl(result, -1);
3155   jmpb(DONE_LABEL);
3156 
3157   bind(FOUND_CHAR);
3158   if (UseAVX >= 2) {
3159     vpmovmskb(tmp, vec3);
3160   } else {
3161     pmovmskb(tmp, vec3);
3162   }
3163   bsfl(ch, tmp);
3164   addptr(result, ch);
3165 
3166   bind(FOUND_SEQ_CHAR);
3167   subptr(result, str1);
3168   shrl(result, 1);
3169 
3170   bind(DONE_LABEL);
3171 } // string_indexof_char
3172 
3173 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3174                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3175   ShortBranchVerifier sbv(this);
3176   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3177 
3178   int stride = 16;
3179 
3180   Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
3181         SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
3182         RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
3183         FOUND_SEQ_CHAR, DONE_LABEL;
3184 
3185   movptr(result, str1);
3186   if (UseAVX >= 2) {
3187     cmpl(cnt1, stride);
3188     jcc(Assembler::less, SCAN_TO_CHAR_INIT);
3189     cmpl(cnt1, stride*2);
3190     jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
3191     movdl(vec1, ch);
3192     vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
3193     vpxor(vec2, vec2);
3194     movl(tmp, cnt1);
3195     andl(tmp, 0xFFFFFFE0);  //vector count (in chars)
3196     andl(cnt1,0x0000001F);  //tail count (in chars)
3197 
3198     bind(SCAN_TO_32_CHAR_LOOP);
3199     vmovdqu(vec3, Address(result, 0));
3200     vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
3201     vptest(vec2, vec3);
3202     jcc(Assembler::carryClear, FOUND_CHAR);
3203     addptr(result, 32);
3204     subl(tmp, stride*2);
3205     jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
3206     jmp(SCAN_TO_16_CHAR);
3207 
3208     bind(SCAN_TO_16_CHAR_INIT);
3209     movdl(vec1, ch);
3210     pxor(vec2, vec2);
3211     pshufb(vec1, vec2);
3212   }
3213 
3214   bind(SCAN_TO_16_CHAR);
3215   cmpl(cnt1, stride);
3216   jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left
3217   if (UseAVX < 2) {
3218     movdl(vec1, ch);
3219     pxor(vec2, vec2);
3220     pshufb(vec1, vec2);
3221   }
3222   movl(tmp, cnt1);
3223   andl(tmp, 0xFFFFFFF0);  //vector count (in bytes)
3224   andl(cnt1,0x0000000F);  //tail count (in bytes)
3225 
3226   bind(SCAN_TO_16_CHAR_LOOP);
3227   movdqu(vec3, Address(result, 0));
3228   pcmpeqb(vec3, vec1);
3229   ptest(vec2, vec3);
3230   jcc(Assembler::carryClear, FOUND_CHAR);
3231   addptr(result, 16);
3232   subl(tmp, stride);
3233   jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
3234 
3235   bind(SCAN_TO_CHAR_INIT);
3236   testl(cnt1, cnt1);
3237   jcc(Assembler::zero, RET_NOT_FOUND);
3238   bind(SCAN_TO_CHAR_LOOP);
3239   load_unsigned_byte(tmp, Address(result, 0));
3240   cmpl(ch, tmp);
3241   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3242   addptr(result, 1);
3243   subl(cnt1, 1);
3244   jccb(Assembler::zero, RET_NOT_FOUND);
3245   jmp(SCAN_TO_CHAR_LOOP);
3246 
3247   bind(RET_NOT_FOUND);
3248   movl(result, -1);
3249   jmpb(DONE_LABEL);
3250 
3251   bind(FOUND_CHAR);
3252   if (UseAVX >= 2) {
3253     vpmovmskb(tmp, vec3);
3254   } else {
3255     pmovmskb(tmp, vec3);
3256   }
3257   bsfl(ch, tmp);
3258   addptr(result, ch);
3259 
3260   bind(FOUND_SEQ_CHAR);
3261   subptr(result, str1);
3262 
3263   bind(DONE_LABEL);
3264 } // stringL_indexof_char
3265 
3266 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) {
3267   switch (eltype) {
3268   case T_BOOLEAN: return sizeof(jboolean);
3269   case T_BYTE:  return sizeof(jbyte);
3270   case T_SHORT: return sizeof(jshort);
3271   case T_CHAR:  return sizeof(jchar);
3272   case T_INT:   return sizeof(jint);
3273   default:
3274     ShouldNotReachHere();
3275     return -1;
3276   }
3277 }
3278 
3279 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) {
3280   switch (eltype) {
3281   // T_BOOLEAN used as surrogate for unsigned byte
3282   case T_BOOLEAN: movzbl(dst, src);   break;
3283   case T_BYTE:    movsbl(dst, src);   break;
3284   case T_SHORT:   movswl(dst, src);   break;
3285   case T_CHAR:    movzwl(dst, src);   break;
3286   case T_INT:     movl(dst, src);     break;
3287   default:
3288     ShouldNotReachHere();
3289   }
3290 }
3291 
3292 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) {
3293   load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8);
3294 }
3295 
3296 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) {
3297   load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8);
3298 }
3299 
3300 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) {
3301   const int vlen = Assembler::AVX_256bit;
3302   switch (eltype) {
3303   case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3304   case T_BYTE:      vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3305   case T_SHORT:     vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3306   case T_CHAR:    vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3307   case T_INT:
3308     // do nothing
3309     break;
3310   default:
3311     ShouldNotReachHere();
3312   }
3313 }
3314 
3315 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result,
3316                                         Register index, Register tmp2, Register tmp3, XMMRegister vnext,
3317                                         XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3,
3318                                         XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3,
3319                                         XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3,
3320                                         BasicType eltype) {
3321   ShortBranchVerifier sbv(this);
3322   assert(UseAVX >= 2, "AVX2 intrinsics are required");
3323   assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3);
3324   assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3);
3325 
3326   Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN,
3327         SHORT_UNROLLED_LOOP_EXIT,
3328         UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME,
3329         UNROLLED_VECTOR_LOOP_BEGIN,
3330         END;
3331   switch (eltype) {
3332   case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break;
3333   case T_CHAR:    BLOCK_COMMENT("arrays_hashcode(char) {");          break;
3334   case T_BYTE:    BLOCK_COMMENT("arrays_hashcode(byte) {");          break;
3335   case T_SHORT:   BLOCK_COMMENT("arrays_hashcode(short) {");         break;
3336   case T_INT:     BLOCK_COMMENT("arrays_hashcode(int) {");           break;
3337   default:        BLOCK_COMMENT("arrays_hashcode {");                break;
3338   }
3339 
3340   // For "renaming" for readibility of the code
3341   const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 },
3342                     vresult[] = { vresult0, vresult1, vresult2, vresult3 },
3343                     vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 };
3344 
3345   const int elsize = arrays_hashcode_elsize(eltype);
3346 
3347   /*
3348     if (cnt1 >= 2) {
3349       if (cnt1 >= 32) {
3350         UNROLLED VECTOR LOOP
3351       }
3352       UNROLLED SCALAR LOOP
3353     }
3354     SINGLE SCALAR
3355    */
3356 
3357   cmpl(cnt1, 32);
3358   jcc(Assembler::less, SHORT_UNROLLED_BEGIN);
3359 
3360   // cnt1 >= 32 && generate_vectorized_loop
3361   xorl(index, index);
3362 
3363   // vresult = IntVector.zero(I256);
3364   for (int idx = 0; idx < 4; idx++) {
3365     vpxor(vresult[idx], vresult[idx]);
3366   }
3367   // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]);
3368   Register bound = tmp2;
3369   Register next = tmp3;
3370   lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint))));
3371   movl(next, Address(tmp2, 0));
3372   movdl(vnext, next);
3373   vpbroadcastd(vnext, vnext, Assembler::AVX_256bit);
3374 
3375   // index = 0;
3376   // bound = cnt1 & ~(32 - 1);
3377   movl(bound, cnt1);
3378   andl(bound, ~(32 - 1));
3379   // for (; index < bound; index += 32) {
3380   bind(UNROLLED_VECTOR_LOOP_BEGIN);
3381   // result *= next;
3382   imull(result, next);
3383   // loop fission to upfront the cost of fetching from memory, OOO execution
3384   // can then hopefully do a better job of prefetching
3385   for (int idx = 0; idx < 4; idx++) {
3386     arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype);
3387   }
3388   // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7];
3389   for (int idx = 0; idx < 4; idx++) {
3390     vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit);
3391     arrays_hashcode_elvcast(vtmp[idx], eltype);
3392     vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit);
3393   }
3394   // index += 32;
3395   addl(index, 32);
3396   // index < bound;
3397   cmpl(index, bound);
3398   jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN);
3399   // }
3400 
3401   lea(ary1, Address(ary1, bound, Address::times(elsize)));
3402   subl(cnt1, bound);
3403   // release bound
3404 
3405   // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1);
3406   for (int idx = 0; idx < 4; idx++) {
3407     lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint))));
3408     arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT);
3409     vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit);
3410   }
3411   // result += vresult.reduceLanes(ADD);
3412   for (int idx = 0; idx < 4; idx++) {
3413     reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]);
3414   }
3415 
3416   // } else if (cnt1 < 32) {
3417 
3418   bind(SHORT_UNROLLED_BEGIN);
3419   // int i = 1;
3420   movl(index, 1);
3421   cmpl(index, cnt1);
3422   jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT);
3423 
3424   // for (; i < cnt1 ; i += 2) {
3425   bind(SHORT_UNROLLED_LOOP_BEGIN);
3426   movl(tmp3, 961);
3427   imull(result, tmp3);
3428   arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3429   movl(tmp3, tmp2);
3430   shll(tmp3, 5);
3431   subl(tmp3, tmp2);
3432   addl(result, tmp3);
3433   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype);
3434   addl(result, tmp3);
3435   addl(index, 2);
3436   cmpl(index, cnt1);
3437   jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN);
3438 
3439   // }
3440   // if (i >= cnt1) {
3441   bind(SHORT_UNROLLED_LOOP_EXIT);
3442   jccb(Assembler::greater, END);
3443   movl(tmp2, result);
3444   shll(result, 5);
3445   subl(result, tmp2);
3446   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3447   addl(result, tmp3);
3448   // }
3449   bind(END);
3450 
3451   BLOCK_COMMENT("} // arrays_hashcode");
3452 
3453 } // arrays_hashcode
3454 
3455 // helper function for string_compare
3456 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
3457                                            Address::ScaleFactor scale, Address::ScaleFactor scale1,
3458                                            Address::ScaleFactor scale2, Register index, int ae) {
3459   if (ae == StrIntrinsicNode::LL) {
3460     load_unsigned_byte(elem1, Address(str1, index, scale, 0));
3461     load_unsigned_byte(elem2, Address(str2, index, scale, 0));
3462   } else if (ae == StrIntrinsicNode::UU) {
3463     load_unsigned_short(elem1, Address(str1, index, scale, 0));
3464     load_unsigned_short(elem2, Address(str2, index, scale, 0));
3465   } else {
3466     load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
3467     load_unsigned_short(elem2, Address(str2, index, scale2, 0));
3468   }
3469 }
3470 
3471 // Compare strings, used for char[] and byte[].
3472 void C2_MacroAssembler::string_compare(Register str1, Register str2,
3473                                        Register cnt1, Register cnt2, Register result,
3474                                        XMMRegister vec1, int ae, KRegister mask) {
3475   ShortBranchVerifier sbv(this);
3476   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
3477   Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only _LP64 && AVX3
3478   int stride, stride2, adr_stride, adr_stride1, adr_stride2;
3479   int stride2x2 = 0x40;
3480   Address::ScaleFactor scale = Address::no_scale;
3481   Address::ScaleFactor scale1 = Address::no_scale;
3482   Address::ScaleFactor scale2 = Address::no_scale;
3483 
3484   if (ae != StrIntrinsicNode::LL) {
3485     stride2x2 = 0x20;
3486   }
3487 
3488   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
3489     shrl(cnt2, 1);
3490   }
3491   // Compute the minimum of the string lengths and the
3492   // difference of the string lengths (stack).
3493   // Do the conditional move stuff
3494   movl(result, cnt1);
3495   subl(cnt1, cnt2);
3496   push(cnt1);
3497   cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
3498 
3499   // Is the minimum length zero?
3500   testl(cnt2, cnt2);
3501   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3502   if (ae == StrIntrinsicNode::LL) {
3503     // Load first bytes
3504     load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
3505     load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
3506   } else if (ae == StrIntrinsicNode::UU) {
3507     // Load first characters
3508     load_unsigned_short(result, Address(str1, 0));
3509     load_unsigned_short(cnt1, Address(str2, 0));
3510   } else {
3511     load_unsigned_byte(result, Address(str1, 0));
3512     load_unsigned_short(cnt1, Address(str2, 0));
3513   }
3514   subl(result, cnt1);
3515   jcc(Assembler::notZero,  POP_LABEL);
3516 
3517   if (ae == StrIntrinsicNode::UU) {
3518     // Divide length by 2 to get number of chars
3519     shrl(cnt2, 1);
3520   }
3521   cmpl(cnt2, 1);
3522   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3523 
3524   // Check if the strings start at the same location and setup scale and stride
3525   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3526     cmpptr(str1, str2);
3527     jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3528     if (ae == StrIntrinsicNode::LL) {
3529       scale = Address::times_1;
3530       stride = 16;
3531     } else {
3532       scale = Address::times_2;
3533       stride = 8;
3534     }
3535   } else {
3536     scale1 = Address::times_1;
3537     scale2 = Address::times_2;
3538     // scale not used
3539     stride = 8;
3540   }
3541 
3542   if (UseAVX >= 2 && UseSSE42Intrinsics) {
3543     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
3544     Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
3545     Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
3546     Label COMPARE_TAIL_LONG;
3547     Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only _LP64 && AVX3
3548 
3549     int pcmpmask = 0x19;
3550     if (ae == StrIntrinsicNode::LL) {
3551       pcmpmask &= ~0x01;
3552     }
3553 
3554     // Setup to compare 16-chars (32-bytes) vectors,
3555     // start from first character again because it has aligned address.
3556     if (ae == StrIntrinsicNode::LL) {
3557       stride2 = 32;
3558     } else {
3559       stride2 = 16;
3560     }
3561     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3562       adr_stride = stride << scale;
3563     } else {
3564       adr_stride1 = 8;  //stride << scale1;
3565       adr_stride2 = 16; //stride << scale2;
3566     }
3567 
3568     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3569     // rax and rdx are used by pcmpestri as elements counters
3570     movl(result, cnt2);
3571     andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
3572     jcc(Assembler::zero, COMPARE_TAIL_LONG);
3573 
3574     // fast path : compare first 2 8-char vectors.
3575     bind(COMPARE_16_CHARS);
3576     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3577       movdqu(vec1, Address(str1, 0));
3578     } else {
3579       pmovzxbw(vec1, Address(str1, 0));
3580     }
3581     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3582     jccb(Assembler::below, COMPARE_INDEX_CHAR);
3583 
3584     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3585       movdqu(vec1, Address(str1, adr_stride));
3586       pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
3587     } else {
3588       pmovzxbw(vec1, Address(str1, adr_stride1));
3589       pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
3590     }
3591     jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
3592     addl(cnt1, stride);
3593 
3594     // Compare the characters at index in cnt1
3595     bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
3596     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3597     subl(result, cnt2);
3598     jmp(POP_LABEL);
3599 
3600     // Setup the registers to start vector comparison loop
3601     bind(COMPARE_WIDE_VECTORS);
3602     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3603       lea(str1, Address(str1, result, scale));
3604       lea(str2, Address(str2, result, scale));
3605     } else {
3606       lea(str1, Address(str1, result, scale1));
3607       lea(str2, Address(str2, result, scale2));
3608     }
3609     subl(result, stride2);
3610     subl(cnt2, stride2);
3611     jcc(Assembler::zero, COMPARE_WIDE_TAIL);
3612     negptr(result);
3613 
3614     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
3615     bind(COMPARE_WIDE_VECTORS_LOOP);
3616 
3617 #ifdef _LP64
3618     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3619       cmpl(cnt2, stride2x2);
3620       jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3621       testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
3622       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
3623 
3624       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3625       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3626         evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
3627         evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3628       } else {
3629         vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
3630         evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3631       }
3632       kortestql(mask, mask);
3633       jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
3634       addptr(result, stride2x2);  // update since we already compared at this addr
3635       subl(cnt2, stride2x2);      // and sub the size too
3636       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3637 
3638       vpxor(vec1, vec1);
3639       jmpb(COMPARE_WIDE_TAIL);
3640     }//if (VM_Version::supports_avx512vlbw())
3641 #endif // _LP64
3642 
3643 
3644     bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3645     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3646       vmovdqu(vec1, Address(str1, result, scale));
3647       vpxor(vec1, Address(str2, result, scale));
3648     } else {
3649       vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
3650       vpxor(vec1, Address(str2, result, scale2));
3651     }
3652     vptest(vec1, vec1);
3653     jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
3654     addptr(result, stride2);
3655     subl(cnt2, stride2);
3656     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
3657     // clean upper bits of YMM registers
3658     vpxor(vec1, vec1);
3659 
3660     // compare wide vectors tail
3661     bind(COMPARE_WIDE_TAIL);
3662     testptr(result, result);
3663     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3664 
3665     movl(result, stride2);
3666     movl(cnt2, result);
3667     negptr(result);
3668     jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3669 
3670     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
3671     bind(VECTOR_NOT_EQUAL);
3672     // clean upper bits of YMM registers
3673     vpxor(vec1, vec1);
3674     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3675       lea(str1, Address(str1, result, scale));
3676       lea(str2, Address(str2, result, scale));
3677     } else {
3678       lea(str1, Address(str1, result, scale1));
3679       lea(str2, Address(str2, result, scale2));
3680     }
3681     jmp(COMPARE_16_CHARS);
3682 
3683     // Compare tail chars, length between 1 to 15 chars
3684     bind(COMPARE_TAIL_LONG);
3685     movl(cnt2, result);
3686     cmpl(cnt2, stride);
3687     jcc(Assembler::less, COMPARE_SMALL_STR);
3688 
3689     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3690       movdqu(vec1, Address(str1, 0));
3691     } else {
3692       pmovzxbw(vec1, Address(str1, 0));
3693     }
3694     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3695     jcc(Assembler::below, COMPARE_INDEX_CHAR);
3696     subptr(cnt2, stride);
3697     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3698     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3699       lea(str1, Address(str1, result, scale));
3700       lea(str2, Address(str2, result, scale));
3701     } else {
3702       lea(str1, Address(str1, result, scale1));
3703       lea(str2, Address(str2, result, scale2));
3704     }
3705     negptr(cnt2);
3706     jmpb(WHILE_HEAD_LABEL);
3707 
3708     bind(COMPARE_SMALL_STR);
3709   } else if (UseSSE42Intrinsics) {
3710     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
3711     int pcmpmask = 0x19;
3712     // Setup to compare 8-char (16-byte) vectors,
3713     // start from first character again because it has aligned address.
3714     movl(result, cnt2);
3715     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
3716     if (ae == StrIntrinsicNode::LL) {
3717       pcmpmask &= ~0x01;
3718     }
3719     jcc(Assembler::zero, COMPARE_TAIL);
3720     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3721       lea(str1, Address(str1, result, scale));
3722       lea(str2, Address(str2, result, scale));
3723     } else {
3724       lea(str1, Address(str1, result, scale1));
3725       lea(str2, Address(str2, result, scale2));
3726     }
3727     negptr(result);
3728 
3729     // pcmpestri
3730     //   inputs:
3731     //     vec1- substring
3732     //     rax - negative string length (elements count)
3733     //     mem - scanned string
3734     //     rdx - string length (elements count)
3735     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
3736     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
3737     //   outputs:
3738     //     rcx - first mismatched element index
3739     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3740 
3741     bind(COMPARE_WIDE_VECTORS);
3742     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3743       movdqu(vec1, Address(str1, result, scale));
3744       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3745     } else {
3746       pmovzxbw(vec1, Address(str1, result, scale1));
3747       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3748     }
3749     // After pcmpestri cnt1(rcx) contains mismatched element index
3750 
3751     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
3752     addptr(result, stride);
3753     subptr(cnt2, stride);
3754     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3755 
3756     // compare wide vectors tail
3757     testptr(result, result);
3758     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3759 
3760     movl(cnt2, stride);
3761     movl(result, stride);
3762     negptr(result);
3763     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3764       movdqu(vec1, Address(str1, result, scale));
3765       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3766     } else {
3767       pmovzxbw(vec1, Address(str1, result, scale1));
3768       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3769     }
3770     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
3771 
3772     // Mismatched characters in the vectors
3773     bind(VECTOR_NOT_EQUAL);
3774     addptr(cnt1, result);
3775     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3776     subl(result, cnt2);
3777     jmpb(POP_LABEL);
3778 
3779     bind(COMPARE_TAIL); // limit is zero
3780     movl(cnt2, result);
3781     // Fallthru to tail compare
3782   }
3783   // Shift str2 and str1 to the end of the arrays, negate min
3784   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3785     lea(str1, Address(str1, cnt2, scale));
3786     lea(str2, Address(str2, cnt2, scale));
3787   } else {
3788     lea(str1, Address(str1, cnt2, scale1));
3789     lea(str2, Address(str2, cnt2, scale2));
3790   }
3791   decrementl(cnt2);  // first character was compared already
3792   negptr(cnt2);
3793 
3794   // Compare the rest of the elements
3795   bind(WHILE_HEAD_LABEL);
3796   load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
3797   subl(result, cnt1);
3798   jccb(Assembler::notZero, POP_LABEL);
3799   increment(cnt2);
3800   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
3801 
3802   // Strings are equal up to min length.  Return the length difference.
3803   bind(LENGTH_DIFF_LABEL);
3804   pop(result);
3805   if (ae == StrIntrinsicNode::UU) {
3806     // Divide diff by 2 to get number of chars
3807     sarl(result, 1);
3808   }
3809   jmpb(DONE_LABEL);
3810 
3811 #ifdef _LP64
3812   if (VM_Version::supports_avx512vlbw()) {
3813 
3814     bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
3815 
3816     kmovql(cnt1, mask);
3817     notq(cnt1);
3818     bsfq(cnt2, cnt1);
3819     if (ae != StrIntrinsicNode::LL) {
3820       // Divide diff by 2 to get number of chars
3821       sarl(cnt2, 1);
3822     }
3823     addq(result, cnt2);
3824     if (ae == StrIntrinsicNode::LL) {
3825       load_unsigned_byte(cnt1, Address(str2, result));
3826       load_unsigned_byte(result, Address(str1, result));
3827     } else if (ae == StrIntrinsicNode::UU) {
3828       load_unsigned_short(cnt1, Address(str2, result, scale));
3829       load_unsigned_short(result, Address(str1, result, scale));
3830     } else {
3831       load_unsigned_short(cnt1, Address(str2, result, scale2));
3832       load_unsigned_byte(result, Address(str1, result, scale1));
3833     }
3834     subl(result, cnt1);
3835     jmpb(POP_LABEL);
3836   }//if (VM_Version::supports_avx512vlbw())
3837 #endif // _LP64
3838 
3839   // Discard the stored length difference
3840   bind(POP_LABEL);
3841   pop(cnt1);
3842 
3843   // That's it
3844   bind(DONE_LABEL);
3845   if(ae == StrIntrinsicNode::UL) {
3846     negl(result);
3847   }
3848 
3849 }
3850 
3851 // Search for Non-ASCII character (Negative byte value) in a byte array,
3852 // return the index of the first such character, otherwise the length
3853 // of the array segment searched.
3854 //   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
3855 //   @IntrinsicCandidate
3856 //   public static int countPositives(byte[] ba, int off, int len) {
3857 //     for (int i = off; i < off + len; i++) {
3858 //       if (ba[i] < 0) {
3859 //         return i - off;
3860 //       }
3861 //     }
3862 //     return len;
3863 //   }
3864 void C2_MacroAssembler::count_positives(Register ary1, Register len,
3865   Register result, Register tmp1,
3866   XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {
3867   // rsi: byte array
3868   // rcx: len
3869   // rax: result
3870   ShortBranchVerifier sbv(this);
3871   assert_different_registers(ary1, len, result, tmp1);
3872   assert_different_registers(vec1, vec2);
3873   Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
3874 
3875   movl(result, len); // copy
3876   // len == 0
3877   testl(len, len);
3878   jcc(Assembler::zero, DONE);
3879 
3880   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
3881     VM_Version::supports_avx512vlbw() &&
3882     VM_Version::supports_bmi2()) {
3883 
3884     Label test_64_loop, test_tail, BREAK_LOOP;
3885     movl(tmp1, len);
3886     vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
3887 
3888     andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F
3889     andl(len,  0xffffffc0); // vector count (in chars)
3890     jccb(Assembler::zero, test_tail);
3891 
3892     lea(ary1, Address(ary1, len, Address::times_1));
3893     negptr(len);
3894 
3895     bind(test_64_loop);
3896     // Check whether our 64 elements of size byte contain negatives
3897     evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
3898     kortestql(mask1, mask1);
3899     jcc(Assembler::notZero, BREAK_LOOP);
3900 
3901     addptr(len, 64);
3902     jccb(Assembler::notZero, test_64_loop);
3903 
3904     bind(test_tail);
3905     // bail out when there is nothing to be done
3906     testl(tmp1, -1);
3907     jcc(Assembler::zero, DONE);
3908 
3909 
3910     // check the tail for absense of negatives
3911     // ~(~0 << len) applied up to two times (for 32-bit scenario)
3912 #ifdef _LP64
3913     {
3914       Register tmp3_aliased = len;
3915       mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
3916       shlxq(tmp3_aliased, tmp3_aliased, tmp1);
3917       notq(tmp3_aliased);
3918       kmovql(mask2, tmp3_aliased);
3919     }
3920 #else
3921     Label k_init;
3922     jmp(k_init);
3923 
3924     // We could not read 64-bits from a general purpose register thus we move
3925     // data required to compose 64 1's to the instruction stream
3926     // We emit 64 byte wide series of elements from 0..63 which later on would
3927     // be used as a compare targets with tail count contained in tmp1 register.
3928     // Result would be a k register having tmp1 consecutive number or 1
3929     // counting from least significant bit.
3930     address tmp = pc();
3931     emit_int64(0x0706050403020100);
3932     emit_int64(0x0F0E0D0C0B0A0908);
3933     emit_int64(0x1716151413121110);
3934     emit_int64(0x1F1E1D1C1B1A1918);
3935     emit_int64(0x2726252423222120);
3936     emit_int64(0x2F2E2D2C2B2A2928);
3937     emit_int64(0x3736353433323130);
3938     emit_int64(0x3F3E3D3C3B3A3938);
3939 
3940     bind(k_init);
3941     lea(len, InternalAddress(tmp));
3942     // create mask to test for negative byte inside a vector
3943     evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit);
3944     evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit);
3945 
3946 #endif
3947     evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);
3948     ktestq(mask1, mask2);
3949     jcc(Assembler::zero, DONE);
3950 
3951     // do a full check for negative registers in the tail
3952     movl(len, tmp1); // tmp1 holds low 6-bit from original len;
3953                      // ary1 already pointing to the right place
3954     jmpb(TAIL_START);
3955 
3956     bind(BREAK_LOOP);
3957     // At least one byte in the last 64 byte block was negative.
3958     // Set up to look at the last 64 bytes as if they were a tail
3959     lea(ary1, Address(ary1, len, Address::times_1));
3960     addptr(result, len);
3961     // Ignore the very last byte: if all others are positive,
3962     // it must be negative, so we can skip right to the 2+1 byte
3963     // end comparison at this point
3964     orl(result, 63);
3965     movl(len, 63);
3966     // Fallthru to tail compare
3967   } else {
3968 
3969     if (UseAVX >= 2 && UseSSE >= 2) {
3970       // With AVX2, use 32-byte vector compare
3971       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
3972 
3973       // Compare 32-byte vectors
3974       testl(len, 0xffffffe0);   // vector count (in bytes)
3975       jccb(Assembler::zero, TAIL_START);
3976 
3977       andl(len, 0xffffffe0);
3978       lea(ary1, Address(ary1, len, Address::times_1));
3979       negptr(len);
3980 
3981       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
3982       movdl(vec2, tmp1);
3983       vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
3984 
3985       bind(COMPARE_WIDE_VECTORS);
3986       vmovdqu(vec1, Address(ary1, len, Address::times_1));
3987       vptest(vec1, vec2);
3988       jccb(Assembler::notZero, BREAK_LOOP);
3989       addptr(len, 32);
3990       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3991 
3992       testl(result, 0x0000001f);   // any bytes remaining?
3993       jcc(Assembler::zero, DONE);
3994 
3995       // Quick test using the already prepared vector mask
3996       movl(len, result);
3997       andl(len, 0x0000001f);
3998       vmovdqu(vec1, Address(ary1, len, Address::times_1, -32));
3999       vptest(vec1, vec2);
4000       jcc(Assembler::zero, DONE);
4001       // There are zeros, jump to the tail to determine exactly where
4002       jmpb(TAIL_START);
4003 
4004       bind(BREAK_LOOP);
4005       // At least one byte in the last 32-byte vector is negative.
4006       // Set up to look at the last 32 bytes as if they were a tail
4007       lea(ary1, Address(ary1, len, Address::times_1));
4008       addptr(result, len);
4009       // Ignore the very last byte: if all others are positive,
4010       // it must be negative, so we can skip right to the 2+1 byte
4011       // end comparison at this point
4012       orl(result, 31);
4013       movl(len, 31);
4014       // Fallthru to tail compare
4015     } else if (UseSSE42Intrinsics) {
4016       // With SSE4.2, use double quad vector compare
4017       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4018 
4019       // Compare 16-byte vectors
4020       testl(len, 0xfffffff0);   // vector count (in bytes)
4021       jcc(Assembler::zero, TAIL_START);
4022 
4023       andl(len, 0xfffffff0);
4024       lea(ary1, Address(ary1, len, Address::times_1));
4025       negptr(len);
4026 
4027       movl(tmp1, 0x80808080);
4028       movdl(vec2, tmp1);
4029       pshufd(vec2, vec2, 0);
4030 
4031       bind(COMPARE_WIDE_VECTORS);
4032       movdqu(vec1, Address(ary1, len, Address::times_1));
4033       ptest(vec1, vec2);
4034       jccb(Assembler::notZero, BREAK_LOOP);
4035       addptr(len, 16);
4036       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4037 
4038       testl(result, 0x0000000f); // len is zero, any bytes remaining?
4039       jcc(Assembler::zero, DONE);
4040 
4041       // Quick test using the already prepared vector mask
4042       movl(len, result);
4043       andl(len, 0x0000000f);   // tail count (in bytes)
4044       movdqu(vec1, Address(ary1, len, Address::times_1, -16));
4045       ptest(vec1, vec2);
4046       jcc(Assembler::zero, DONE);
4047       jmpb(TAIL_START);
4048 
4049       bind(BREAK_LOOP);
4050       // At least one byte in the last 16-byte vector is negative.
4051       // Set up and look at the last 16 bytes as if they were a tail
4052       lea(ary1, Address(ary1, len, Address::times_1));
4053       addptr(result, len);
4054       // Ignore the very last byte: if all others are positive,
4055       // it must be negative, so we can skip right to the 2+1 byte
4056       // end comparison at this point
4057       orl(result, 15);
4058       movl(len, 15);
4059       // Fallthru to tail compare
4060     }
4061   }
4062 
4063   bind(TAIL_START);
4064   // Compare 4-byte vectors
4065   andl(len, 0xfffffffc); // vector count (in bytes)
4066   jccb(Assembler::zero, COMPARE_CHAR);
4067 
4068   lea(ary1, Address(ary1, len, Address::times_1));
4069   negptr(len);
4070 
4071   bind(COMPARE_VECTORS);
4072   movl(tmp1, Address(ary1, len, Address::times_1));
4073   andl(tmp1, 0x80808080);
4074   jccb(Assembler::notZero, TAIL_ADJUST);
4075   addptr(len, 4);
4076   jccb(Assembler::notZero, COMPARE_VECTORS);
4077 
4078   // Compare trailing char (final 2-3 bytes), if any
4079   bind(COMPARE_CHAR);
4080 
4081   testl(result, 0x2);   // tail  char
4082   jccb(Assembler::zero, COMPARE_BYTE);
4083   load_unsigned_short(tmp1, Address(ary1, 0));
4084   andl(tmp1, 0x00008080);
4085   jccb(Assembler::notZero, CHAR_ADJUST);
4086   lea(ary1, Address(ary1, 2));
4087 
4088   bind(COMPARE_BYTE);
4089   testl(result, 0x1);   // tail  byte
4090   jccb(Assembler::zero, DONE);
4091   load_unsigned_byte(tmp1, Address(ary1, 0));
4092   testl(tmp1, 0x00000080);
4093   jccb(Assembler::zero, DONE);
4094   subptr(result, 1);
4095   jmpb(DONE);
4096 
4097   bind(TAIL_ADJUST);
4098   // there are negative bits in the last 4 byte block.
4099   // Adjust result and check the next three bytes
4100   addptr(result, len);
4101   orl(result, 3);
4102   lea(ary1, Address(ary1, len, Address::times_1));
4103   jmpb(COMPARE_CHAR);
4104 
4105   bind(CHAR_ADJUST);
4106   // We are looking at a char + optional byte tail, and found that one
4107   // of the bytes in the char is negative. Adjust the result, check the
4108   // first byte and readjust if needed.
4109   andl(result, 0xfffffffc);
4110   testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first
4111   jccb(Assembler::notZero, DONE);
4112   addptr(result, 1);
4113 
4114   // That's it
4115   bind(DONE);
4116   if (UseAVX >= 2 && UseSSE >= 2) {
4117     // clean upper bits of YMM registers
4118     vpxor(vec1, vec1);
4119     vpxor(vec2, vec2);
4120   }
4121 }
4122 
4123 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
4124 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
4125                                       Register limit, Register result, Register chr,
4126                                       XMMRegister vec1, XMMRegister vec2, bool is_char, KRegister mask) {
4127   ShortBranchVerifier sbv(this);
4128   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
4129 
4130   int length_offset  = arrayOopDesc::length_offset_in_bytes();
4131   int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
4132 
4133   if (is_array_equ) {
4134     // Check the input args
4135     cmpoop(ary1, ary2);
4136     jcc(Assembler::equal, TRUE_LABEL);
4137 
4138     // Need additional checks for arrays_equals.
4139     testptr(ary1, ary1);
4140     jcc(Assembler::zero, FALSE_LABEL);
4141     testptr(ary2, ary2);
4142     jcc(Assembler::zero, FALSE_LABEL);
4143 
4144     // Check the lengths
4145     movl(limit, Address(ary1, length_offset));
4146     cmpl(limit, Address(ary2, length_offset));
4147     jcc(Assembler::notEqual, FALSE_LABEL);
4148   }
4149 
4150   // count == 0
4151   testl(limit, limit);
4152   jcc(Assembler::zero, TRUE_LABEL);
4153 
4154   if (is_array_equ) {
4155     // Load array address
4156     lea(ary1, Address(ary1, base_offset));
4157     lea(ary2, Address(ary2, base_offset));
4158   }
4159 
4160   if (is_array_equ && is_char) {
4161     // arrays_equals when used for char[].
4162     shll(limit, 1);      // byte count != 0
4163   }
4164   movl(result, limit); // copy
4165 
4166   if (UseAVX >= 2) {
4167     // With AVX2, use 32-byte vector compare
4168     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4169 
4170     // Compare 32-byte vectors
4171     andl(result, 0x0000001f);  //   tail count (in bytes)
4172     andl(limit, 0xffffffe0);   // vector count (in bytes)
4173     jcc(Assembler::zero, COMPARE_TAIL);
4174 
4175     lea(ary1, Address(ary1, limit, Address::times_1));
4176     lea(ary2, Address(ary2, limit, Address::times_1));
4177     negptr(limit);
4178 
4179 #ifdef _LP64
4180     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
4181       Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
4182 
4183       cmpl(limit, -64);
4184       jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
4185 
4186       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
4187 
4188       evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
4189       evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
4190       kortestql(mask, mask);
4191       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4192       addptr(limit, 64);  // update since we already compared at this addr
4193       cmpl(limit, -64);
4194       jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
4195 
4196       // At this point we may still need to compare -limit+result bytes.
4197       // We could execute the next two instruction and just continue via non-wide path:
4198       //  cmpl(limit, 0);
4199       //  jcc(Assembler::equal, COMPARE_TAIL);  // true
4200       // But since we stopped at the points ary{1,2}+limit which are
4201       // not farther than 64 bytes from the ends of arrays ary{1,2}+result
4202       // (|limit| <= 32 and result < 32),
4203       // we may just compare the last 64 bytes.
4204       //
4205       addptr(result, -64);   // it is safe, bc we just came from this area
4206       evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
4207       evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
4208       kortestql(mask, mask);
4209       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4210 
4211       jmp(TRUE_LABEL);
4212 
4213       bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
4214 
4215     }//if (VM_Version::supports_avx512vlbw())
4216 #endif //_LP64
4217     bind(COMPARE_WIDE_VECTORS);
4218     vmovdqu(vec1, Address(ary1, limit, Address::times_1));
4219     vmovdqu(vec2, Address(ary2, limit, Address::times_1));
4220     vpxor(vec1, vec2);
4221 
4222     vptest(vec1, vec1);
4223     jcc(Assembler::notZero, FALSE_LABEL);
4224     addptr(limit, 32);
4225     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4226 
4227     testl(result, result);
4228     jcc(Assembler::zero, TRUE_LABEL);
4229 
4230     vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
4231     vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
4232     vpxor(vec1, vec2);
4233 
4234     vptest(vec1, vec1);
4235     jccb(Assembler::notZero, FALSE_LABEL);
4236     jmpb(TRUE_LABEL);
4237 
4238     bind(COMPARE_TAIL); // limit is zero
4239     movl(limit, result);
4240     // Fallthru to tail compare
4241   } else if (UseSSE42Intrinsics) {
4242     // With SSE4.2, use double quad vector compare
4243     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4244 
4245     // Compare 16-byte vectors
4246     andl(result, 0x0000000f);  //   tail count (in bytes)
4247     andl(limit, 0xfffffff0);   // vector count (in bytes)
4248     jcc(Assembler::zero, COMPARE_TAIL);
4249 
4250     lea(ary1, Address(ary1, limit, Address::times_1));
4251     lea(ary2, Address(ary2, limit, Address::times_1));
4252     negptr(limit);
4253 
4254     bind(COMPARE_WIDE_VECTORS);
4255     movdqu(vec1, Address(ary1, limit, Address::times_1));
4256     movdqu(vec2, Address(ary2, limit, Address::times_1));
4257     pxor(vec1, vec2);
4258 
4259     ptest(vec1, vec1);
4260     jcc(Assembler::notZero, FALSE_LABEL);
4261     addptr(limit, 16);
4262     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4263 
4264     testl(result, result);
4265     jcc(Assembler::zero, TRUE_LABEL);
4266 
4267     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
4268     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
4269     pxor(vec1, vec2);
4270 
4271     ptest(vec1, vec1);
4272     jccb(Assembler::notZero, FALSE_LABEL);
4273     jmpb(TRUE_LABEL);
4274 
4275     bind(COMPARE_TAIL); // limit is zero
4276     movl(limit, result);
4277     // Fallthru to tail compare
4278   }
4279 
4280   // Compare 4-byte vectors
4281   andl(limit, 0xfffffffc); // vector count (in bytes)
4282   jccb(Assembler::zero, COMPARE_CHAR);
4283 
4284   lea(ary1, Address(ary1, limit, Address::times_1));
4285   lea(ary2, Address(ary2, limit, Address::times_1));
4286   negptr(limit);
4287 
4288   bind(COMPARE_VECTORS);
4289   movl(chr, Address(ary1, limit, Address::times_1));
4290   cmpl(chr, Address(ary2, limit, Address::times_1));
4291   jccb(Assembler::notEqual, FALSE_LABEL);
4292   addptr(limit, 4);
4293   jcc(Assembler::notZero, COMPARE_VECTORS);
4294 
4295   // Compare trailing char (final 2 bytes), if any
4296   bind(COMPARE_CHAR);
4297   testl(result, 0x2);   // tail  char
4298   jccb(Assembler::zero, COMPARE_BYTE);
4299   load_unsigned_short(chr, Address(ary1, 0));
4300   load_unsigned_short(limit, Address(ary2, 0));
4301   cmpl(chr, limit);
4302   jccb(Assembler::notEqual, FALSE_LABEL);
4303 
4304   if (is_array_equ && is_char) {
4305     bind(COMPARE_BYTE);
4306   } else {
4307     lea(ary1, Address(ary1, 2));
4308     lea(ary2, Address(ary2, 2));
4309 
4310     bind(COMPARE_BYTE);
4311     testl(result, 0x1);   // tail  byte
4312     jccb(Assembler::zero, TRUE_LABEL);
4313     load_unsigned_byte(chr, Address(ary1, 0));
4314     load_unsigned_byte(limit, Address(ary2, 0));
4315     cmpl(chr, limit);
4316     jccb(Assembler::notEqual, FALSE_LABEL);
4317   }
4318   bind(TRUE_LABEL);
4319   movl(result, 1);   // return true
4320   jmpb(DONE);
4321 
4322   bind(FALSE_LABEL);
4323   xorl(result, result); // return false
4324 
4325   // That's it
4326   bind(DONE);
4327   if (UseAVX >= 2) {
4328     // clean upper bits of YMM registers
4329     vpxor(vec1, vec1);
4330     vpxor(vec2, vec2);
4331   }
4332 }
4333 
4334 #ifdef _LP64
4335 
4336 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) {
4337 #define __ masm.
4338   Register dst = stub.data<0>();
4339   XMMRegister src = stub.data<1>();
4340   address target = stub.data<2>();
4341   __ bind(stub.entry());
4342   __ subptr(rsp, 8);
4343   __ movdbl(Address(rsp), src);
4344   __ call(RuntimeAddress(target));
4345   __ pop(dst);
4346   __ jmp(stub.continuation());
4347 #undef __
4348 }
4349 
4350 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) {
4351   assert(dst_bt == T_INT || dst_bt == T_LONG, "");
4352   assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, "");
4353 
4354   address slowpath_target;
4355   if (dst_bt == T_INT) {
4356     if (src_bt == T_FLOAT) {
4357       cvttss2sil(dst, src);
4358       cmpl(dst, 0x80000000);
4359       slowpath_target = StubRoutines::x86::f2i_fixup();
4360     } else {
4361       cvttsd2sil(dst, src);
4362       cmpl(dst, 0x80000000);
4363       slowpath_target = StubRoutines::x86::d2i_fixup();
4364     }
4365   } else {
4366     if (src_bt == T_FLOAT) {
4367       cvttss2siq(dst, src);
4368       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4369       slowpath_target = StubRoutines::x86::f2l_fixup();
4370     } else {
4371       cvttsd2siq(dst, src);
4372       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4373       slowpath_target = StubRoutines::x86::d2l_fixup();
4374     }
4375   }
4376 
4377   auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, 23, convertF2I_slowpath);
4378   jcc(Assembler::equal, stub->entry());
4379   bind(stub->continuation());
4380 }
4381 
4382 #endif // _LP64
4383 
4384 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4385                                     XMMRegister src1, int imm8, bool merge, int vlen_enc) {
4386   switch(ideal_opc) {
4387     case Op_LShiftVS:
4388       Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break;
4389     case Op_LShiftVI:
4390       Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break;
4391     case Op_LShiftVL:
4392       Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break;
4393     case Op_RShiftVS:
4394       Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break;
4395     case Op_RShiftVI:
4396       Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break;
4397     case Op_RShiftVL:
4398       Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break;
4399     case Op_URShiftVS:
4400       Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break;
4401     case Op_URShiftVI:
4402       Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break;
4403     case Op_URShiftVL:
4404       Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break;
4405     case Op_RotateRightV:
4406       evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4407     case Op_RotateLeftV:
4408       evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4409     default:
4410       fatal("Unsupported masked operation"); break;
4411   }
4412 }
4413 
4414 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4415                                     XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc,
4416                                     bool is_varshift) {
4417   switch (ideal_opc) {
4418     case Op_AddVB:
4419       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4420     case Op_AddVS:
4421       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4422     case Op_AddVI:
4423       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4424     case Op_AddVL:
4425       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4426     case Op_AddVF:
4427       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4428     case Op_AddVD:
4429       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4430     case Op_SubVB:
4431       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4432     case Op_SubVS:
4433       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4434     case Op_SubVI:
4435       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4436     case Op_SubVL:
4437       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4438     case Op_SubVF:
4439       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4440     case Op_SubVD:
4441       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4442     case Op_MulVS:
4443       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4444     case Op_MulVI:
4445       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4446     case Op_MulVL:
4447       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4448     case Op_MulVF:
4449       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4450     case Op_MulVD:
4451       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4452     case Op_DivVF:
4453       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4454     case Op_DivVD:
4455       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4456     case Op_SqrtVF:
4457       evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break;
4458     case Op_SqrtVD:
4459       evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break;
4460     case Op_AbsVB:
4461       evpabsb(dst, mask, src2, merge, vlen_enc); break;
4462     case Op_AbsVS:
4463       evpabsw(dst, mask, src2, merge, vlen_enc); break;
4464     case Op_AbsVI:
4465       evpabsd(dst, mask, src2, merge, vlen_enc); break;
4466     case Op_AbsVL:
4467       evpabsq(dst, mask, src2, merge, vlen_enc); break;
4468     case Op_FmaVF:
4469       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4470     case Op_FmaVD:
4471       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4472     case Op_VectorRearrange:
4473       evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break;
4474     case Op_LShiftVS:
4475       evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4476     case Op_LShiftVI:
4477       evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4478     case Op_LShiftVL:
4479       evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4480     case Op_RShiftVS:
4481       evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4482     case Op_RShiftVI:
4483       evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4484     case Op_RShiftVL:
4485       evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4486     case Op_URShiftVS:
4487       evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4488     case Op_URShiftVI:
4489       evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4490     case Op_URShiftVL:
4491       evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4492     case Op_RotateLeftV:
4493       evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4494     case Op_RotateRightV:
4495       evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4496     case Op_MaxV:
4497       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4498     case Op_MinV:
4499       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4500     case Op_XorV:
4501       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4502     case Op_OrV:
4503       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4504     case Op_AndV:
4505       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4506     default:
4507       fatal("Unsupported masked operation"); break;
4508   }
4509 }
4510 
4511 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4512                                     XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4513   switch (ideal_opc) {
4514     case Op_AddVB:
4515       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4516     case Op_AddVS:
4517       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4518     case Op_AddVI:
4519       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4520     case Op_AddVL:
4521       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4522     case Op_AddVF:
4523       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4524     case Op_AddVD:
4525       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4526     case Op_SubVB:
4527       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4528     case Op_SubVS:
4529       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4530     case Op_SubVI:
4531       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4532     case Op_SubVL:
4533       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4534     case Op_SubVF:
4535       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4536     case Op_SubVD:
4537       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4538     case Op_MulVS:
4539       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4540     case Op_MulVI:
4541       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4542     case Op_MulVL:
4543       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4544     case Op_MulVF:
4545       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4546     case Op_MulVD:
4547       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4548     case Op_DivVF:
4549       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4550     case Op_DivVD:
4551       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4552     case Op_FmaVF:
4553       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4554     case Op_FmaVD:
4555       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4556     case Op_MaxV:
4557       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4558     case Op_MinV:
4559       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4560     case Op_XorV:
4561       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4562     case Op_OrV:
4563       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4564     case Op_AndV:
4565       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4566     default:
4567       fatal("Unsupported masked operation"); break;
4568   }
4569 }
4570 
4571 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst,
4572                                   KRegister src1, KRegister src2) {
4573   BasicType etype = T_ILLEGAL;
4574   switch(mask_len) {
4575     case 2:
4576     case 4:
4577     case 8:  etype = T_BYTE; break;
4578     case 16: etype = T_SHORT; break;
4579     case 32: etype = T_INT; break;
4580     case 64: etype = T_LONG; break;
4581     default: fatal("Unsupported type"); break;
4582   }
4583   assert(etype != T_ILLEGAL, "");
4584   switch(ideal_opc) {
4585     case Op_AndVMask:
4586       kand(etype, dst, src1, src2); break;
4587     case Op_OrVMask:
4588       kor(etype, dst, src1, src2); break;
4589     case Op_XorVMask:
4590       kxor(etype, dst, src1, src2); break;
4591     default:
4592       fatal("Unsupported masked operation"); break;
4593   }
4594 }
4595 
4596 /*
4597  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4598  * If src is NaN, the result is 0.
4599  * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE,
4600  * the result is equal to the value of Integer.MIN_VALUE.
4601  * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE,
4602  * the result is equal to the value of Integer.MAX_VALUE.
4603  */
4604 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4605                                                                    XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
4606                                                                    Register rscratch, AddressLiteral float_sign_flip,
4607                                                                    int vec_enc) {
4608   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4609   Label done;
4610   vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch);
4611   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
4612   vptest(xtmp2, xtmp2, vec_enc);
4613   jccb(Assembler::equal, done);
4614 
4615   vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc);
4616   vpxor(xtmp1, xtmp1, xtmp4, vec_enc);
4617 
4618   vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
4619   vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc);
4620   vblendvps(dst, dst, xtmp4, xtmp3, vec_enc);
4621 
4622   // Recompute the mask for remaining special value.
4623   vpxor(xtmp2, xtmp2, xtmp3, vec_enc);
4624   // Extract SRC values corresponding to TRUE mask lanes.
4625   vpand(xtmp4, xtmp2, src, vec_enc);
4626   // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special
4627   // values are set.
4628   vpxor(xtmp3, xtmp2, xtmp4, vec_enc);
4629 
4630   vblendvps(dst, dst, xtmp1, xtmp3, vec_enc);
4631   bind(done);
4632 }
4633 
4634 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4635                                                                     XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4636                                                                     Register rscratch, AddressLiteral float_sign_flip,
4637                                                                     int vec_enc) {
4638   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4639   Label done;
4640   evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
4641   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4642   kortestwl(ktmp1, ktmp1);
4643   jccb(Assembler::equal, done);
4644 
4645   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4646   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4647   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
4648 
4649   kxorwl(ktmp1, ktmp1, ktmp2);
4650   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4651   vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4652   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
4653   bind(done);
4654 }
4655 
4656 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4657                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4658                                                                      Register rscratch, AddressLiteral double_sign_flip,
4659                                                                      int vec_enc) {
4660   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4661 
4662   Label done;
4663   evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
4664   Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc);
4665   kortestwl(ktmp1, ktmp1);
4666   jccb(Assembler::equal, done);
4667 
4668   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4669   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4670   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
4671 
4672   kxorwl(ktmp1, ktmp1, ktmp2);
4673   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4674   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4675   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
4676   bind(done);
4677 }
4678 
4679 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4680                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4681                                                                      Register rscratch, AddressLiteral float_sign_flip,
4682                                                                      int vec_enc) {
4683   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4684   Label done;
4685   evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
4686   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4687   kortestwl(ktmp1, ktmp1);
4688   jccb(Assembler::equal, done);
4689 
4690   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4691   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4692   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
4693 
4694   kxorwl(ktmp1, ktmp1, ktmp2);
4695   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4696   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4697   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
4698   bind(done);
4699 }
4700 
4701 /*
4702  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4703  * If src is NaN, the result is 0.
4704  * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE,
4705  * the result is equal to the value of Long.MIN_VALUE.
4706  * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE,
4707  * the result is equal to the value of Long.MAX_VALUE.
4708  */
4709 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4710                                                                       XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4711                                                                       Register rscratch, AddressLiteral double_sign_flip,
4712                                                                       int vec_enc) {
4713   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4714 
4715   Label done;
4716   evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
4717   evpcmpeqq(ktmp1, xtmp1, dst, vec_enc);
4718   kortestwl(ktmp1, ktmp1);
4719   jccb(Assembler::equal, done);
4720 
4721   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4722   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4723   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
4724 
4725   kxorwl(ktmp1, ktmp1, ktmp2);
4726   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4727   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4728   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
4729   bind(done);
4730 }
4731 
4732 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero,
4733                                                              XMMRegister xtmp, int index, int vec_enc) {
4734    assert(vec_enc < Assembler::AVX_512bit, "");
4735    if (vec_enc == Assembler::AVX_256bit) {
4736      vextractf128_high(xtmp, src);
4737      vshufps(dst, src, xtmp, index, vec_enc);
4738    } else {
4739      vshufps(dst, src, zero, index, vec_enc);
4740    }
4741 }
4742 
4743 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
4744                                                                     XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch,
4745                                                                     AddressLiteral float_sign_flip, int src_vec_enc) {
4746   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4747 
4748   Label done;
4749   // Compare the destination lanes with float_sign_flip
4750   // value to get mask for all special values.
4751   movdqu(xtmp1, float_sign_flip, rscratch);
4752   vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit);
4753   ptest(xtmp2, xtmp2);
4754   jccb(Assembler::equal, done);
4755 
4756   // Flip float_sign_flip to get max integer value.
4757   vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit);
4758   pxor(xtmp1, xtmp4);
4759 
4760   // Set detination lanes corresponding to unordered source lanes as zero.
4761   vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc);
4762   vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc);
4763 
4764   // Shuffle mask vector and pack lower doubles word from each quadword lane.
4765   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
4766   vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit);
4767 
4768   // Recompute the mask for remaining special value.
4769   pxor(xtmp2, xtmp3);
4770   // Extract mask corresponding to non-negative source lanes.
4771   vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc);
4772 
4773   // Shuffle mask vector and pack lower doubles word from each quadword lane.
4774   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
4775   pand(xtmp3, xtmp2);
4776 
4777   // Replace destination lanes holding special value(0x80000000) with max int
4778   // if corresponding source lane holds a +ve value.
4779   vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit);
4780   bind(done);
4781 }
4782 
4783 
4784 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero,
4785                                                    XMMRegister xtmp, Register rscratch, int vec_enc) {
4786   switch(to_elem_bt) {
4787     case T_SHORT:
4788       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing");
4789       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch);
4790       vpackusdw(dst, dst, zero, vec_enc);
4791       if (vec_enc == Assembler::AVX_256bit) {
4792         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
4793       }
4794       break;
4795     case  T_BYTE:
4796       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing");
4797       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch);
4798       vpackusdw(dst, dst, zero, vec_enc);
4799       if (vec_enc == Assembler::AVX_256bit) {
4800         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
4801       }
4802       vpackuswb(dst, dst, zero, vec_enc);
4803       break;
4804     default: assert(false, "%s", type2name(to_elem_bt));
4805   }
4806 }
4807 
4808 /*
4809  * Algorithm for vector D2L and F2I conversions:-
4810  * a) Perform vector D2L/F2I cast.
4811  * b) Choose fast path if none of the result vector lane contains 0x80000000 value.
4812  *    It signifies that source value could be any of the special floating point
4813  *    values(NaN,-Inf,Inf,Max,-Min).
4814  * c) Set destination to zero if source is NaN value.
4815  * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value.
4816  */
4817 
4818 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4819                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
4820                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
4821   int to_elem_sz = type2aelembytes(to_elem_bt);
4822   assert(to_elem_sz <= 4, "");
4823   vcvttps2dq(dst, src, vec_enc);
4824   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc);
4825   if (to_elem_sz < 4) {
4826     vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
4827     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc);
4828   }
4829 }
4830 
4831 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4832                                             XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip,
4833                                             Register rscratch, int vec_enc) {
4834   int to_elem_sz = type2aelembytes(to_elem_bt);
4835   assert(to_elem_sz <= 4, "");
4836   vcvttps2dq(dst, src, vec_enc);
4837   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc);
4838   switch(to_elem_bt) {
4839     case T_INT:
4840       break;
4841     case T_SHORT:
4842       evpmovdw(dst, dst, vec_enc);
4843       break;
4844     case T_BYTE:
4845       evpmovdb(dst, dst, vec_enc);
4846       break;
4847     default: assert(false, "%s", type2name(to_elem_bt));
4848   }
4849 }
4850 
4851 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
4852                                             KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip,
4853                                             Register rscratch, int vec_enc) {
4854   evcvttps2qq(dst, src, vec_enc);
4855   vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc);
4856 }
4857 
4858 // Handling for downcasting from double to integer or sub-word types on AVX2.
4859 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4860                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5,
4861                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
4862   int to_elem_sz = type2aelembytes(to_elem_bt);
4863   assert(to_elem_sz < 8, "");
4864   vcvttpd2dq(dst, src, vec_enc);
4865   vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch,
4866                                               float_sign_flip, vec_enc);
4867   if (to_elem_sz < 4) {
4868     // xtmp4 holds all zero lanes.
4869     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit);
4870   }
4871 }
4872 
4873 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src,
4874                                             XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1,
4875                                             KRegister ktmp2, AddressLiteral sign_flip,
4876                                             Register rscratch, int vec_enc) {
4877   if (VM_Version::supports_avx512dq()) {
4878     evcvttpd2qq(dst, src, vec_enc);
4879     vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
4880     switch(to_elem_bt) {
4881       case T_LONG:
4882         break;
4883       case T_INT:
4884         evpmovsqd(dst, dst, vec_enc);
4885         break;
4886       case T_SHORT:
4887         evpmovsqd(dst, dst, vec_enc);
4888         evpmovdw(dst, dst, vec_enc);
4889         break;
4890       case T_BYTE:
4891         evpmovsqd(dst, dst, vec_enc);
4892         evpmovdb(dst, dst, vec_enc);
4893         break;
4894       default: assert(false, "%s", type2name(to_elem_bt));
4895     }
4896   } else {
4897     assert(type2aelembytes(to_elem_bt) <= 4, "");
4898     vcvttpd2dq(dst, src, vec_enc);
4899     vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
4900     switch(to_elem_bt) {
4901       case T_INT:
4902         break;
4903       case T_SHORT:
4904         evpmovdw(dst, dst, vec_enc);
4905         break;
4906       case T_BYTE:
4907         evpmovdb(dst, dst, vec_enc);
4908         break;
4909       default: assert(false, "%s", type2name(to_elem_bt));
4910     }
4911   }
4912 }
4913 
4914 #ifdef _LP64
4915 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src,
4916                                                  AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
4917                                                  Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
4918   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
4919   // and re-instantiate original MXCSR.RC mode after that.
4920   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
4921 
4922   mov64(tmp, julong_cast(0.5L));
4923   evpbroadcastq(xtmp1, tmp, vec_enc);
4924   vaddpd(xtmp1, src , xtmp1, vec_enc);
4925   evcvtpd2qq(dst, xtmp1, vec_enc);
4926   vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
4927                                                 double_sign_flip, vec_enc);;
4928 
4929   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
4930 }
4931 
4932 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src,
4933                                                 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
4934                                                 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
4935   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
4936   // and re-instantiate original MXCSR.RC mode after that.
4937   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
4938 
4939   movl(tmp, jint_cast(0.5));
4940   movq(xtmp1, tmp);
4941   vbroadcastss(xtmp1, xtmp1, vec_enc);
4942   vaddps(xtmp1, src , xtmp1, vec_enc);
4943   vcvtps2dq(dst, xtmp1, vec_enc);
4944   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
4945                                               float_sign_flip, vec_enc);
4946 
4947   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
4948 }
4949 
4950 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src,
4951                                                AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
4952                                                Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) {
4953   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
4954   // and re-instantiate original MXCSR.RC mode after that.
4955   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
4956 
4957   movl(tmp, jint_cast(0.5));
4958   movq(xtmp1, tmp);
4959   vbroadcastss(xtmp1, xtmp1, vec_enc);
4960   vaddps(xtmp1, src , xtmp1, vec_enc);
4961   vcvtps2dq(dst, xtmp1, vec_enc);
4962   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc);
4963 
4964   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
4965 }
4966 #endif // _LP64
4967 
4968 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
4969                                              BasicType from_elem_bt, BasicType to_elem_bt) {
4970   switch (from_elem_bt) {
4971     case T_BYTE:
4972       switch (to_elem_bt) {
4973         case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break;
4974         case T_INT:   vpmovzxbd(dst, src, vlen_enc); break;
4975         case T_LONG:  vpmovzxbq(dst, src, vlen_enc); break;
4976         default: ShouldNotReachHere();
4977       }
4978       break;
4979     case T_SHORT:
4980       switch (to_elem_bt) {
4981         case T_INT:  vpmovzxwd(dst, src, vlen_enc); break;
4982         case T_LONG: vpmovzxwq(dst, src, vlen_enc); break;
4983         default: ShouldNotReachHere();
4984       }
4985       break;
4986     case T_INT:
4987       assert(to_elem_bt == T_LONG, "");
4988       vpmovzxdq(dst, src, vlen_enc);
4989       break;
4990     default:
4991       ShouldNotReachHere();
4992   }
4993 }
4994 
4995 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
4996                                            BasicType from_elem_bt, BasicType to_elem_bt) {
4997   switch (from_elem_bt) {
4998     case T_BYTE:
4999       switch (to_elem_bt) {
5000         case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break;
5001         case T_INT:   vpmovsxbd(dst, src, vlen_enc); break;
5002         case T_LONG:  vpmovsxbq(dst, src, vlen_enc); break;
5003         default: ShouldNotReachHere();
5004       }
5005       break;
5006     case T_SHORT:
5007       switch (to_elem_bt) {
5008         case T_INT:  vpmovsxwd(dst, src, vlen_enc); break;
5009         case T_LONG: vpmovsxwq(dst, src, vlen_enc); break;
5010         default: ShouldNotReachHere();
5011       }
5012       break;
5013     case T_INT:
5014       assert(to_elem_bt == T_LONG, "");
5015       vpmovsxdq(dst, src, vlen_enc);
5016       break;
5017     default:
5018       ShouldNotReachHere();
5019   }
5020 }
5021 
5022 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src,
5023                                          BasicType dst_bt, BasicType src_bt, int vlen) {
5024   int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen);
5025   assert(vlen_enc != AVX_512bit, "");
5026 
5027   int dst_bt_size = type2aelembytes(dst_bt);
5028   int src_bt_size = type2aelembytes(src_bt);
5029   if (dst_bt_size > src_bt_size) {
5030     switch (dst_bt_size / src_bt_size) {
5031       case 2: vpmovsxbw(dst, src, vlen_enc); break;
5032       case 4: vpmovsxbd(dst, src, vlen_enc); break;
5033       case 8: vpmovsxbq(dst, src, vlen_enc); break;
5034       default: ShouldNotReachHere();
5035     }
5036   } else {
5037     assert(dst_bt_size < src_bt_size, "");
5038     switch (src_bt_size / dst_bt_size) {
5039       case 2: {
5040         if (vlen_enc == AVX_128bit) {
5041           vpacksswb(dst, src, src, vlen_enc);
5042         } else {
5043           vpacksswb(dst, src, src, vlen_enc);
5044           vpermq(dst, dst, 0x08, vlen_enc);
5045         }
5046         break;
5047       }
5048       case 4: {
5049         if (vlen_enc == AVX_128bit) {
5050           vpackssdw(dst, src, src, vlen_enc);
5051           vpacksswb(dst, dst, dst, vlen_enc);
5052         } else {
5053           vpackssdw(dst, src, src, vlen_enc);
5054           vpermq(dst, dst, 0x08, vlen_enc);
5055           vpacksswb(dst, dst, dst, AVX_128bit);
5056         }
5057         break;
5058       }
5059       case 8: {
5060         if (vlen_enc == AVX_128bit) {
5061           vpshufd(dst, src, 0x08, vlen_enc);
5062           vpackssdw(dst, dst, dst, vlen_enc);
5063           vpacksswb(dst, dst, dst, vlen_enc);
5064         } else {
5065           vpshufd(dst, src, 0x08, vlen_enc);
5066           vpermq(dst, dst, 0x08, vlen_enc);
5067           vpackssdw(dst, dst, dst, AVX_128bit);
5068           vpacksswb(dst, dst, dst, AVX_128bit);
5069         }
5070         break;
5071       }
5072       default: ShouldNotReachHere();
5073     }
5074   }
5075 }
5076 
5077 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3,
5078                                    bool merge, BasicType bt, int vlen_enc) {
5079   if (bt == T_INT) {
5080     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5081   } else {
5082     assert(bt == T_LONG, "");
5083     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5084   }
5085 }
5086 
5087 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3,
5088                                    bool merge, BasicType bt, int vlen_enc) {
5089   if (bt == T_INT) {
5090     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5091   } else {
5092     assert(bt == T_LONG, "");
5093     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5094   }
5095 }
5096 
5097 #ifdef _LP64
5098 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1,
5099                                                Register rtmp2, XMMRegister xtmp, int mask_len,
5100                                                int vec_enc) {
5101   int index = 0;
5102   int vindex = 0;
5103   mov64(rtmp1, 0x0101010101010101L);
5104   pdepq(rtmp1, src, rtmp1);
5105   if (mask_len > 8) {
5106     movq(rtmp2, src);
5107     vpxor(xtmp, xtmp, xtmp, vec_enc);
5108     movq(xtmp, rtmp1);
5109   }
5110   movq(dst, rtmp1);
5111 
5112   mask_len -= 8;
5113   while (mask_len > 0) {
5114     assert ((mask_len & 0x7) == 0, "mask must be multiple of 8");
5115     index++;
5116     if ((index % 2) == 0) {
5117       pxor(xtmp, xtmp);
5118     }
5119     mov64(rtmp1, 0x0101010101010101L);
5120     shrq(rtmp2, 8);
5121     pdepq(rtmp1, rtmp2, rtmp1);
5122     pinsrq(xtmp, rtmp1, index % 2);
5123     vindex = index / 2;
5124     if (vindex) {
5125       // Write entire 16 byte vector when both 64 bit
5126       // lanes are update to save redundant instructions.
5127       if (index % 2) {
5128         vinsertf128(dst, dst, xtmp, vindex);
5129       }
5130     } else {
5131       vmovdqu(dst, xtmp);
5132     }
5133     mask_len -= 8;
5134   }
5135 }
5136 
5137 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) {
5138   switch(opc) {
5139     case Op_VectorMaskTrueCount:
5140       popcntq(dst, tmp);
5141       break;
5142     case Op_VectorMaskLastTrue:
5143       if (VM_Version::supports_lzcnt()) {
5144         lzcntq(tmp, tmp);
5145         movl(dst, 63);
5146         subl(dst, tmp);
5147       } else {
5148         movl(dst, -1);
5149         bsrq(tmp, tmp);
5150         cmov32(Assembler::notZero, dst, tmp);
5151       }
5152       break;
5153     case Op_VectorMaskFirstTrue:
5154       if (VM_Version::supports_bmi1()) {
5155         if (masklen < 32) {
5156           orl(tmp, 1 << masklen);
5157           tzcntl(dst, tmp);
5158         } else if (masklen == 32) {
5159           tzcntl(dst, tmp);
5160         } else {
5161           assert(masklen == 64, "");
5162           tzcntq(dst, tmp);
5163         }
5164       } else {
5165         if (masklen < 32) {
5166           orl(tmp, 1 << masklen);
5167           bsfl(dst, tmp);
5168         } else {
5169           assert(masklen == 32 || masklen == 64, "");
5170           movl(dst, masklen);
5171           if (masklen == 32)  {
5172             bsfl(tmp, tmp);
5173           } else {
5174             bsfq(tmp, tmp);
5175           }
5176           cmov32(Assembler::notZero, dst, tmp);
5177         }
5178       }
5179       break;
5180     case Op_VectorMaskToLong:
5181       assert(dst == tmp, "Dst and tmp should be the same for toLong operations");
5182       break;
5183     default: assert(false, "Unhandled mask operation");
5184   }
5185 }
5186 
5187 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp,
5188                                               int masklen, int masksize, int vec_enc) {
5189   assert(VM_Version::supports_popcnt(), "");
5190 
5191   if(VM_Version::supports_avx512bw()) {
5192     kmovql(tmp, mask);
5193   } else {
5194     assert(masklen <= 16, "");
5195     kmovwl(tmp, mask);
5196   }
5197 
5198   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5199   // operations needs to be clipped.
5200   if (masksize < 16 && opc != Op_VectorMaskFirstTrue) {
5201     andq(tmp, (1 << masklen) - 1);
5202   }
5203 
5204   vector_mask_operation_helper(opc, dst, tmp, masklen);
5205 }
5206 
5207 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
5208                                               Register tmp, int masklen, BasicType bt, int vec_enc) {
5209   assert(vec_enc == AVX_128bit && VM_Version::supports_avx() ||
5210          vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4), "");
5211   assert(VM_Version::supports_popcnt(), "");
5212 
5213   bool need_clip = false;
5214   switch(bt) {
5215     case T_BOOLEAN:
5216       // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1
5217       vpxor(xtmp, xtmp, xtmp, vec_enc);
5218       vpsubb(xtmp, xtmp, mask, vec_enc);
5219       vpmovmskb(tmp, xtmp, vec_enc);
5220       need_clip = masklen < 16;
5221       break;
5222     case T_BYTE:
5223       vpmovmskb(tmp, mask, vec_enc);
5224       need_clip = masklen < 16;
5225       break;
5226     case T_SHORT:
5227       vpacksswb(xtmp, mask, mask, vec_enc);
5228       if (masklen >= 16) {
5229         vpermpd(xtmp, xtmp, 8, vec_enc);
5230       }
5231       vpmovmskb(tmp, xtmp, Assembler::AVX_128bit);
5232       need_clip = masklen < 16;
5233       break;
5234     case T_INT:
5235     case T_FLOAT:
5236       vmovmskps(tmp, mask, vec_enc);
5237       need_clip = masklen < 4;
5238       break;
5239     case T_LONG:
5240     case T_DOUBLE:
5241       vmovmskpd(tmp, mask, vec_enc);
5242       need_clip = masklen < 2;
5243       break;
5244     default: assert(false, "Unhandled type, %s", type2name(bt));
5245   }
5246 
5247   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5248   // operations needs to be clipped.
5249   if (need_clip && opc != Op_VectorMaskFirstTrue) {
5250     // need_clip implies masklen < 32
5251     andq(tmp, (1 << masklen) - 1);
5252   }
5253 
5254   vector_mask_operation_helper(opc, dst, tmp, masklen);
5255 }
5256 
5257 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1,
5258                                              Register rtmp2, int mask_len) {
5259   kmov(rtmp1, src);
5260   andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len)));
5261   mov64(rtmp2, -1L);
5262   pextq(rtmp2, rtmp2, rtmp1);
5263   kmov(dst, rtmp2);
5264 }
5265 
5266 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask,
5267                                                bool merge, BasicType bt, int vec_enc) {
5268   if (opcode == Op_CompressV) {
5269     switch(bt) {
5270     case T_BYTE:
5271       evpcompressb(dst, mask, src, merge, vec_enc);
5272       break;
5273     case T_CHAR:
5274     case T_SHORT:
5275       evpcompressw(dst, mask, src, merge, vec_enc);
5276       break;
5277     case T_INT:
5278       evpcompressd(dst, mask, src, merge, vec_enc);
5279       break;
5280     case T_FLOAT:
5281       evcompressps(dst, mask, src, merge, vec_enc);
5282       break;
5283     case T_LONG:
5284       evpcompressq(dst, mask, src, merge, vec_enc);
5285       break;
5286     case T_DOUBLE:
5287       evcompresspd(dst, mask, src, merge, vec_enc);
5288       break;
5289     default:
5290       fatal("Unsupported type %s", type2name(bt));
5291       break;
5292     }
5293   } else {
5294     assert(opcode == Op_ExpandV, "");
5295     switch(bt) {
5296     case T_BYTE:
5297       evpexpandb(dst, mask, src, merge, vec_enc);
5298       break;
5299     case T_CHAR:
5300     case T_SHORT:
5301       evpexpandw(dst, mask, src, merge, vec_enc);
5302       break;
5303     case T_INT:
5304       evpexpandd(dst, mask, src, merge, vec_enc);
5305       break;
5306     case T_FLOAT:
5307       evexpandps(dst, mask, src, merge, vec_enc);
5308       break;
5309     case T_LONG:
5310       evpexpandq(dst, mask, src, merge, vec_enc);
5311       break;
5312     case T_DOUBLE:
5313       evexpandpd(dst, mask, src, merge, vec_enc);
5314       break;
5315     default:
5316       fatal("Unsupported type %s", type2name(bt));
5317       break;
5318     }
5319   }
5320 }
5321 #endif
5322 
5323 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5324                                            KRegister ktmp1, int vec_enc) {
5325   if (opcode == Op_SignumVD) {
5326     vsubpd(dst, zero, one, vec_enc);
5327     // if src < 0 ? -1 : 1
5328     evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5329     evblendmpd(dst, ktmp1, one, dst, true, vec_enc);
5330     // if src == NaN, -0.0 or 0.0 return src.
5331     evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5332     evblendmpd(dst, ktmp1, dst, src, true, vec_enc);
5333   } else {
5334     assert(opcode == Op_SignumVF, "");
5335     vsubps(dst, zero, one, vec_enc);
5336     // if src < 0 ? -1 : 1
5337     evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5338     evblendmps(dst, ktmp1, one, dst, true, vec_enc);
5339     // if src == NaN, -0.0 or 0.0 return src.
5340     evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5341     evblendmps(dst, ktmp1, dst, src, true, vec_enc);
5342   }
5343 }
5344 
5345 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5346                                           XMMRegister xtmp1, int vec_enc) {
5347   if (opcode == Op_SignumVD) {
5348     vsubpd(dst, zero, one, vec_enc);
5349     // if src < 0 ? -1 : 1
5350     vblendvpd(dst, one, dst, src, vec_enc);
5351     // if src == NaN, -0.0 or 0.0 return src.
5352     vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5353     vblendvpd(dst, dst, src, xtmp1, vec_enc);
5354   } else {
5355     assert(opcode == Op_SignumVF, "");
5356     vsubps(dst, zero, one, vec_enc);
5357     // if src < 0 ? -1 : 1
5358     vblendvps(dst, one, dst, src, vec_enc);
5359     // if src == NaN, -0.0 or 0.0 return src.
5360     vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5361     vblendvps(dst, dst, src, xtmp1, vec_enc);
5362   }
5363 }
5364 
5365 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) {
5366   if (VM_Version::supports_avx512bw()) {
5367     if (mask_len > 32) {
5368       kmovql(dst, src);
5369     } else {
5370       kmovdl(dst, src);
5371       if (mask_len != 32) {
5372         kshiftrdl(dst, dst, 32 - mask_len);
5373       }
5374     }
5375   } else {
5376     assert(mask_len <= 16, "");
5377     kmovwl(dst, src);
5378     if (mask_len != 16) {
5379       kshiftrwl(dst, dst, 16 - mask_len);
5380     }
5381   }
5382 }
5383 
5384 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) {
5385   int lane_size = type2aelembytes(bt);
5386   bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
5387   if ((is_LP64 || lane_size < 8) &&
5388       ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) ||
5389        (is_subword_type(bt) && VM_Version::supports_avx512vlbw()))) {
5390     movptr(rtmp, imm32);
5391     switch(lane_size) {
5392       case 1 : evpbroadcastb(dst, rtmp, vec_enc); break;
5393       case 2 : evpbroadcastw(dst, rtmp, vec_enc); break;
5394       case 4 : evpbroadcastd(dst, rtmp, vec_enc); break;
5395       case 8 : evpbroadcastq(dst, rtmp, vec_enc); break;
5396       fatal("Unsupported lane size %d", lane_size);
5397       break;
5398     }
5399   } else {
5400     movptr(rtmp, imm32);
5401     LP64_ONLY(movq(dst, rtmp)) NOT_LP64(movdl(dst, rtmp));
5402     switch(lane_size) {
5403       case 1 : vpbroadcastb(dst, dst, vec_enc); break;
5404       case 2 : vpbroadcastw(dst, dst, vec_enc); break;
5405       case 4 : vpbroadcastd(dst, dst, vec_enc); break;
5406       case 8 : vpbroadcastq(dst, dst, vec_enc); break;
5407       fatal("Unsupported lane size %d", lane_size);
5408       break;
5409     }
5410   }
5411 }
5412 
5413 //
5414 // Following is lookup table based popcount computation algorithm:-
5415 //       Index   Bit set count
5416 //     [ 0000 ->   0,
5417 //       0001 ->   1,
5418 //       0010 ->   1,
5419 //       0011 ->   2,
5420 //       0100 ->   1,
5421 //       0101 ->   2,
5422 //       0110 ->   2,
5423 //       0111 ->   3,
5424 //       1000 ->   1,
5425 //       1001 ->   2,
5426 //       1010 ->   3,
5427 //       1011 ->   3,
5428 //       1100 ->   2,
5429 //       1101 ->   3,
5430 //       1111 ->   4 ]
5431 //  a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as
5432 //     shuffle indices for lookup table access.
5433 //  b. Right shift each byte of vector lane by 4 positions.
5434 //  c. Count the number of 1s in 4 MSB bits each byte. These bits are used as
5435 //     shuffle indices for lookup table access.
5436 //  d. Add the bitset count of upper and lower 4 bits of each byte.
5437 //  e. Unpack double words to quad words and compute sum of absolute difference of bitset
5438 //     count of all the bytes of a quadword.
5439 //  f. Perform step e. for upper 128bit vector lane.
5440 //  g. Pack the bitset count of quadwords back to double word.
5441 //  h. Unpacking and packing operations are not needed for 64bit vector lane.
5442 
5443 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5444                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5445   assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), "");
5446   vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc);
5447   vpsrlw(dst, src, 4, vec_enc);
5448   vpand(dst, dst, xtmp1, vec_enc);
5449   vpand(xtmp1, src, xtmp1, vec_enc);
5450   vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg);
5451   vpshufb(xtmp1, xtmp2, xtmp1, vec_enc);
5452   vpshufb(dst, xtmp2, dst, vec_enc);
5453   vpaddb(dst, dst, xtmp1, vec_enc);
5454 }
5455 
5456 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5457                                             XMMRegister xtmp2, Register rtmp, int vec_enc) {
5458   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5459   // Following code is as per steps e,f,g and h of above algorithm.
5460   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5461   vpunpckhdq(dst, xtmp1, xtmp2, vec_enc);
5462   vpsadbw(dst, dst, xtmp2, vec_enc);
5463   vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc);
5464   vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc);
5465   vpackuswb(dst, xtmp1, dst, vec_enc);
5466 }
5467 
5468 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5469                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5470   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5471   // Add the popcount of upper and lower bytes of word.
5472   vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc);
5473   vpsrlw(dst, xtmp1, 8, vec_enc);
5474   vpand(xtmp1, xtmp1, xtmp2, vec_enc);
5475   vpaddw(dst, dst, xtmp1, vec_enc);
5476 }
5477 
5478 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5479                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5480   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5481   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5482   vpsadbw(dst, xtmp1, xtmp2, vec_enc);
5483 }
5484 
5485 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5486                                                  XMMRegister xtmp2, Register rtmp, int vec_enc) {
5487   switch(bt) {
5488     case T_LONG:
5489       vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5490       break;
5491     case T_INT:
5492       vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5493       break;
5494     case T_CHAR:
5495     case T_SHORT:
5496       vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5497       break;
5498     case T_BYTE:
5499     case T_BOOLEAN:
5500       vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5501       break;
5502     default:
5503       fatal("Unsupported type %s", type2name(bt));
5504       break;
5505   }
5506 }
5507 
5508 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5509                                                       KRegister mask, bool merge, int vec_enc) {
5510   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5511   switch(bt) {
5512     case T_LONG:
5513       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5514       evpopcntq(dst, mask, src, merge, vec_enc);
5515       break;
5516     case T_INT:
5517       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5518       evpopcntd(dst, mask, src, merge, vec_enc);
5519       break;
5520     case T_CHAR:
5521     case T_SHORT:
5522       assert(VM_Version::supports_avx512_bitalg(), "");
5523       evpopcntw(dst, mask, src, merge, vec_enc);
5524       break;
5525     case T_BYTE:
5526     case T_BOOLEAN:
5527       assert(VM_Version::supports_avx512_bitalg(), "");
5528       evpopcntb(dst, mask, src, merge, vec_enc);
5529       break;
5530     default:
5531       fatal("Unsupported type %s", type2name(bt));
5532       break;
5533   }
5534 }
5535 
5536 #ifndef _LP64
5537 void C2_MacroAssembler::vector_maskall_operation32(KRegister dst, Register src, KRegister tmp, int mask_len) {
5538   assert(VM_Version::supports_avx512bw(), "");
5539   kmovdl(tmp, src);
5540   kunpckdql(dst, tmp, tmp);
5541 }
5542 #endif
5543 
5544 // Bit reversal algorithm first reverses the bits of each byte followed by
5545 // a byte level reversal for multi-byte primitive types (short/int/long).
5546 // Algorithm performs a lookup table access to get reverse bit sequence
5547 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte
5548 // is obtained by swapping the reverse bit sequences of upper and lower
5549 // nibble of a byte.
5550 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5551                                            XMMRegister xtmp2, Register rtmp, int vec_enc) {
5552   if (VM_Version::supports_avx512vlbw()) {
5553 
5554     // Get the reverse bit sequence of lower nibble of each byte.
5555     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg);
5556     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5557     evpandq(dst, xtmp2, src, vec_enc);
5558     vpshufb(dst, xtmp1, dst, vec_enc);
5559     vpsllq(dst, dst, 4, vec_enc);
5560 
5561     // Get the reverse bit sequence of upper nibble of each byte.
5562     vpandn(xtmp2, xtmp2, src, vec_enc);
5563     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5564     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5565 
5566     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5567     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5568     evporq(xtmp2, dst, xtmp2, vec_enc);
5569     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
5570 
5571   } else if(vec_enc == Assembler::AVX_512bit) {
5572     // Shift based bit reversal.
5573     assert(bt == T_LONG || bt == T_INT, "");
5574 
5575     // Swap lower and upper nibble of each byte.
5576     vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc);
5577 
5578     // Swap two least and most significant bits of each nibble.
5579     vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc);
5580 
5581     // Swap adjacent pair of bits.
5582     evmovdqul(xtmp1, k0, dst, true, vec_enc);
5583     vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc);
5584 
5585     evmovdqul(xtmp1, k0, dst, true, vec_enc);
5586     vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc);
5587   } else {
5588     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp);
5589     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5590 
5591     // Get the reverse bit sequence of lower nibble of each byte.
5592     vpand(dst, xtmp2, src, vec_enc);
5593     vpshufb(dst, xtmp1, dst, vec_enc);
5594     vpsllq(dst, dst, 4, vec_enc);
5595 
5596     // Get the reverse bit sequence of upper nibble of each byte.
5597     vpandn(xtmp2, xtmp2, src, vec_enc);
5598     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5599     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5600 
5601     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5602     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5603     vpor(xtmp2, dst, xtmp2, vec_enc);
5604     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
5605   }
5606 }
5607 
5608 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc,
5609                                                 XMMRegister xtmp, Register rscratch) {
5610   assert(VM_Version::supports_gfni(), "");
5611   assert(rscratch != noreg || always_reachable(mask), "missing");
5612 
5613   // Galois field instruction based bit reversal based on following algorithm.
5614   // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
5615   vpbroadcastq(xtmp, mask, vec_enc, rscratch);
5616   vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc);
5617   vector_reverse_byte(bt, dst, xtmp, vec_enc);
5618 }
5619 
5620 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src,
5621                                           XMMRegister xtmp1, Register rtmp, int vec_enc) {
5622   vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc);
5623   evpandq(dst, xtmp1, src, vec_enc);
5624   vpsllq(dst, dst, nbits, vec_enc);
5625   vpandn(xtmp1, xtmp1, src, vec_enc);
5626   vpsrlq(xtmp1, xtmp1, nbits, vec_enc);
5627   evporq(dst, dst, xtmp1, vec_enc);
5628 }
5629 
5630 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5631                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5632   // Shift based bit reversal.
5633   assert(VM_Version::supports_evex(), "");
5634   switch(bt) {
5635     case T_LONG:
5636       // Swap upper and lower double word of each quad word.
5637       evprorq(xtmp1, k0, src, 32, true, vec_enc);
5638       evprord(xtmp1, k0, xtmp1, 16, true, vec_enc);
5639       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
5640       break;
5641     case T_INT:
5642       // Swap upper and lower word of each double word.
5643       evprord(xtmp1, k0, src, 16, true, vec_enc);
5644       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
5645       break;
5646     case T_CHAR:
5647     case T_SHORT:
5648       // Swap upper and lower byte of each word.
5649       vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc);
5650       break;
5651     case T_BYTE:
5652       evmovdquq(dst, k0, src, true, vec_enc);
5653       break;
5654     default:
5655       fatal("Unsupported type %s", type2name(bt));
5656       break;
5657   }
5658 }
5659 
5660 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) {
5661   if (bt == T_BYTE) {
5662     if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) {
5663       evmovdquq(dst, k0, src, true, vec_enc);
5664     } else {
5665       vmovdqu(dst, src);
5666     }
5667     return;
5668   }
5669   // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using
5670   // pre-computed shuffle indices.
5671   switch(bt) {
5672     case T_LONG:
5673       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg);
5674       break;
5675     case T_INT:
5676       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg);
5677       break;
5678     case T_CHAR:
5679     case T_SHORT:
5680       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg);
5681       break;
5682     default:
5683       fatal("Unsupported type %s", type2name(bt));
5684       break;
5685   }
5686   vpshufb(dst, src, dst, vec_enc);
5687 }
5688 
5689 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5690                                                         XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
5691                                                         KRegister ktmp, Register rtmp, bool merge, int vec_enc) {
5692   assert(is_integral_type(bt), "");
5693   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5694   assert(VM_Version::supports_avx512cd(), "");
5695   switch(bt) {
5696     case T_LONG:
5697       evplzcntq(dst, ktmp, src, merge, vec_enc);
5698       break;
5699     case T_INT:
5700       evplzcntd(dst, ktmp, src, merge, vec_enc);
5701       break;
5702     case T_SHORT:
5703       vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc);
5704       vpunpcklwd(xtmp2, xtmp1, src, vec_enc);
5705       evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc);
5706       vpunpckhwd(dst, xtmp1, src, vec_enc);
5707       evplzcntd(dst, ktmp, dst, merge, vec_enc);
5708       vpackusdw(dst, xtmp2, dst, vec_enc);
5709       break;
5710     case T_BYTE:
5711       // T1 = Compute leading zero counts of 4 LSB bits of each byte by
5712       // accessing the lookup table.
5713       // T2 = Compute leading zero counts of 4 MSB bits of each byte by
5714       // accessing the lookup table.
5715       // Add T1 to T2 if 4 MSB bits of byte are all zeros.
5716       assert(VM_Version::supports_avx512bw(), "");
5717       evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp);
5718       vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc);
5719       vpand(xtmp2, dst, src, vec_enc);
5720       vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5721       vpsrlw(xtmp3, src, 4, vec_enc);
5722       vpand(xtmp3, dst, xtmp3, vec_enc);
5723       vpshufb(dst, xtmp1, xtmp3, vec_enc);
5724       vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
5725       evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc);
5726       evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc);
5727       break;
5728     default:
5729       fatal("Unsupported type %s", type2name(bt));
5730       break;
5731   }
5732 }
5733 
5734 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5735                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
5736   vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp);
5737   vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5738   // T1 = Compute leading zero counts of 4 LSB bits of each byte by
5739   // accessing the lookup table.
5740   vpand(dst, xtmp2, src, vec_enc);
5741   vpshufb(dst, xtmp1, dst, vec_enc);
5742   // T2 = Compute leading zero counts of 4 MSB bits of each byte by
5743   // accessing the lookup table.
5744   vpsrlw(xtmp3, src, 4, vec_enc);
5745   vpand(xtmp3, xtmp2, xtmp3, vec_enc);
5746   vpshufb(xtmp2, xtmp1, xtmp3, vec_enc);
5747   // Add T1 to T2 if 4 MSB bits of byte are all zeros.
5748   vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
5749   vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc);
5750   vpaddb(dst, dst, xtmp2, vec_enc);
5751   vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc);
5752 }
5753 
5754 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5755                                                              XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
5756   vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
5757   // Add zero counts of lower byte and upper byte of a word if
5758   // upper byte holds a zero value.
5759   vpsrlw(xtmp3, src, 8, vec_enc);
5760   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
5761   vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc);
5762   vpsllw(xtmp2, dst, 8, vec_enc);
5763   vpaddw(xtmp2, xtmp2, dst, vec_enc);
5764   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
5765   vpsrlw(dst, dst, 8, vec_enc);
5766 }
5767 
5768 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5769                                                            XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) {
5770   // Since IEEE 754 floating point format represents mantissa in 1.0 format
5771   // hence biased exponent can be used to compute leading zero count as per
5772   // following formula:-
5773   // LZCNT = 32 - (biased_exp - 127)
5774   // Special handling has been introduced for Zero, Max_Int and -ve source values.
5775 
5776   // Broadcast 0xFF
5777   vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc);
5778   vpsrld(xtmp1, xtmp1, 24, vec_enc);
5779 
5780   // Extract biased exponent.
5781   vcvtdq2ps(dst, src, vec_enc);
5782   vpsrld(dst, dst, 23, vec_enc);
5783   vpand(dst, dst, xtmp1, vec_enc);
5784 
5785   // Broadcast 127.
5786   vpsrld(xtmp1, xtmp1, 1, vec_enc);
5787   // Exponent = biased_exp - 127
5788   vpsubd(dst, dst, xtmp1, vec_enc);
5789 
5790   // Exponent = Exponent  + 1
5791   vpsrld(xtmp3, xtmp1, 6, vec_enc);
5792   vpaddd(dst, dst, xtmp3, vec_enc);
5793 
5794   // Replace -ve exponent with zero, exponent is -ve when src
5795   // lane contains a zero value.
5796   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5797   vblendvps(dst, dst, xtmp2, dst, vec_enc);
5798 
5799   // Rematerialize broadcast 32.
5800   vpslld(xtmp1, xtmp3, 5, vec_enc);
5801   // Exponent is 32 if corresponding source lane contains max_int value.
5802   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
5803   // LZCNT = 32 - exponent
5804   vpsubd(dst, xtmp1, dst, vec_enc);
5805 
5806   // Replace LZCNT with a value 1 if corresponding source lane
5807   // contains max_int value.
5808   vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc);
5809 
5810   // Replace biased_exp with 0 if source lane value is less than zero.
5811   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5812   vblendvps(dst, dst, xtmp2, src, vec_enc);
5813 }
5814 
5815 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5816                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
5817   vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
5818   // Add zero counts of lower word and upper word of a double word if
5819   // upper word holds a zero value.
5820   vpsrld(xtmp3, src, 16, vec_enc);
5821   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
5822   vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc);
5823   vpslld(xtmp2, dst, 16, vec_enc);
5824   vpaddd(xtmp2, xtmp2, dst, vec_enc);
5825   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
5826   vpsrld(dst, dst, 16, vec_enc);
5827   // Add zero counts of lower doubleword and upper doubleword of a
5828   // quadword if upper doubleword holds a zero value.
5829   vpsrlq(xtmp3, src, 32, vec_enc);
5830   vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc);
5831   vpsllq(xtmp2, dst, 32, vec_enc);
5832   vpaddq(xtmp2, xtmp2, dst, vec_enc);
5833   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
5834   vpsrlq(dst, dst, 32, vec_enc);
5835 }
5836 
5837 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src,
5838                                                        XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
5839                                                        Register rtmp, int vec_enc) {
5840   assert(is_integral_type(bt), "unexpected type");
5841   assert(vec_enc < Assembler::AVX_512bit, "");
5842   switch(bt) {
5843     case T_LONG:
5844       vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
5845       break;
5846     case T_INT:
5847       vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
5848       break;
5849     case T_SHORT:
5850       vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
5851       break;
5852     case T_BYTE:
5853       vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
5854       break;
5855     default:
5856       fatal("Unsupported type %s", type2name(bt));
5857       break;
5858   }
5859 }
5860 
5861 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) {
5862   switch(bt) {
5863     case T_BYTE:
5864       vpsubb(dst, src1, src2, vec_enc);
5865       break;
5866     case T_SHORT:
5867       vpsubw(dst, src1, src2, vec_enc);
5868       break;
5869     case T_INT:
5870       vpsubd(dst, src1, src2, vec_enc);
5871       break;
5872     case T_LONG:
5873       vpsubq(dst, src1, src2, vec_enc);
5874       break;
5875     default:
5876       fatal("Unsupported type %s", type2name(bt));
5877       break;
5878   }
5879 }
5880 
5881 // Trailing zero count computation is based on leading zero count operation as per
5882 // following equation. All AVX3 targets support AVX512CD feature which offers
5883 // direct vector instruction to compute leading zero count.
5884 //      CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x)
5885 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5886                                                          XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
5887                                                          XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) {
5888   assert(is_integral_type(bt), "");
5889   // xtmp = -1
5890   vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc);
5891   // xtmp = xtmp + src
5892   vpadd(bt, xtmp4, xtmp4, src, vec_enc);
5893   // xtmp = xtmp & ~src
5894   vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc);
5895   vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc);
5896   vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc);
5897   vpsub(bt, dst, xtmp4, dst, vec_enc);
5898 }
5899 
5900 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation
5901 //      CTZ = PRIM_TYPE_WIDHT - POPC(x | -x)
5902 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5903                                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
5904   assert(is_integral_type(bt), "");
5905   // xtmp = 0
5906   vpxor(xtmp3 , xtmp3, xtmp3, vec_enc);
5907   // xtmp = 0 - src
5908   vpsub(bt, xtmp3, xtmp3, src, vec_enc);
5909   // xtmp = xtmp | src
5910   vpor(xtmp3, xtmp3, src, vec_enc);
5911   vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc);
5912   vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc);
5913   vpsub(bt, dst, xtmp1, dst, vec_enc);
5914 }
5915 
5916 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) {
5917   Label done;
5918   Label neg_divisor_fastpath;
5919   cmpl(divisor, 0);
5920   jccb(Assembler::less, neg_divisor_fastpath);
5921   xorl(rdx, rdx);
5922   divl(divisor);
5923   jmpb(done);
5924   bind(neg_divisor_fastpath);
5925   // Fastpath for divisor < 0:
5926   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
5927   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
5928   movl(rdx, rax);
5929   subl(rdx, divisor);
5930   if (VM_Version::supports_bmi1()) {
5931     andnl(rax, rdx, rax);
5932   } else {
5933     notl(rdx);
5934     andl(rax, rdx);
5935   }
5936   shrl(rax, 31);
5937   bind(done);
5938 }
5939 
5940 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) {
5941   Label done;
5942   Label neg_divisor_fastpath;
5943   cmpl(divisor, 0);
5944   jccb(Assembler::less, neg_divisor_fastpath);
5945   xorl(rdx, rdx);
5946   divl(divisor);
5947   jmpb(done);
5948   bind(neg_divisor_fastpath);
5949   // Fastpath when divisor < 0:
5950   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
5951   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
5952   movl(rdx, rax);
5953   subl(rax, divisor);
5954   if (VM_Version::supports_bmi1()) {
5955     andnl(rax, rax, rdx);
5956   } else {
5957     notl(rax);
5958     andl(rax, rdx);
5959   }
5960   sarl(rax, 31);
5961   andl(rax, divisor);
5962   subl(rdx, rax);
5963   bind(done);
5964 }
5965 
5966 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) {
5967   Label done;
5968   Label neg_divisor_fastpath;
5969 
5970   cmpl(divisor, 0);
5971   jccb(Assembler::less, neg_divisor_fastpath);
5972   xorl(rdx, rdx);
5973   divl(divisor);
5974   jmpb(done);
5975   bind(neg_divisor_fastpath);
5976   // Fastpath for divisor < 0:
5977   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
5978   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
5979   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
5980   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
5981   movl(rdx, rax);
5982   subl(rax, divisor);
5983   if (VM_Version::supports_bmi1()) {
5984     andnl(rax, rax, rdx);
5985   } else {
5986     notl(rax);
5987     andl(rax, rdx);
5988   }
5989   movl(tmp, rax);
5990   shrl(rax, 31); // quotient
5991   sarl(tmp, 31);
5992   andl(tmp, divisor);
5993   subl(rdx, tmp); // remainder
5994   bind(done);
5995 }
5996 
5997 #ifdef _LP64
5998 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1,
5999                                  XMMRegister xtmp2, Register rtmp) {
6000   if(VM_Version::supports_gfni()) {
6001     // Galois field instruction based bit reversal based on following algorithm.
6002     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6003     mov64(rtmp, 0x8040201008040201L);
6004     movq(xtmp1, src);
6005     movq(xtmp2, rtmp);
6006     gf2p8affineqb(xtmp1, xtmp2, 0);
6007     movq(dst, xtmp1);
6008   } else {
6009     // Swap even and odd numbered bits.
6010     movl(rtmp, src);
6011     andl(rtmp, 0x55555555);
6012     shll(rtmp, 1);
6013     movl(dst, src);
6014     andl(dst, 0xAAAAAAAA);
6015     shrl(dst, 1);
6016     orl(dst, rtmp);
6017 
6018     // Swap LSB and MSB 2 bits of each nibble.
6019     movl(rtmp, dst);
6020     andl(rtmp, 0x33333333);
6021     shll(rtmp, 2);
6022     andl(dst, 0xCCCCCCCC);
6023     shrl(dst, 2);
6024     orl(dst, rtmp);
6025 
6026     // Swap LSB and MSB 4 bits of each byte.
6027     movl(rtmp, dst);
6028     andl(rtmp, 0x0F0F0F0F);
6029     shll(rtmp, 4);
6030     andl(dst, 0xF0F0F0F0);
6031     shrl(dst, 4);
6032     orl(dst, rtmp);
6033   }
6034   bswapl(dst);
6035 }
6036 
6037 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1,
6038                                  XMMRegister xtmp2, Register rtmp1, Register rtmp2) {
6039   if(VM_Version::supports_gfni()) {
6040     // Galois field instruction based bit reversal based on following algorithm.
6041     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6042     mov64(rtmp1, 0x8040201008040201L);
6043     movq(xtmp1, src);
6044     movq(xtmp2, rtmp1);
6045     gf2p8affineqb(xtmp1, xtmp2, 0);
6046     movq(dst, xtmp1);
6047   } else {
6048     // Swap even and odd numbered bits.
6049     movq(rtmp1, src);
6050     mov64(rtmp2, 0x5555555555555555L);
6051     andq(rtmp1, rtmp2);
6052     shlq(rtmp1, 1);
6053     movq(dst, src);
6054     notq(rtmp2);
6055     andq(dst, rtmp2);
6056     shrq(dst, 1);
6057     orq(dst, rtmp1);
6058 
6059     // Swap LSB and MSB 2 bits of each nibble.
6060     movq(rtmp1, dst);
6061     mov64(rtmp2, 0x3333333333333333L);
6062     andq(rtmp1, rtmp2);
6063     shlq(rtmp1, 2);
6064     notq(rtmp2);
6065     andq(dst, rtmp2);
6066     shrq(dst, 2);
6067     orq(dst, rtmp1);
6068 
6069     // Swap LSB and MSB 4 bits of each byte.
6070     movq(rtmp1, dst);
6071     mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL);
6072     andq(rtmp1, rtmp2);
6073     shlq(rtmp1, 4);
6074     notq(rtmp2);
6075     andq(dst, rtmp2);
6076     shrq(dst, 4);
6077     orq(dst, rtmp1);
6078   }
6079   bswapq(dst);
6080 }
6081 
6082 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) {
6083   Label done;
6084   Label neg_divisor_fastpath;
6085   cmpq(divisor, 0);
6086   jccb(Assembler::less, neg_divisor_fastpath);
6087   xorl(rdx, rdx);
6088   divq(divisor);
6089   jmpb(done);
6090   bind(neg_divisor_fastpath);
6091   // Fastpath for divisor < 0:
6092   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6093   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6094   movq(rdx, rax);
6095   subq(rdx, divisor);
6096   if (VM_Version::supports_bmi1()) {
6097     andnq(rax, rdx, rax);
6098   } else {
6099     notq(rdx);
6100     andq(rax, rdx);
6101   }
6102   shrq(rax, 63);
6103   bind(done);
6104 }
6105 
6106 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) {
6107   Label done;
6108   Label neg_divisor_fastpath;
6109   cmpq(divisor, 0);
6110   jccb(Assembler::less, neg_divisor_fastpath);
6111   xorq(rdx, rdx);
6112   divq(divisor);
6113   jmp(done);
6114   bind(neg_divisor_fastpath);
6115   // Fastpath when divisor < 0:
6116   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6117   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6118   movq(rdx, rax);
6119   subq(rax, divisor);
6120   if (VM_Version::supports_bmi1()) {
6121     andnq(rax, rax, rdx);
6122   } else {
6123     notq(rax);
6124     andq(rax, rdx);
6125   }
6126   sarq(rax, 63);
6127   andq(rax, divisor);
6128   subq(rdx, rax);
6129   bind(done);
6130 }
6131 
6132 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) {
6133   Label done;
6134   Label neg_divisor_fastpath;
6135   cmpq(divisor, 0);
6136   jccb(Assembler::less, neg_divisor_fastpath);
6137   xorq(rdx, rdx);
6138   divq(divisor);
6139   jmp(done);
6140   bind(neg_divisor_fastpath);
6141   // Fastpath for divisor < 0:
6142   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6143   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6144   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6145   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6146   movq(rdx, rax);
6147   subq(rax, divisor);
6148   if (VM_Version::supports_bmi1()) {
6149     andnq(rax, rax, rdx);
6150   } else {
6151     notq(rax);
6152     andq(rax, rdx);
6153   }
6154   movq(tmp, rax);
6155   shrq(rax, 63); // quotient
6156   sarq(tmp, 63);
6157   andq(tmp, divisor);
6158   subq(rdx, tmp); // remainder
6159   bind(done);
6160 }
6161 #endif
6162 
6163 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1,
6164                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp,
6165                                         int vlen_enc) {
6166   assert(VM_Version::supports_avx512bw(), "");
6167   // Byte shuffles are inlane operations and indices are determined using
6168   // lower 4 bit of each shuffle lane, thus all shuffle indices are
6169   // normalized to index range 0-15. This makes sure that all the multiples
6170   // of an index value are placed at same relative position in 128 bit
6171   // lane i.e. elements corresponding to shuffle indices 16, 32 and 64
6172   // will be 16th element in their respective 128 bit lanes.
6173   movl(rtmp, 16);
6174   evpbroadcastb(xtmp1, rtmp, vlen_enc);
6175 
6176   // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16,
6177   // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using
6178   // original shuffle indices and move the shuffled lanes corresponding to true
6179   // mask to destination vector.
6180   evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
6181   evshufi64x2(xtmp2, src, src, 0x0, vlen_enc);
6182   evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc);
6183 
6184   // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32
6185   // and broadcasting second 128 bit lane.
6186   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6187   vpsllq(xtmp2, xtmp1, 0x1, vlen_enc);
6188   evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
6189   evshufi64x2(xtmp3, src, src, 0x55, vlen_enc);
6190   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6191 
6192   // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48
6193   // and broadcasting third 128 bit lane.
6194   evpcmpb(ktmp, k0, shuffle,  xtmp2, Assembler::nlt, true, vlen_enc);
6195   vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc);
6196   evpcmpb(ktmp, ktmp, shuffle,  xtmp1, Assembler::lt, true, vlen_enc);
6197   evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc);
6198   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6199 
6200   // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64
6201   // and broadcasting third 128 bit lane.
6202   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6203   vpsllq(xtmp2, xtmp2, 0x1, vlen_enc);
6204   evpcmpb(ktmp, ktmp, shuffle,  xtmp2, Assembler::lt, true, vlen_enc);
6205   evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc);
6206   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6207 }
6208 
6209 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst,
6210                                                    XMMRegister shuffle, XMMRegister src, int vlen_enc) {
6211   if (vlen_enc == AVX_128bit) {
6212     vpermilps(dst, src, shuffle, vlen_enc);
6213   } else if (bt == T_INT) {
6214     vpermd(dst, shuffle, src, vlen_enc);
6215   } else {
6216     assert(bt == T_FLOAT, "");
6217     vpermps(dst, shuffle, src, vlen_enc);
6218   }
6219 }