1 /*
   2  * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "gc/shared/barrierSet.hpp"
  29 #include "gc/shared/barrierSetAssembler.hpp"
  30 #include "oops/methodData.hpp"
  31 #include "opto/c2_MacroAssembler.hpp"
  32 #include "opto/intrinsicnode.hpp"
  33 #include "opto/output.hpp"
  34 #include "opto/opcodes.hpp"
  35 #include "opto/subnode.hpp"
  36 #include "runtime/globals.hpp"
  37 #include "runtime/objectMonitor.hpp"
  38 #include "runtime/stubRoutines.hpp"
  39 #include "utilities/checkedCast.hpp"
  40 #include "utilities/globalDefinitions.hpp"
  41 #include "utilities/powerOfTwo.hpp"
  42 #include "utilities/sizes.hpp"
  43 
  44 #ifdef PRODUCT
  45 #define BLOCK_COMMENT(str) /* nothing */
  46 #define STOP(error) stop(error)
  47 #else
  48 #define BLOCK_COMMENT(str) block_comment(str)
  49 #define STOP(error) block_comment(error); stop(error)
  50 #endif
  51 
  52 // C2 compiled method's prolog code.
  53 void C2_MacroAssembler::verified_entry(Compile* C, int sp_inc) {
  54   if (C->clinit_barrier_on_entry()) {
  55     assert(VM_Version::supports_fast_class_init_checks(), "sanity");
  56     assert(!C->method()->holder()->is_not_initialized(), "initialization should have been started");
  57 
  58     Label L_skip_barrier;
  59     Register klass = rscratch1;
  60 
  61     mov_metadata(klass, C->method()->holder()->constant_encoding());
  62     clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
  63 
  64     jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
  65 
  66     bind(L_skip_barrier);
  67   }
  68 
  69   int framesize = C->output()->frame_size_in_bytes();
  70   int bangsize = C->output()->bang_size_in_bytes();
  71   bool fp_mode_24b = false;
  72   int stack_bang_size = C->output()->need_stack_bang(bangsize) ? bangsize : 0;
  73 
  74   // WARNING: Initial instruction MUST be 5 bytes or longer so that
  75   // NativeJump::patch_verified_entry will be able to patch out the entry
  76   // code safely. The push to verify stack depth is ok at 5 bytes,
  77   // the frame allocation can be either 3 or 6 bytes. So if we don't do
  78   // stack bang then we must use the 6 byte frame allocation even if
  79   // we have no frame. :-(
  80   assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
  81 
  82   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
  83   // Remove word for return addr
  84   framesize -= wordSize;
  85   stack_bang_size -= wordSize;
  86 
  87   // Calls to C2R adapters often do not accept exceptional returns.
  88   // We require that their callers must bang for them.  But be careful, because
  89   // some VM calls (such as call site linkage) can use several kilobytes of
  90   // stack.  But the stack safety zone should account for that.
  91   // See bugs 4446381, 4468289, 4497237.
  92   if (stack_bang_size > 0) {
  93     generate_stack_overflow_check(stack_bang_size);
  94 
  95     // We always push rbp, so that on return to interpreter rbp, will be
  96     // restored correctly and we can correct the stack.
  97     push(rbp);
  98     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  99     if (PreserveFramePointer) {
 100       mov(rbp, rsp);
 101     }
 102     // Remove word for ebp
 103     framesize -= wordSize;
 104 
 105     // Create frame
 106     if (framesize) {
 107       subptr(rsp, framesize);
 108     }
 109   } else {
 110     // Create frame (force generation of a 4 byte immediate value)
 111     subptr_imm32(rsp, framesize);
 112 
 113     // Save RBP register now.
 114     framesize -= wordSize;
 115     movptr(Address(rsp, framesize), rbp);
 116     // Save caller's stack pointer into RBP if the frame pointer is preserved.
 117     if (PreserveFramePointer) {
 118       movptr(rbp, rsp);
 119       if (framesize > 0) {
 120         addptr(rbp, framesize);
 121       }
 122     }
 123   }
 124 
 125   if (C->needs_stack_repair()) {
 126     // Save stack increment just below the saved rbp (also account for fixed framesize and rbp)
 127     assert((sp_inc & (StackAlignmentInBytes-1)) == 0, "stack increment not aligned");
 128     movptr(Address(rsp, framesize - wordSize), sp_inc + framesize + wordSize);
 129   }
 130 
 131   if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
 132     framesize -= wordSize;
 133     movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 134   }
 135 
 136 #ifndef _LP64
 137   // If method sets FPU control word do it now
 138   if (fp_mode_24b) {
 139     fldcw(ExternalAddress(StubRoutines::x86::addr_fpu_cntrl_wrd_24()));
 140   }
 141   if (UseSSE >= 2 && VerifyFPU) {
 142     verify_FPU(0, "FPU stack must be clean on entry");
 143   }
 144 #endif
 145 
 146 #ifdef ASSERT
 147   if (VerifyStackAtCalls) {
 148     Label L;
 149     push(rax);
 150     mov(rax, rsp);
 151     andptr(rax, StackAlignmentInBytes-1);
 152     cmpptr(rax, StackAlignmentInBytes-wordSize);
 153     pop(rax);
 154     jcc(Assembler::equal, L);
 155     STOP("Stack is not properly aligned!");
 156     bind(L);
 157   }
 158 #endif
 159 }
 160 
 161 void C2_MacroAssembler::entry_barrier() {
 162   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 163 #ifdef _LP64
 164   if (BarrierSet::barrier_set()->barrier_set_nmethod() != nullptr) {
 165     // We put the non-hot code of the nmethod entry barrier out-of-line in a stub.
 166     Label dummy_slow_path;
 167     Label dummy_continuation;
 168     Label* slow_path = &dummy_slow_path;
 169     Label* continuation = &dummy_continuation;
 170     if (!Compile::current()->output()->in_scratch_emit_size()) {
 171       // Use real labels from actual stub when not emitting code for the purpose of measuring its size
 172       C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
 173       Compile::current()->output()->add_stub(stub);
 174       slow_path = &stub->entry();
 175       continuation = &stub->continuation();
 176     }
 177     bs->nmethod_entry_barrier(this, slow_path, continuation);
 178   }
 179 #else
 180   // Don't bother with out-of-line nmethod entry barrier stub for x86_32.
 181   bs->nmethod_entry_barrier(this, nullptr /* slow_path */, nullptr /* continuation */);
 182 #endif
 183 }
 184 
 185 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
 186   switch (vlen_in_bytes) {
 187     case  4: // fall-through
 188     case  8: // fall-through
 189     case 16: return Assembler::AVX_128bit;
 190     case 32: return Assembler::AVX_256bit;
 191     case 64: return Assembler::AVX_512bit;
 192 
 193     default: {
 194       ShouldNotReachHere();
 195       return Assembler::AVX_NoVec;
 196     }
 197   }
 198 }
 199 
 200 #if INCLUDE_RTM_OPT
 201 
 202 // Update rtm_counters based on abort status
 203 // input: abort_status
 204 //        rtm_counters (RTMLockingCounters*)
 205 // flags are killed
 206 void C2_MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) {
 207 
 208   atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset()));
 209   if (PrintPreciseRTMLockingStatistics) {
 210     for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {
 211       Label check_abort;
 212       testl(abort_status, (1<<i));
 213       jccb(Assembler::equal, check_abort);
 214       atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx))));
 215       bind(check_abort);
 216     }
 217   }
 218 }
 219 
 220 // Branch if (random & (count-1) != 0), count is 2^n
 221 // tmp, scr and flags are killed
 222 void C2_MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) {
 223   assert(tmp == rax, "");
 224   assert(scr == rdx, "");
 225   rdtsc(); // modifies EDX:EAX
 226   andptr(tmp, count-1);
 227   jccb(Assembler::notZero, brLabel);
 228 }
 229 
 230 // Perform abort ratio calculation, set no_rtm bit if high ratio
 231 // input:  rtm_counters_Reg (RTMLockingCounters* address)
 232 // tmpReg, rtm_counters_Reg and flags are killed
 233 void C2_MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg,
 234                                                     Register rtm_counters_Reg,
 235                                                     RTMLockingCounters* rtm_counters,
 236                                                     Metadata* method_data) {
 237   Label L_done, L_check_always_rtm1, L_check_always_rtm2;
 238 
 239   if (RTMLockingCalculationDelay > 0) {
 240     // Delay calculation
 241     movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr()));
 242     testptr(tmpReg, tmpReg);
 243     jccb(Assembler::equal, L_done);
 244   }
 245   // Abort ratio calculation only if abort_count > RTMAbortThreshold
 246   //   Aborted transactions = abort_count * 100
 247   //   All transactions = total_count *  RTMTotalCountIncrRate
 248   //   Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
 249 
 250   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset()));
 251   cmpptr(tmpReg, RTMAbortThreshold);
 252   jccb(Assembler::below, L_check_always_rtm2);
 253   imulptr(tmpReg, tmpReg, 100);
 254 
 255   Register scrReg = rtm_counters_Reg;
 256   movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
 257   imulptr(scrReg, scrReg, RTMTotalCountIncrRate);
 258   imulptr(scrReg, scrReg, RTMAbortRatio);
 259   cmpptr(tmpReg, scrReg);
 260   jccb(Assembler::below, L_check_always_rtm1);
 261   if (method_data != nullptr) {
 262     // set rtm_state to "no rtm" in MDO
 263     mov_metadata(tmpReg, method_data);
 264     lock();
 265     orl(Address(tmpReg, MethodData::rtm_state_offset()), NoRTM);
 266   }
 267   jmpb(L_done);
 268   bind(L_check_always_rtm1);
 269   // Reload RTMLockingCounters* address
 270   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
 271   bind(L_check_always_rtm2);
 272   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
 273   cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate);
 274   jccb(Assembler::below, L_done);
 275   if (method_data != nullptr) {
 276     // set rtm_state to "always rtm" in MDO
 277     mov_metadata(tmpReg, method_data);
 278     lock();
 279     orl(Address(tmpReg, MethodData::rtm_state_offset()), UseRTM);
 280   }
 281   bind(L_done);
 282 }
 283 
 284 // Update counters and perform abort ratio calculation
 285 // input:  abort_status_Reg
 286 // rtm_counters_Reg, flags are killed
 287 void C2_MacroAssembler::rtm_profiling(Register abort_status_Reg,
 288                                       Register rtm_counters_Reg,
 289                                       RTMLockingCounters* rtm_counters,
 290                                       Metadata* method_data,
 291                                       bool profile_rtm) {
 292 
 293   assert(rtm_counters != nullptr, "should not be null when profiling RTM");
 294   // update rtm counters based on rax value at abort
 295   // reads abort_status_Reg, updates flags
 296   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
 297   rtm_counters_update(abort_status_Reg, rtm_counters_Reg);
 298   if (profile_rtm) {
 299     // Save abort status because abort_status_Reg is used by following code.
 300     if (RTMRetryCount > 0) {
 301       push(abort_status_Reg);
 302     }
 303     assert(rtm_counters != nullptr, "should not be null when profiling RTM");
 304     rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data);
 305     // restore abort status
 306     if (RTMRetryCount > 0) {
 307       pop(abort_status_Reg);
 308     }
 309   }
 310 }
 311 
 312 // Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4)
 313 // inputs: retry_count_Reg
 314 //       : abort_status_Reg
 315 // output: retry_count_Reg decremented by 1
 316 // flags are killed
 317 void C2_MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) {
 318   Label doneRetry;
 319   assert(abort_status_Reg == rax, "");
 320   // The abort reason bits are in eax (see all states in rtmLocking.hpp)
 321   // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4)
 322   // if reason is in 0x6 and retry count != 0 then retry
 323   andptr(abort_status_Reg, 0x6);
 324   jccb(Assembler::zero, doneRetry);
 325   testl(retry_count_Reg, retry_count_Reg);
 326   jccb(Assembler::zero, doneRetry);
 327   pause();
 328   decrementl(retry_count_Reg);
 329   jmp(retryLabel);
 330   bind(doneRetry);
 331 }
 332 
 333 // Spin and retry if lock is busy,
 334 // inputs: box_Reg (monitor address)
 335 //       : retry_count_Reg
 336 // output: retry_count_Reg decremented by 1
 337 //       : clear z flag if retry count exceeded
 338 // tmp_Reg, scr_Reg, flags are killed
 339 void C2_MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg,
 340                                                Register tmp_Reg, Register scr_Reg, Label& retryLabel) {
 341   Label SpinLoop, SpinExit, doneRetry;
 342   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 343 
 344   testl(retry_count_Reg, retry_count_Reg);
 345   jccb(Assembler::zero, doneRetry);
 346   decrementl(retry_count_Reg);
 347   movptr(scr_Reg, RTMSpinLoopCount);
 348 
 349   bind(SpinLoop);
 350   pause();
 351   decrementl(scr_Reg);
 352   jccb(Assembler::lessEqual, SpinExit);
 353   movptr(tmp_Reg, Address(box_Reg, owner_offset));
 354   testptr(tmp_Reg, tmp_Reg);
 355   jccb(Assembler::notZero, SpinLoop);
 356 
 357   bind(SpinExit);
 358   jmp(retryLabel);
 359   bind(doneRetry);
 360   incrementl(retry_count_Reg); // clear z flag
 361 }
 362 
 363 // Use RTM for normal stack locks
 364 // Input: objReg (object to lock)
 365 void C2_MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg,
 366                                          Register retry_on_abort_count_Reg,
 367                                          RTMLockingCounters* stack_rtm_counters,
 368                                          Metadata* method_data, bool profile_rtm,
 369                                          Label& DONE_LABEL, Label& IsInflated) {
 370   assert(UseRTMForStackLocks, "why call this otherwise?");
 371   assert(tmpReg == rax, "");
 372   assert(scrReg == rdx, "");
 373   Label L_rtm_retry, L_decrement_retry, L_on_abort;
 374 
 375   if (RTMRetryCount > 0) {
 376     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
 377     bind(L_rtm_retry);
 378   }
 379   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
 380   testptr(tmpReg, markWord::monitor_value);  // inflated vs stack-locked|neutral
 381   jcc(Assembler::notZero, IsInflated);
 382 
 383   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 384     Label L_noincrement;
 385     if (RTMTotalCountIncrRate > 1) {
 386       // tmpReg, scrReg and flags are killed
 387       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
 388     }
 389     assert(stack_rtm_counters != nullptr, "should not be null when profiling RTM");
 390     atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg);
 391     bind(L_noincrement);
 392   }
 393   xbegin(L_on_abort);
 394   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));       // fetch markword
 395   andptr(tmpReg, markWord::lock_mask_in_place);     // look at 2 lock bits
 396   cmpptr(tmpReg, markWord::unlocked_value);         // bits = 01 unlocked
 397   jcc(Assembler::equal, DONE_LABEL);        // all done if unlocked
 398 
 399   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
 400   if (UseRTMXendForLockBusy) {
 401     xend();
 402     movptr(abort_status_Reg, 0x2);   // Set the abort status to 2 (so we can retry)
 403     jmp(L_decrement_retry);
 404   }
 405   else {
 406     xabort(0);
 407   }
 408   bind(L_on_abort);
 409   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 410     rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm);
 411   }
 412   bind(L_decrement_retry);
 413   if (RTMRetryCount > 0) {
 414     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
 415     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
 416   }
 417 }
 418 
 419 // Use RTM for inflating locks
 420 // inputs: objReg (object to lock)
 421 //         boxReg (on-stack box address (displaced header location) - KILLED)
 422 //         tmpReg (ObjectMonitor address + markWord::monitor_value)
 423 void C2_MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg,
 424                                             Register scrReg, Register retry_on_busy_count_Reg,
 425                                             Register retry_on_abort_count_Reg,
 426                                             RTMLockingCounters* rtm_counters,
 427                                             Metadata* method_data, bool profile_rtm,
 428                                             Label& DONE_LABEL) {
 429   assert(UseRTMLocking, "why call this otherwise?");
 430   assert(tmpReg == rax, "");
 431   assert(scrReg == rdx, "");
 432   Label L_rtm_retry, L_decrement_retry, L_on_abort;
 433   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 434 
 435   movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value()));
 436   movptr(boxReg, tmpReg); // Save ObjectMonitor address
 437 
 438   if (RTMRetryCount > 0) {
 439     movl(retry_on_busy_count_Reg, RTMRetryCount);  // Retry on lock busy
 440     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
 441     bind(L_rtm_retry);
 442   }
 443   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 444     Label L_noincrement;
 445     if (RTMTotalCountIncrRate > 1) {
 446       // tmpReg, scrReg and flags are killed
 447       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
 448     }
 449     assert(rtm_counters != nullptr, "should not be null when profiling RTM");
 450     atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg);
 451     bind(L_noincrement);
 452   }
 453   xbegin(L_on_abort);
 454   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
 455   movptr(tmpReg, Address(tmpReg, owner_offset));
 456   testptr(tmpReg, tmpReg);
 457   jcc(Assembler::zero, DONE_LABEL);
 458   if (UseRTMXendForLockBusy) {
 459     xend();
 460     jmp(L_decrement_retry);
 461   }
 462   else {
 463     xabort(0);
 464   }
 465   bind(L_on_abort);
 466   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
 467   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 468     rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm);
 469   }
 470   if (RTMRetryCount > 0) {
 471     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
 472     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
 473   }
 474 
 475   movptr(tmpReg, Address(boxReg, owner_offset)) ;
 476   testptr(tmpReg, tmpReg) ;
 477   jccb(Assembler::notZero, L_decrement_retry) ;
 478 
 479   // Appears unlocked - try to swing _owner from null to non-null.
 480   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 481 #ifdef _LP64
 482   Register threadReg = r15_thread;
 483 #else
 484   get_thread(scrReg);
 485   Register threadReg = scrReg;
 486 #endif
 487   lock();
 488   cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg
 489 
 490   if (RTMRetryCount > 0) {
 491     // success done else retry
 492     jccb(Assembler::equal, DONE_LABEL) ;
 493     bind(L_decrement_retry);
 494     // Spin and retry if lock is busy.
 495     rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry);
 496   }
 497   else {
 498     bind(L_decrement_retry);
 499   }
 500 }
 501 
 502 #endif //  INCLUDE_RTM_OPT
 503 
 504 // fast_lock and fast_unlock used by C2
 505 
 506 // Because the transitions from emitted code to the runtime
 507 // monitorenter/exit helper stubs are so slow it's critical that
 508 // we inline both the stack-locking fast path and the inflated fast path.
 509 //
 510 // See also: cmpFastLock and cmpFastUnlock.
 511 //
 512 // What follows is a specialized inline transliteration of the code
 513 // in enter() and exit(). If we're concerned about I$ bloat another
 514 // option would be to emit TrySlowEnter and TrySlowExit methods
 515 // at startup-time.  These methods would accept arguments as
 516 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
 517 // indications in the icc.ZFlag.  fast_lock and fast_unlock would simply
 518 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
 519 // In practice, however, the # of lock sites is bounded and is usually small.
 520 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
 521 // if the processor uses simple bimodal branch predictors keyed by EIP
 522 // Since the helper routines would be called from multiple synchronization
 523 // sites.
 524 //
 525 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
 526 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
 527 // to those specialized methods.  That'd give us a mostly platform-independent
 528 // implementation that the JITs could optimize and inline at their pleasure.
 529 // Done correctly, the only time we'd need to cross to native could would be
 530 // to park() or unpark() threads.  We'd also need a few more unsafe operators
 531 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
 532 // (b) explicit barriers or fence operations.
 533 //
 534 // TODO:
 535 //
 536 // *  Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
 537 //    This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
 538 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
 539 //    the lock operators would typically be faster than reifying Self.
 540 //
 541 // *  Ideally I'd define the primitives as:
 542 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
 543 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
 544 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
 545 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
 546 //    Furthermore the register assignments are overconstrained, possibly resulting in
 547 //    sub-optimal code near the synchronization site.
 548 //
 549 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
 550 //    Alternately, use a better sp-proximity test.
 551 //
 552 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
 553 //    Either one is sufficient to uniquely identify a thread.
 554 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
 555 //
 556 // *  Intrinsify notify() and notifyAll() for the common cases where the
 557 //    object is locked by the calling thread but the waitlist is empty.
 558 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
 559 //
 560 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
 561 //    But beware of excessive branch density on AMD Opterons.
 562 //
 563 // *  Both fast_lock and fast_unlock set the ICC.ZF to indicate success
 564 //    or failure of the fast path.  If the fast path fails then we pass
 565 //    control to the slow path, typically in C.  In fast_lock and
 566 //    fast_unlock we often branch to DONE_LABEL, just to find that C2
 567 //    will emit a conditional branch immediately after the node.
 568 //    So we have branches to branches and lots of ICC.ZF games.
 569 //    Instead, it might be better to have C2 pass a "FailureLabel"
 570 //    into fast_lock and fast_unlock.  In the case of success, control
 571 //    will drop through the node.  ICC.ZF is undefined at exit.
 572 //    In the case of failure, the node will branch directly to the
 573 //    FailureLabel
 574 
 575 
 576 // obj: object to lock
 577 // box: on-stack box address (displaced header location) - KILLED
 578 // rax,: tmp -- KILLED
 579 // scr: tmp -- KILLED
 580 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
 581                                  Register scrReg, Register cx1Reg, Register cx2Reg, Register thread,
 582                                  RTMLockingCounters* rtm_counters,
 583                                  RTMLockingCounters* stack_rtm_counters,
 584                                  Metadata* method_data,
 585                                  bool use_rtm, bool profile_rtm) {
 586   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight");
 587   // Ensure the register assignments are disjoint
 588   assert(tmpReg == rax, "");
 589 
 590   if (use_rtm) {
 591     assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg);
 592   } else {
 593     assert(cx1Reg == noreg, "");
 594     assert(cx2Reg == noreg, "");
 595     assert_different_registers(objReg, boxReg, tmpReg, scrReg);
 596   }
 597 
 598   // Possible cases that we'll encounter in fast_lock
 599   // ------------------------------------------------
 600   // * Inflated
 601   //    -- unlocked
 602   //    -- Locked
 603   //       = by self
 604   //       = by other
 605   // * neutral
 606   // * stack-locked
 607   //    -- by self
 608   //       = sp-proximity test hits
 609   //       = sp-proximity test generates false-negative
 610   //    -- by other
 611   //
 612 
 613   Label IsInflated, DONE_LABEL, NO_COUNT, COUNT;
 614 
 615   if (DiagnoseSyncOnValueBasedClasses != 0) {
 616     load_klass(tmpReg, objReg, scrReg);
 617     movl(tmpReg, Address(tmpReg, Klass::access_flags_offset()));
 618     testl(tmpReg, JVM_ACC_IS_VALUE_BASED_CLASS);
 619     jcc(Assembler::notZero, DONE_LABEL);
 620   }
 621 
 622 #if INCLUDE_RTM_OPT
 623   if (UseRTMForStackLocks && use_rtm) {
 624     assert(LockingMode != LM_MONITOR, "LockingMode == 0 (LM_MONITOR) and +UseRTMForStackLocks are mutually exclusive");
 625     rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg,
 626                       stack_rtm_counters, method_data, profile_rtm,
 627                       DONE_LABEL, IsInflated);
 628   }
 629 #endif // INCLUDE_RTM_OPT
 630 
 631   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));          // [FETCH]
 632   testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral
 633   jcc(Assembler::notZero, IsInflated);
 634 
 635   if (LockingMode == LM_MONITOR) {
 636     // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0.
 637     testptr(objReg, objReg);
 638   } else {
 639     assert(LockingMode == LM_LEGACY, "must be");
 640     // Attempt stack-locking ...
 641     orptr (tmpReg, markWord::unlocked_value);
 642     if (EnableValhalla) {
 643       // Mask inline_type bit such that we go to the slow path if object is an inline type
 644       andptr(tmpReg, ~((int) markWord::inline_type_bit_in_place));
 645     }
 646     movptr(Address(boxReg, 0), tmpReg);          // Anticipate successful CAS
 647     lock();
 648     cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes()));      // Updates tmpReg
 649     jcc(Assembler::equal, COUNT);           // Success
 650 
 651     // Recursive locking.
 652     // The object is stack-locked: markword contains stack pointer to BasicLock.
 653     // Locked by current thread if difference with current SP is less than one page.
 654     subptr(tmpReg, rsp);
 655     // Next instruction set ZFlag == 1 (Success) if difference is less then one page.
 656     andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - (int)os::vm_page_size())) );
 657     movptr(Address(boxReg, 0), tmpReg);
 658   }
 659   jmp(DONE_LABEL);
 660 
 661   bind(IsInflated);
 662   // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value
 663 
 664 #if INCLUDE_RTM_OPT
 665   // Use the same RTM locking code in 32- and 64-bit VM.
 666   if (use_rtm) {
 667     rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg,
 668                          rtm_counters, method_data, profile_rtm, DONE_LABEL);
 669   } else {
 670 #endif // INCLUDE_RTM_OPT
 671 
 672 #ifndef _LP64
 673   // The object is inflated.
 674 
 675   // boxReg refers to the on-stack BasicLock in the current frame.
 676   // We'd like to write:
 677   //   set box->_displaced_header = markWord::unused_mark().  Any non-0 value suffices.
 678   // This is convenient but results a ST-before-CAS penalty.  The following CAS suffers
 679   // additional latency as we have another ST in the store buffer that must drain.
 680 
 681   // avoid ST-before-CAS
 682   // register juggle because we need tmpReg for cmpxchgptr below
 683   movptr(scrReg, boxReg);
 684   movptr(boxReg, tmpReg);                   // consider: LEA box, [tmp-2]
 685 
 686   // Optimistic form: consider XORL tmpReg,tmpReg
 687   movptr(tmpReg, NULL_WORD);
 688 
 689   // Appears unlocked - try to swing _owner from null to non-null.
 690   // Ideally, I'd manifest "Self" with get_thread and then attempt
 691   // to CAS the register containing Self into m->Owner.
 692   // But we don't have enough registers, so instead we can either try to CAS
 693   // rsp or the address of the box (in scr) into &m->owner.  If the CAS succeeds
 694   // we later store "Self" into m->Owner.  Transiently storing a stack address
 695   // (rsp or the address of the box) into  m->owner is harmless.
 696   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 697   lock();
 698   cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 699   movptr(Address(scrReg, 0), 3);          // box->_displaced_header = 3
 700   // If we weren't able to swing _owner from null to the BasicLock
 701   // then take the slow path.
 702   jccb  (Assembler::notZero, NO_COUNT);
 703   // update _owner from BasicLock to thread
 704   get_thread (scrReg);                    // beware: clobbers ICCs
 705   movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg);
 706   xorptr(boxReg, boxReg);                 // set icc.ZFlag = 1 to indicate success
 707 
 708   // If the CAS fails we can either retry or pass control to the slow path.
 709   // We use the latter tactic.
 710   // Pass the CAS result in the icc.ZFlag into DONE_LABEL
 711   // If the CAS was successful ...
 712   //   Self has acquired the lock
 713   //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
 714   // Intentional fall-through into DONE_LABEL ...
 715 #else // _LP64
 716   // It's inflated and we use scrReg for ObjectMonitor* in this section.
 717   movq(scrReg, tmpReg);
 718   xorq(tmpReg, tmpReg);
 719   lock();
 720   cmpxchgptr(thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 721   // Unconditionally set box->_displaced_header = markWord::unused_mark().
 722   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 723   movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value()));
 724   // Propagate ICC.ZF from CAS above into DONE_LABEL.
 725   jccb(Assembler::equal, COUNT);          // CAS above succeeded; propagate ZF = 1 (success)
 726 
 727   cmpptr(thread, rax);                // Check if we are already the owner (recursive lock)
 728   jccb(Assembler::notEqual, NO_COUNT);    // If not recursive, ZF = 0 at this point (fail)
 729   incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 730   xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success
 731 #endif // _LP64
 732 #if INCLUDE_RTM_OPT
 733   } // use_rtm()
 734 #endif
 735   bind(DONE_LABEL);
 736 
 737   // ZFlag == 1 count in fast path
 738   // ZFlag == 0 count in slow path
 739   jccb(Assembler::notZero, NO_COUNT); // jump if ZFlag == 0
 740 
 741   bind(COUNT);
 742   // Count monitors in fast path
 743   increment(Address(thread, JavaThread::held_monitor_count_offset()));
 744 
 745   xorl(tmpReg, tmpReg); // Set ZF == 1
 746 
 747   bind(NO_COUNT);
 748 
 749   // At NO_COUNT the icc ZFlag is set as follows ...
 750   // fast_unlock uses the same protocol.
 751   // ZFlag == 1 -> Success
 752   // ZFlag == 0 -> Failure - force control through the slow path
 753 }
 754 
 755 // obj: object to unlock
 756 // box: box address (displaced header location), killed.  Must be EAX.
 757 // tmp: killed, cannot be obj nor box.
 758 //
 759 // Some commentary on balanced locking:
 760 //
 761 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
 762 // Methods that don't have provably balanced locking are forced to run in the
 763 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
 764 // The interpreter provides two properties:
 765 // I1:  At return-time the interpreter automatically and quietly unlocks any
 766 //      objects acquired the current activation (frame).  Recall that the
 767 //      interpreter maintains an on-stack list of locks currently held by
 768 //      a frame.
 769 // I2:  If a method attempts to unlock an object that is not held by the
 770 //      the frame the interpreter throws IMSX.
 771 //
 772 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
 773 // B() doesn't have provably balanced locking so it runs in the interpreter.
 774 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
 775 // is still locked by A().
 776 //
 777 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
 778 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
 779 // should not be unlocked by "normal" java-level locking and vice-versa.  The specification
 780 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
 781 // Arguably given that the spec legislates the JNI case as undefined our implementation
 782 // could reasonably *avoid* checking owner in fast_unlock().
 783 // In the interest of performance we elide m->Owner==Self check in unlock.
 784 // A perfectly viable alternative is to elide the owner check except when
 785 // Xcheck:jni is enabled.
 786 
 787 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) {
 788   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight");
 789   assert(boxReg == rax, "");
 790   assert_different_registers(objReg, boxReg, tmpReg);
 791 
 792   Label DONE_LABEL, Stacked, COUNT, NO_COUNT;
 793 
 794 #if INCLUDE_RTM_OPT
 795   if (UseRTMForStackLocks && use_rtm) {
 796     assert(LockingMode != LM_MONITOR, "LockingMode == 0 (LM_MONITOR) and +UseRTMForStackLocks are mutually exclusive");
 797     Label L_regular_unlock;
 798     movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword
 799     andptr(tmpReg, markWord::lock_mask_in_place);                     // look at 2 lock bits
 800     cmpptr(tmpReg, markWord::unlocked_value);                         // bits = 01 unlocked
 801     jccb(Assembler::notEqual, L_regular_unlock);                      // if !HLE RegularLock
 802     xend();                                                           // otherwise end...
 803     jmp(DONE_LABEL);                                                  // ... and we're done
 804     bind(L_regular_unlock);
 805   }
 806 #endif
 807 
 808   if (LockingMode == LM_LEGACY) {
 809     cmpptr(Address(boxReg, 0), NULL_WORD);                            // Examine the displaced header
 810     jcc   (Assembler::zero, COUNT);                                   // 0 indicates recursive stack-lock
 811   }
 812   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));   // Examine the object's markword
 813   if (LockingMode != LM_MONITOR) {
 814     testptr(tmpReg, markWord::monitor_value);                         // Inflated?
 815     jcc(Assembler::zero, Stacked);
 816   }
 817 
 818   // It's inflated.
 819 
 820 #if INCLUDE_RTM_OPT
 821   if (use_rtm) {
 822     Label L_regular_inflated_unlock;
 823     int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 824     movptr(boxReg, Address(tmpReg, owner_offset));
 825     testptr(boxReg, boxReg);
 826     jccb(Assembler::notZero, L_regular_inflated_unlock);
 827     xend();
 828     jmp(DONE_LABEL);
 829     bind(L_regular_inflated_unlock);
 830   }
 831 #endif
 832 
 833   // Despite our balanced locking property we still check that m->_owner == Self
 834   // as java routines or native JNI code called by this thread might
 835   // have released the lock.
 836   // Refer to the comments in synchronizer.cpp for how we might encode extra
 837   // state in _succ so we can avoid fetching EntryList|cxq.
 838   //
 839   // If there's no contention try a 1-0 exit.  That is, exit without
 840   // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
 841   // we detect and recover from the race that the 1-0 exit admits.
 842   //
 843   // Conceptually fast_unlock() must execute a STST|LDST "release" barrier
 844   // before it STs null into _owner, releasing the lock.  Updates
 845   // to data protected by the critical section must be visible before
 846   // we drop the lock (and thus before any other thread could acquire
 847   // the lock and observe the fields protected by the lock).
 848   // IA32's memory-model is SPO, so STs are ordered with respect to
 849   // each other and there's no need for an explicit barrier (fence).
 850   // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
 851 #ifndef _LP64
 852   // Note that we could employ various encoding schemes to reduce
 853   // the number of loads below (currently 4) to just 2 or 3.
 854   // Refer to the comments in synchronizer.cpp.
 855   // In practice the chain of fetches doesn't seem to impact performance, however.
 856   xorptr(boxReg, boxReg);
 857   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 858   jccb  (Assembler::notZero, DONE_LABEL);
 859   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 860   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 861   jccb  (Assembler::notZero, DONE_LABEL);
 862   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 863   jmpb  (DONE_LABEL);
 864 #else // _LP64
 865   // It's inflated
 866   Label CheckSucc, LNotRecursive, LSuccess, LGoSlowPath;
 867 
 868   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0);
 869   jccb(Assembler::equal, LNotRecursive);
 870 
 871   // Recursive inflated unlock
 872   decq(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 873   jmpb(LSuccess);
 874 
 875   bind(LNotRecursive);
 876   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 877   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 878   jccb  (Assembler::notZero, CheckSucc);
 879   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 880   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 881   jmpb  (DONE_LABEL);
 882 
 883   // Try to avoid passing control into the slow_path ...
 884   bind  (CheckSucc);
 885 
 886   // The following optional optimization can be elided if necessary
 887   // Effectively: if (succ == null) goto slow path
 888   // The code reduces the window for a race, however,
 889   // and thus benefits performance.
 890   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD);
 891   jccb  (Assembler::zero, LGoSlowPath);
 892 
 893   xorptr(boxReg, boxReg);
 894   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 895   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 896 
 897   // Memory barrier/fence
 898   // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ
 899   // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack.
 900   // This is faster on Nehalem and AMD Shanghai/Barcelona.
 901   // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences
 902   // We might also restructure (ST Owner=0;barrier;LD _Succ) to
 903   // (mov box,0; xchgq box, &m->Owner; LD _succ) .
 904   lock(); addl(Address(rsp, 0), 0);
 905 
 906   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD);
 907   jccb  (Assembler::notZero, LSuccess);
 908 
 909   // Rare inopportune interleaving - race.
 910   // The successor vanished in the small window above.
 911   // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor.
 912   // We need to ensure progress and succession.
 913   // Try to reacquire the lock.
 914   // If that fails then the new owner is responsible for succession and this
 915   // thread needs to take no further action and can exit via the fast path (success).
 916   // If the re-acquire succeeds then pass control into the slow path.
 917   // As implemented, this latter mode is horrible because we generated more
 918   // coherence traffic on the lock *and* artificially extended the critical section
 919   // length while by virtue of passing control into the slow path.
 920 
 921   // box is really RAX -- the following CMPXCHG depends on that binding
 922   // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R)
 923   lock();
 924   cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 925   // There's no successor so we tried to regrab the lock.
 926   // If that didn't work, then another thread grabbed the
 927   // lock so we're done (and exit was a success).
 928   jccb  (Assembler::notEqual, LSuccess);
 929   // Intentional fall-through into slow path
 930 
 931   bind  (LGoSlowPath);
 932   orl   (boxReg, 1);                      // set ICC.ZF=0 to indicate failure
 933   jmpb  (DONE_LABEL);
 934 
 935   bind  (LSuccess);
 936   testl (boxReg, 0);                      // set ICC.ZF=1 to indicate success
 937   jmpb  (DONE_LABEL);
 938 
 939 #endif
 940   if (LockingMode == LM_LEGACY) {
 941     bind  (Stacked);
 942     movptr(tmpReg, Address (boxReg, 0));      // re-fetch
 943     lock();
 944     cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
 945     // Intentional fall-thru into DONE_LABEL
 946   }
 947 
 948   bind(DONE_LABEL);
 949 
 950   // ZFlag == 1 count in fast path
 951   // ZFlag == 0 count in slow path
 952   jccb(Assembler::notZero, NO_COUNT);
 953 
 954   bind(COUNT);
 955   // Count monitors in fast path
 956 #ifndef _LP64
 957   get_thread(tmpReg);
 958   decrementl(Address(tmpReg, JavaThread::held_monitor_count_offset()));
 959 #else // _LP64
 960   decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset()));
 961 #endif
 962 
 963   xorl(tmpReg, tmpReg); // Set ZF == 1
 964 
 965   bind(NO_COUNT);
 966 }
 967 
 968 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register rax_reg,
 969                                               Register t, Register thread) {
 970   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 971   assert(rax_reg == rax, "Used for CAS");
 972   assert_different_registers(obj, box, rax_reg, t, thread);
 973 
 974   // Handle inflated monitor.
 975   Label inflated;
 976   // Finish fast lock successfully. ZF value is irrelevant.
 977   Label locked;
 978   // Finish fast lock unsuccessfully. MUST jump with ZF == 0
 979   Label slow_path;
 980 
 981   if (DiagnoseSyncOnValueBasedClasses != 0) {
 982     load_klass(rax_reg, obj, t);
 983     movl(rax_reg, Address(rax_reg, Klass::access_flags_offset()));
 984     testl(rax_reg, JVM_ACC_IS_VALUE_BASED_CLASS);
 985     jcc(Assembler::notZero, slow_path);
 986   }
 987 
 988   const Register mark = t;
 989 
 990   { // Lightweight Lock
 991 
 992     Label push;
 993 
 994     const Register top = box;
 995 
 996     // Load the mark.
 997     movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 998 
 999     // Prefetch top.
1000     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
1001 
1002     // Check for monitor (0b10).
1003     testptr(mark, markWord::monitor_value);
1004     jcc(Assembler::notZero, inflated);
1005 
1006     // Check if lock-stack is full.
1007     cmpl(top, LockStack::end_offset() - 1);
1008     jcc(Assembler::greater, slow_path);
1009 
1010     // Check if recursive.
1011     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
1012     jccb(Assembler::equal, push);
1013 
1014     // Try to lock. Transition lock bits 0b01 => 0b00
1015     movptr(rax_reg, mark);
1016     orptr(rax_reg, markWord::unlocked_value);
1017     andptr(mark, ~(int32_t)markWord::unlocked_value);
1018     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
1019     jcc(Assembler::notEqual, slow_path);
1020 
1021     bind(push);
1022     // After successful lock, push object on lock-stack.
1023     movptr(Address(thread, top), obj);
1024     addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
1025     jmpb(locked);
1026   }
1027 
1028   { // Handle inflated monitor.
1029     bind(inflated);
1030 
1031     const Register tagged_monitor = mark;
1032 
1033     // CAS owner (null => current thread).
1034     xorptr(rax_reg, rax_reg);
1035     lock(); cmpxchgptr(thread, Address(tagged_monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
1036     jccb(Assembler::equal, locked);
1037 
1038     // Check if recursive.
1039     cmpptr(thread, rax_reg);
1040     jccb(Assembler::notEqual, slow_path);
1041 
1042     // Recursive.
1043     increment(Address(tagged_monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
1044   }
1045 
1046   bind(locked);
1047   increment(Address(thread, JavaThread::held_monitor_count_offset()));
1048   // Set ZF = 1
1049   xorl(rax_reg, rax_reg);
1050 
1051 #ifdef ASSERT
1052   // Check that locked label is reached with ZF set.
1053   Label zf_correct;
1054   jccb(Assembler::zero, zf_correct);
1055   stop("Fast Lock ZF != 1");
1056 #endif
1057 
1058   bind(slow_path);
1059 #ifdef ASSERT
1060   // Check that slow_path label is reached with ZF not set.
1061   jccb(Assembler::notZero, zf_correct);
1062   stop("Fast Lock ZF != 0");
1063   bind(zf_correct);
1064 #endif
1065   // C2 uses the value of ZF to determine the continuation.
1066 }
1067 
1068 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register reg_rax, Register t, Register thread) {
1069   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
1070   assert(reg_rax == rax, "Used for CAS");
1071   assert_different_registers(obj, reg_rax, t);
1072 
1073   // Handle inflated monitor.
1074   Label inflated, inflated_check_lock_stack;
1075   // Finish fast unlock successfully.  MUST jump with ZF == 1
1076   Label unlocked;
1077 
1078   // Assume success.
1079   decrement(Address(thread, JavaThread::held_monitor_count_offset()));
1080 
1081   const Register mark = t;
1082   const Register top = reg_rax;
1083 
1084   Label dummy;
1085   C2FastUnlockLightweightStub* stub = nullptr;
1086 
1087   if (!Compile::current()->output()->in_scratch_emit_size()) {
1088     stub = new (Compile::current()->comp_arena()) C2FastUnlockLightweightStub(obj, mark, reg_rax, thread);
1089     Compile::current()->output()->add_stub(stub);
1090   }
1091 
1092   Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path();
1093   Label& check_successor = stub == nullptr ? dummy : stub->check_successor();
1094 
1095   { // Lightweight Unlock
1096 
1097     // Load top.
1098     movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
1099 
1100     // Prefetch mark.
1101     movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
1102 
1103     // Check if obj is top of lock-stack.
1104     cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
1105     // Top of lock stack was not obj. Must be monitor.
1106     jcc(Assembler::notEqual, inflated_check_lock_stack);
1107 
1108     // Pop lock-stack.
1109     DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);)
1110     subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
1111 
1112     // Check if recursive.
1113     cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize));
1114     jcc(Assembler::equal, unlocked);
1115 
1116     // We elide the monitor check, let the CAS fail instead.
1117 
1118     // Try to unlock. Transition lock bits 0b00 => 0b01
1119     movptr(reg_rax, mark);
1120     andptr(reg_rax, ~(int32_t)markWord::lock_mask);
1121     orptr(mark, markWord::unlocked_value);
1122     lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
1123     jcc(Assembler::notEqual, push_and_slow_path);
1124     jmp(unlocked);
1125   }
1126 
1127 
1128   { // Handle inflated monitor.
1129     bind(inflated_check_lock_stack);
1130 #ifdef ASSERT
1131     Label check_done;
1132     subl(top, oopSize);
1133     cmpl(top, in_bytes(JavaThread::lock_stack_base_offset()));
1134     jcc(Assembler::below, check_done);
1135     cmpptr(obj, Address(thread, top));
1136     jccb(Assembler::notEqual, inflated_check_lock_stack);
1137     stop("Fast Unlock lock on stack");
1138     bind(check_done);
1139     testptr(mark, markWord::monitor_value);
1140     jccb(Assembler::notZero, inflated);
1141     stop("Fast Unlock not monitor");
1142 #endif
1143 
1144     bind(inflated);
1145 
1146     // mark contains the tagged ObjectMonitor*.
1147     const Register monitor = mark;
1148 
1149 #ifndef _LP64
1150     // Check if recursive.
1151     xorptr(reg_rax, reg_rax);
1152     orptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
1153     jcc(Assembler::notZero, check_successor);
1154 
1155     // Check if the entry lists are empty.
1156     movptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
1157     orptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
1158     jcc(Assembler::notZero, check_successor);
1159 
1160     // Release lock.
1161     movptr(Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
1162 #else // _LP64
1163     Label recursive;
1164 
1165     // Check if recursive.
1166     cmpptr(Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0);
1167     jccb(Assembler::notEqual, recursive);
1168 
1169     // Check if the entry lists are empty.
1170     movptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
1171     orptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
1172     jcc(Assembler::notZero, check_successor);
1173 
1174     // Release lock.
1175     movptr(Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
1176     jmpb(unlocked);
1177 
1178     // Recursive unlock.
1179     bind(recursive);
1180     decrement(Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
1181     xorl(t, t);
1182 #endif
1183   }
1184 
1185   bind(unlocked);
1186   if (stub != nullptr) {
1187     bind(stub->unlocked_continuation());
1188   }
1189 
1190 #ifdef ASSERT
1191   // Check that unlocked label is reached with ZF set.
1192   Label zf_correct;
1193   jccb(Assembler::zero, zf_correct);
1194   stop("Fast Unlock ZF != 1");
1195 #endif
1196 
1197   if (stub != nullptr) {
1198     bind(stub->slow_path_continuation());
1199   }
1200 #ifdef ASSERT
1201   // Check that stub->continuation() label is reached with ZF not set.
1202   jccb(Assembler::notZero, zf_correct);
1203   stop("Fast Unlock ZF != 0");
1204   bind(zf_correct);
1205 #endif
1206   // C2 uses the value of ZF to determine the continuation.
1207 }
1208 
1209 //-------------------------------------------------------------------------------------------
1210 // Generic instructions support for use in .ad files C2 code generation
1211 
1212 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) {
1213   if (dst != src) {
1214     movdqu(dst, src);
1215   }
1216   if (opcode == Op_AbsVD) {
1217     andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg);
1218   } else {
1219     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
1220     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
1221   }
1222 }
1223 
1224 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
1225   if (opcode == Op_AbsVD) {
1226     vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg);
1227   } else {
1228     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
1229     vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg);
1230   }
1231 }
1232 
1233 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) {
1234   if (dst != src) {
1235     movdqu(dst, src);
1236   }
1237   if (opcode == Op_AbsVF) {
1238     andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg);
1239   } else {
1240     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
1241     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
1242   }
1243 }
1244 
1245 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
1246   if (opcode == Op_AbsVF) {
1247     vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg);
1248   } else {
1249     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
1250     vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg);
1251   }
1252 }
1253 
1254 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
1255   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
1256   assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
1257 
1258   if (opcode == Op_MinV) {
1259     if (elem_bt == T_BYTE) {
1260       pminsb(dst, src);
1261     } else if (elem_bt == T_SHORT) {
1262       pminsw(dst, src);
1263     } else if (elem_bt == T_INT) {
1264       pminsd(dst, src);
1265     } else {
1266       assert(elem_bt == T_LONG, "required");
1267       assert(tmp == xmm0, "required");
1268       assert_different_registers(dst, src, tmp);
1269       movdqu(xmm0, dst);
1270       pcmpgtq(xmm0, src);
1271       blendvpd(dst, src);  // xmm0 as mask
1272     }
1273   } else { // opcode == Op_MaxV
1274     if (elem_bt == T_BYTE) {
1275       pmaxsb(dst, src);
1276     } else if (elem_bt == T_SHORT) {
1277       pmaxsw(dst, src);
1278     } else if (elem_bt == T_INT) {
1279       pmaxsd(dst, src);
1280     } else {
1281       assert(elem_bt == T_LONG, "required");
1282       assert(tmp == xmm0, "required");
1283       assert_different_registers(dst, src, tmp);
1284       movdqu(xmm0, src);
1285       pcmpgtq(xmm0, dst);
1286       blendvpd(dst, src);  // xmm0 as mask
1287     }
1288   }
1289 }
1290 
1291 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
1292                                  XMMRegister dst, XMMRegister src1, XMMRegister src2,
1293                                  int vlen_enc) {
1294   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
1295 
1296   if (opcode == Op_MinV) {
1297     if (elem_bt == T_BYTE) {
1298       vpminsb(dst, src1, src2, vlen_enc);
1299     } else if (elem_bt == T_SHORT) {
1300       vpminsw(dst, src1, src2, vlen_enc);
1301     } else if (elem_bt == T_INT) {
1302       vpminsd(dst, src1, src2, vlen_enc);
1303     } else {
1304       assert(elem_bt == T_LONG, "required");
1305       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
1306         vpminsq(dst, src1, src2, vlen_enc);
1307       } else {
1308         assert_different_registers(dst, src1, src2);
1309         vpcmpgtq(dst, src1, src2, vlen_enc);
1310         vblendvpd(dst, src1, src2, dst, vlen_enc);
1311       }
1312     }
1313   } else { // opcode == Op_MaxV
1314     if (elem_bt == T_BYTE) {
1315       vpmaxsb(dst, src1, src2, vlen_enc);
1316     } else if (elem_bt == T_SHORT) {
1317       vpmaxsw(dst, src1, src2, vlen_enc);
1318     } else if (elem_bt == T_INT) {
1319       vpmaxsd(dst, src1, src2, vlen_enc);
1320     } else {
1321       assert(elem_bt == T_LONG, "required");
1322       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
1323         vpmaxsq(dst, src1, src2, vlen_enc);
1324       } else {
1325         assert_different_registers(dst, src1, src2);
1326         vpcmpgtq(dst, src1, src2, vlen_enc);
1327         vblendvpd(dst, src2, src1, dst, vlen_enc);
1328       }
1329     }
1330   }
1331 }
1332 
1333 // Float/Double min max
1334 
1335 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
1336                                    XMMRegister dst, XMMRegister a, XMMRegister b,
1337                                    XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
1338                                    int vlen_enc) {
1339   assert(UseAVX > 0, "required");
1340   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1341          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1342   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1343   assert_different_registers(a, tmp, atmp, btmp);
1344   assert_different_registers(b, tmp, atmp, btmp);
1345 
1346   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1347   bool is_double_word = is_double_word_type(elem_bt);
1348 
1349   /* Note on 'non-obvious' assembly sequence:
1350    *
1351    * While there are vminps/vmaxps instructions, there are two important differences between hardware
1352    * and Java on how they handle floats:
1353    *  a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal)
1354    *  b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN)
1355    *
1356    * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing:
1357    *  a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps)
1358    *                (only useful when signs differ, noop otherwise)
1359    *  b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q])
1360 
1361    *  Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines):
1362    *   btmp = (b < +0.0) ? a : b
1363    *   atmp = (b < +0.0) ? b : a
1364    *   Tmp  = Max_Float(atmp , btmp)
1365    *   Res  = (atmp == NaN) ? atmp : Tmp
1366    */
1367 
1368   void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister);
1369   void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int);
1370   void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int);
1371   XMMRegister mask;
1372 
1373   if (!is_double_word && is_min) {
1374     mask = a;
1375     vblend = &MacroAssembler::vblendvps;
1376     vmaxmin = &MacroAssembler::vminps;
1377     vcmp = &MacroAssembler::vcmpps;
1378   } else if (!is_double_word && !is_min) {
1379     mask = b;
1380     vblend = &MacroAssembler::vblendvps;
1381     vmaxmin = &MacroAssembler::vmaxps;
1382     vcmp = &MacroAssembler::vcmpps;
1383   } else if (is_double_word && is_min) {
1384     mask = a;
1385     vblend = &MacroAssembler::vblendvpd;
1386     vmaxmin = &MacroAssembler::vminpd;
1387     vcmp = &MacroAssembler::vcmppd;
1388   } else {
1389     assert(is_double_word && !is_min, "sanity");
1390     mask = b;
1391     vblend = &MacroAssembler::vblendvpd;
1392     vmaxmin = &MacroAssembler::vmaxpd;
1393     vcmp = &MacroAssembler::vcmppd;
1394   }
1395 
1396   // Make sure EnableX86ECoreOpts isn't disabled on register overlaps
1397   XMMRegister maxmin, scratch;
1398   if (dst == btmp) {
1399     maxmin = btmp;
1400     scratch = tmp;
1401   } else {
1402     maxmin = tmp;
1403     scratch = btmp;
1404   }
1405 
1406   bool precompute_mask = EnableX86ECoreOpts && UseAVX>1;
1407   if (precompute_mask && !is_double_word) {
1408     vpsrad(tmp, mask, 32, vlen_enc);
1409     mask = tmp;
1410   } else if (precompute_mask && is_double_word) {
1411     vpxor(tmp, tmp, tmp, vlen_enc);
1412     vpcmpgtq(tmp, tmp, mask, vlen_enc);
1413     mask = tmp;
1414   }
1415 
1416   (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp);
1417   (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp);
1418   (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc);
1419   (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1420   (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch);
1421 }
1422 
1423 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
1424                                     XMMRegister dst, XMMRegister a, XMMRegister b,
1425                                     KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
1426                                     int vlen_enc) {
1427   assert(UseAVX > 2, "required");
1428   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1429          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1430   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1431   assert_different_registers(dst, a, atmp, btmp);
1432   assert_different_registers(dst, b, atmp, btmp);
1433 
1434   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1435   bool is_double_word = is_double_word_type(elem_bt);
1436   bool merge = true;
1437 
1438   if (!is_double_word && is_min) {
1439     evpmovd2m(ktmp, a, vlen_enc);
1440     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1441     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1442     vminps(dst, atmp, btmp, vlen_enc);
1443     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1444     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1445   } else if (!is_double_word && !is_min) {
1446     evpmovd2m(ktmp, b, vlen_enc);
1447     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1448     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1449     vmaxps(dst, atmp, btmp, vlen_enc);
1450     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1451     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1452   } else if (is_double_word && is_min) {
1453     evpmovq2m(ktmp, a, vlen_enc);
1454     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1455     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1456     vminpd(dst, atmp, btmp, vlen_enc);
1457     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1458     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1459   } else {
1460     assert(is_double_word && !is_min, "sanity");
1461     evpmovq2m(ktmp, b, vlen_enc);
1462     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1463     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1464     vmaxpd(dst, atmp, btmp, vlen_enc);
1465     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1466     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1467   }
1468 }
1469 
1470 // Float/Double signum
1471 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) {
1472   assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");
1473 
1474   Label DONE_LABEL;
1475 
1476   if (opcode == Op_SignumF) {
1477     assert(UseSSE > 0, "required");
1478     ucomiss(dst, zero);
1479     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1480     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1481     movflt(dst, one);
1482     jcc(Assembler::above, DONE_LABEL);
1483     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
1484   } else if (opcode == Op_SignumD) {
1485     assert(UseSSE > 1, "required");
1486     ucomisd(dst, zero);
1487     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1488     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1489     movdbl(dst, one);
1490     jcc(Assembler::above, DONE_LABEL);
1491     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
1492   }
1493 
1494   bind(DONE_LABEL);
1495 }
1496 
1497 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1498   if (sign) {
1499     pmovsxbw(dst, src);
1500   } else {
1501     pmovzxbw(dst, src);
1502   }
1503 }
1504 
1505 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1506   if (sign) {
1507     vpmovsxbw(dst, src, vector_len);
1508   } else {
1509     vpmovzxbw(dst, src, vector_len);
1510   }
1511 }
1512 
1513 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1514   if (sign) {
1515     vpmovsxbd(dst, src, vector_len);
1516   } else {
1517     vpmovzxbd(dst, src, vector_len);
1518   }
1519 }
1520 
1521 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1522   if (sign) {
1523     vpmovsxwd(dst, src, vector_len);
1524   } else {
1525     vpmovzxwd(dst, src, vector_len);
1526   }
1527 }
1528 
1529 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1530                                      int shift, int vector_len) {
1531   if (opcode == Op_RotateLeftV) {
1532     if (etype == T_INT) {
1533       evprold(dst, src, shift, vector_len);
1534     } else {
1535       assert(etype == T_LONG, "expected type T_LONG");
1536       evprolq(dst, src, shift, vector_len);
1537     }
1538   } else {
1539     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1540     if (etype == T_INT) {
1541       evprord(dst, src, shift, vector_len);
1542     } else {
1543       assert(etype == T_LONG, "expected type T_LONG");
1544       evprorq(dst, src, shift, vector_len);
1545     }
1546   }
1547 }
1548 
1549 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1550                                      XMMRegister shift, int vector_len) {
1551   if (opcode == Op_RotateLeftV) {
1552     if (etype == T_INT) {
1553       evprolvd(dst, src, shift, vector_len);
1554     } else {
1555       assert(etype == T_LONG, "expected type T_LONG");
1556       evprolvq(dst, src, shift, vector_len);
1557     }
1558   } else {
1559     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1560     if (etype == T_INT) {
1561       evprorvd(dst, src, shift, vector_len);
1562     } else {
1563       assert(etype == T_LONG, "expected type T_LONG");
1564       evprorvq(dst, src, shift, vector_len);
1565     }
1566   }
1567 }
1568 
1569 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1570   if (opcode == Op_RShiftVI) {
1571     psrad(dst, shift);
1572   } else if (opcode == Op_LShiftVI) {
1573     pslld(dst, shift);
1574   } else {
1575     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1576     psrld(dst, shift);
1577   }
1578 }
1579 
1580 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1581   switch (opcode) {
1582     case Op_RShiftVI:  psrad(dst, shift); break;
1583     case Op_LShiftVI:  pslld(dst, shift); break;
1584     case Op_URShiftVI: psrld(dst, shift); break;
1585 
1586     default: assert(false, "%s", NodeClassNames[opcode]);
1587   }
1588 }
1589 
1590 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1591   if (opcode == Op_RShiftVI) {
1592     vpsrad(dst, nds, shift, vector_len);
1593   } else if (opcode == Op_LShiftVI) {
1594     vpslld(dst, nds, shift, vector_len);
1595   } else {
1596     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1597     vpsrld(dst, nds, shift, vector_len);
1598   }
1599 }
1600 
1601 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1602   switch (opcode) {
1603     case Op_RShiftVI:  vpsrad(dst, src, shift, vlen_enc); break;
1604     case Op_LShiftVI:  vpslld(dst, src, shift, vlen_enc); break;
1605     case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1606 
1607     default: assert(false, "%s", NodeClassNames[opcode]);
1608   }
1609 }
1610 
1611 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1612   switch (opcode) {
1613     case Op_RShiftVB:  // fall-through
1614     case Op_RShiftVS:  psraw(dst, shift); break;
1615 
1616     case Op_LShiftVB:  // fall-through
1617     case Op_LShiftVS:  psllw(dst, shift);   break;
1618 
1619     case Op_URShiftVS: // fall-through
1620     case Op_URShiftVB: psrlw(dst, shift);  break;
1621 
1622     default: assert(false, "%s", NodeClassNames[opcode]);
1623   }
1624 }
1625 
1626 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1627   switch (opcode) {
1628     case Op_RShiftVB:  // fall-through
1629     case Op_RShiftVS:  vpsraw(dst, src, shift, vlen_enc); break;
1630 
1631     case Op_LShiftVB:  // fall-through
1632     case Op_LShiftVS:  vpsllw(dst, src, shift, vlen_enc); break;
1633 
1634     case Op_URShiftVS: // fall-through
1635     case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1636 
1637     default: assert(false, "%s", NodeClassNames[opcode]);
1638   }
1639 }
1640 
1641 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1642   switch (opcode) {
1643     case Op_RShiftVL:  psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1644     case Op_LShiftVL:  psllq(dst, shift); break;
1645     case Op_URShiftVL: psrlq(dst, shift); break;
1646 
1647     default: assert(false, "%s", NodeClassNames[opcode]);
1648   }
1649 }
1650 
1651 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1652   if (opcode == Op_RShiftVL) {
1653     psrlq(dst, shift);  // using srl to implement sra on pre-avs512 systems
1654   } else if (opcode == Op_LShiftVL) {
1655     psllq(dst, shift);
1656   } else {
1657     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1658     psrlq(dst, shift);
1659   }
1660 }
1661 
1662 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1663   switch (opcode) {
1664     case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1665     case Op_LShiftVL:  vpsllq(dst, src, shift, vlen_enc); break;
1666     case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1667 
1668     default: assert(false, "%s", NodeClassNames[opcode]);
1669   }
1670 }
1671 
1672 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1673   if (opcode == Op_RShiftVL) {
1674     evpsraq(dst, nds, shift, vector_len);
1675   } else if (opcode == Op_LShiftVL) {
1676     vpsllq(dst, nds, shift, vector_len);
1677   } else {
1678     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1679     vpsrlq(dst, nds, shift, vector_len);
1680   }
1681 }
1682 
1683 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1684   switch (opcode) {
1685     case Op_RShiftVB:  // fall-through
1686     case Op_RShiftVS:  // fall-through
1687     case Op_RShiftVI:  vpsravd(dst, src, shift, vlen_enc); break;
1688 
1689     case Op_LShiftVB:  // fall-through
1690     case Op_LShiftVS:  // fall-through
1691     case Op_LShiftVI:  vpsllvd(dst, src, shift, vlen_enc); break;
1692 
1693     case Op_URShiftVB: // fall-through
1694     case Op_URShiftVS: // fall-through
1695     case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1696 
1697     default: assert(false, "%s", NodeClassNames[opcode]);
1698   }
1699 }
1700 
1701 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1702   switch (opcode) {
1703     case Op_RShiftVB:  // fall-through
1704     case Op_RShiftVS:  evpsravw(dst, src, shift, vlen_enc); break;
1705 
1706     case Op_LShiftVB:  // fall-through
1707     case Op_LShiftVS:  evpsllvw(dst, src, shift, vlen_enc); break;
1708 
1709     case Op_URShiftVB: // fall-through
1710     case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1711 
1712     default: assert(false, "%s", NodeClassNames[opcode]);
1713   }
1714 }
1715 
1716 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1717   assert(UseAVX >= 2, "required");
1718   switch (opcode) {
1719     case Op_RShiftVL: {
1720       if (UseAVX > 2) {
1721         assert(tmp == xnoreg, "not used");
1722         if (!VM_Version::supports_avx512vl()) {
1723           vlen_enc = Assembler::AVX_512bit;
1724         }
1725         evpsravq(dst, src, shift, vlen_enc);
1726       } else {
1727         vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1728         vpsrlvq(dst, src, shift, vlen_enc);
1729         vpsrlvq(tmp, tmp, shift, vlen_enc);
1730         vpxor(dst, dst, tmp, vlen_enc);
1731         vpsubq(dst, dst, tmp, vlen_enc);
1732       }
1733       break;
1734     }
1735     case Op_LShiftVL: {
1736       assert(tmp == xnoreg, "not used");
1737       vpsllvq(dst, src, shift, vlen_enc);
1738       break;
1739     }
1740     case Op_URShiftVL: {
1741       assert(tmp == xnoreg, "not used");
1742       vpsrlvq(dst, src, shift, vlen_enc);
1743       break;
1744     }
1745     default: assert(false, "%s", NodeClassNames[opcode]);
1746   }
1747 }
1748 
1749 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1750 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1751   assert(opcode == Op_LShiftVB ||
1752          opcode == Op_RShiftVB ||
1753          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1754   bool sign = (opcode != Op_URShiftVB);
1755   assert(vector_len == 0, "required");
1756   vextendbd(sign, dst, src, 1);
1757   vpmovzxbd(vtmp, shift, 1);
1758   varshiftd(opcode, dst, dst, vtmp, 1);
1759   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg);
1760   vextracti128_high(vtmp, dst);
1761   vpackusdw(dst, dst, vtmp, 0);
1762 }
1763 
1764 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1765 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1766   assert(opcode == Op_LShiftVB ||
1767          opcode == Op_RShiftVB ||
1768          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1769   bool sign = (opcode != Op_URShiftVB);
1770   int ext_vector_len = vector_len + 1;
1771   vextendbw(sign, dst, src, ext_vector_len);
1772   vpmovzxbw(vtmp, shift, ext_vector_len);
1773   varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1774   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg);
1775   if (vector_len == 0) {
1776     vextracti128_high(vtmp, dst);
1777     vpackuswb(dst, dst, vtmp, vector_len);
1778   } else {
1779     vextracti64x4_high(vtmp, dst);
1780     vpackuswb(dst, dst, vtmp, vector_len);
1781     vpermq(dst, dst, 0xD8, vector_len);
1782   }
1783 }
1784 
1785 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1786   switch(typ) {
1787     case T_BYTE:
1788       pinsrb(dst, val, idx);
1789       break;
1790     case T_SHORT:
1791       pinsrw(dst, val, idx);
1792       break;
1793     case T_INT:
1794       pinsrd(dst, val, idx);
1795       break;
1796     case T_LONG:
1797       pinsrq(dst, val, idx);
1798       break;
1799     default:
1800       assert(false,"Should not reach here.");
1801       break;
1802   }
1803 }
1804 
1805 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1806   switch(typ) {
1807     case T_BYTE:
1808       vpinsrb(dst, src, val, idx);
1809       break;
1810     case T_SHORT:
1811       vpinsrw(dst, src, val, idx);
1812       break;
1813     case T_INT:
1814       vpinsrd(dst, src, val, idx);
1815       break;
1816     case T_LONG:
1817       vpinsrq(dst, src, val, idx);
1818       break;
1819     default:
1820       assert(false,"Should not reach here.");
1821       break;
1822   }
1823 }
1824 
1825 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1826   switch(typ) {
1827     case T_INT:
1828       vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1829       break;
1830     case T_FLOAT:
1831       vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1832       break;
1833     case T_LONG:
1834       vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1835       break;
1836     case T_DOUBLE:
1837       vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1838       break;
1839     default:
1840       assert(false,"Should not reach here.");
1841       break;
1842   }
1843 }
1844 
1845 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1846   switch(typ) {
1847     case T_INT:
1848       evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1849       break;
1850     case T_FLOAT:
1851       evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1852       break;
1853     case T_LONG:
1854       evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1855       break;
1856     case T_DOUBLE:
1857       evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1858       break;
1859     default:
1860       assert(false,"Should not reach here.");
1861       break;
1862   }
1863 }
1864 
1865 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1866   switch(typ) {
1867     case T_INT:
1868       evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1869       break;
1870     case T_FLOAT:
1871       evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1872       break;
1873     case T_LONG:
1874       evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1875       break;
1876     case T_DOUBLE:
1877       evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1878       break;
1879     default:
1880       assert(false,"Should not reach here.");
1881       break;
1882   }
1883 }
1884 
1885 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) {
1886   if (vlen_in_bytes <= 16) {
1887     pxor (dst, dst);
1888     psubb(dst, src);
1889     switch (elem_bt) {
1890       case T_BYTE:   /* nothing to do */ break;
1891       case T_SHORT:  pmovsxbw(dst, dst); break;
1892       case T_INT:    pmovsxbd(dst, dst); break;
1893       case T_FLOAT:  pmovsxbd(dst, dst); break;
1894       case T_LONG:   pmovsxbq(dst, dst); break;
1895       case T_DOUBLE: pmovsxbq(dst, dst); break;
1896 
1897       default: assert(false, "%s", type2name(elem_bt));
1898     }
1899   } else {
1900     assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, "");
1901     int vlen_enc = vector_length_encoding(vlen_in_bytes);
1902 
1903     vpxor (dst, dst, dst, vlen_enc);
1904     vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc);
1905 
1906     switch (elem_bt) {
1907       case T_BYTE:   /* nothing to do */            break;
1908       case T_SHORT:  vpmovsxbw(dst, dst, vlen_enc); break;
1909       case T_INT:    vpmovsxbd(dst, dst, vlen_enc); break;
1910       case T_FLOAT:  vpmovsxbd(dst, dst, vlen_enc); break;
1911       case T_LONG:   vpmovsxbq(dst, dst, vlen_enc); break;
1912       case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1913 
1914       default: assert(false, "%s", type2name(elem_bt));
1915     }
1916   }
1917 }
1918 
1919 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) {
1920   if (novlbwdq) {
1921     vpmovsxbd(xtmp, src, vlen_enc);
1922     evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()),
1923             Assembler::eq, true, vlen_enc, noreg);
1924   } else {
1925     vpxor(xtmp, xtmp, xtmp, vlen_enc);
1926     vpsubb(xtmp, xtmp, src, vlen_enc);
1927     evpmovb2m(dst, xtmp, vlen_enc);
1928   }
1929 }
1930 
1931 void C2_MacroAssembler::load_vector(XMMRegister dst, Address src, int vlen_in_bytes) {
1932   switch (vlen_in_bytes) {
1933     case 4:  movdl(dst, src);   break;
1934     case 8:  movq(dst, src);    break;
1935     case 16: movdqu(dst, src);  break;
1936     case 32: vmovdqu(dst, src); break;
1937     case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break;
1938     default: ShouldNotReachHere();
1939   }
1940 }
1941 
1942 void C2_MacroAssembler::load_vector(XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) {
1943   assert(rscratch != noreg || always_reachable(src), "missing");
1944 
1945   if (reachable(src)) {
1946     load_vector(dst, as_Address(src), vlen_in_bytes);
1947   } else {
1948     lea(rscratch, src);
1949     load_vector(dst, Address(rscratch, 0), vlen_in_bytes);
1950   }
1951 }
1952 
1953 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) {
1954   int vlen_enc = vector_length_encoding(vlen);
1955   if (VM_Version::supports_avx()) {
1956     if (bt == T_LONG) {
1957       if (VM_Version::supports_avx2()) {
1958         vpbroadcastq(dst, src, vlen_enc);
1959       } else {
1960         vmovddup(dst, src, vlen_enc);
1961       }
1962     } else if (bt == T_DOUBLE) {
1963       if (vlen_enc != Assembler::AVX_128bit) {
1964         vbroadcastsd(dst, src, vlen_enc, noreg);
1965       } else {
1966         vmovddup(dst, src, vlen_enc);
1967       }
1968     } else {
1969       if (VM_Version::supports_avx2() && is_integral_type(bt)) {
1970         vpbroadcastd(dst, src, vlen_enc);
1971       } else {
1972         vbroadcastss(dst, src, vlen_enc);
1973       }
1974     }
1975   } else if (VM_Version::supports_sse3()) {
1976     movddup(dst, src);
1977   } else {
1978     movq(dst, src);
1979     if (vlen == 16) {
1980       punpcklqdq(dst, dst);
1981     }
1982   }
1983 }
1984 
1985 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) {
1986   // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64.
1987   int offset = exact_log2(type2aelembytes(bt)) << 6;
1988   if (is_floating_point_type(bt)) {
1989     offset += 128;
1990   }
1991   ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset);
1992   load_vector(dst, addr, vlen_in_bytes);
1993 }
1994 
1995 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1996 
1997 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1998   int vector_len = Assembler::AVX_128bit;
1999 
2000   switch (opcode) {
2001     case Op_AndReductionV:  pand(dst, src); break;
2002     case Op_OrReductionV:   por (dst, src); break;
2003     case Op_XorReductionV:  pxor(dst, src); break;
2004     case Op_MinReductionV:
2005       switch (typ) {
2006         case T_BYTE:        pminsb(dst, src); break;
2007         case T_SHORT:       pminsw(dst, src); break;
2008         case T_INT:         pminsd(dst, src); break;
2009         case T_LONG:        assert(UseAVX > 2, "required");
2010                             vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
2011         default:            assert(false, "wrong type");
2012       }
2013       break;
2014     case Op_MaxReductionV:
2015       switch (typ) {
2016         case T_BYTE:        pmaxsb(dst, src); break;
2017         case T_SHORT:       pmaxsw(dst, src); break;
2018         case T_INT:         pmaxsd(dst, src); break;
2019         case T_LONG:        assert(UseAVX > 2, "required");
2020                             vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
2021         default:            assert(false, "wrong type");
2022       }
2023       break;
2024     case Op_AddReductionVF: addss(dst, src); break;
2025     case Op_AddReductionVD: addsd(dst, src); break;
2026     case Op_AddReductionVI:
2027       switch (typ) {
2028         case T_BYTE:        paddb(dst, src); break;
2029         case T_SHORT:       paddw(dst, src); break;
2030         case T_INT:         paddd(dst, src); break;
2031         default:            assert(false, "wrong type");
2032       }
2033       break;
2034     case Op_AddReductionVL: paddq(dst, src); break;
2035     case Op_MulReductionVF: mulss(dst, src); break;
2036     case Op_MulReductionVD: mulsd(dst, src); break;
2037     case Op_MulReductionVI:
2038       switch (typ) {
2039         case T_SHORT:       pmullw(dst, src); break;
2040         case T_INT:         pmulld(dst, src); break;
2041         default:            assert(false, "wrong type");
2042       }
2043       break;
2044     case Op_MulReductionVL: assert(UseAVX > 2, "required");
2045                             evpmullq(dst, dst, src, vector_len); break;
2046     default:                assert(false, "wrong opcode");
2047   }
2048 }
2049 
2050 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
2051   int vector_len = Assembler::AVX_256bit;
2052 
2053   switch (opcode) {
2054     case Op_AndReductionV:  vpand(dst, src1, src2, vector_len); break;
2055     case Op_OrReductionV:   vpor (dst, src1, src2, vector_len); break;
2056     case Op_XorReductionV:  vpxor(dst, src1, src2, vector_len); break;
2057     case Op_MinReductionV:
2058       switch (typ) {
2059         case T_BYTE:        vpminsb(dst, src1, src2, vector_len); break;
2060         case T_SHORT:       vpminsw(dst, src1, src2, vector_len); break;
2061         case T_INT:         vpminsd(dst, src1, src2, vector_len); break;
2062         case T_LONG:        assert(UseAVX > 2, "required");
2063                             vpminsq(dst, src1, src2, vector_len); break;
2064         default:            assert(false, "wrong type");
2065       }
2066       break;
2067     case Op_MaxReductionV:
2068       switch (typ) {
2069         case T_BYTE:        vpmaxsb(dst, src1, src2, vector_len); break;
2070         case T_SHORT:       vpmaxsw(dst, src1, src2, vector_len); break;
2071         case T_INT:         vpmaxsd(dst, src1, src2, vector_len); break;
2072         case T_LONG:        assert(UseAVX > 2, "required");
2073                             vpmaxsq(dst, src1, src2, vector_len); break;
2074         default:            assert(false, "wrong type");
2075       }
2076       break;
2077     case Op_AddReductionVI:
2078       switch (typ) {
2079         case T_BYTE:        vpaddb(dst, src1, src2, vector_len); break;
2080         case T_SHORT:       vpaddw(dst, src1, src2, vector_len); break;
2081         case T_INT:         vpaddd(dst, src1, src2, vector_len); break;
2082         default:            assert(false, "wrong type");
2083       }
2084       break;
2085     case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
2086     case Op_MulReductionVI:
2087       switch (typ) {
2088         case T_SHORT:       vpmullw(dst, src1, src2, vector_len); break;
2089         case T_INT:         vpmulld(dst, src1, src2, vector_len); break;
2090         default:            assert(false, "wrong type");
2091       }
2092       break;
2093     case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break;
2094     default:                assert(false, "wrong opcode");
2095   }
2096 }
2097 
2098 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
2099                                   XMMRegister dst, XMMRegister src,
2100                                   XMMRegister vtmp1, XMMRegister vtmp2) {
2101   switch (opcode) {
2102     case Op_AddReductionVF:
2103     case Op_MulReductionVF:
2104       reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
2105       break;
2106 
2107     case Op_AddReductionVD:
2108     case Op_MulReductionVD:
2109       reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
2110       break;
2111 
2112     default: assert(false, "wrong opcode");
2113   }
2114 }
2115 
2116 void C2_MacroAssembler::reduceB(int opcode, int vlen,
2117                              Register dst, Register src1, XMMRegister src2,
2118                              XMMRegister vtmp1, XMMRegister vtmp2) {
2119   switch (vlen) {
2120     case  8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2121     case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2122     case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2123     case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2124 
2125     default: assert(false, "wrong vector length");
2126   }
2127 }
2128 
2129 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
2130                              Register dst, Register src1, XMMRegister src2,
2131                              XMMRegister vtmp1, XMMRegister vtmp2) {
2132   switch (vlen) {
2133     case  8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2134     case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2135     case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2136     case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2137 
2138     default: assert(false, "wrong vector length");
2139   }
2140 }
2141 
2142 void C2_MacroAssembler::reduceS(int opcode, int vlen,
2143                              Register dst, Register src1, XMMRegister src2,
2144                              XMMRegister vtmp1, XMMRegister vtmp2) {
2145   switch (vlen) {
2146     case  4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2147     case  8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2148     case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2149     case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2150 
2151     default: assert(false, "wrong vector length");
2152   }
2153 }
2154 
2155 void C2_MacroAssembler::reduceI(int opcode, int vlen,
2156                              Register dst, Register src1, XMMRegister src2,
2157                              XMMRegister vtmp1, XMMRegister vtmp2) {
2158   switch (vlen) {
2159     case  2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2160     case  4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2161     case  8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2162     case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2163 
2164     default: assert(false, "wrong vector length");
2165   }
2166 }
2167 
2168 #ifdef _LP64
2169 void C2_MacroAssembler::reduceL(int opcode, int vlen,
2170                              Register dst, Register src1, XMMRegister src2,
2171                              XMMRegister vtmp1, XMMRegister vtmp2) {
2172   switch (vlen) {
2173     case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2174     case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2175     case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2176 
2177     default: assert(false, "wrong vector length");
2178   }
2179 }
2180 #endif // _LP64
2181 
2182 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2183   switch (vlen) {
2184     case 2:
2185       assert(vtmp2 == xnoreg, "");
2186       reduce2F(opcode, dst, src, vtmp1);
2187       break;
2188     case 4:
2189       assert(vtmp2 == xnoreg, "");
2190       reduce4F(opcode, dst, src, vtmp1);
2191       break;
2192     case 8:
2193       reduce8F(opcode, dst, src, vtmp1, vtmp2);
2194       break;
2195     case 16:
2196       reduce16F(opcode, dst, src, vtmp1, vtmp2);
2197       break;
2198     default: assert(false, "wrong vector length");
2199   }
2200 }
2201 
2202 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2203   switch (vlen) {
2204     case 2:
2205       assert(vtmp2 == xnoreg, "");
2206       reduce2D(opcode, dst, src, vtmp1);
2207       break;
2208     case 4:
2209       reduce4D(opcode, dst, src, vtmp1, vtmp2);
2210       break;
2211     case 8:
2212       reduce8D(opcode, dst, src, vtmp1, vtmp2);
2213       break;
2214     default: assert(false, "wrong vector length");
2215   }
2216 }
2217 
2218 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2219   if (opcode == Op_AddReductionVI) {
2220     if (vtmp1 != src2) {
2221       movdqu(vtmp1, src2);
2222     }
2223     phaddd(vtmp1, vtmp1);
2224   } else {
2225     pshufd(vtmp1, src2, 0x1);
2226     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2227   }
2228   movdl(vtmp2, src1);
2229   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2230   movdl(dst, vtmp1);
2231 }
2232 
2233 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2234   if (opcode == Op_AddReductionVI) {
2235     if (vtmp1 != src2) {
2236       movdqu(vtmp1, src2);
2237     }
2238     phaddd(vtmp1, src2);
2239     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2240   } else {
2241     pshufd(vtmp2, src2, 0xE);
2242     reduce_operation_128(T_INT, opcode, vtmp2, src2);
2243     reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2244   }
2245 }
2246 
2247 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2248   if (opcode == Op_AddReductionVI) {
2249     vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
2250     vextracti128_high(vtmp2, vtmp1);
2251     vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
2252     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2253   } else {
2254     vextracti128_high(vtmp1, src2);
2255     reduce_operation_128(T_INT, opcode, vtmp1, src2);
2256     reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2257   }
2258 }
2259 
2260 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2261   vextracti64x4_high(vtmp2, src2);
2262   reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
2263   reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2264 }
2265 
2266 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2267   pshufd(vtmp2, src2, 0x1);
2268   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2269   movdqu(vtmp1, vtmp2);
2270   psrldq(vtmp1, 2);
2271   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2272   movdqu(vtmp2, vtmp1);
2273   psrldq(vtmp2, 1);
2274   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2275   movdl(vtmp2, src1);
2276   pmovsxbd(vtmp1, vtmp1);
2277   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2278   pextrb(dst, vtmp1, 0x0);
2279   movsbl(dst, dst);
2280 }
2281 
2282 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2283   pshufd(vtmp1, src2, 0xE);
2284   reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
2285   reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2286 }
2287 
2288 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2289   vextracti128_high(vtmp2, src2);
2290   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2291   reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2292 }
2293 
2294 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2295   vextracti64x4_high(vtmp1, src2);
2296   reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
2297   reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2298 }
2299 
2300 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2301   pmovsxbw(vtmp2, src2);
2302   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2303 }
2304 
2305 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2306   if (UseAVX > 1) {
2307     int vector_len = Assembler::AVX_256bit;
2308     vpmovsxbw(vtmp1, src2, vector_len);
2309     reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2310   } else {
2311     pmovsxbw(vtmp2, src2);
2312     reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2313     pshufd(vtmp2, src2, 0x1);
2314     pmovsxbw(vtmp2, src2);
2315     reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2316   }
2317 }
2318 
2319 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2320   if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
2321     int vector_len = Assembler::AVX_512bit;
2322     vpmovsxbw(vtmp1, src2, vector_len);
2323     reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2324   } else {
2325     assert(UseAVX >= 2,"Should not reach here.");
2326     mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
2327     vextracti128_high(vtmp2, src2);
2328     mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2329   }
2330 }
2331 
2332 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2333   mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
2334   vextracti64x4_high(vtmp2, src2);
2335   mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2336 }
2337 
2338 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2339   if (opcode == Op_AddReductionVI) {
2340     if (vtmp1 != src2) {
2341       movdqu(vtmp1, src2);
2342     }
2343     phaddw(vtmp1, vtmp1);
2344     phaddw(vtmp1, vtmp1);
2345   } else {
2346     pshufd(vtmp2, src2, 0x1);
2347     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2348     movdqu(vtmp1, vtmp2);
2349     psrldq(vtmp1, 2);
2350     reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
2351   }
2352   movdl(vtmp2, src1);
2353   pmovsxwd(vtmp1, vtmp1);
2354   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2355   pextrw(dst, vtmp1, 0x0);
2356   movswl(dst, dst);
2357 }
2358 
2359 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2360   if (opcode == Op_AddReductionVI) {
2361     if (vtmp1 != src2) {
2362       movdqu(vtmp1, src2);
2363     }
2364     phaddw(vtmp1, src2);
2365   } else {
2366     pshufd(vtmp1, src2, 0xE);
2367     reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
2368   }
2369   reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2370 }
2371 
2372 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2373   if (opcode == Op_AddReductionVI) {
2374     int vector_len = Assembler::AVX_256bit;
2375     vphaddw(vtmp2, src2, src2, vector_len);
2376     vpermq(vtmp2, vtmp2, 0xD8, vector_len);
2377   } else {
2378     vextracti128_high(vtmp2, src2);
2379     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2380   }
2381   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2382 }
2383 
2384 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2385   int vector_len = Assembler::AVX_256bit;
2386   vextracti64x4_high(vtmp1, src2);
2387   reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
2388   reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2389 }
2390 
2391 #ifdef _LP64
2392 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2393   pshufd(vtmp2, src2, 0xE);
2394   reduce_operation_128(T_LONG, opcode, vtmp2, src2);
2395   movdq(vtmp1, src1);
2396   reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
2397   movdq(dst, vtmp1);
2398 }
2399 
2400 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2401   vextracti128_high(vtmp1, src2);
2402   reduce_operation_128(T_LONG, opcode, vtmp1, src2);
2403   reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2404 }
2405 
2406 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2407   vextracti64x4_high(vtmp2, src2);
2408   reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
2409   reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2410 }
2411 
2412 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
2413   mov64(temp, -1L);
2414   bzhiq(temp, temp, len);
2415   kmovql(dst, temp);
2416 }
2417 #endif // _LP64
2418 
2419 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2420   reduce_operation_128(T_FLOAT, opcode, dst, src);
2421   pshufd(vtmp, src, 0x1);
2422   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2423 }
2424 
2425 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2426   reduce2F(opcode, dst, src, vtmp);
2427   pshufd(vtmp, src, 0x2);
2428   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2429   pshufd(vtmp, src, 0x3);
2430   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2431 }
2432 
2433 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2434   reduce4F(opcode, dst, src, vtmp2);
2435   vextractf128_high(vtmp2, src);
2436   reduce4F(opcode, dst, vtmp2, vtmp1);
2437 }
2438 
2439 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2440   reduce8F(opcode, dst, src, vtmp1, vtmp2);
2441   vextracti64x4_high(vtmp1, src);
2442   reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
2443 }
2444 
2445 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2446   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2447   pshufd(vtmp, src, 0xE);
2448   reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
2449 }
2450 
2451 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2452   reduce2D(opcode, dst, src, vtmp2);
2453   vextractf128_high(vtmp2, src);
2454   reduce2D(opcode, dst, vtmp2, vtmp1);
2455 }
2456 
2457 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2458   reduce4D(opcode, dst, src, vtmp1, vtmp2);
2459   vextracti64x4_high(vtmp1, src);
2460   reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
2461 }
2462 
2463 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) {
2464   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2465 }
2466 
2467 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) {
2468   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2469 }
2470 
2471 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask,
2472                                  int vec_enc) {
2473   switch(elem_bt) {
2474     case T_INT:
2475     case T_FLOAT:
2476       vmaskmovps(dst, src, mask, vec_enc);
2477       break;
2478     case T_LONG:
2479     case T_DOUBLE:
2480       vmaskmovpd(dst, src, mask, vec_enc);
2481       break;
2482     default:
2483       fatal("Unsupported type %s", type2name(elem_bt));
2484       break;
2485   }
2486 }
2487 
2488 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask,
2489                                  int vec_enc) {
2490   switch(elem_bt) {
2491     case T_INT:
2492     case T_FLOAT:
2493       vmaskmovps(dst, src, mask, vec_enc);
2494       break;
2495     case T_LONG:
2496     case T_DOUBLE:
2497       vmaskmovpd(dst, src, mask, vec_enc);
2498       break;
2499     default:
2500       fatal("Unsupported type %s", type2name(elem_bt));
2501       break;
2502   }
2503 }
2504 
2505 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
2506                                           XMMRegister dst, XMMRegister src,
2507                                           XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2508                                           XMMRegister xmm_0, XMMRegister xmm_1) {
2509   const int permconst[] = {1, 14};
2510   XMMRegister wsrc = src;
2511   XMMRegister wdst = xmm_0;
2512   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2513 
2514   int vlen_enc = Assembler::AVX_128bit;
2515   if (vlen == 16) {
2516     vlen_enc = Assembler::AVX_256bit;
2517   }
2518 
2519   for (int i = log2(vlen) - 1; i >=0; i--) {
2520     if (i == 0 && !is_dst_valid) {
2521       wdst = dst;
2522     }
2523     if (i == 3) {
2524       vextracti64x4_high(wtmp, wsrc);
2525     } else if (i == 2) {
2526       vextracti128_high(wtmp, wsrc);
2527     } else { // i = [0,1]
2528       vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
2529     }
2530     vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2531     wsrc = wdst;
2532     vlen_enc = Assembler::AVX_128bit;
2533   }
2534   if (is_dst_valid) {
2535     vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2536   }
2537 }
2538 
2539 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
2540                                         XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2541                                         XMMRegister xmm_0, XMMRegister xmm_1) {
2542   XMMRegister wsrc = src;
2543   XMMRegister wdst = xmm_0;
2544   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2545   int vlen_enc = Assembler::AVX_128bit;
2546   if (vlen == 8) {
2547     vlen_enc = Assembler::AVX_256bit;
2548   }
2549   for (int i = log2(vlen) - 1; i >=0; i--) {
2550     if (i == 0 && !is_dst_valid) {
2551       wdst = dst;
2552     }
2553     if (i == 1) {
2554       vextracti128_high(wtmp, wsrc);
2555     } else if (i == 2) {
2556       vextracti64x4_high(wtmp, wsrc);
2557     } else {
2558       assert(i == 0, "%d", i);
2559       vpermilpd(wtmp, wsrc, 1, vlen_enc);
2560     }
2561     vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2562     wsrc = wdst;
2563     vlen_enc = Assembler::AVX_128bit;
2564   }
2565   if (is_dst_valid) {
2566     vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2567   }
2568 }
2569 
2570 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2571   switch (bt) {
2572     case T_BYTE:  pextrb(dst, src, idx); break;
2573     case T_SHORT: pextrw(dst, src, idx); break;
2574     case T_INT:   pextrd(dst, src, idx); break;
2575     case T_LONG:  pextrq(dst, src, idx); break;
2576 
2577     default:
2578       assert(false,"Should not reach here.");
2579       break;
2580   }
2581 }
2582 
2583 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2584   int esize =  type2aelembytes(typ);
2585   int elem_per_lane = 16/esize;
2586   int lane = elemindex / elem_per_lane;
2587   int eindex = elemindex % elem_per_lane;
2588 
2589   if (lane >= 2) {
2590     assert(UseAVX > 2, "required");
2591     vextractf32x4(dst, src, lane & 3);
2592     return dst;
2593   } else if (lane > 0) {
2594     assert(UseAVX > 0, "required");
2595     vextractf128(dst, src, lane);
2596     return dst;
2597   } else {
2598     return src;
2599   }
2600 }
2601 
2602 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) {
2603   if (typ == T_BYTE) {
2604     movsbl(dst, dst);
2605   } else if (typ == T_SHORT) {
2606     movswl(dst, dst);
2607   }
2608 }
2609 
2610 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2611   int esize =  type2aelembytes(typ);
2612   int elem_per_lane = 16/esize;
2613   int eindex = elemindex % elem_per_lane;
2614   assert(is_integral_type(typ),"required");
2615 
2616   if (eindex == 0) {
2617     if (typ == T_LONG) {
2618       movq(dst, src);
2619     } else {
2620       movdl(dst, src);
2621       movsxl(typ, dst);
2622     }
2623   } else {
2624     extract(typ, dst, src, eindex);
2625     movsxl(typ, dst);
2626   }
2627 }
2628 
2629 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) {
2630   int esize =  type2aelembytes(typ);
2631   int elem_per_lane = 16/esize;
2632   int eindex = elemindex % elem_per_lane;
2633   assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2634 
2635   if (eindex == 0) {
2636     movq(dst, src);
2637   } else {
2638     if (typ == T_FLOAT) {
2639       if (UseAVX == 0) {
2640         movdqu(dst, src);
2641         shufps(dst, dst, eindex);
2642       } else {
2643         vshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2644       }
2645     } else {
2646       if (UseAVX == 0) {
2647         movdqu(dst, src);
2648         psrldq(dst, eindex*esize);
2649       } else {
2650         vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2651       }
2652       movq(dst, dst);
2653     }
2654   }
2655   // Zero upper bits
2656   if (typ == T_FLOAT) {
2657     if (UseAVX == 0) {
2658       assert(vtmp != xnoreg, "required.");
2659       movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg);
2660       pand(dst, vtmp);
2661     } else {
2662       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg);
2663     }
2664   }
2665 }
2666 
2667 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) {
2668   switch(typ) {
2669     case T_BYTE:
2670     case T_BOOLEAN:
2671       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2672       break;
2673     case T_SHORT:
2674     case T_CHAR:
2675       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2676       break;
2677     case T_INT:
2678     case T_FLOAT:
2679       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2680       break;
2681     case T_LONG:
2682     case T_DOUBLE:
2683       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2684       break;
2685     default:
2686       assert(false,"Should not reach here.");
2687       break;
2688   }
2689 }
2690 
2691 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) {
2692   assert(rscratch != noreg || always_reachable(src2), "missing");
2693 
2694   switch(typ) {
2695     case T_BOOLEAN:
2696     case T_BYTE:
2697       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2698       break;
2699     case T_CHAR:
2700     case T_SHORT:
2701       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2702       break;
2703     case T_INT:
2704     case T_FLOAT:
2705       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2706       break;
2707     case T_LONG:
2708     case T_DOUBLE:
2709       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2710       break;
2711     default:
2712       assert(false,"Should not reach here.");
2713       break;
2714   }
2715 }
2716 
2717 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2718   switch(typ) {
2719     case T_BYTE:
2720       evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2721       break;
2722     case T_SHORT:
2723       evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2724       break;
2725     case T_INT:
2726     case T_FLOAT:
2727       evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2728       break;
2729     case T_LONG:
2730     case T_DOUBLE:
2731       evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2732       break;
2733     default:
2734       assert(false,"Should not reach here.");
2735       break;
2736   }
2737 }
2738 
2739 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) {
2740   assert(vlen_in_bytes <= 32, "");
2741   int esize = type2aelembytes(bt);
2742   if (vlen_in_bytes == 32) {
2743     assert(vtmp == xnoreg, "required.");
2744     if (esize >= 4) {
2745       vtestps(src1, src2, AVX_256bit);
2746     } else {
2747       vptest(src1, src2, AVX_256bit);
2748     }
2749     return;
2750   }
2751   if (vlen_in_bytes < 16) {
2752     // Duplicate the lower part to fill the whole register,
2753     // Don't need to do so for src2
2754     assert(vtmp != xnoreg, "required");
2755     int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04;
2756     pshufd(vtmp, src1, shuffle_imm);
2757   } else {
2758     assert(vtmp == xnoreg, "required");
2759     vtmp = src1;
2760   }
2761   if (esize >= 4 && VM_Version::supports_avx()) {
2762     vtestps(vtmp, src2, AVX_128bit);
2763   } else {
2764     ptest(vtmp, src2);
2765   }
2766 }
2767 
2768 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
2769   assert(UseAVX >= 2, "required");
2770 #ifdef ASSERT
2771   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2772   bool is_bw_supported = VM_Version::supports_avx512bw();
2773   if (is_bw && !is_bw_supported) {
2774     assert(vlen_enc != Assembler::AVX_512bit, "required");
2775     assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16),
2776            "XMM register should be 0-15");
2777   }
2778 #endif // ASSERT
2779   switch (elem_bt) {
2780     case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return;
2781     case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return;
2782     case T_INT: vpaddd(dst, src1, src2, vlen_enc); return;
2783     case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return;
2784     case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return;
2785     case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return;
2786     default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2787   }
2788 }
2789 
2790 #ifdef _LP64
2791 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) {
2792   assert(UseAVX >= 2, "required");
2793   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2794   bool is_vl = vlen_enc != Assembler::AVX_512bit;
2795   if ((UseAVX > 2) &&
2796       (!is_bw || VM_Version::supports_avx512bw()) &&
2797       (!is_vl || VM_Version::supports_avx512vl())) {
2798     switch (elem_bt) {
2799       case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return;
2800       case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return;
2801       case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return;
2802       case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return;
2803       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2804     }
2805   } else {
2806     assert(vlen_enc != Assembler::AVX_512bit, "required");
2807     assert((dst->encoding() < 16),"XMM register should be 0-15");
2808     switch (elem_bt) {
2809       case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return;
2810       case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return;
2811       case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return;
2812       case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return;
2813       case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return;
2814       case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return;
2815       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2816     }
2817   }
2818 }
2819 #endif
2820 
2821 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
2822   switch (to_elem_bt) {
2823     case T_SHORT:
2824       vpmovsxbw(dst, src, vlen_enc);
2825       break;
2826     case T_INT:
2827       vpmovsxbd(dst, src, vlen_enc);
2828       break;
2829     case T_FLOAT:
2830       vpmovsxbd(dst, src, vlen_enc);
2831       vcvtdq2ps(dst, dst, vlen_enc);
2832       break;
2833     case T_LONG:
2834       vpmovsxbq(dst, src, vlen_enc);
2835       break;
2836     case T_DOUBLE: {
2837       int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
2838       vpmovsxbd(dst, src, mid_vlen_enc);
2839       vcvtdq2pd(dst, dst, vlen_enc);
2840       break;
2841     }
2842     default:
2843       fatal("Unsupported type %s", type2name(to_elem_bt));
2844       break;
2845   }
2846 }
2847 
2848 //-------------------------------------------------------------------------------------------
2849 
2850 // IndexOf for constant substrings with size >= 8 chars
2851 // which don't need to be loaded through stack.
2852 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2853                                          Register cnt1, Register cnt2,
2854                                          int int_cnt2,  Register result,
2855                                          XMMRegister vec, Register tmp,
2856                                          int ae) {
2857   ShortBranchVerifier sbv(this);
2858   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2859   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2860 
2861   // This method uses the pcmpestri instruction with bound registers
2862   //   inputs:
2863   //     xmm - substring
2864   //     rax - substring length (elements count)
2865   //     mem - scanned string
2866   //     rdx - string length (elements count)
2867   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2868   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2869   //   outputs:
2870   //     rcx - matched index in string
2871   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2872   int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2873   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2874   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2875   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2876 
2877   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2878         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2879         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2880 
2881   // Note, inline_string_indexOf() generates checks:
2882   // if (substr.count > string.count) return -1;
2883   // if (substr.count == 0) return 0;
2884   assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2885 
2886   // Load substring.
2887   if (ae == StrIntrinsicNode::UL) {
2888     pmovzxbw(vec, Address(str2, 0));
2889   } else {
2890     movdqu(vec, Address(str2, 0));
2891   }
2892   movl(cnt2, int_cnt2);
2893   movptr(result, str1); // string addr
2894 
2895   if (int_cnt2 > stride) {
2896     jmpb(SCAN_TO_SUBSTR);
2897 
2898     // Reload substr for rescan, this code
2899     // is executed only for large substrings (> 8 chars)
2900     bind(RELOAD_SUBSTR);
2901     if (ae == StrIntrinsicNode::UL) {
2902       pmovzxbw(vec, Address(str2, 0));
2903     } else {
2904       movdqu(vec, Address(str2, 0));
2905     }
2906     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2907 
2908     bind(RELOAD_STR);
2909     // We came here after the beginning of the substring was
2910     // matched but the rest of it was not so we need to search
2911     // again. Start from the next element after the previous match.
2912 
2913     // cnt2 is number of substring reminding elements and
2914     // cnt1 is number of string reminding elements when cmp failed.
2915     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
2916     subl(cnt1, cnt2);
2917     addl(cnt1, int_cnt2);
2918     movl(cnt2, int_cnt2); // Now restore cnt2
2919 
2920     decrementl(cnt1);     // Shift to next element
2921     cmpl(cnt1, cnt2);
2922     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2923 
2924     addptr(result, (1<<scale1));
2925 
2926   } // (int_cnt2 > 8)
2927 
2928   // Scan string for start of substr in 16-byte vectors
2929   bind(SCAN_TO_SUBSTR);
2930   pcmpestri(vec, Address(result, 0), mode);
2931   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2932   subl(cnt1, stride);
2933   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2934   cmpl(cnt1, cnt2);
2935   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2936   addptr(result, 16);
2937   jmpb(SCAN_TO_SUBSTR);
2938 
2939   // Found a potential substr
2940   bind(FOUND_CANDIDATE);
2941   // Matched whole vector if first element matched (tmp(rcx) == 0).
2942   if (int_cnt2 == stride) {
2943     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
2944   } else { // int_cnt2 > 8
2945     jccb(Assembler::overflow, FOUND_SUBSTR);
2946   }
2947   // After pcmpestri tmp(rcx) contains matched element index
2948   // Compute start addr of substr
2949   lea(result, Address(result, tmp, scale1));
2950 
2951   // Make sure string is still long enough
2952   subl(cnt1, tmp);
2953   cmpl(cnt1, cnt2);
2954   if (int_cnt2 == stride) {
2955     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2956   } else { // int_cnt2 > 8
2957     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
2958   }
2959   // Left less then substring.
2960 
2961   bind(RET_NOT_FOUND);
2962   movl(result, -1);
2963   jmp(EXIT);
2964 
2965   if (int_cnt2 > stride) {
2966     // This code is optimized for the case when whole substring
2967     // is matched if its head is matched.
2968     bind(MATCH_SUBSTR_HEAD);
2969     pcmpestri(vec, Address(result, 0), mode);
2970     // Reload only string if does not match
2971     jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
2972 
2973     Label CONT_SCAN_SUBSTR;
2974     // Compare the rest of substring (> 8 chars).
2975     bind(FOUND_SUBSTR);
2976     // First 8 chars are already matched.
2977     negptr(cnt2);
2978     addptr(cnt2, stride);
2979 
2980     bind(SCAN_SUBSTR);
2981     subl(cnt1, stride);
2982     cmpl(cnt2, -stride); // Do not read beyond substring
2983     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
2984     // Back-up strings to avoid reading beyond substring:
2985     // cnt1 = cnt1 - cnt2 + 8
2986     addl(cnt1, cnt2); // cnt2 is negative
2987     addl(cnt1, stride);
2988     movl(cnt2, stride); negptr(cnt2);
2989     bind(CONT_SCAN_SUBSTR);
2990     if (int_cnt2 < (int)G) {
2991       int tail_off1 = int_cnt2<<scale1;
2992       int tail_off2 = int_cnt2<<scale2;
2993       if (ae == StrIntrinsicNode::UL) {
2994         pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
2995       } else {
2996         movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
2997       }
2998       pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
2999     } else {
3000       // calculate index in register to avoid integer overflow (int_cnt2*2)
3001       movl(tmp, int_cnt2);
3002       addptr(tmp, cnt2);
3003       if (ae == StrIntrinsicNode::UL) {
3004         pmovzxbw(vec, Address(str2, tmp, scale2, 0));
3005       } else {
3006         movdqu(vec, Address(str2, tmp, scale2, 0));
3007       }
3008       pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
3009     }
3010     // Need to reload strings pointers if not matched whole vector
3011     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
3012     addptr(cnt2, stride);
3013     jcc(Assembler::negative, SCAN_SUBSTR);
3014     // Fall through if found full substring
3015 
3016   } // (int_cnt2 > 8)
3017 
3018   bind(RET_FOUND);
3019   // Found result if we matched full small substring.
3020   // Compute substr offset
3021   subptr(result, str1);
3022   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3023     shrl(result, 1); // index
3024   }
3025   bind(EXIT);
3026 
3027 } // string_indexofC8
3028 
3029 // Small strings are loaded through stack if they cross page boundary.
3030 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
3031                                        Register cnt1, Register cnt2,
3032                                        int int_cnt2,  Register result,
3033                                        XMMRegister vec, Register tmp,
3034                                        int ae) {
3035   ShortBranchVerifier sbv(this);
3036   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3037   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
3038 
3039   //
3040   // int_cnt2 is length of small (< 8 chars) constant substring
3041   // or (-1) for non constant substring in which case its length
3042   // is in cnt2 register.
3043   //
3044   // Note, inline_string_indexOf() generates checks:
3045   // if (substr.count > string.count) return -1;
3046   // if (substr.count == 0) return 0;
3047   //
3048   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
3049   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
3050   // This method uses the pcmpestri instruction with bound registers
3051   //   inputs:
3052   //     xmm - substring
3053   //     rax - substring length (elements count)
3054   //     mem - scanned string
3055   //     rdx - string length (elements count)
3056   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
3057   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
3058   //   outputs:
3059   //     rcx - matched index in string
3060   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
3061   int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
3062   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
3063   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
3064 
3065   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
3066         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
3067         FOUND_CANDIDATE;
3068 
3069   { //========================================================
3070     // We don't know where these strings are located
3071     // and we can't read beyond them. Load them through stack.
3072     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
3073 
3074     movptr(tmp, rsp); // save old SP
3075 
3076     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
3077       if (int_cnt2 == (1>>scale2)) { // One byte
3078         assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
3079         load_unsigned_byte(result, Address(str2, 0));
3080         movdl(vec, result); // move 32 bits
3081       } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
3082         // Not enough header space in 32-bit VM: 12+3 = 15.
3083         movl(result, Address(str2, -1));
3084         shrl(result, 8);
3085         movdl(vec, result); // move 32 bits
3086       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
3087         load_unsigned_short(result, Address(str2, 0));
3088         movdl(vec, result); // move 32 bits
3089       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
3090         movdl(vec, Address(str2, 0)); // move 32 bits
3091       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
3092         movq(vec, Address(str2, 0));  // move 64 bits
3093       } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
3094         // Array header size is 12 bytes in 32-bit VM
3095         // + 6 bytes for 3 chars == 18 bytes,
3096         // enough space to load vec and shift.
3097         assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
3098         if (ae == StrIntrinsicNode::UL) {
3099           int tail_off = int_cnt2-8;
3100           pmovzxbw(vec, Address(str2, tail_off));
3101           psrldq(vec, -2*tail_off);
3102         }
3103         else {
3104           int tail_off = int_cnt2*(1<<scale2);
3105           movdqu(vec, Address(str2, tail_off-16));
3106           psrldq(vec, 16-tail_off);
3107         }
3108       }
3109     } else { // not constant substring
3110       cmpl(cnt2, stride);
3111       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
3112 
3113       // We can read beyond string if srt+16 does not cross page boundary
3114       // since heaps are aligned and mapped by pages.
3115       assert(os::vm_page_size() < (int)G, "default page should be small");
3116       movl(result, str2); // We need only low 32 bits
3117       andl(result, ((int)os::vm_page_size()-1));
3118       cmpl(result, ((int)os::vm_page_size()-16));
3119       jccb(Assembler::belowEqual, CHECK_STR);
3120 
3121       // Move small strings to stack to allow load 16 bytes into vec.
3122       subptr(rsp, 16);
3123       int stk_offset = wordSize-(1<<scale2);
3124       push(cnt2);
3125 
3126       bind(COPY_SUBSTR);
3127       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
3128         load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
3129         movb(Address(rsp, cnt2, scale2, stk_offset), result);
3130       } else if (ae == StrIntrinsicNode::UU) {
3131         load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
3132         movw(Address(rsp, cnt2, scale2, stk_offset), result);
3133       }
3134       decrement(cnt2);
3135       jccb(Assembler::notZero, COPY_SUBSTR);
3136 
3137       pop(cnt2);
3138       movptr(str2, rsp);  // New substring address
3139     } // non constant
3140 
3141     bind(CHECK_STR);
3142     cmpl(cnt1, stride);
3143     jccb(Assembler::aboveEqual, BIG_STRINGS);
3144 
3145     // Check cross page boundary.
3146     movl(result, str1); // We need only low 32 bits
3147     andl(result, ((int)os::vm_page_size()-1));
3148     cmpl(result, ((int)os::vm_page_size()-16));
3149     jccb(Assembler::belowEqual, BIG_STRINGS);
3150 
3151     subptr(rsp, 16);
3152     int stk_offset = -(1<<scale1);
3153     if (int_cnt2 < 0) { // not constant
3154       push(cnt2);
3155       stk_offset += wordSize;
3156     }
3157     movl(cnt2, cnt1);
3158 
3159     bind(COPY_STR);
3160     if (ae == StrIntrinsicNode::LL) {
3161       load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
3162       movb(Address(rsp, cnt2, scale1, stk_offset), result);
3163     } else {
3164       load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
3165       movw(Address(rsp, cnt2, scale1, stk_offset), result);
3166     }
3167     decrement(cnt2);
3168     jccb(Assembler::notZero, COPY_STR);
3169 
3170     if (int_cnt2 < 0) { // not constant
3171       pop(cnt2);
3172     }
3173     movptr(str1, rsp);  // New string address
3174 
3175     bind(BIG_STRINGS);
3176     // Load substring.
3177     if (int_cnt2 < 0) { // -1
3178       if (ae == StrIntrinsicNode::UL) {
3179         pmovzxbw(vec, Address(str2, 0));
3180       } else {
3181         movdqu(vec, Address(str2, 0));
3182       }
3183       push(cnt2);       // substr count
3184       push(str2);       // substr addr
3185       push(str1);       // string addr
3186     } else {
3187       // Small (< 8 chars) constant substrings are loaded already.
3188       movl(cnt2, int_cnt2);
3189     }
3190     push(tmp);  // original SP
3191 
3192   } // Finished loading
3193 
3194   //========================================================
3195   // Start search
3196   //
3197 
3198   movptr(result, str1); // string addr
3199 
3200   if (int_cnt2  < 0) {  // Only for non constant substring
3201     jmpb(SCAN_TO_SUBSTR);
3202 
3203     // SP saved at sp+0
3204     // String saved at sp+1*wordSize
3205     // Substr saved at sp+2*wordSize
3206     // Substr count saved at sp+3*wordSize
3207 
3208     // Reload substr for rescan, this code
3209     // is executed only for large substrings (> 8 chars)
3210     bind(RELOAD_SUBSTR);
3211     movptr(str2, Address(rsp, 2*wordSize));
3212     movl(cnt2, Address(rsp, 3*wordSize));
3213     if (ae == StrIntrinsicNode::UL) {
3214       pmovzxbw(vec, Address(str2, 0));
3215     } else {
3216       movdqu(vec, Address(str2, 0));
3217     }
3218     // We came here after the beginning of the substring was
3219     // matched but the rest of it was not so we need to search
3220     // again. Start from the next element after the previous match.
3221     subptr(str1, result); // Restore counter
3222     if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3223       shrl(str1, 1);
3224     }
3225     addl(cnt1, str1);
3226     decrementl(cnt1);   // Shift to next element
3227     cmpl(cnt1, cnt2);
3228     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3229 
3230     addptr(result, (1<<scale1));
3231   } // non constant
3232 
3233   // Scan string for start of substr in 16-byte vectors
3234   bind(SCAN_TO_SUBSTR);
3235   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
3236   pcmpestri(vec, Address(result, 0), mode);
3237   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
3238   subl(cnt1, stride);
3239   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
3240   cmpl(cnt1, cnt2);
3241   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
3242   addptr(result, 16);
3243 
3244   bind(ADJUST_STR);
3245   cmpl(cnt1, stride); // Do not read beyond string
3246   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
3247   // Back-up string to avoid reading beyond string.
3248   lea(result, Address(result, cnt1, scale1, -16));
3249   movl(cnt1, stride);
3250   jmpb(SCAN_TO_SUBSTR);
3251 
3252   // Found a potential substr
3253   bind(FOUND_CANDIDATE);
3254   // After pcmpestri tmp(rcx) contains matched element index
3255 
3256   // Make sure string is still long enough
3257   subl(cnt1, tmp);
3258   cmpl(cnt1, cnt2);
3259   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
3260   // Left less then substring.
3261 
3262   bind(RET_NOT_FOUND);
3263   movl(result, -1);
3264   jmp(CLEANUP);
3265 
3266   bind(FOUND_SUBSTR);
3267   // Compute start addr of substr
3268   lea(result, Address(result, tmp, scale1));
3269   if (int_cnt2 > 0) { // Constant substring
3270     // Repeat search for small substring (< 8 chars)
3271     // from new point without reloading substring.
3272     // Have to check that we don't read beyond string.
3273     cmpl(tmp, stride-int_cnt2);
3274     jccb(Assembler::greater, ADJUST_STR);
3275     // Fall through if matched whole substring.
3276   } else { // non constant
3277     assert(int_cnt2 == -1, "should be != 0");
3278 
3279     addl(tmp, cnt2);
3280     // Found result if we matched whole substring.
3281     cmpl(tmp, stride);
3282     jcc(Assembler::lessEqual, RET_FOUND);
3283 
3284     // Repeat search for small substring (<= 8 chars)
3285     // from new point 'str1' without reloading substring.
3286     cmpl(cnt2, stride);
3287     // Have to check that we don't read beyond string.
3288     jccb(Assembler::lessEqual, ADJUST_STR);
3289 
3290     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
3291     // Compare the rest of substring (> 8 chars).
3292     movptr(str1, result);
3293 
3294     cmpl(tmp, cnt2);
3295     // First 8 chars are already matched.
3296     jccb(Assembler::equal, CHECK_NEXT);
3297 
3298     bind(SCAN_SUBSTR);
3299     pcmpestri(vec, Address(str1, 0), mode);
3300     // Need to reload strings pointers if not matched whole vector
3301     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
3302 
3303     bind(CHECK_NEXT);
3304     subl(cnt2, stride);
3305     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
3306     addptr(str1, 16);
3307     if (ae == StrIntrinsicNode::UL) {
3308       addptr(str2, 8);
3309     } else {
3310       addptr(str2, 16);
3311     }
3312     subl(cnt1, stride);
3313     cmpl(cnt2, stride); // Do not read beyond substring
3314     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
3315     // Back-up strings to avoid reading beyond substring.
3316 
3317     if (ae == StrIntrinsicNode::UL) {
3318       lea(str2, Address(str2, cnt2, scale2, -8));
3319       lea(str1, Address(str1, cnt2, scale1, -16));
3320     } else {
3321       lea(str2, Address(str2, cnt2, scale2, -16));
3322       lea(str1, Address(str1, cnt2, scale1, -16));
3323     }
3324     subl(cnt1, cnt2);
3325     movl(cnt2, stride);
3326     addl(cnt1, stride);
3327     bind(CONT_SCAN_SUBSTR);
3328     if (ae == StrIntrinsicNode::UL) {
3329       pmovzxbw(vec, Address(str2, 0));
3330     } else {
3331       movdqu(vec, Address(str2, 0));
3332     }
3333     jmp(SCAN_SUBSTR);
3334 
3335     bind(RET_FOUND_LONG);
3336     movptr(str1, Address(rsp, wordSize));
3337   } // non constant
3338 
3339   bind(RET_FOUND);
3340   // Compute substr offset
3341   subptr(result, str1);
3342   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3343     shrl(result, 1); // index
3344   }
3345   bind(CLEANUP);
3346   pop(rsp); // restore SP
3347 
3348 } // string_indexof
3349 
3350 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3351                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3352   ShortBranchVerifier sbv(this);
3353   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3354 
3355   int stride = 8;
3356 
3357   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
3358         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
3359         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
3360         FOUND_SEQ_CHAR, DONE_LABEL;
3361 
3362   movptr(result, str1);
3363   if (UseAVX >= 2) {
3364     cmpl(cnt1, stride);
3365     jcc(Assembler::less, SCAN_TO_CHAR);
3366     cmpl(cnt1, 2*stride);
3367     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
3368     movdl(vec1, ch);
3369     vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
3370     vpxor(vec2, vec2);
3371     movl(tmp, cnt1);
3372     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
3373     andl(cnt1,0x0000000F);  //tail count (in chars)
3374 
3375     bind(SCAN_TO_16_CHAR_LOOP);
3376     vmovdqu(vec3, Address(result, 0));
3377     vpcmpeqw(vec3, vec3, vec1, 1);
3378     vptest(vec2, vec3);
3379     jcc(Assembler::carryClear, FOUND_CHAR);
3380     addptr(result, 32);
3381     subl(tmp, 2*stride);
3382     jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
3383     jmp(SCAN_TO_8_CHAR);
3384     bind(SCAN_TO_8_CHAR_INIT);
3385     movdl(vec1, ch);
3386     pshuflw(vec1, vec1, 0x00);
3387     pshufd(vec1, vec1, 0);
3388     pxor(vec2, vec2);
3389   }
3390   bind(SCAN_TO_8_CHAR);
3391   cmpl(cnt1, stride);
3392   jcc(Assembler::less, SCAN_TO_CHAR);
3393   if (UseAVX < 2) {
3394     movdl(vec1, ch);
3395     pshuflw(vec1, vec1, 0x00);
3396     pshufd(vec1, vec1, 0);
3397     pxor(vec2, vec2);
3398   }
3399   movl(tmp, cnt1);
3400   andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
3401   andl(cnt1,0x00000007);  //tail count (in chars)
3402 
3403   bind(SCAN_TO_8_CHAR_LOOP);
3404   movdqu(vec3, Address(result, 0));
3405   pcmpeqw(vec3, vec1);
3406   ptest(vec2, vec3);
3407   jcc(Assembler::carryClear, FOUND_CHAR);
3408   addptr(result, 16);
3409   subl(tmp, stride);
3410   jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
3411   bind(SCAN_TO_CHAR);
3412   testl(cnt1, cnt1);
3413   jcc(Assembler::zero, RET_NOT_FOUND);
3414   bind(SCAN_TO_CHAR_LOOP);
3415   load_unsigned_short(tmp, Address(result, 0));
3416   cmpl(ch, tmp);
3417   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3418   addptr(result, 2);
3419   subl(cnt1, 1);
3420   jccb(Assembler::zero, RET_NOT_FOUND);
3421   jmp(SCAN_TO_CHAR_LOOP);
3422 
3423   bind(RET_NOT_FOUND);
3424   movl(result, -1);
3425   jmpb(DONE_LABEL);
3426 
3427   bind(FOUND_CHAR);
3428   if (UseAVX >= 2) {
3429     vpmovmskb(tmp, vec3);
3430   } else {
3431     pmovmskb(tmp, vec3);
3432   }
3433   bsfl(ch, tmp);
3434   addptr(result, ch);
3435 
3436   bind(FOUND_SEQ_CHAR);
3437   subptr(result, str1);
3438   shrl(result, 1);
3439 
3440   bind(DONE_LABEL);
3441 } // string_indexof_char
3442 
3443 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3444                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3445   ShortBranchVerifier sbv(this);
3446   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3447 
3448   int stride = 16;
3449 
3450   Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
3451         SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
3452         RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
3453         FOUND_SEQ_CHAR, DONE_LABEL;
3454 
3455   movptr(result, str1);
3456   if (UseAVX >= 2) {
3457     cmpl(cnt1, stride);
3458     jcc(Assembler::less, SCAN_TO_CHAR_INIT);
3459     cmpl(cnt1, stride*2);
3460     jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
3461     movdl(vec1, ch);
3462     vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
3463     vpxor(vec2, vec2);
3464     movl(tmp, cnt1);
3465     andl(tmp, 0xFFFFFFE0);  //vector count (in chars)
3466     andl(cnt1,0x0000001F);  //tail count (in chars)
3467 
3468     bind(SCAN_TO_32_CHAR_LOOP);
3469     vmovdqu(vec3, Address(result, 0));
3470     vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
3471     vptest(vec2, vec3);
3472     jcc(Assembler::carryClear, FOUND_CHAR);
3473     addptr(result, 32);
3474     subl(tmp, stride*2);
3475     jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
3476     jmp(SCAN_TO_16_CHAR);
3477 
3478     bind(SCAN_TO_16_CHAR_INIT);
3479     movdl(vec1, ch);
3480     pxor(vec2, vec2);
3481     pshufb(vec1, vec2);
3482   }
3483 
3484   bind(SCAN_TO_16_CHAR);
3485   cmpl(cnt1, stride);
3486   jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left
3487   if (UseAVX < 2) {
3488     movdl(vec1, ch);
3489     pxor(vec2, vec2);
3490     pshufb(vec1, vec2);
3491   }
3492   movl(tmp, cnt1);
3493   andl(tmp, 0xFFFFFFF0);  //vector count (in bytes)
3494   andl(cnt1,0x0000000F);  //tail count (in bytes)
3495 
3496   bind(SCAN_TO_16_CHAR_LOOP);
3497   movdqu(vec3, Address(result, 0));
3498   pcmpeqb(vec3, vec1);
3499   ptest(vec2, vec3);
3500   jcc(Assembler::carryClear, FOUND_CHAR);
3501   addptr(result, 16);
3502   subl(tmp, stride);
3503   jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
3504 
3505   bind(SCAN_TO_CHAR_INIT);
3506   testl(cnt1, cnt1);
3507   jcc(Assembler::zero, RET_NOT_FOUND);
3508   bind(SCAN_TO_CHAR_LOOP);
3509   load_unsigned_byte(tmp, Address(result, 0));
3510   cmpl(ch, tmp);
3511   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3512   addptr(result, 1);
3513   subl(cnt1, 1);
3514   jccb(Assembler::zero, RET_NOT_FOUND);
3515   jmp(SCAN_TO_CHAR_LOOP);
3516 
3517   bind(RET_NOT_FOUND);
3518   movl(result, -1);
3519   jmpb(DONE_LABEL);
3520 
3521   bind(FOUND_CHAR);
3522   if (UseAVX >= 2) {
3523     vpmovmskb(tmp, vec3);
3524   } else {
3525     pmovmskb(tmp, vec3);
3526   }
3527   bsfl(ch, tmp);
3528   addptr(result, ch);
3529 
3530   bind(FOUND_SEQ_CHAR);
3531   subptr(result, str1);
3532 
3533   bind(DONE_LABEL);
3534 } // stringL_indexof_char
3535 
3536 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) {
3537   switch (eltype) {
3538   case T_BOOLEAN: return sizeof(jboolean);
3539   case T_BYTE:  return sizeof(jbyte);
3540   case T_SHORT: return sizeof(jshort);
3541   case T_CHAR:  return sizeof(jchar);
3542   case T_INT:   return sizeof(jint);
3543   default:
3544     ShouldNotReachHere();
3545     return -1;
3546   }
3547 }
3548 
3549 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) {
3550   switch (eltype) {
3551   // T_BOOLEAN used as surrogate for unsigned byte
3552   case T_BOOLEAN: movzbl(dst, src);   break;
3553   case T_BYTE:    movsbl(dst, src);   break;
3554   case T_SHORT:   movswl(dst, src);   break;
3555   case T_CHAR:    movzwl(dst, src);   break;
3556   case T_INT:     movl(dst, src);     break;
3557   default:
3558     ShouldNotReachHere();
3559   }
3560 }
3561 
3562 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) {
3563   load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8);
3564 }
3565 
3566 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) {
3567   load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8);
3568 }
3569 
3570 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) {
3571   const int vlen = Assembler::AVX_256bit;
3572   switch (eltype) {
3573   case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3574   case T_BYTE:      vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3575   case T_SHORT:     vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3576   case T_CHAR:    vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3577   case T_INT:
3578     // do nothing
3579     break;
3580   default:
3581     ShouldNotReachHere();
3582   }
3583 }
3584 
3585 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result,
3586                                         Register index, Register tmp2, Register tmp3, XMMRegister vnext,
3587                                         XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3,
3588                                         XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3,
3589                                         XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3,
3590                                         BasicType eltype) {
3591   ShortBranchVerifier sbv(this);
3592   assert(UseAVX >= 2, "AVX2 intrinsics are required");
3593   assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3);
3594   assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3);
3595 
3596   Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN,
3597         SHORT_UNROLLED_LOOP_EXIT,
3598         UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME,
3599         UNROLLED_VECTOR_LOOP_BEGIN,
3600         END;
3601   switch (eltype) {
3602   case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break;
3603   case T_CHAR:    BLOCK_COMMENT("arrays_hashcode(char) {");          break;
3604   case T_BYTE:    BLOCK_COMMENT("arrays_hashcode(byte) {");          break;
3605   case T_SHORT:   BLOCK_COMMENT("arrays_hashcode(short) {");         break;
3606   case T_INT:     BLOCK_COMMENT("arrays_hashcode(int) {");           break;
3607   default:        BLOCK_COMMENT("arrays_hashcode {");                break;
3608   }
3609 
3610   // For "renaming" for readibility of the code
3611   const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 },
3612                     vresult[] = { vresult0, vresult1, vresult2, vresult3 },
3613                     vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 };
3614 
3615   const int elsize = arrays_hashcode_elsize(eltype);
3616 
3617   /*
3618     if (cnt1 >= 2) {
3619       if (cnt1 >= 32) {
3620         UNROLLED VECTOR LOOP
3621       }
3622       UNROLLED SCALAR LOOP
3623     }
3624     SINGLE SCALAR
3625    */
3626 
3627   cmpl(cnt1, 32);
3628   jcc(Assembler::less, SHORT_UNROLLED_BEGIN);
3629 
3630   // cnt1 >= 32 && generate_vectorized_loop
3631   xorl(index, index);
3632 
3633   // vresult = IntVector.zero(I256);
3634   for (int idx = 0; idx < 4; idx++) {
3635     vpxor(vresult[idx], vresult[idx]);
3636   }
3637   // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]);
3638   Register bound = tmp2;
3639   Register next = tmp3;
3640   lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint))));
3641   movl(next, Address(tmp2, 0));
3642   movdl(vnext, next);
3643   vpbroadcastd(vnext, vnext, Assembler::AVX_256bit);
3644 
3645   // index = 0;
3646   // bound = cnt1 & ~(32 - 1);
3647   movl(bound, cnt1);
3648   andl(bound, ~(32 - 1));
3649   // for (; index < bound; index += 32) {
3650   bind(UNROLLED_VECTOR_LOOP_BEGIN);
3651   // result *= next;
3652   imull(result, next);
3653   // loop fission to upfront the cost of fetching from memory, OOO execution
3654   // can then hopefully do a better job of prefetching
3655   for (int idx = 0; idx < 4; idx++) {
3656     arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype);
3657   }
3658   // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7];
3659   for (int idx = 0; idx < 4; idx++) {
3660     vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit);
3661     arrays_hashcode_elvcast(vtmp[idx], eltype);
3662     vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit);
3663   }
3664   // index += 32;
3665   addl(index, 32);
3666   // index < bound;
3667   cmpl(index, bound);
3668   jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN);
3669   // }
3670 
3671   lea(ary1, Address(ary1, bound, Address::times(elsize)));
3672   subl(cnt1, bound);
3673   // release bound
3674 
3675   // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1);
3676   for (int idx = 0; idx < 4; idx++) {
3677     lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint))));
3678     arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT);
3679     vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit);
3680   }
3681   // result += vresult.reduceLanes(ADD);
3682   for (int idx = 0; idx < 4; idx++) {
3683     reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]);
3684   }
3685 
3686   // } else if (cnt1 < 32) {
3687 
3688   bind(SHORT_UNROLLED_BEGIN);
3689   // int i = 1;
3690   movl(index, 1);
3691   cmpl(index, cnt1);
3692   jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT);
3693 
3694   // for (; i < cnt1 ; i += 2) {
3695   bind(SHORT_UNROLLED_LOOP_BEGIN);
3696   movl(tmp3, 961);
3697   imull(result, tmp3);
3698   arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3699   movl(tmp3, tmp2);
3700   shll(tmp3, 5);
3701   subl(tmp3, tmp2);
3702   addl(result, tmp3);
3703   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype);
3704   addl(result, tmp3);
3705   addl(index, 2);
3706   cmpl(index, cnt1);
3707   jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN);
3708 
3709   // }
3710   // if (i >= cnt1) {
3711   bind(SHORT_UNROLLED_LOOP_EXIT);
3712   jccb(Assembler::greater, END);
3713   movl(tmp2, result);
3714   shll(result, 5);
3715   subl(result, tmp2);
3716   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3717   addl(result, tmp3);
3718   // }
3719   bind(END);
3720 
3721   BLOCK_COMMENT("} // arrays_hashcode");
3722 
3723 } // arrays_hashcode
3724 
3725 // helper function for string_compare
3726 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
3727                                            Address::ScaleFactor scale, Address::ScaleFactor scale1,
3728                                            Address::ScaleFactor scale2, Register index, int ae) {
3729   if (ae == StrIntrinsicNode::LL) {
3730     load_unsigned_byte(elem1, Address(str1, index, scale, 0));
3731     load_unsigned_byte(elem2, Address(str2, index, scale, 0));
3732   } else if (ae == StrIntrinsicNode::UU) {
3733     load_unsigned_short(elem1, Address(str1, index, scale, 0));
3734     load_unsigned_short(elem2, Address(str2, index, scale, 0));
3735   } else {
3736     load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
3737     load_unsigned_short(elem2, Address(str2, index, scale2, 0));
3738   }
3739 }
3740 
3741 // Compare strings, used for char[] and byte[].
3742 void C2_MacroAssembler::string_compare(Register str1, Register str2,
3743                                        Register cnt1, Register cnt2, Register result,
3744                                        XMMRegister vec1, int ae, KRegister mask) {
3745   ShortBranchVerifier sbv(this);
3746   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
3747   Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only _LP64 && AVX3
3748   int stride, stride2, adr_stride, adr_stride1, adr_stride2;
3749   int stride2x2 = 0x40;
3750   Address::ScaleFactor scale = Address::no_scale;
3751   Address::ScaleFactor scale1 = Address::no_scale;
3752   Address::ScaleFactor scale2 = Address::no_scale;
3753 
3754   if (ae != StrIntrinsicNode::LL) {
3755     stride2x2 = 0x20;
3756   }
3757 
3758   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
3759     shrl(cnt2, 1);
3760   }
3761   // Compute the minimum of the string lengths and the
3762   // difference of the string lengths (stack).
3763   // Do the conditional move stuff
3764   movl(result, cnt1);
3765   subl(cnt1, cnt2);
3766   push(cnt1);
3767   cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
3768 
3769   // Is the minimum length zero?
3770   testl(cnt2, cnt2);
3771   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3772   if (ae == StrIntrinsicNode::LL) {
3773     // Load first bytes
3774     load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
3775     load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
3776   } else if (ae == StrIntrinsicNode::UU) {
3777     // Load first characters
3778     load_unsigned_short(result, Address(str1, 0));
3779     load_unsigned_short(cnt1, Address(str2, 0));
3780   } else {
3781     load_unsigned_byte(result, Address(str1, 0));
3782     load_unsigned_short(cnt1, Address(str2, 0));
3783   }
3784   subl(result, cnt1);
3785   jcc(Assembler::notZero,  POP_LABEL);
3786 
3787   if (ae == StrIntrinsicNode::UU) {
3788     // Divide length by 2 to get number of chars
3789     shrl(cnt2, 1);
3790   }
3791   cmpl(cnt2, 1);
3792   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3793 
3794   // Check if the strings start at the same location and setup scale and stride
3795   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3796     cmpptr(str1, str2);
3797     jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3798     if (ae == StrIntrinsicNode::LL) {
3799       scale = Address::times_1;
3800       stride = 16;
3801     } else {
3802       scale = Address::times_2;
3803       stride = 8;
3804     }
3805   } else {
3806     scale1 = Address::times_1;
3807     scale2 = Address::times_2;
3808     // scale not used
3809     stride = 8;
3810   }
3811 
3812   if (UseAVX >= 2 && UseSSE42Intrinsics) {
3813     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
3814     Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
3815     Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
3816     Label COMPARE_TAIL_LONG;
3817     Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only _LP64 && AVX3
3818 
3819     int pcmpmask = 0x19;
3820     if (ae == StrIntrinsicNode::LL) {
3821       pcmpmask &= ~0x01;
3822     }
3823 
3824     // Setup to compare 16-chars (32-bytes) vectors,
3825     // start from first character again because it has aligned address.
3826     if (ae == StrIntrinsicNode::LL) {
3827       stride2 = 32;
3828     } else {
3829       stride2 = 16;
3830     }
3831     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3832       adr_stride = stride << scale;
3833     } else {
3834       adr_stride1 = 8;  //stride << scale1;
3835       adr_stride2 = 16; //stride << scale2;
3836     }
3837 
3838     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3839     // rax and rdx are used by pcmpestri as elements counters
3840     movl(result, cnt2);
3841     andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
3842     jcc(Assembler::zero, COMPARE_TAIL_LONG);
3843 
3844     // fast path : compare first 2 8-char vectors.
3845     bind(COMPARE_16_CHARS);
3846     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3847       movdqu(vec1, Address(str1, 0));
3848     } else {
3849       pmovzxbw(vec1, Address(str1, 0));
3850     }
3851     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3852     jccb(Assembler::below, COMPARE_INDEX_CHAR);
3853 
3854     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3855       movdqu(vec1, Address(str1, adr_stride));
3856       pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
3857     } else {
3858       pmovzxbw(vec1, Address(str1, adr_stride1));
3859       pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
3860     }
3861     jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
3862     addl(cnt1, stride);
3863 
3864     // Compare the characters at index in cnt1
3865     bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
3866     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3867     subl(result, cnt2);
3868     jmp(POP_LABEL);
3869 
3870     // Setup the registers to start vector comparison loop
3871     bind(COMPARE_WIDE_VECTORS);
3872     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3873       lea(str1, Address(str1, result, scale));
3874       lea(str2, Address(str2, result, scale));
3875     } else {
3876       lea(str1, Address(str1, result, scale1));
3877       lea(str2, Address(str2, result, scale2));
3878     }
3879     subl(result, stride2);
3880     subl(cnt2, stride2);
3881     jcc(Assembler::zero, COMPARE_WIDE_TAIL);
3882     negptr(result);
3883 
3884     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
3885     bind(COMPARE_WIDE_VECTORS_LOOP);
3886 
3887 #ifdef _LP64
3888     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3889       cmpl(cnt2, stride2x2);
3890       jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3891       testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
3892       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
3893 
3894       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3895       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3896         evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
3897         evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3898       } else {
3899         vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
3900         evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3901       }
3902       kortestql(mask, mask);
3903       jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
3904       addptr(result, stride2x2);  // update since we already compared at this addr
3905       subl(cnt2, stride2x2);      // and sub the size too
3906       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3907 
3908       vpxor(vec1, vec1);
3909       jmpb(COMPARE_WIDE_TAIL);
3910     }//if (VM_Version::supports_avx512vlbw())
3911 #endif // _LP64
3912 
3913 
3914     bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3915     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3916       vmovdqu(vec1, Address(str1, result, scale));
3917       vpxor(vec1, Address(str2, result, scale));
3918     } else {
3919       vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
3920       vpxor(vec1, Address(str2, result, scale2));
3921     }
3922     vptest(vec1, vec1);
3923     jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
3924     addptr(result, stride2);
3925     subl(cnt2, stride2);
3926     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
3927     // clean upper bits of YMM registers
3928     vpxor(vec1, vec1);
3929 
3930     // compare wide vectors tail
3931     bind(COMPARE_WIDE_TAIL);
3932     testptr(result, result);
3933     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3934 
3935     movl(result, stride2);
3936     movl(cnt2, result);
3937     negptr(result);
3938     jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3939 
3940     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
3941     bind(VECTOR_NOT_EQUAL);
3942     // clean upper bits of YMM registers
3943     vpxor(vec1, vec1);
3944     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3945       lea(str1, Address(str1, result, scale));
3946       lea(str2, Address(str2, result, scale));
3947     } else {
3948       lea(str1, Address(str1, result, scale1));
3949       lea(str2, Address(str2, result, scale2));
3950     }
3951     jmp(COMPARE_16_CHARS);
3952 
3953     // Compare tail chars, length between 1 to 15 chars
3954     bind(COMPARE_TAIL_LONG);
3955     movl(cnt2, result);
3956     cmpl(cnt2, stride);
3957     jcc(Assembler::less, COMPARE_SMALL_STR);
3958 
3959     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3960       movdqu(vec1, Address(str1, 0));
3961     } else {
3962       pmovzxbw(vec1, Address(str1, 0));
3963     }
3964     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3965     jcc(Assembler::below, COMPARE_INDEX_CHAR);
3966     subptr(cnt2, stride);
3967     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3968     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3969       lea(str1, Address(str1, result, scale));
3970       lea(str2, Address(str2, result, scale));
3971     } else {
3972       lea(str1, Address(str1, result, scale1));
3973       lea(str2, Address(str2, result, scale2));
3974     }
3975     negptr(cnt2);
3976     jmpb(WHILE_HEAD_LABEL);
3977 
3978     bind(COMPARE_SMALL_STR);
3979   } else if (UseSSE42Intrinsics) {
3980     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
3981     int pcmpmask = 0x19;
3982     // Setup to compare 8-char (16-byte) vectors,
3983     // start from first character again because it has aligned address.
3984     movl(result, cnt2);
3985     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
3986     if (ae == StrIntrinsicNode::LL) {
3987       pcmpmask &= ~0x01;
3988     }
3989     jcc(Assembler::zero, COMPARE_TAIL);
3990     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3991       lea(str1, Address(str1, result, scale));
3992       lea(str2, Address(str2, result, scale));
3993     } else {
3994       lea(str1, Address(str1, result, scale1));
3995       lea(str2, Address(str2, result, scale2));
3996     }
3997     negptr(result);
3998 
3999     // pcmpestri
4000     //   inputs:
4001     //     vec1- substring
4002     //     rax - negative string length (elements count)
4003     //     mem - scanned string
4004     //     rdx - string length (elements count)
4005     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
4006     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
4007     //   outputs:
4008     //     rcx - first mismatched element index
4009     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
4010 
4011     bind(COMPARE_WIDE_VECTORS);
4012     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
4013       movdqu(vec1, Address(str1, result, scale));
4014       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
4015     } else {
4016       pmovzxbw(vec1, Address(str1, result, scale1));
4017       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
4018     }
4019     // After pcmpestri cnt1(rcx) contains mismatched element index
4020 
4021     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
4022     addptr(result, stride);
4023     subptr(cnt2, stride);
4024     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4025 
4026     // compare wide vectors tail
4027     testptr(result, result);
4028     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
4029 
4030     movl(cnt2, stride);
4031     movl(result, stride);
4032     negptr(result);
4033     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
4034       movdqu(vec1, Address(str1, result, scale));
4035       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
4036     } else {
4037       pmovzxbw(vec1, Address(str1, result, scale1));
4038       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
4039     }
4040     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
4041 
4042     // Mismatched characters in the vectors
4043     bind(VECTOR_NOT_EQUAL);
4044     addptr(cnt1, result);
4045     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
4046     subl(result, cnt2);
4047     jmpb(POP_LABEL);
4048 
4049     bind(COMPARE_TAIL); // limit is zero
4050     movl(cnt2, result);
4051     // Fallthru to tail compare
4052   }
4053   // Shift str2 and str1 to the end of the arrays, negate min
4054   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
4055     lea(str1, Address(str1, cnt2, scale));
4056     lea(str2, Address(str2, cnt2, scale));
4057   } else {
4058     lea(str1, Address(str1, cnt2, scale1));
4059     lea(str2, Address(str2, cnt2, scale2));
4060   }
4061   decrementl(cnt2);  // first character was compared already
4062   negptr(cnt2);
4063 
4064   // Compare the rest of the elements
4065   bind(WHILE_HEAD_LABEL);
4066   load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
4067   subl(result, cnt1);
4068   jccb(Assembler::notZero, POP_LABEL);
4069   increment(cnt2);
4070   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
4071 
4072   // Strings are equal up to min length.  Return the length difference.
4073   bind(LENGTH_DIFF_LABEL);
4074   pop(result);
4075   if (ae == StrIntrinsicNode::UU) {
4076     // Divide diff by 2 to get number of chars
4077     sarl(result, 1);
4078   }
4079   jmpb(DONE_LABEL);
4080 
4081 #ifdef _LP64
4082   if (VM_Version::supports_avx512vlbw()) {
4083 
4084     bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
4085 
4086     kmovql(cnt1, mask);
4087     notq(cnt1);
4088     bsfq(cnt2, cnt1);
4089     if (ae != StrIntrinsicNode::LL) {
4090       // Divide diff by 2 to get number of chars
4091       sarl(cnt2, 1);
4092     }
4093     addq(result, cnt2);
4094     if (ae == StrIntrinsicNode::LL) {
4095       load_unsigned_byte(cnt1, Address(str2, result));
4096       load_unsigned_byte(result, Address(str1, result));
4097     } else if (ae == StrIntrinsicNode::UU) {
4098       load_unsigned_short(cnt1, Address(str2, result, scale));
4099       load_unsigned_short(result, Address(str1, result, scale));
4100     } else {
4101       load_unsigned_short(cnt1, Address(str2, result, scale2));
4102       load_unsigned_byte(result, Address(str1, result, scale1));
4103     }
4104     subl(result, cnt1);
4105     jmpb(POP_LABEL);
4106   }//if (VM_Version::supports_avx512vlbw())
4107 #endif // _LP64
4108 
4109   // Discard the stored length difference
4110   bind(POP_LABEL);
4111   pop(cnt1);
4112 
4113   // That's it
4114   bind(DONE_LABEL);
4115   if(ae == StrIntrinsicNode::UL) {
4116     negl(result);
4117   }
4118 
4119 }
4120 
4121 // Search for Non-ASCII character (Negative byte value) in a byte array,
4122 // return the index of the first such character, otherwise the length
4123 // of the array segment searched.
4124 //   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
4125 //   @IntrinsicCandidate
4126 //   public static int countPositives(byte[] ba, int off, int len) {
4127 //     for (int i = off; i < off + len; i++) {
4128 //       if (ba[i] < 0) {
4129 //         return i - off;
4130 //       }
4131 //     }
4132 //     return len;
4133 //   }
4134 void C2_MacroAssembler::count_positives(Register ary1, Register len,
4135   Register result, Register tmp1,
4136   XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {
4137   // rsi: byte array
4138   // rcx: len
4139   // rax: result
4140   ShortBranchVerifier sbv(this);
4141   assert_different_registers(ary1, len, result, tmp1);
4142   assert_different_registers(vec1, vec2);
4143   Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
4144 
4145   movl(result, len); // copy
4146   // len == 0
4147   testl(len, len);
4148   jcc(Assembler::zero, DONE);
4149 
4150   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
4151     VM_Version::supports_avx512vlbw() &&
4152     VM_Version::supports_bmi2()) {
4153 
4154     Label test_64_loop, test_tail, BREAK_LOOP;
4155     movl(tmp1, len);
4156     vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
4157 
4158     andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F
4159     andl(len,  0xffffffc0); // vector count (in chars)
4160     jccb(Assembler::zero, test_tail);
4161 
4162     lea(ary1, Address(ary1, len, Address::times_1));
4163     negptr(len);
4164 
4165     bind(test_64_loop);
4166     // Check whether our 64 elements of size byte contain negatives
4167     evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
4168     kortestql(mask1, mask1);
4169     jcc(Assembler::notZero, BREAK_LOOP);
4170 
4171     addptr(len, 64);
4172     jccb(Assembler::notZero, test_64_loop);
4173 
4174     bind(test_tail);
4175     // bail out when there is nothing to be done
4176     testl(tmp1, -1);
4177     jcc(Assembler::zero, DONE);
4178 
4179 
4180     // check the tail for absense of negatives
4181     // ~(~0 << len) applied up to two times (for 32-bit scenario)
4182 #ifdef _LP64
4183     {
4184       Register tmp3_aliased = len;
4185       mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
4186       shlxq(tmp3_aliased, tmp3_aliased, tmp1);
4187       notq(tmp3_aliased);
4188       kmovql(mask2, tmp3_aliased);
4189     }
4190 #else
4191     Label k_init;
4192     jmp(k_init);
4193 
4194     // We could not read 64-bits from a general purpose register thus we move
4195     // data required to compose 64 1's to the instruction stream
4196     // We emit 64 byte wide series of elements from 0..63 which later on would
4197     // be used as a compare targets with tail count contained in tmp1 register.
4198     // Result would be a k register having tmp1 consecutive number or 1
4199     // counting from least significant bit.
4200     address tmp = pc();
4201     emit_int64(0x0706050403020100);
4202     emit_int64(0x0F0E0D0C0B0A0908);
4203     emit_int64(0x1716151413121110);
4204     emit_int64(0x1F1E1D1C1B1A1918);
4205     emit_int64(0x2726252423222120);
4206     emit_int64(0x2F2E2D2C2B2A2928);
4207     emit_int64(0x3736353433323130);
4208     emit_int64(0x3F3E3D3C3B3A3938);
4209 
4210     bind(k_init);
4211     lea(len, InternalAddress(tmp));
4212     // create mask to test for negative byte inside a vector
4213     evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit);
4214     evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit);
4215 
4216 #endif
4217     evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);
4218     ktestq(mask1, mask2);
4219     jcc(Assembler::zero, DONE);
4220 
4221     // do a full check for negative registers in the tail
4222     movl(len, tmp1); // tmp1 holds low 6-bit from original len;
4223                      // ary1 already pointing to the right place
4224     jmpb(TAIL_START);
4225 
4226     bind(BREAK_LOOP);
4227     // At least one byte in the last 64 byte block was negative.
4228     // Set up to look at the last 64 bytes as if they were a tail
4229     lea(ary1, Address(ary1, len, Address::times_1));
4230     addptr(result, len);
4231     // Ignore the very last byte: if all others are positive,
4232     // it must be negative, so we can skip right to the 2+1 byte
4233     // end comparison at this point
4234     orl(result, 63);
4235     movl(len, 63);
4236     // Fallthru to tail compare
4237   } else {
4238 
4239     if (UseAVX >= 2 && UseSSE >= 2) {
4240       // With AVX2, use 32-byte vector compare
4241       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4242 
4243       // Compare 32-byte vectors
4244       testl(len, 0xffffffe0);   // vector count (in bytes)
4245       jccb(Assembler::zero, TAIL_START);
4246 
4247       andl(len, 0xffffffe0);
4248       lea(ary1, Address(ary1, len, Address::times_1));
4249       negptr(len);
4250 
4251       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
4252       movdl(vec2, tmp1);
4253       vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
4254 
4255       bind(COMPARE_WIDE_VECTORS);
4256       vmovdqu(vec1, Address(ary1, len, Address::times_1));
4257       vptest(vec1, vec2);
4258       jccb(Assembler::notZero, BREAK_LOOP);
4259       addptr(len, 32);
4260       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4261 
4262       testl(result, 0x0000001f);   // any bytes remaining?
4263       jcc(Assembler::zero, DONE);
4264 
4265       // Quick test using the already prepared vector mask
4266       movl(len, result);
4267       andl(len, 0x0000001f);
4268       vmovdqu(vec1, Address(ary1, len, Address::times_1, -32));
4269       vptest(vec1, vec2);
4270       jcc(Assembler::zero, DONE);
4271       // There are zeros, jump to the tail to determine exactly where
4272       jmpb(TAIL_START);
4273 
4274       bind(BREAK_LOOP);
4275       // At least one byte in the last 32-byte vector is negative.
4276       // Set up to look at the last 32 bytes as if they were a tail
4277       lea(ary1, Address(ary1, len, Address::times_1));
4278       addptr(result, len);
4279       // Ignore the very last byte: if all others are positive,
4280       // it must be negative, so we can skip right to the 2+1 byte
4281       // end comparison at this point
4282       orl(result, 31);
4283       movl(len, 31);
4284       // Fallthru to tail compare
4285     } else if (UseSSE42Intrinsics) {
4286       // With SSE4.2, use double quad vector compare
4287       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4288 
4289       // Compare 16-byte vectors
4290       testl(len, 0xfffffff0);   // vector count (in bytes)
4291       jcc(Assembler::zero, TAIL_START);
4292 
4293       andl(len, 0xfffffff0);
4294       lea(ary1, Address(ary1, len, Address::times_1));
4295       negptr(len);
4296 
4297       movl(tmp1, 0x80808080);
4298       movdl(vec2, tmp1);
4299       pshufd(vec2, vec2, 0);
4300 
4301       bind(COMPARE_WIDE_VECTORS);
4302       movdqu(vec1, Address(ary1, len, Address::times_1));
4303       ptest(vec1, vec2);
4304       jccb(Assembler::notZero, BREAK_LOOP);
4305       addptr(len, 16);
4306       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4307 
4308       testl(result, 0x0000000f); // len is zero, any bytes remaining?
4309       jcc(Assembler::zero, DONE);
4310 
4311       // Quick test using the already prepared vector mask
4312       movl(len, result);
4313       andl(len, 0x0000000f);   // tail count (in bytes)
4314       movdqu(vec1, Address(ary1, len, Address::times_1, -16));
4315       ptest(vec1, vec2);
4316       jcc(Assembler::zero, DONE);
4317       jmpb(TAIL_START);
4318 
4319       bind(BREAK_LOOP);
4320       // At least one byte in the last 16-byte vector is negative.
4321       // Set up and look at the last 16 bytes as if they were a tail
4322       lea(ary1, Address(ary1, len, Address::times_1));
4323       addptr(result, len);
4324       // Ignore the very last byte: if all others are positive,
4325       // it must be negative, so we can skip right to the 2+1 byte
4326       // end comparison at this point
4327       orl(result, 15);
4328       movl(len, 15);
4329       // Fallthru to tail compare
4330     }
4331   }
4332 
4333   bind(TAIL_START);
4334   // Compare 4-byte vectors
4335   andl(len, 0xfffffffc); // vector count (in bytes)
4336   jccb(Assembler::zero, COMPARE_CHAR);
4337 
4338   lea(ary1, Address(ary1, len, Address::times_1));
4339   negptr(len);
4340 
4341   bind(COMPARE_VECTORS);
4342   movl(tmp1, Address(ary1, len, Address::times_1));
4343   andl(tmp1, 0x80808080);
4344   jccb(Assembler::notZero, TAIL_ADJUST);
4345   addptr(len, 4);
4346   jccb(Assembler::notZero, COMPARE_VECTORS);
4347 
4348   // Compare trailing char (final 2-3 bytes), if any
4349   bind(COMPARE_CHAR);
4350 
4351   testl(result, 0x2);   // tail  char
4352   jccb(Assembler::zero, COMPARE_BYTE);
4353   load_unsigned_short(tmp1, Address(ary1, 0));
4354   andl(tmp1, 0x00008080);
4355   jccb(Assembler::notZero, CHAR_ADJUST);
4356   lea(ary1, Address(ary1, 2));
4357 
4358   bind(COMPARE_BYTE);
4359   testl(result, 0x1);   // tail  byte
4360   jccb(Assembler::zero, DONE);
4361   load_unsigned_byte(tmp1, Address(ary1, 0));
4362   testl(tmp1, 0x00000080);
4363   jccb(Assembler::zero, DONE);
4364   subptr(result, 1);
4365   jmpb(DONE);
4366 
4367   bind(TAIL_ADJUST);
4368   // there are negative bits in the last 4 byte block.
4369   // Adjust result and check the next three bytes
4370   addptr(result, len);
4371   orl(result, 3);
4372   lea(ary1, Address(ary1, len, Address::times_1));
4373   jmpb(COMPARE_CHAR);
4374 
4375   bind(CHAR_ADJUST);
4376   // We are looking at a char + optional byte tail, and found that one
4377   // of the bytes in the char is negative. Adjust the result, check the
4378   // first byte and readjust if needed.
4379   andl(result, 0xfffffffc);
4380   testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first
4381   jccb(Assembler::notZero, DONE);
4382   addptr(result, 1);
4383 
4384   // That's it
4385   bind(DONE);
4386   if (UseAVX >= 2 && UseSSE >= 2) {
4387     // clean upper bits of YMM registers
4388     vpxor(vec1, vec1);
4389     vpxor(vec2, vec2);
4390   }
4391 }
4392 
4393 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
4394 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
4395                                       Register limit, Register result, Register chr,
4396                                       XMMRegister vec1, XMMRegister vec2, bool is_char, KRegister mask) {
4397   ShortBranchVerifier sbv(this);
4398   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
4399 
4400   int length_offset  = arrayOopDesc::length_offset_in_bytes();
4401   int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
4402 
4403   if (is_array_equ) {
4404     // Check the input args
4405     cmpoop(ary1, ary2);
4406     jcc(Assembler::equal, TRUE_LABEL);
4407 
4408     // Need additional checks for arrays_equals.
4409     testptr(ary1, ary1);
4410     jcc(Assembler::zero, FALSE_LABEL);
4411     testptr(ary2, ary2);
4412     jcc(Assembler::zero, FALSE_LABEL);
4413 
4414     // Check the lengths
4415     movl(limit, Address(ary1, length_offset));
4416     cmpl(limit, Address(ary2, length_offset));
4417     jcc(Assembler::notEqual, FALSE_LABEL);
4418   }
4419 
4420   // count == 0
4421   testl(limit, limit);
4422   jcc(Assembler::zero, TRUE_LABEL);
4423 
4424   if (is_array_equ) {
4425     // Load array address
4426     lea(ary1, Address(ary1, base_offset));
4427     lea(ary2, Address(ary2, base_offset));
4428   }
4429 
4430   if (is_array_equ && is_char) {
4431     // arrays_equals when used for char[].
4432     shll(limit, 1);      // byte count != 0
4433   }
4434   movl(result, limit); // copy
4435 
4436   if (UseAVX >= 2) {
4437     // With AVX2, use 32-byte vector compare
4438     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4439 
4440     // Compare 32-byte vectors
4441     andl(result, 0x0000001f);  //   tail count (in bytes)
4442     andl(limit, 0xffffffe0);   // vector count (in bytes)
4443     jcc(Assembler::zero, COMPARE_TAIL);
4444 
4445     lea(ary1, Address(ary1, limit, Address::times_1));
4446     lea(ary2, Address(ary2, limit, Address::times_1));
4447     negptr(limit);
4448 
4449 #ifdef _LP64
4450     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
4451       Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
4452 
4453       cmpl(limit, -64);
4454       jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
4455 
4456       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
4457 
4458       evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
4459       evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
4460       kortestql(mask, mask);
4461       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4462       addptr(limit, 64);  // update since we already compared at this addr
4463       cmpl(limit, -64);
4464       jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
4465 
4466       // At this point we may still need to compare -limit+result bytes.
4467       // We could execute the next two instruction and just continue via non-wide path:
4468       //  cmpl(limit, 0);
4469       //  jcc(Assembler::equal, COMPARE_TAIL);  // true
4470       // But since we stopped at the points ary{1,2}+limit which are
4471       // not farther than 64 bytes from the ends of arrays ary{1,2}+result
4472       // (|limit| <= 32 and result < 32),
4473       // we may just compare the last 64 bytes.
4474       //
4475       addptr(result, -64);   // it is safe, bc we just came from this area
4476       evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
4477       evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
4478       kortestql(mask, mask);
4479       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4480 
4481       jmp(TRUE_LABEL);
4482 
4483       bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
4484 
4485     }//if (VM_Version::supports_avx512vlbw())
4486 #endif //_LP64
4487     bind(COMPARE_WIDE_VECTORS);
4488     vmovdqu(vec1, Address(ary1, limit, Address::times_1));
4489     vmovdqu(vec2, Address(ary2, limit, Address::times_1));
4490     vpxor(vec1, vec2);
4491 
4492     vptest(vec1, vec1);
4493     jcc(Assembler::notZero, FALSE_LABEL);
4494     addptr(limit, 32);
4495     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4496 
4497     testl(result, result);
4498     jcc(Assembler::zero, TRUE_LABEL);
4499 
4500     vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
4501     vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
4502     vpxor(vec1, vec2);
4503 
4504     vptest(vec1, vec1);
4505     jccb(Assembler::notZero, FALSE_LABEL);
4506     jmpb(TRUE_LABEL);
4507 
4508     bind(COMPARE_TAIL); // limit is zero
4509     movl(limit, result);
4510     // Fallthru to tail compare
4511   } else if (UseSSE42Intrinsics) {
4512     // With SSE4.2, use double quad vector compare
4513     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4514 
4515     // Compare 16-byte vectors
4516     andl(result, 0x0000000f);  //   tail count (in bytes)
4517     andl(limit, 0xfffffff0);   // vector count (in bytes)
4518     jcc(Assembler::zero, COMPARE_TAIL);
4519 
4520     lea(ary1, Address(ary1, limit, Address::times_1));
4521     lea(ary2, Address(ary2, limit, Address::times_1));
4522     negptr(limit);
4523 
4524     bind(COMPARE_WIDE_VECTORS);
4525     movdqu(vec1, Address(ary1, limit, Address::times_1));
4526     movdqu(vec2, Address(ary2, limit, Address::times_1));
4527     pxor(vec1, vec2);
4528 
4529     ptest(vec1, vec1);
4530     jcc(Assembler::notZero, FALSE_LABEL);
4531     addptr(limit, 16);
4532     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4533 
4534     testl(result, result);
4535     jcc(Assembler::zero, TRUE_LABEL);
4536 
4537     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
4538     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
4539     pxor(vec1, vec2);
4540 
4541     ptest(vec1, vec1);
4542     jccb(Assembler::notZero, FALSE_LABEL);
4543     jmpb(TRUE_LABEL);
4544 
4545     bind(COMPARE_TAIL); // limit is zero
4546     movl(limit, result);
4547     // Fallthru to tail compare
4548   }
4549 
4550   // Compare 4-byte vectors
4551   andl(limit, 0xfffffffc); // vector count (in bytes)
4552   jccb(Assembler::zero, COMPARE_CHAR);
4553 
4554   lea(ary1, Address(ary1, limit, Address::times_1));
4555   lea(ary2, Address(ary2, limit, Address::times_1));
4556   negptr(limit);
4557 
4558   bind(COMPARE_VECTORS);
4559   movl(chr, Address(ary1, limit, Address::times_1));
4560   cmpl(chr, Address(ary2, limit, Address::times_1));
4561   jccb(Assembler::notEqual, FALSE_LABEL);
4562   addptr(limit, 4);
4563   jcc(Assembler::notZero, COMPARE_VECTORS);
4564 
4565   // Compare trailing char (final 2 bytes), if any
4566   bind(COMPARE_CHAR);
4567   testl(result, 0x2);   // tail  char
4568   jccb(Assembler::zero, COMPARE_BYTE);
4569   load_unsigned_short(chr, Address(ary1, 0));
4570   load_unsigned_short(limit, Address(ary2, 0));
4571   cmpl(chr, limit);
4572   jccb(Assembler::notEqual, FALSE_LABEL);
4573 
4574   if (is_array_equ && is_char) {
4575     bind(COMPARE_BYTE);
4576   } else {
4577     lea(ary1, Address(ary1, 2));
4578     lea(ary2, Address(ary2, 2));
4579 
4580     bind(COMPARE_BYTE);
4581     testl(result, 0x1);   // tail  byte
4582     jccb(Assembler::zero, TRUE_LABEL);
4583     load_unsigned_byte(chr, Address(ary1, 0));
4584     load_unsigned_byte(limit, Address(ary2, 0));
4585     cmpl(chr, limit);
4586     jccb(Assembler::notEqual, FALSE_LABEL);
4587   }
4588   bind(TRUE_LABEL);
4589   movl(result, 1);   // return true
4590   jmpb(DONE);
4591 
4592   bind(FALSE_LABEL);
4593   xorl(result, result); // return false
4594 
4595   // That's it
4596   bind(DONE);
4597   if (UseAVX >= 2) {
4598     // clean upper bits of YMM registers
4599     vpxor(vec1, vec1);
4600     vpxor(vec2, vec2);
4601   }
4602 }
4603 
4604 #ifdef _LP64
4605 
4606 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) {
4607 #define __ masm.
4608   Register dst = stub.data<0>();
4609   XMMRegister src = stub.data<1>();
4610   address target = stub.data<2>();
4611   __ bind(stub.entry());
4612   __ subptr(rsp, 8);
4613   __ movdbl(Address(rsp), src);
4614   __ call(RuntimeAddress(target));
4615   __ pop(dst);
4616   __ jmp(stub.continuation());
4617 #undef __
4618 }
4619 
4620 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) {
4621   assert(dst_bt == T_INT || dst_bt == T_LONG, "");
4622   assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, "");
4623 
4624   address slowpath_target;
4625   if (dst_bt == T_INT) {
4626     if (src_bt == T_FLOAT) {
4627       cvttss2sil(dst, src);
4628       cmpl(dst, 0x80000000);
4629       slowpath_target = StubRoutines::x86::f2i_fixup();
4630     } else {
4631       cvttsd2sil(dst, src);
4632       cmpl(dst, 0x80000000);
4633       slowpath_target = StubRoutines::x86::d2i_fixup();
4634     }
4635   } else {
4636     if (src_bt == T_FLOAT) {
4637       cvttss2siq(dst, src);
4638       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4639       slowpath_target = StubRoutines::x86::f2l_fixup();
4640     } else {
4641       cvttsd2siq(dst, src);
4642       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4643       slowpath_target = StubRoutines::x86::d2l_fixup();
4644     }
4645   }
4646 
4647   auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, 23, convertF2I_slowpath);
4648   jcc(Assembler::equal, stub->entry());
4649   bind(stub->continuation());
4650 }
4651 
4652 #endif // _LP64
4653 
4654 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4655                                     XMMRegister src1, int imm8, bool merge, int vlen_enc) {
4656   switch(ideal_opc) {
4657     case Op_LShiftVS:
4658       Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break;
4659     case Op_LShiftVI:
4660       Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break;
4661     case Op_LShiftVL:
4662       Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break;
4663     case Op_RShiftVS:
4664       Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break;
4665     case Op_RShiftVI:
4666       Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break;
4667     case Op_RShiftVL:
4668       Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break;
4669     case Op_URShiftVS:
4670       Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break;
4671     case Op_URShiftVI:
4672       Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break;
4673     case Op_URShiftVL:
4674       Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break;
4675     case Op_RotateRightV:
4676       evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4677     case Op_RotateLeftV:
4678       evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4679     default:
4680       fatal("Unsupported masked operation"); break;
4681   }
4682 }
4683 
4684 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4685                                     XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc,
4686                                     bool is_varshift) {
4687   switch (ideal_opc) {
4688     case Op_AddVB:
4689       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4690     case Op_AddVS:
4691       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4692     case Op_AddVI:
4693       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4694     case Op_AddVL:
4695       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4696     case Op_AddVF:
4697       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4698     case Op_AddVD:
4699       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4700     case Op_SubVB:
4701       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4702     case Op_SubVS:
4703       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4704     case Op_SubVI:
4705       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4706     case Op_SubVL:
4707       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4708     case Op_SubVF:
4709       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4710     case Op_SubVD:
4711       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4712     case Op_MulVS:
4713       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4714     case Op_MulVI:
4715       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4716     case Op_MulVL:
4717       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4718     case Op_MulVF:
4719       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4720     case Op_MulVD:
4721       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4722     case Op_DivVF:
4723       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4724     case Op_DivVD:
4725       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4726     case Op_SqrtVF:
4727       evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break;
4728     case Op_SqrtVD:
4729       evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break;
4730     case Op_AbsVB:
4731       evpabsb(dst, mask, src2, merge, vlen_enc); break;
4732     case Op_AbsVS:
4733       evpabsw(dst, mask, src2, merge, vlen_enc); break;
4734     case Op_AbsVI:
4735       evpabsd(dst, mask, src2, merge, vlen_enc); break;
4736     case Op_AbsVL:
4737       evpabsq(dst, mask, src2, merge, vlen_enc); break;
4738     case Op_FmaVF:
4739       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4740     case Op_FmaVD:
4741       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4742     case Op_VectorRearrange:
4743       evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break;
4744     case Op_LShiftVS:
4745       evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4746     case Op_LShiftVI:
4747       evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4748     case Op_LShiftVL:
4749       evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4750     case Op_RShiftVS:
4751       evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4752     case Op_RShiftVI:
4753       evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4754     case Op_RShiftVL:
4755       evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4756     case Op_URShiftVS:
4757       evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4758     case Op_URShiftVI:
4759       evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4760     case Op_URShiftVL:
4761       evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4762     case Op_RotateLeftV:
4763       evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4764     case Op_RotateRightV:
4765       evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4766     case Op_MaxV:
4767       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4768     case Op_MinV:
4769       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4770     case Op_XorV:
4771       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4772     case Op_OrV:
4773       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4774     case Op_AndV:
4775       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4776     default:
4777       fatal("Unsupported masked operation"); break;
4778   }
4779 }
4780 
4781 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4782                                     XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4783   switch (ideal_opc) {
4784     case Op_AddVB:
4785       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4786     case Op_AddVS:
4787       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4788     case Op_AddVI:
4789       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4790     case Op_AddVL:
4791       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4792     case Op_AddVF:
4793       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4794     case Op_AddVD:
4795       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4796     case Op_SubVB:
4797       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4798     case Op_SubVS:
4799       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4800     case Op_SubVI:
4801       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4802     case Op_SubVL:
4803       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4804     case Op_SubVF:
4805       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4806     case Op_SubVD:
4807       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4808     case Op_MulVS:
4809       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4810     case Op_MulVI:
4811       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4812     case Op_MulVL:
4813       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4814     case Op_MulVF:
4815       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4816     case Op_MulVD:
4817       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4818     case Op_DivVF:
4819       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4820     case Op_DivVD:
4821       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4822     case Op_FmaVF:
4823       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4824     case Op_FmaVD:
4825       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4826     case Op_MaxV:
4827       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4828     case Op_MinV:
4829       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4830     case Op_XorV:
4831       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4832     case Op_OrV:
4833       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4834     case Op_AndV:
4835       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4836     default:
4837       fatal("Unsupported masked operation"); break;
4838   }
4839 }
4840 
4841 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst,
4842                                   KRegister src1, KRegister src2) {
4843   BasicType etype = T_ILLEGAL;
4844   switch(mask_len) {
4845     case 2:
4846     case 4:
4847     case 8:  etype = T_BYTE; break;
4848     case 16: etype = T_SHORT; break;
4849     case 32: etype = T_INT; break;
4850     case 64: etype = T_LONG; break;
4851     default: fatal("Unsupported type"); break;
4852   }
4853   assert(etype != T_ILLEGAL, "");
4854   switch(ideal_opc) {
4855     case Op_AndVMask:
4856       kand(etype, dst, src1, src2); break;
4857     case Op_OrVMask:
4858       kor(etype, dst, src1, src2); break;
4859     case Op_XorVMask:
4860       kxor(etype, dst, src1, src2); break;
4861     default:
4862       fatal("Unsupported masked operation"); break;
4863   }
4864 }
4865 
4866 /*
4867  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4868  * If src is NaN, the result is 0.
4869  * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE,
4870  * the result is equal to the value of Integer.MIN_VALUE.
4871  * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE,
4872  * the result is equal to the value of Integer.MAX_VALUE.
4873  */
4874 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4875                                                                    XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
4876                                                                    Register rscratch, AddressLiteral float_sign_flip,
4877                                                                    int vec_enc) {
4878   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4879   Label done;
4880   vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch);
4881   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
4882   vptest(xtmp2, xtmp2, vec_enc);
4883   jccb(Assembler::equal, done);
4884 
4885   vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc);
4886   vpxor(xtmp1, xtmp1, xtmp4, vec_enc);
4887 
4888   vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
4889   vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc);
4890   vblendvps(dst, dst, xtmp4, xtmp3, vec_enc);
4891 
4892   // Recompute the mask for remaining special value.
4893   vpxor(xtmp2, xtmp2, xtmp3, vec_enc);
4894   // Extract SRC values corresponding to TRUE mask lanes.
4895   vpand(xtmp4, xtmp2, src, vec_enc);
4896   // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special
4897   // values are set.
4898   vpxor(xtmp3, xtmp2, xtmp4, vec_enc);
4899 
4900   vblendvps(dst, dst, xtmp1, xtmp3, vec_enc);
4901   bind(done);
4902 }
4903 
4904 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4905                                                                     XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4906                                                                     Register rscratch, AddressLiteral float_sign_flip,
4907                                                                     int vec_enc) {
4908   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4909   Label done;
4910   evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
4911   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4912   kortestwl(ktmp1, ktmp1);
4913   jccb(Assembler::equal, done);
4914 
4915   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4916   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4917   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
4918 
4919   kxorwl(ktmp1, ktmp1, ktmp2);
4920   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4921   vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4922   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
4923   bind(done);
4924 }
4925 
4926 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4927                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4928                                                                      Register rscratch, AddressLiteral double_sign_flip,
4929                                                                      int vec_enc) {
4930   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4931 
4932   Label done;
4933   evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
4934   Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc);
4935   kortestwl(ktmp1, ktmp1);
4936   jccb(Assembler::equal, done);
4937 
4938   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4939   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4940   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
4941 
4942   kxorwl(ktmp1, ktmp1, ktmp2);
4943   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4944   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4945   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
4946   bind(done);
4947 }
4948 
4949 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4950                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4951                                                                      Register rscratch, AddressLiteral float_sign_flip,
4952                                                                      int vec_enc) {
4953   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4954   Label done;
4955   evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
4956   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4957   kortestwl(ktmp1, ktmp1);
4958   jccb(Assembler::equal, done);
4959 
4960   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4961   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4962   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
4963 
4964   kxorwl(ktmp1, ktmp1, ktmp2);
4965   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4966   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4967   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
4968   bind(done);
4969 }
4970 
4971 /*
4972  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4973  * If src is NaN, the result is 0.
4974  * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE,
4975  * the result is equal to the value of Long.MIN_VALUE.
4976  * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE,
4977  * the result is equal to the value of Long.MAX_VALUE.
4978  */
4979 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4980                                                                       XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4981                                                                       Register rscratch, AddressLiteral double_sign_flip,
4982                                                                       int vec_enc) {
4983   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4984 
4985   Label done;
4986   evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
4987   evpcmpeqq(ktmp1, xtmp1, dst, vec_enc);
4988   kortestwl(ktmp1, ktmp1);
4989   jccb(Assembler::equal, done);
4990 
4991   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4992   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4993   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
4994 
4995   kxorwl(ktmp1, ktmp1, ktmp2);
4996   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4997   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4998   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
4999   bind(done);
5000 }
5001 
5002 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero,
5003                                                              XMMRegister xtmp, int index, int vec_enc) {
5004    assert(vec_enc < Assembler::AVX_512bit, "");
5005    if (vec_enc == Assembler::AVX_256bit) {
5006      vextractf128_high(xtmp, src);
5007      vshufps(dst, src, xtmp, index, vec_enc);
5008    } else {
5009      vshufps(dst, src, zero, index, vec_enc);
5010    }
5011 }
5012 
5013 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5014                                                                     XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch,
5015                                                                     AddressLiteral float_sign_flip, int src_vec_enc) {
5016   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5017 
5018   Label done;
5019   // Compare the destination lanes with float_sign_flip
5020   // value to get mask for all special values.
5021   movdqu(xtmp1, float_sign_flip, rscratch);
5022   vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit);
5023   ptest(xtmp2, xtmp2);
5024   jccb(Assembler::equal, done);
5025 
5026   // Flip float_sign_flip to get max integer value.
5027   vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit);
5028   pxor(xtmp1, xtmp4);
5029 
5030   // Set detination lanes corresponding to unordered source lanes as zero.
5031   vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc);
5032   vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc);
5033 
5034   // Shuffle mask vector and pack lower doubles word from each quadword lane.
5035   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5036   vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit);
5037 
5038   // Recompute the mask for remaining special value.
5039   pxor(xtmp2, xtmp3);
5040   // Extract mask corresponding to non-negative source lanes.
5041   vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc);
5042 
5043   // Shuffle mask vector and pack lower doubles word from each quadword lane.
5044   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5045   pand(xtmp3, xtmp2);
5046 
5047   // Replace destination lanes holding special value(0x80000000) with max int
5048   // if corresponding source lane holds a +ve value.
5049   vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit);
5050   bind(done);
5051 }
5052 
5053 
5054 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero,
5055                                                    XMMRegister xtmp, Register rscratch, int vec_enc) {
5056   switch(to_elem_bt) {
5057     case T_SHORT:
5058       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing");
5059       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch);
5060       vpackusdw(dst, dst, zero, vec_enc);
5061       if (vec_enc == Assembler::AVX_256bit) {
5062         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5063       }
5064       break;
5065     case  T_BYTE:
5066       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing");
5067       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch);
5068       vpackusdw(dst, dst, zero, vec_enc);
5069       if (vec_enc == Assembler::AVX_256bit) {
5070         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5071       }
5072       vpackuswb(dst, dst, zero, vec_enc);
5073       break;
5074     default: assert(false, "%s", type2name(to_elem_bt));
5075   }
5076 }
5077 
5078 /*
5079  * Algorithm for vector D2L and F2I conversions:-
5080  * a) Perform vector D2L/F2I cast.
5081  * b) Choose fast path if none of the result vector lane contains 0x80000000 value.
5082  *    It signifies that source value could be any of the special floating point
5083  *    values(NaN,-Inf,Inf,Max,-Min).
5084  * c) Set destination to zero if source is NaN value.
5085  * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value.
5086  */
5087 
5088 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5089                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
5090                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5091   int to_elem_sz = type2aelembytes(to_elem_bt);
5092   assert(to_elem_sz <= 4, "");
5093   vcvttps2dq(dst, src, vec_enc);
5094   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc);
5095   if (to_elem_sz < 4) {
5096     vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
5097     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc);
5098   }
5099 }
5100 
5101 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5102                                             XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip,
5103                                             Register rscratch, int vec_enc) {
5104   int to_elem_sz = type2aelembytes(to_elem_bt);
5105   assert(to_elem_sz <= 4, "");
5106   vcvttps2dq(dst, src, vec_enc);
5107   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc);
5108   switch(to_elem_bt) {
5109     case T_INT:
5110       break;
5111     case T_SHORT:
5112       evpmovdw(dst, dst, vec_enc);
5113       break;
5114     case T_BYTE:
5115       evpmovdb(dst, dst, vec_enc);
5116       break;
5117     default: assert(false, "%s", type2name(to_elem_bt));
5118   }
5119 }
5120 
5121 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5122                                             KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip,
5123                                             Register rscratch, int vec_enc) {
5124   evcvttps2qq(dst, src, vec_enc);
5125   vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc);
5126 }
5127 
5128 // Handling for downcasting from double to integer or sub-word types on AVX2.
5129 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5130                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5,
5131                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5132   int to_elem_sz = type2aelembytes(to_elem_bt);
5133   assert(to_elem_sz < 8, "");
5134   vcvttpd2dq(dst, src, vec_enc);
5135   vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch,
5136                                               float_sign_flip, vec_enc);
5137   if (to_elem_sz < 4) {
5138     // xtmp4 holds all zero lanes.
5139     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit);
5140   }
5141 }
5142 
5143 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src,
5144                                             XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1,
5145                                             KRegister ktmp2, AddressLiteral sign_flip,
5146                                             Register rscratch, int vec_enc) {
5147   if (VM_Version::supports_avx512dq()) {
5148     evcvttpd2qq(dst, src, vec_enc);
5149     vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5150     switch(to_elem_bt) {
5151       case T_LONG:
5152         break;
5153       case T_INT:
5154         evpmovsqd(dst, dst, vec_enc);
5155         break;
5156       case T_SHORT:
5157         evpmovsqd(dst, dst, vec_enc);
5158         evpmovdw(dst, dst, vec_enc);
5159         break;
5160       case T_BYTE:
5161         evpmovsqd(dst, dst, vec_enc);
5162         evpmovdb(dst, dst, vec_enc);
5163         break;
5164       default: assert(false, "%s", type2name(to_elem_bt));
5165     }
5166   } else {
5167     assert(type2aelembytes(to_elem_bt) <= 4, "");
5168     vcvttpd2dq(dst, src, vec_enc);
5169     vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5170     switch(to_elem_bt) {
5171       case T_INT:
5172         break;
5173       case T_SHORT:
5174         evpmovdw(dst, dst, vec_enc);
5175         break;
5176       case T_BYTE:
5177         evpmovdb(dst, dst, vec_enc);
5178         break;
5179       default: assert(false, "%s", type2name(to_elem_bt));
5180     }
5181   }
5182 }
5183 
5184 #ifdef _LP64
5185 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src,
5186                                                  AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5187                                                  Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5188   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5189   // and re-instantiate original MXCSR.RC mode after that.
5190   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5191 
5192   mov64(tmp, julong_cast(0.5L));
5193   evpbroadcastq(xtmp1, tmp, vec_enc);
5194   vaddpd(xtmp1, src , xtmp1, vec_enc);
5195   evcvtpd2qq(dst, xtmp1, vec_enc);
5196   vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5197                                                 double_sign_flip, vec_enc);;
5198 
5199   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5200 }
5201 
5202 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src,
5203                                                 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5204                                                 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5205   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5206   // and re-instantiate original MXCSR.RC mode after that.
5207   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5208 
5209   movl(tmp, jint_cast(0.5));
5210   movq(xtmp1, tmp);
5211   vbroadcastss(xtmp1, xtmp1, vec_enc);
5212   vaddps(xtmp1, src , xtmp1, vec_enc);
5213   vcvtps2dq(dst, xtmp1, vec_enc);
5214   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5215                                               float_sign_flip, vec_enc);
5216 
5217   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5218 }
5219 
5220 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src,
5221                                                AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5222                                                Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) {
5223   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5224   // and re-instantiate original MXCSR.RC mode after that.
5225   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5226 
5227   movl(tmp, jint_cast(0.5));
5228   movq(xtmp1, tmp);
5229   vbroadcastss(xtmp1, xtmp1, vec_enc);
5230   vaddps(xtmp1, src , xtmp1, vec_enc);
5231   vcvtps2dq(dst, xtmp1, vec_enc);
5232   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc);
5233 
5234   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5235 }
5236 #endif // _LP64
5237 
5238 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5239                                              BasicType from_elem_bt, BasicType to_elem_bt) {
5240   switch (from_elem_bt) {
5241     case T_BYTE:
5242       switch (to_elem_bt) {
5243         case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break;
5244         case T_INT:   vpmovzxbd(dst, src, vlen_enc); break;
5245         case T_LONG:  vpmovzxbq(dst, src, vlen_enc); break;
5246         default: ShouldNotReachHere();
5247       }
5248       break;
5249     case T_SHORT:
5250       switch (to_elem_bt) {
5251         case T_INT:  vpmovzxwd(dst, src, vlen_enc); break;
5252         case T_LONG: vpmovzxwq(dst, src, vlen_enc); break;
5253         default: ShouldNotReachHere();
5254       }
5255       break;
5256     case T_INT:
5257       assert(to_elem_bt == T_LONG, "");
5258       vpmovzxdq(dst, src, vlen_enc);
5259       break;
5260     default:
5261       ShouldNotReachHere();
5262   }
5263 }
5264 
5265 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5266                                            BasicType from_elem_bt, BasicType to_elem_bt) {
5267   switch (from_elem_bt) {
5268     case T_BYTE:
5269       switch (to_elem_bt) {
5270         case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break;
5271         case T_INT:   vpmovsxbd(dst, src, vlen_enc); break;
5272         case T_LONG:  vpmovsxbq(dst, src, vlen_enc); break;
5273         default: ShouldNotReachHere();
5274       }
5275       break;
5276     case T_SHORT:
5277       switch (to_elem_bt) {
5278         case T_INT:  vpmovsxwd(dst, src, vlen_enc); break;
5279         case T_LONG: vpmovsxwq(dst, src, vlen_enc); break;
5280         default: ShouldNotReachHere();
5281       }
5282       break;
5283     case T_INT:
5284       assert(to_elem_bt == T_LONG, "");
5285       vpmovsxdq(dst, src, vlen_enc);
5286       break;
5287     default:
5288       ShouldNotReachHere();
5289   }
5290 }
5291 
5292 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src,
5293                                          BasicType dst_bt, BasicType src_bt, int vlen) {
5294   int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen);
5295   assert(vlen_enc != AVX_512bit, "");
5296 
5297   int dst_bt_size = type2aelembytes(dst_bt);
5298   int src_bt_size = type2aelembytes(src_bt);
5299   if (dst_bt_size > src_bt_size) {
5300     switch (dst_bt_size / src_bt_size) {
5301       case 2: vpmovsxbw(dst, src, vlen_enc); break;
5302       case 4: vpmovsxbd(dst, src, vlen_enc); break;
5303       case 8: vpmovsxbq(dst, src, vlen_enc); break;
5304       default: ShouldNotReachHere();
5305     }
5306   } else {
5307     assert(dst_bt_size < src_bt_size, "");
5308     switch (src_bt_size / dst_bt_size) {
5309       case 2: {
5310         if (vlen_enc == AVX_128bit) {
5311           vpacksswb(dst, src, src, vlen_enc);
5312         } else {
5313           vpacksswb(dst, src, src, vlen_enc);
5314           vpermq(dst, dst, 0x08, vlen_enc);
5315         }
5316         break;
5317       }
5318       case 4: {
5319         if (vlen_enc == AVX_128bit) {
5320           vpackssdw(dst, src, src, vlen_enc);
5321           vpacksswb(dst, dst, dst, vlen_enc);
5322         } else {
5323           vpackssdw(dst, src, src, vlen_enc);
5324           vpermq(dst, dst, 0x08, vlen_enc);
5325           vpacksswb(dst, dst, dst, AVX_128bit);
5326         }
5327         break;
5328       }
5329       case 8: {
5330         if (vlen_enc == AVX_128bit) {
5331           vpshufd(dst, src, 0x08, vlen_enc);
5332           vpackssdw(dst, dst, dst, vlen_enc);
5333           vpacksswb(dst, dst, dst, vlen_enc);
5334         } else {
5335           vpshufd(dst, src, 0x08, vlen_enc);
5336           vpermq(dst, dst, 0x08, vlen_enc);
5337           vpackssdw(dst, dst, dst, AVX_128bit);
5338           vpacksswb(dst, dst, dst, AVX_128bit);
5339         }
5340         break;
5341       }
5342       default: ShouldNotReachHere();
5343     }
5344   }
5345 }
5346 
5347 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3,
5348                                    bool merge, BasicType bt, int vlen_enc) {
5349   if (bt == T_INT) {
5350     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5351   } else {
5352     assert(bt == T_LONG, "");
5353     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5354   }
5355 }
5356 
5357 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3,
5358                                    bool merge, BasicType bt, int vlen_enc) {
5359   if (bt == T_INT) {
5360     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5361   } else {
5362     assert(bt == T_LONG, "");
5363     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5364   }
5365 }
5366 
5367 #ifdef _LP64
5368 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1,
5369                                                Register rtmp2, XMMRegister xtmp, int mask_len,
5370                                                int vec_enc) {
5371   int index = 0;
5372   int vindex = 0;
5373   mov64(rtmp1, 0x0101010101010101L);
5374   pdepq(rtmp1, src, rtmp1);
5375   if (mask_len > 8) {
5376     movq(rtmp2, src);
5377     vpxor(xtmp, xtmp, xtmp, vec_enc);
5378     movq(xtmp, rtmp1);
5379   }
5380   movq(dst, rtmp1);
5381 
5382   mask_len -= 8;
5383   while (mask_len > 0) {
5384     assert ((mask_len & 0x7) == 0, "mask must be multiple of 8");
5385     index++;
5386     if ((index % 2) == 0) {
5387       pxor(xtmp, xtmp);
5388     }
5389     mov64(rtmp1, 0x0101010101010101L);
5390     shrq(rtmp2, 8);
5391     pdepq(rtmp1, rtmp2, rtmp1);
5392     pinsrq(xtmp, rtmp1, index % 2);
5393     vindex = index / 2;
5394     if (vindex) {
5395       // Write entire 16 byte vector when both 64 bit
5396       // lanes are update to save redundant instructions.
5397       if (index % 2) {
5398         vinsertf128(dst, dst, xtmp, vindex);
5399       }
5400     } else {
5401       vmovdqu(dst, xtmp);
5402     }
5403     mask_len -= 8;
5404   }
5405 }
5406 
5407 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) {
5408   switch(opc) {
5409     case Op_VectorMaskTrueCount:
5410       popcntq(dst, tmp);
5411       break;
5412     case Op_VectorMaskLastTrue:
5413       if (VM_Version::supports_lzcnt()) {
5414         lzcntq(tmp, tmp);
5415         movl(dst, 63);
5416         subl(dst, tmp);
5417       } else {
5418         movl(dst, -1);
5419         bsrq(tmp, tmp);
5420         cmov32(Assembler::notZero, dst, tmp);
5421       }
5422       break;
5423     case Op_VectorMaskFirstTrue:
5424       if (VM_Version::supports_bmi1()) {
5425         if (masklen < 32) {
5426           orl(tmp, 1 << masklen);
5427           tzcntl(dst, tmp);
5428         } else if (masklen == 32) {
5429           tzcntl(dst, tmp);
5430         } else {
5431           assert(masklen == 64, "");
5432           tzcntq(dst, tmp);
5433         }
5434       } else {
5435         if (masklen < 32) {
5436           orl(tmp, 1 << masklen);
5437           bsfl(dst, tmp);
5438         } else {
5439           assert(masklen == 32 || masklen == 64, "");
5440           movl(dst, masklen);
5441           if (masklen == 32)  {
5442             bsfl(tmp, tmp);
5443           } else {
5444             bsfq(tmp, tmp);
5445           }
5446           cmov32(Assembler::notZero, dst, tmp);
5447         }
5448       }
5449       break;
5450     case Op_VectorMaskToLong:
5451       assert(dst == tmp, "Dst and tmp should be the same for toLong operations");
5452       break;
5453     default: assert(false, "Unhandled mask operation");
5454   }
5455 }
5456 
5457 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp,
5458                                               int masklen, int masksize, int vec_enc) {
5459   assert(VM_Version::supports_popcnt(), "");
5460 
5461   if(VM_Version::supports_avx512bw()) {
5462     kmovql(tmp, mask);
5463   } else {
5464     assert(masklen <= 16, "");
5465     kmovwl(tmp, mask);
5466   }
5467 
5468   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5469   // operations needs to be clipped.
5470   if (masksize < 16 && opc != Op_VectorMaskFirstTrue) {
5471     andq(tmp, (1 << masklen) - 1);
5472   }
5473 
5474   vector_mask_operation_helper(opc, dst, tmp, masklen);
5475 }
5476 
5477 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
5478                                               Register tmp, int masklen, BasicType bt, int vec_enc) {
5479   assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) ||
5480          (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), "");
5481   assert(VM_Version::supports_popcnt(), "");
5482 
5483   bool need_clip = false;
5484   switch(bt) {
5485     case T_BOOLEAN:
5486       // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1
5487       vpxor(xtmp, xtmp, xtmp, vec_enc);
5488       vpsubb(xtmp, xtmp, mask, vec_enc);
5489       vpmovmskb(tmp, xtmp, vec_enc);
5490       need_clip = masklen < 16;
5491       break;
5492     case T_BYTE:
5493       vpmovmskb(tmp, mask, vec_enc);
5494       need_clip = masklen < 16;
5495       break;
5496     case T_SHORT:
5497       vpacksswb(xtmp, mask, mask, vec_enc);
5498       if (masklen >= 16) {
5499         vpermpd(xtmp, xtmp, 8, vec_enc);
5500       }
5501       vpmovmskb(tmp, xtmp, Assembler::AVX_128bit);
5502       need_clip = masklen < 16;
5503       break;
5504     case T_INT:
5505     case T_FLOAT:
5506       vmovmskps(tmp, mask, vec_enc);
5507       need_clip = masklen < 4;
5508       break;
5509     case T_LONG:
5510     case T_DOUBLE:
5511       vmovmskpd(tmp, mask, vec_enc);
5512       need_clip = masklen < 2;
5513       break;
5514     default: assert(false, "Unhandled type, %s", type2name(bt));
5515   }
5516 
5517   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5518   // operations needs to be clipped.
5519   if (need_clip && opc != Op_VectorMaskFirstTrue) {
5520     // need_clip implies masklen < 32
5521     andq(tmp, (1 << masklen) - 1);
5522   }
5523 
5524   vector_mask_operation_helper(opc, dst, tmp, masklen);
5525 }
5526 
5527 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1,
5528                                              Register rtmp2, int mask_len) {
5529   kmov(rtmp1, src);
5530   andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len)));
5531   mov64(rtmp2, -1L);
5532   pextq(rtmp2, rtmp2, rtmp1);
5533   kmov(dst, rtmp2);
5534 }
5535 
5536 #ifdef _LP64
5537 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src,
5538                                                     XMMRegister mask, Register rtmp, Register rscratch,
5539                                                     XMMRegister permv, XMMRegister xtmp, BasicType bt,
5540                                                     int vec_enc) {
5541   assert(type2aelembytes(bt) >= 4, "");
5542   assert(opcode == Op_CompressV || opcode == Op_ExpandV, "");
5543   address compress_perm_table = nullptr;
5544   address expand_perm_table = nullptr;
5545   if (type2aelembytes(bt) == 8) {
5546     compress_perm_table = StubRoutines::x86::compress_perm_table64();
5547     expand_perm_table  = StubRoutines::x86::expand_perm_table64();
5548     vmovmskpd(rtmp, mask, vec_enc);
5549   } else {
5550     compress_perm_table = StubRoutines::x86::compress_perm_table32();
5551     expand_perm_table = StubRoutines::x86::expand_perm_table32();
5552     vmovmskps(rtmp, mask, vec_enc);
5553   }
5554   shlq(rtmp, 5); // for 32 byte permute row.
5555   if (opcode == Op_CompressV) {
5556     lea(rscratch, ExternalAddress(compress_perm_table));
5557   } else {
5558     lea(rscratch, ExternalAddress(expand_perm_table));
5559   }
5560   addptr(rtmp, rscratch);
5561   vmovdqu(permv, Address(rtmp));
5562   vpermps(dst, permv, src, Assembler::AVX_256bit);
5563   vpxor(xtmp, xtmp, xtmp, vec_enc);
5564   // Blend the result with zero vector using permute mask, each column entry
5565   // in a permute table row contains either a valid permute index or a -1 (default)
5566   // value, this can potentially be used as a blending mask after
5567   // compressing/expanding the source vector lanes.
5568   vblendvps(dst, dst, xtmp, permv, vec_enc, false, permv);
5569 }
5570 #endif
5571 
5572 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask,
5573                                                bool merge, BasicType bt, int vec_enc) {
5574   if (opcode == Op_CompressV) {
5575     switch(bt) {
5576     case T_BYTE:
5577       evpcompressb(dst, mask, src, merge, vec_enc);
5578       break;
5579     case T_CHAR:
5580     case T_SHORT:
5581       evpcompressw(dst, mask, src, merge, vec_enc);
5582       break;
5583     case T_INT:
5584       evpcompressd(dst, mask, src, merge, vec_enc);
5585       break;
5586     case T_FLOAT:
5587       evcompressps(dst, mask, src, merge, vec_enc);
5588       break;
5589     case T_LONG:
5590       evpcompressq(dst, mask, src, merge, vec_enc);
5591       break;
5592     case T_DOUBLE:
5593       evcompresspd(dst, mask, src, merge, vec_enc);
5594       break;
5595     default:
5596       fatal("Unsupported type %s", type2name(bt));
5597       break;
5598     }
5599   } else {
5600     assert(opcode == Op_ExpandV, "");
5601     switch(bt) {
5602     case T_BYTE:
5603       evpexpandb(dst, mask, src, merge, vec_enc);
5604       break;
5605     case T_CHAR:
5606     case T_SHORT:
5607       evpexpandw(dst, mask, src, merge, vec_enc);
5608       break;
5609     case T_INT:
5610       evpexpandd(dst, mask, src, merge, vec_enc);
5611       break;
5612     case T_FLOAT:
5613       evexpandps(dst, mask, src, merge, vec_enc);
5614       break;
5615     case T_LONG:
5616       evpexpandq(dst, mask, src, merge, vec_enc);
5617       break;
5618     case T_DOUBLE:
5619       evexpandpd(dst, mask, src, merge, vec_enc);
5620       break;
5621     default:
5622       fatal("Unsupported type %s", type2name(bt));
5623       break;
5624     }
5625   }
5626 }
5627 #endif
5628 
5629 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5630                                            KRegister ktmp1, int vec_enc) {
5631   if (opcode == Op_SignumVD) {
5632     vsubpd(dst, zero, one, vec_enc);
5633     // if src < 0 ? -1 : 1
5634     evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5635     evblendmpd(dst, ktmp1, one, dst, true, vec_enc);
5636     // if src == NaN, -0.0 or 0.0 return src.
5637     evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5638     evblendmpd(dst, ktmp1, dst, src, true, vec_enc);
5639   } else {
5640     assert(opcode == Op_SignumVF, "");
5641     vsubps(dst, zero, one, vec_enc);
5642     // if src < 0 ? -1 : 1
5643     evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5644     evblendmps(dst, ktmp1, one, dst, true, vec_enc);
5645     // if src == NaN, -0.0 or 0.0 return src.
5646     evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5647     evblendmps(dst, ktmp1, dst, src, true, vec_enc);
5648   }
5649 }
5650 
5651 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5652                                           XMMRegister xtmp1, int vec_enc) {
5653   if (opcode == Op_SignumVD) {
5654     vsubpd(dst, zero, one, vec_enc);
5655     // if src < 0 ? -1 : 1
5656     vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1);
5657     // if src == NaN, -0.0 or 0.0 return src.
5658     vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5659     vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5660   } else {
5661     assert(opcode == Op_SignumVF, "");
5662     vsubps(dst, zero, one, vec_enc);
5663     // if src < 0 ? -1 : 1
5664     vblendvps(dst, one, dst, src, vec_enc, true, xtmp1);
5665     // if src == NaN, -0.0 or 0.0 return src.
5666     vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5667     vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5668   }
5669 }
5670 
5671 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) {
5672   if (VM_Version::supports_avx512bw()) {
5673     if (mask_len > 32) {
5674       kmovql(dst, src);
5675     } else {
5676       kmovdl(dst, src);
5677       if (mask_len != 32) {
5678         kshiftrdl(dst, dst, 32 - mask_len);
5679       }
5680     }
5681   } else {
5682     assert(mask_len <= 16, "");
5683     kmovwl(dst, src);
5684     if (mask_len != 16) {
5685       kshiftrwl(dst, dst, 16 - mask_len);
5686     }
5687   }
5688 }
5689 
5690 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) {
5691   int lane_size = type2aelembytes(bt);
5692   bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
5693   if ((is_LP64 || lane_size < 8) &&
5694       ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) ||
5695        (is_subword_type(bt) && VM_Version::supports_avx512vlbw()))) {
5696     movptr(rtmp, imm32);
5697     switch(lane_size) {
5698       case 1 : evpbroadcastb(dst, rtmp, vec_enc); break;
5699       case 2 : evpbroadcastw(dst, rtmp, vec_enc); break;
5700       case 4 : evpbroadcastd(dst, rtmp, vec_enc); break;
5701       case 8 : evpbroadcastq(dst, rtmp, vec_enc); break;
5702       fatal("Unsupported lane size %d", lane_size);
5703       break;
5704     }
5705   } else {
5706     movptr(rtmp, imm32);
5707     LP64_ONLY(movq(dst, rtmp)) NOT_LP64(movdl(dst, rtmp));
5708     switch(lane_size) {
5709       case 1 : vpbroadcastb(dst, dst, vec_enc); break;
5710       case 2 : vpbroadcastw(dst, dst, vec_enc); break;
5711       case 4 : vpbroadcastd(dst, dst, vec_enc); break;
5712       case 8 : vpbroadcastq(dst, dst, vec_enc); break;
5713       fatal("Unsupported lane size %d", lane_size);
5714       break;
5715     }
5716   }
5717 }
5718 
5719 //
5720 // Following is lookup table based popcount computation algorithm:-
5721 //       Index   Bit set count
5722 //     [ 0000 ->   0,
5723 //       0001 ->   1,
5724 //       0010 ->   1,
5725 //       0011 ->   2,
5726 //       0100 ->   1,
5727 //       0101 ->   2,
5728 //       0110 ->   2,
5729 //       0111 ->   3,
5730 //       1000 ->   1,
5731 //       1001 ->   2,
5732 //       1010 ->   3,
5733 //       1011 ->   3,
5734 //       1100 ->   2,
5735 //       1101 ->   3,
5736 //       1111 ->   4 ]
5737 //  a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as
5738 //     shuffle indices for lookup table access.
5739 //  b. Right shift each byte of vector lane by 4 positions.
5740 //  c. Count the number of 1s in 4 MSB bits each byte. These bits are used as
5741 //     shuffle indices for lookup table access.
5742 //  d. Add the bitset count of upper and lower 4 bits of each byte.
5743 //  e. Unpack double words to quad words and compute sum of absolute difference of bitset
5744 //     count of all the bytes of a quadword.
5745 //  f. Perform step e. for upper 128bit vector lane.
5746 //  g. Pack the bitset count of quadwords back to double word.
5747 //  h. Unpacking and packing operations are not needed for 64bit vector lane.
5748 
5749 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5750                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5751   assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), "");
5752   vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc);
5753   vpsrlw(dst, src, 4, vec_enc);
5754   vpand(dst, dst, xtmp1, vec_enc);
5755   vpand(xtmp1, src, xtmp1, vec_enc);
5756   vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg);
5757   vpshufb(xtmp1, xtmp2, xtmp1, vec_enc);
5758   vpshufb(dst, xtmp2, dst, vec_enc);
5759   vpaddb(dst, dst, xtmp1, vec_enc);
5760 }
5761 
5762 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5763                                             XMMRegister xtmp2, Register rtmp, int vec_enc) {
5764   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5765   // Following code is as per steps e,f,g and h of above algorithm.
5766   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5767   vpunpckhdq(dst, xtmp1, xtmp2, vec_enc);
5768   vpsadbw(dst, dst, xtmp2, vec_enc);
5769   vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc);
5770   vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc);
5771   vpackuswb(dst, xtmp1, dst, vec_enc);
5772 }
5773 
5774 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5775                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5776   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5777   // Add the popcount of upper and lower bytes of word.
5778   vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc);
5779   vpsrlw(dst, xtmp1, 8, vec_enc);
5780   vpand(xtmp1, xtmp1, xtmp2, vec_enc);
5781   vpaddw(dst, dst, xtmp1, vec_enc);
5782 }
5783 
5784 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5785                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5786   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5787   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5788   vpsadbw(dst, xtmp1, xtmp2, vec_enc);
5789 }
5790 
5791 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5792                                                  XMMRegister xtmp2, Register rtmp, int vec_enc) {
5793   switch(bt) {
5794     case T_LONG:
5795       vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5796       break;
5797     case T_INT:
5798       vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5799       break;
5800     case T_CHAR:
5801     case T_SHORT:
5802       vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5803       break;
5804     case T_BYTE:
5805     case T_BOOLEAN:
5806       vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5807       break;
5808     default:
5809       fatal("Unsupported type %s", type2name(bt));
5810       break;
5811   }
5812 }
5813 
5814 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5815                                                       KRegister mask, bool merge, int vec_enc) {
5816   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5817   switch(bt) {
5818     case T_LONG:
5819       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5820       evpopcntq(dst, mask, src, merge, vec_enc);
5821       break;
5822     case T_INT:
5823       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5824       evpopcntd(dst, mask, src, merge, vec_enc);
5825       break;
5826     case T_CHAR:
5827     case T_SHORT:
5828       assert(VM_Version::supports_avx512_bitalg(), "");
5829       evpopcntw(dst, mask, src, merge, vec_enc);
5830       break;
5831     case T_BYTE:
5832     case T_BOOLEAN:
5833       assert(VM_Version::supports_avx512_bitalg(), "");
5834       evpopcntb(dst, mask, src, merge, vec_enc);
5835       break;
5836     default:
5837       fatal("Unsupported type %s", type2name(bt));
5838       break;
5839   }
5840 }
5841 
5842 #ifndef _LP64
5843 void C2_MacroAssembler::vector_maskall_operation32(KRegister dst, Register src, KRegister tmp, int mask_len) {
5844   assert(VM_Version::supports_avx512bw(), "");
5845   kmovdl(tmp, src);
5846   kunpckdql(dst, tmp, tmp);
5847 }
5848 #endif
5849 
5850 // Bit reversal algorithm first reverses the bits of each byte followed by
5851 // a byte level reversal for multi-byte primitive types (short/int/long).
5852 // Algorithm performs a lookup table access to get reverse bit sequence
5853 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte
5854 // is obtained by swapping the reverse bit sequences of upper and lower
5855 // nibble of a byte.
5856 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5857                                            XMMRegister xtmp2, Register rtmp, int vec_enc) {
5858   if (VM_Version::supports_avx512vlbw()) {
5859 
5860     // Get the reverse bit sequence of lower nibble of each byte.
5861     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg);
5862     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5863     evpandq(dst, xtmp2, src, vec_enc);
5864     vpshufb(dst, xtmp1, dst, vec_enc);
5865     vpsllq(dst, dst, 4, vec_enc);
5866 
5867     // Get the reverse bit sequence of upper nibble of each byte.
5868     vpandn(xtmp2, xtmp2, src, vec_enc);
5869     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5870     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5871 
5872     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5873     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5874     evporq(xtmp2, dst, xtmp2, vec_enc);
5875     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
5876 
5877   } else if(vec_enc == Assembler::AVX_512bit) {
5878     // Shift based bit reversal.
5879     assert(bt == T_LONG || bt == T_INT, "");
5880 
5881     // Swap lower and upper nibble of each byte.
5882     vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc);
5883 
5884     // Swap two least and most significant bits of each nibble.
5885     vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc);
5886 
5887     // Swap adjacent pair of bits.
5888     evmovdqul(xtmp1, k0, dst, true, vec_enc);
5889     vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc);
5890 
5891     evmovdqul(xtmp1, k0, dst, true, vec_enc);
5892     vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc);
5893   } else {
5894     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp);
5895     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5896 
5897     // Get the reverse bit sequence of lower nibble of each byte.
5898     vpand(dst, xtmp2, src, vec_enc);
5899     vpshufb(dst, xtmp1, dst, vec_enc);
5900     vpsllq(dst, dst, 4, vec_enc);
5901 
5902     // Get the reverse bit sequence of upper nibble of each byte.
5903     vpandn(xtmp2, xtmp2, src, vec_enc);
5904     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5905     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5906 
5907     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5908     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5909     vpor(xtmp2, dst, xtmp2, vec_enc);
5910     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
5911   }
5912 }
5913 
5914 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc,
5915                                                 XMMRegister xtmp, Register rscratch) {
5916   assert(VM_Version::supports_gfni(), "");
5917   assert(rscratch != noreg || always_reachable(mask), "missing");
5918 
5919   // Galois field instruction based bit reversal based on following algorithm.
5920   // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
5921   vpbroadcastq(xtmp, mask, vec_enc, rscratch);
5922   vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc);
5923   vector_reverse_byte(bt, dst, xtmp, vec_enc);
5924 }
5925 
5926 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src,
5927                                           XMMRegister xtmp1, Register rtmp, int vec_enc) {
5928   vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc);
5929   evpandq(dst, xtmp1, src, vec_enc);
5930   vpsllq(dst, dst, nbits, vec_enc);
5931   vpandn(xtmp1, xtmp1, src, vec_enc);
5932   vpsrlq(xtmp1, xtmp1, nbits, vec_enc);
5933   evporq(dst, dst, xtmp1, vec_enc);
5934 }
5935 
5936 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5937                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5938   // Shift based bit reversal.
5939   assert(VM_Version::supports_evex(), "");
5940   switch(bt) {
5941     case T_LONG:
5942       // Swap upper and lower double word of each quad word.
5943       evprorq(xtmp1, k0, src, 32, true, vec_enc);
5944       evprord(xtmp1, k0, xtmp1, 16, true, vec_enc);
5945       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
5946       break;
5947     case T_INT:
5948       // Swap upper and lower word of each double word.
5949       evprord(xtmp1, k0, src, 16, true, vec_enc);
5950       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
5951       break;
5952     case T_CHAR:
5953     case T_SHORT:
5954       // Swap upper and lower byte of each word.
5955       vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc);
5956       break;
5957     case T_BYTE:
5958       evmovdquq(dst, k0, src, true, vec_enc);
5959       break;
5960     default:
5961       fatal("Unsupported type %s", type2name(bt));
5962       break;
5963   }
5964 }
5965 
5966 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) {
5967   if (bt == T_BYTE) {
5968     if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) {
5969       evmovdquq(dst, k0, src, true, vec_enc);
5970     } else {
5971       vmovdqu(dst, src);
5972     }
5973     return;
5974   }
5975   // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using
5976   // pre-computed shuffle indices.
5977   switch(bt) {
5978     case T_LONG:
5979       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg);
5980       break;
5981     case T_INT:
5982       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg);
5983       break;
5984     case T_CHAR:
5985     case T_SHORT:
5986       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg);
5987       break;
5988     default:
5989       fatal("Unsupported type %s", type2name(bt));
5990       break;
5991   }
5992   vpshufb(dst, src, dst, vec_enc);
5993 }
5994 
5995 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5996                                                         XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
5997                                                         KRegister ktmp, Register rtmp, bool merge, int vec_enc) {
5998   assert(is_integral_type(bt), "");
5999   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
6000   assert(VM_Version::supports_avx512cd(), "");
6001   switch(bt) {
6002     case T_LONG:
6003       evplzcntq(dst, ktmp, src, merge, vec_enc);
6004       break;
6005     case T_INT:
6006       evplzcntd(dst, ktmp, src, merge, vec_enc);
6007       break;
6008     case T_SHORT:
6009       vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc);
6010       vpunpcklwd(xtmp2, xtmp1, src, vec_enc);
6011       evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc);
6012       vpunpckhwd(dst, xtmp1, src, vec_enc);
6013       evplzcntd(dst, ktmp, dst, merge, vec_enc);
6014       vpackusdw(dst, xtmp2, dst, vec_enc);
6015       break;
6016     case T_BYTE:
6017       // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6018       // accessing the lookup table.
6019       // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6020       // accessing the lookup table.
6021       // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6022       assert(VM_Version::supports_avx512bw(), "");
6023       evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp);
6024       vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc);
6025       vpand(xtmp2, dst, src, vec_enc);
6026       vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6027       vpsrlw(xtmp3, src, 4, vec_enc);
6028       vpand(xtmp3, dst, xtmp3, vec_enc);
6029       vpshufb(dst, xtmp1, xtmp3, vec_enc);
6030       vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6031       evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc);
6032       evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc);
6033       break;
6034     default:
6035       fatal("Unsupported type %s", type2name(bt));
6036       break;
6037   }
6038 }
6039 
6040 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6041                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6042   vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp);
6043   vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6044   // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6045   // accessing the lookup table.
6046   vpand(dst, xtmp2, src, vec_enc);
6047   vpshufb(dst, xtmp1, dst, vec_enc);
6048   // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6049   // accessing the lookup table.
6050   vpsrlw(xtmp3, src, 4, vec_enc);
6051   vpand(xtmp3, xtmp2, xtmp3, vec_enc);
6052   vpshufb(xtmp2, xtmp1, xtmp3, vec_enc);
6053   // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6054   vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6055   vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc);
6056   vpaddb(dst, dst, xtmp2, vec_enc);
6057   vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc);
6058 }
6059 
6060 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6061                                                              XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6062   vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6063   // Add zero counts of lower byte and upper byte of a word if
6064   // upper byte holds a zero value.
6065   vpsrlw(xtmp3, src, 8, vec_enc);
6066   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6067   vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc);
6068   vpsllw(xtmp2, dst, 8, vec_enc);
6069   vpaddw(xtmp2, xtmp2, dst, vec_enc);
6070   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6071   vpsrlw(dst, dst, 8, vec_enc);
6072 }
6073 
6074 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6075                                                            XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) {
6076   // Since IEEE 754 floating point format represents mantissa in 1.0 format
6077   // hence biased exponent can be used to compute leading zero count as per
6078   // following formula:-
6079   // LZCNT = 32 - (biased_exp - 127)
6080   // Special handling has been introduced for Zero, Max_Int and -ve source values.
6081 
6082   // Broadcast 0xFF
6083   vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc);
6084   vpsrld(xtmp1, xtmp1, 24, vec_enc);
6085 
6086   // Extract biased exponent.
6087   vcvtdq2ps(dst, src, vec_enc);
6088   vpsrld(dst, dst, 23, vec_enc);
6089   vpand(dst, dst, xtmp1, vec_enc);
6090 
6091   // Broadcast 127.
6092   vpsrld(xtmp1, xtmp1, 1, vec_enc);
6093   // Exponent = biased_exp - 127
6094   vpsubd(dst, dst, xtmp1, vec_enc);
6095 
6096   // Exponent = Exponent  + 1
6097   vpsrld(xtmp3, xtmp1, 6, vec_enc);
6098   vpaddd(dst, dst, xtmp3, vec_enc);
6099 
6100   // Replace -ve exponent with zero, exponent is -ve when src
6101   // lane contains a zero value.
6102   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
6103   vblendvps(dst, dst, xtmp2, dst, vec_enc);
6104 
6105   // Rematerialize broadcast 32.
6106   vpslld(xtmp1, xtmp3, 5, vec_enc);
6107   // Exponent is 32 if corresponding source lane contains max_int value.
6108   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
6109   // LZCNT = 32 - exponent
6110   vpsubd(dst, xtmp1, dst, vec_enc);
6111 
6112   // Replace LZCNT with a value 1 if corresponding source lane
6113   // contains max_int value.
6114   vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc);
6115 
6116   // Replace biased_exp with 0 if source lane value is less than zero.
6117   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
6118   vblendvps(dst, dst, xtmp2, src, vec_enc);
6119 }
6120 
6121 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6122                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6123   vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6124   // Add zero counts of lower word and upper word of a double word if
6125   // upper word holds a zero value.
6126   vpsrld(xtmp3, src, 16, vec_enc);
6127   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6128   vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc);
6129   vpslld(xtmp2, dst, 16, vec_enc);
6130   vpaddd(xtmp2, xtmp2, dst, vec_enc);
6131   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6132   vpsrld(dst, dst, 16, vec_enc);
6133   // Add zero counts of lower doubleword and upper doubleword of a
6134   // quadword if upper doubleword holds a zero value.
6135   vpsrlq(xtmp3, src, 32, vec_enc);
6136   vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc);
6137   vpsllq(xtmp2, dst, 32, vec_enc);
6138   vpaddq(xtmp2, xtmp2, dst, vec_enc);
6139   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6140   vpsrlq(dst, dst, 32, vec_enc);
6141 }
6142 
6143 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src,
6144                                                        XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6145                                                        Register rtmp, int vec_enc) {
6146   assert(is_integral_type(bt), "unexpected type");
6147   assert(vec_enc < Assembler::AVX_512bit, "");
6148   switch(bt) {
6149     case T_LONG:
6150       vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6151       break;
6152     case T_INT:
6153       vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
6154       break;
6155     case T_SHORT:
6156       vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6157       break;
6158     case T_BYTE:
6159       vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6160       break;
6161     default:
6162       fatal("Unsupported type %s", type2name(bt));
6163       break;
6164   }
6165 }
6166 
6167 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) {
6168   switch(bt) {
6169     case T_BYTE:
6170       vpsubb(dst, src1, src2, vec_enc);
6171       break;
6172     case T_SHORT:
6173       vpsubw(dst, src1, src2, vec_enc);
6174       break;
6175     case T_INT:
6176       vpsubd(dst, src1, src2, vec_enc);
6177       break;
6178     case T_LONG:
6179       vpsubq(dst, src1, src2, vec_enc);
6180       break;
6181     default:
6182       fatal("Unsupported type %s", type2name(bt));
6183       break;
6184   }
6185 }
6186 
6187 // Trailing zero count computation is based on leading zero count operation as per
6188 // following equation. All AVX3 targets support AVX512CD feature which offers
6189 // direct vector instruction to compute leading zero count.
6190 //      CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x)
6191 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6192                                                          XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6193                                                          XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) {
6194   assert(is_integral_type(bt), "");
6195   // xtmp = -1
6196   vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc);
6197   // xtmp = xtmp + src
6198   vpadd(bt, xtmp4, xtmp4, src, vec_enc);
6199   // xtmp = xtmp & ~src
6200   vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc);
6201   vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc);
6202   vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc);
6203   vpsub(bt, dst, xtmp4, dst, vec_enc);
6204 }
6205 
6206 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation
6207 //      CTZ = PRIM_TYPE_WIDHT - POPC(x | -x)
6208 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6209                                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6210   assert(is_integral_type(bt), "");
6211   // xtmp = 0
6212   vpxor(xtmp3 , xtmp3, xtmp3, vec_enc);
6213   // xtmp = 0 - src
6214   vpsub(bt, xtmp3, xtmp3, src, vec_enc);
6215   // xtmp = xtmp | src
6216   vpor(xtmp3, xtmp3, src, vec_enc);
6217   vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc);
6218   vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc);
6219   vpsub(bt, dst, xtmp1, dst, vec_enc);
6220 }
6221 
6222 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) {
6223   Label done;
6224   Label neg_divisor_fastpath;
6225   cmpl(divisor, 0);
6226   jccb(Assembler::less, neg_divisor_fastpath);
6227   xorl(rdx, rdx);
6228   divl(divisor);
6229   jmpb(done);
6230   bind(neg_divisor_fastpath);
6231   // Fastpath for divisor < 0:
6232   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6233   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6234   movl(rdx, rax);
6235   subl(rdx, divisor);
6236   if (VM_Version::supports_bmi1()) {
6237     andnl(rax, rdx, rax);
6238   } else {
6239     notl(rdx);
6240     andl(rax, rdx);
6241   }
6242   shrl(rax, 31);
6243   bind(done);
6244 }
6245 
6246 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) {
6247   Label done;
6248   Label neg_divisor_fastpath;
6249   cmpl(divisor, 0);
6250   jccb(Assembler::less, neg_divisor_fastpath);
6251   xorl(rdx, rdx);
6252   divl(divisor);
6253   jmpb(done);
6254   bind(neg_divisor_fastpath);
6255   // Fastpath when divisor < 0:
6256   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6257   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6258   movl(rdx, rax);
6259   subl(rax, divisor);
6260   if (VM_Version::supports_bmi1()) {
6261     andnl(rax, rax, rdx);
6262   } else {
6263     notl(rax);
6264     andl(rax, rdx);
6265   }
6266   sarl(rax, 31);
6267   andl(rax, divisor);
6268   subl(rdx, rax);
6269   bind(done);
6270 }
6271 
6272 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) {
6273   Label done;
6274   Label neg_divisor_fastpath;
6275 
6276   cmpl(divisor, 0);
6277   jccb(Assembler::less, neg_divisor_fastpath);
6278   xorl(rdx, rdx);
6279   divl(divisor);
6280   jmpb(done);
6281   bind(neg_divisor_fastpath);
6282   // Fastpath for divisor < 0:
6283   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6284   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6285   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6286   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6287   movl(rdx, rax);
6288   subl(rax, divisor);
6289   if (VM_Version::supports_bmi1()) {
6290     andnl(rax, rax, rdx);
6291   } else {
6292     notl(rax);
6293     andl(rax, rdx);
6294   }
6295   movl(tmp, rax);
6296   shrl(rax, 31); // quotient
6297   sarl(tmp, 31);
6298   andl(tmp, divisor);
6299   subl(rdx, tmp); // remainder
6300   bind(done);
6301 }
6302 
6303 #ifdef _LP64
6304 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1,
6305                                  XMMRegister xtmp2, Register rtmp) {
6306   if(VM_Version::supports_gfni()) {
6307     // Galois field instruction based bit reversal based on following algorithm.
6308     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6309     mov64(rtmp, 0x8040201008040201L);
6310     movq(xtmp1, src);
6311     movq(xtmp2, rtmp);
6312     gf2p8affineqb(xtmp1, xtmp2, 0);
6313     movq(dst, xtmp1);
6314   } else {
6315     // Swap even and odd numbered bits.
6316     movl(rtmp, src);
6317     andl(rtmp, 0x55555555);
6318     shll(rtmp, 1);
6319     movl(dst, src);
6320     andl(dst, 0xAAAAAAAA);
6321     shrl(dst, 1);
6322     orl(dst, rtmp);
6323 
6324     // Swap LSB and MSB 2 bits of each nibble.
6325     movl(rtmp, dst);
6326     andl(rtmp, 0x33333333);
6327     shll(rtmp, 2);
6328     andl(dst, 0xCCCCCCCC);
6329     shrl(dst, 2);
6330     orl(dst, rtmp);
6331 
6332     // Swap LSB and MSB 4 bits of each byte.
6333     movl(rtmp, dst);
6334     andl(rtmp, 0x0F0F0F0F);
6335     shll(rtmp, 4);
6336     andl(dst, 0xF0F0F0F0);
6337     shrl(dst, 4);
6338     orl(dst, rtmp);
6339   }
6340   bswapl(dst);
6341 }
6342 
6343 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1,
6344                                  XMMRegister xtmp2, Register rtmp1, Register rtmp2) {
6345   if(VM_Version::supports_gfni()) {
6346     // Galois field instruction based bit reversal based on following algorithm.
6347     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6348     mov64(rtmp1, 0x8040201008040201L);
6349     movq(xtmp1, src);
6350     movq(xtmp2, rtmp1);
6351     gf2p8affineqb(xtmp1, xtmp2, 0);
6352     movq(dst, xtmp1);
6353   } else {
6354     // Swap even and odd numbered bits.
6355     movq(rtmp1, src);
6356     mov64(rtmp2, 0x5555555555555555L);
6357     andq(rtmp1, rtmp2);
6358     shlq(rtmp1, 1);
6359     movq(dst, src);
6360     notq(rtmp2);
6361     andq(dst, rtmp2);
6362     shrq(dst, 1);
6363     orq(dst, rtmp1);
6364 
6365     // Swap LSB and MSB 2 bits of each nibble.
6366     movq(rtmp1, dst);
6367     mov64(rtmp2, 0x3333333333333333L);
6368     andq(rtmp1, rtmp2);
6369     shlq(rtmp1, 2);
6370     notq(rtmp2);
6371     andq(dst, rtmp2);
6372     shrq(dst, 2);
6373     orq(dst, rtmp1);
6374 
6375     // Swap LSB and MSB 4 bits of each byte.
6376     movq(rtmp1, dst);
6377     mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL);
6378     andq(rtmp1, rtmp2);
6379     shlq(rtmp1, 4);
6380     notq(rtmp2);
6381     andq(dst, rtmp2);
6382     shrq(dst, 4);
6383     orq(dst, rtmp1);
6384   }
6385   bswapq(dst);
6386 }
6387 
6388 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) {
6389   Label done;
6390   Label neg_divisor_fastpath;
6391   cmpq(divisor, 0);
6392   jccb(Assembler::less, neg_divisor_fastpath);
6393   xorl(rdx, rdx);
6394   divq(divisor);
6395   jmpb(done);
6396   bind(neg_divisor_fastpath);
6397   // Fastpath for divisor < 0:
6398   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6399   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6400   movq(rdx, rax);
6401   subq(rdx, divisor);
6402   if (VM_Version::supports_bmi1()) {
6403     andnq(rax, rdx, rax);
6404   } else {
6405     notq(rdx);
6406     andq(rax, rdx);
6407   }
6408   shrq(rax, 63);
6409   bind(done);
6410 }
6411 
6412 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) {
6413   Label done;
6414   Label neg_divisor_fastpath;
6415   cmpq(divisor, 0);
6416   jccb(Assembler::less, neg_divisor_fastpath);
6417   xorq(rdx, rdx);
6418   divq(divisor);
6419   jmp(done);
6420   bind(neg_divisor_fastpath);
6421   // Fastpath when divisor < 0:
6422   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6423   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6424   movq(rdx, rax);
6425   subq(rax, divisor);
6426   if (VM_Version::supports_bmi1()) {
6427     andnq(rax, rax, rdx);
6428   } else {
6429     notq(rax);
6430     andq(rax, rdx);
6431   }
6432   sarq(rax, 63);
6433   andq(rax, divisor);
6434   subq(rdx, rax);
6435   bind(done);
6436 }
6437 
6438 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) {
6439   Label done;
6440   Label neg_divisor_fastpath;
6441   cmpq(divisor, 0);
6442   jccb(Assembler::less, neg_divisor_fastpath);
6443   xorq(rdx, rdx);
6444   divq(divisor);
6445   jmp(done);
6446   bind(neg_divisor_fastpath);
6447   // Fastpath for divisor < 0:
6448   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6449   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6450   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6451   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6452   movq(rdx, rax);
6453   subq(rax, divisor);
6454   if (VM_Version::supports_bmi1()) {
6455     andnq(rax, rax, rdx);
6456   } else {
6457     notq(rax);
6458     andq(rax, rdx);
6459   }
6460   movq(tmp, rax);
6461   shrq(rax, 63); // quotient
6462   sarq(tmp, 63);
6463   andq(tmp, divisor);
6464   subq(rdx, tmp); // remainder
6465   bind(done);
6466 }
6467 #endif
6468 
6469 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1,
6470                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp,
6471                                         int vlen_enc) {
6472   assert(VM_Version::supports_avx512bw(), "");
6473   // Byte shuffles are inlane operations and indices are determined using
6474   // lower 4 bit of each shuffle lane, thus all shuffle indices are
6475   // normalized to index range 0-15. This makes sure that all the multiples
6476   // of an index value are placed at same relative position in 128 bit
6477   // lane i.e. elements corresponding to shuffle indices 16, 32 and 64
6478   // will be 16th element in their respective 128 bit lanes.
6479   movl(rtmp, 16);
6480   evpbroadcastb(xtmp1, rtmp, vlen_enc);
6481 
6482   // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16,
6483   // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using
6484   // original shuffle indices and move the shuffled lanes corresponding to true
6485   // mask to destination vector.
6486   evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
6487   evshufi64x2(xtmp2, src, src, 0x0, vlen_enc);
6488   evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc);
6489 
6490   // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32
6491   // and broadcasting second 128 bit lane.
6492   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6493   vpsllq(xtmp2, xtmp1, 0x1, vlen_enc);
6494   evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
6495   evshufi64x2(xtmp3, src, src, 0x55, vlen_enc);
6496   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6497 
6498   // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48
6499   // and broadcasting third 128 bit lane.
6500   evpcmpb(ktmp, k0, shuffle,  xtmp2, Assembler::nlt, true, vlen_enc);
6501   vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc);
6502   evpcmpb(ktmp, ktmp, shuffle,  xtmp1, Assembler::lt, true, vlen_enc);
6503   evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc);
6504   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6505 
6506   // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64
6507   // and broadcasting third 128 bit lane.
6508   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6509   vpsllq(xtmp2, xtmp2, 0x1, vlen_enc);
6510   evpcmpb(ktmp, ktmp, shuffle,  xtmp2, Assembler::lt, true, vlen_enc);
6511   evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc);
6512   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6513 }
6514 
6515 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst,
6516                                                    XMMRegister shuffle, XMMRegister src, int vlen_enc) {
6517   if (vlen_enc == AVX_128bit) {
6518     vpermilps(dst, src, shuffle, vlen_enc);
6519   } else if (bt == T_INT) {
6520     vpermd(dst, shuffle, src, vlen_enc);
6521   } else {
6522     assert(bt == T_FLOAT, "");
6523     vpermps(dst, shuffle, src, vlen_enc);
6524   }
6525 }