1 /*
   2  * Copyright (c) 2020, 2023, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "gc/shared/barrierSet.hpp"
  29 #include "gc/shared/barrierSetAssembler.hpp"
  30 #include "oops/methodData.hpp"
  31 #include "opto/c2_MacroAssembler.hpp"
  32 #include "opto/intrinsicnode.hpp"
  33 #include "opto/output.hpp"
  34 #include "opto/opcodes.hpp"
  35 #include "opto/subnode.hpp"
  36 #include "runtime/objectMonitor.hpp"
  37 #include "runtime/stubRoutines.hpp"
  38 
  39 #ifdef PRODUCT
  40 #define BLOCK_COMMENT(str) /* nothing */
  41 #define STOP(error) stop(error)
  42 #else
  43 #define BLOCK_COMMENT(str) block_comment(str)
  44 #define STOP(error) block_comment(error); stop(error)
  45 #endif
  46 
  47 // C2 compiled method's prolog code.
  48 void C2_MacroAssembler::verified_entry(Compile* C, int sp_inc) {
  49   if (C->clinit_barrier_on_entry()) {
  50     assert(VM_Version::supports_fast_class_init_checks(), "sanity");
  51     assert(!C->method()->holder()->is_not_initialized(), "initialization should have been started");
  52 
  53     Label L_skip_barrier;
  54     Register klass = rscratch1;
  55 
  56     mov_metadata(klass, C->method()->holder()->constant_encoding());
  57     clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
  58 
  59     jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
  60 
  61     bind(L_skip_barrier);
  62   }
  63 
  64   int framesize = C->output()->frame_size_in_bytes();
  65   int bangsize = C->output()->bang_size_in_bytes();
  66   bool fp_mode_24b = false;
  67   int stack_bang_size = C->output()->need_stack_bang(bangsize) ? bangsize : 0;
  68 
  69   // WARNING: Initial instruction MUST be 5 bytes or longer so that
  70   // NativeJump::patch_verified_entry will be able to patch out the entry
  71   // code safely. The push to verify stack depth is ok at 5 bytes,
  72   // the frame allocation can be either 3 or 6 bytes. So if we don't do
  73   // stack bang then we must use the 6 byte frame allocation even if
  74   // we have no frame. :-(
  75   assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
  76 
  77   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
  78   // Remove word for return addr
  79   framesize -= wordSize;
  80   stack_bang_size -= wordSize;
  81 
  82   // Calls to C2R adapters often do not accept exceptional returns.
  83   // We require that their callers must bang for them.  But be careful, because
  84   // some VM calls (such as call site linkage) can use several kilobytes of
  85   // stack.  But the stack safety zone should account for that.
  86   // See bugs 4446381, 4468289, 4497237.
  87   if (stack_bang_size > 0) {
  88     generate_stack_overflow_check(stack_bang_size);
  89 
  90     // We always push rbp, so that on return to interpreter rbp, will be
  91     // restored correctly and we can correct the stack.
  92     push(rbp);
  93     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  94     if (PreserveFramePointer) {
  95       mov(rbp, rsp);
  96     }
  97     // Remove word for ebp
  98     framesize -= wordSize;
  99 
 100     // Create frame
 101     if (framesize) {
 102       subptr(rsp, framesize);
 103     }
 104   } else {
 105     // Create frame (force generation of a 4 byte immediate value)
 106     subptr_imm32(rsp, framesize);
 107 
 108     // Save RBP register now.
 109     framesize -= wordSize;
 110     movptr(Address(rsp, framesize), rbp);
 111     // Save caller's stack pointer into RBP if the frame pointer is preserved.
 112     if (PreserveFramePointer) {
 113       movptr(rbp, rsp);
 114       if (framesize > 0) {
 115         addptr(rbp, framesize);
 116       }
 117     }
 118   }
 119 
 120   if (C->needs_stack_repair()) {
 121     // Save stack increment just below the saved rbp (also account for fixed framesize and rbp)
 122     assert((sp_inc & (StackAlignmentInBytes-1)) == 0, "stack increment not aligned");
 123     movptr(Address(rsp, framesize - wordSize), sp_inc + framesize + wordSize);
 124   }
 125 
 126   if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
 127     framesize -= wordSize;
 128     movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 129   }
 130 
 131 #ifndef _LP64
 132   // If method sets FPU control word do it now
 133   if (fp_mode_24b) {
 134     fldcw(ExternalAddress(StubRoutines::x86::addr_fpu_cntrl_wrd_24()));
 135   }
 136   if (UseSSE >= 2 && VerifyFPU) {
 137     verify_FPU(0, "FPU stack must be clean on entry");
 138   }
 139 #endif
 140 
 141 #ifdef ASSERT
 142   if (VerifyStackAtCalls) {
 143     Label L;
 144     push(rax);
 145     mov(rax, rsp);
 146     andptr(rax, StackAlignmentInBytes-1);
 147     cmpptr(rax, StackAlignmentInBytes-wordSize);
 148     pop(rax);
 149     jcc(Assembler::equal, L);
 150     STOP("Stack is not properly aligned!");
 151     bind(L);
 152   }
 153 #endif
 154 }
 155 
 156 void C2_MacroAssembler::entry_barrier() {
 157   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 158 #ifdef _LP64
 159   if (BarrierSet::barrier_set()->barrier_set_nmethod() != nullptr) {
 160     // We put the non-hot code of the nmethod entry barrier out-of-line in a stub.
 161     Label dummy_slow_path;
 162     Label dummy_continuation;
 163     Label* slow_path = &dummy_slow_path;
 164     Label* continuation = &dummy_continuation;
 165     if (!Compile::current()->output()->in_scratch_emit_size()) {
 166       // Use real labels from actual stub when not emitting code for the purpose of measuring its size
 167       C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
 168       Compile::current()->output()->add_stub(stub);
 169       slow_path = &stub->entry();
 170       continuation = &stub->continuation();
 171     }
 172     bs->nmethod_entry_barrier(this, slow_path, continuation);
 173   }
 174 #else
 175   // Don't bother with out-of-line nmethod entry barrier stub for x86_32.
 176   bs->nmethod_entry_barrier(this, nullptr /* slow_path */, nullptr /* continuation */);
 177 #endif
 178 }
 179 
 180 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
 181   switch (vlen_in_bytes) {
 182     case  4: // fall-through
 183     case  8: // fall-through
 184     case 16: return Assembler::AVX_128bit;
 185     case 32: return Assembler::AVX_256bit;
 186     case 64: return Assembler::AVX_512bit;
 187 
 188     default: {
 189       ShouldNotReachHere();
 190       return Assembler::AVX_NoVec;
 191     }
 192   }
 193 }
 194 
 195 #if INCLUDE_RTM_OPT
 196 
 197 // Update rtm_counters based on abort status
 198 // input: abort_status
 199 //        rtm_counters (RTMLockingCounters*)
 200 // flags are killed
 201 void C2_MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) {
 202 
 203   atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset()));
 204   if (PrintPreciseRTMLockingStatistics) {
 205     for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {
 206       Label check_abort;
 207       testl(abort_status, (1<<i));
 208       jccb(Assembler::equal, check_abort);
 209       atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx))));
 210       bind(check_abort);
 211     }
 212   }
 213 }
 214 
 215 // Branch if (random & (count-1) != 0), count is 2^n
 216 // tmp, scr and flags are killed
 217 void C2_MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) {
 218   assert(tmp == rax, "");
 219   assert(scr == rdx, "");
 220   rdtsc(); // modifies EDX:EAX
 221   andptr(tmp, count-1);
 222   jccb(Assembler::notZero, brLabel);
 223 }
 224 
 225 // Perform abort ratio calculation, set no_rtm bit if high ratio
 226 // input:  rtm_counters_Reg (RTMLockingCounters* address)
 227 // tmpReg, rtm_counters_Reg and flags are killed
 228 void C2_MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg,
 229                                                     Register rtm_counters_Reg,
 230                                                     RTMLockingCounters* rtm_counters,
 231                                                     Metadata* method_data) {
 232   Label L_done, L_check_always_rtm1, L_check_always_rtm2;
 233 
 234   if (RTMLockingCalculationDelay > 0) {
 235     // Delay calculation
 236     movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr()));
 237     testptr(tmpReg, tmpReg);
 238     jccb(Assembler::equal, L_done);
 239   }
 240   // Abort ratio calculation only if abort_count > RTMAbortThreshold
 241   //   Aborted transactions = abort_count * 100
 242   //   All transactions = total_count *  RTMTotalCountIncrRate
 243   //   Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
 244 
 245   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset()));
 246   cmpptr(tmpReg, RTMAbortThreshold);
 247   jccb(Assembler::below, L_check_always_rtm2);
 248   imulptr(tmpReg, tmpReg, 100);
 249 
 250   Register scrReg = rtm_counters_Reg;
 251   movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
 252   imulptr(scrReg, scrReg, RTMTotalCountIncrRate);
 253   imulptr(scrReg, scrReg, RTMAbortRatio);
 254   cmpptr(tmpReg, scrReg);
 255   jccb(Assembler::below, L_check_always_rtm1);
 256   if (method_data != nullptr) {
 257     // set rtm_state to "no rtm" in MDO
 258     mov_metadata(tmpReg, method_data);
 259     lock();
 260     orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), NoRTM);
 261   }
 262   jmpb(L_done);
 263   bind(L_check_always_rtm1);
 264   // Reload RTMLockingCounters* address
 265   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
 266   bind(L_check_always_rtm2);
 267   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
 268   cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate);
 269   jccb(Assembler::below, L_done);
 270   if (method_data != nullptr) {
 271     // set rtm_state to "always rtm" in MDO
 272     mov_metadata(tmpReg, method_data);
 273     lock();
 274     orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), UseRTM);
 275   }
 276   bind(L_done);
 277 }
 278 
 279 // Update counters and perform abort ratio calculation
 280 // input:  abort_status_Reg
 281 // rtm_counters_Reg, flags are killed
 282 void C2_MacroAssembler::rtm_profiling(Register abort_status_Reg,
 283                                       Register rtm_counters_Reg,
 284                                       RTMLockingCounters* rtm_counters,
 285                                       Metadata* method_data,
 286                                       bool profile_rtm) {
 287 
 288   assert(rtm_counters != nullptr, "should not be null when profiling RTM");
 289   // update rtm counters based on rax value at abort
 290   // reads abort_status_Reg, updates flags
 291   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
 292   rtm_counters_update(abort_status_Reg, rtm_counters_Reg);
 293   if (profile_rtm) {
 294     // Save abort status because abort_status_Reg is used by following code.
 295     if (RTMRetryCount > 0) {
 296       push(abort_status_Reg);
 297     }
 298     assert(rtm_counters != nullptr, "should not be null when profiling RTM");
 299     rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data);
 300     // restore abort status
 301     if (RTMRetryCount > 0) {
 302       pop(abort_status_Reg);
 303     }
 304   }
 305 }
 306 
 307 // Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4)
 308 // inputs: retry_count_Reg
 309 //       : abort_status_Reg
 310 // output: retry_count_Reg decremented by 1
 311 // flags are killed
 312 void C2_MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) {
 313   Label doneRetry;
 314   assert(abort_status_Reg == rax, "");
 315   // The abort reason bits are in eax (see all states in rtmLocking.hpp)
 316   // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4)
 317   // if reason is in 0x6 and retry count != 0 then retry
 318   andptr(abort_status_Reg, 0x6);
 319   jccb(Assembler::zero, doneRetry);
 320   testl(retry_count_Reg, retry_count_Reg);
 321   jccb(Assembler::zero, doneRetry);
 322   pause();
 323   decrementl(retry_count_Reg);
 324   jmp(retryLabel);
 325   bind(doneRetry);
 326 }
 327 
 328 // Spin and retry if lock is busy,
 329 // inputs: box_Reg (monitor address)
 330 //       : retry_count_Reg
 331 // output: retry_count_Reg decremented by 1
 332 //       : clear z flag if retry count exceeded
 333 // tmp_Reg, scr_Reg, flags are killed
 334 void C2_MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg,
 335                                                Register tmp_Reg, Register scr_Reg, Label& retryLabel) {
 336   Label SpinLoop, SpinExit, doneRetry;
 337   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 338 
 339   testl(retry_count_Reg, retry_count_Reg);
 340   jccb(Assembler::zero, doneRetry);
 341   decrementl(retry_count_Reg);
 342   movptr(scr_Reg, RTMSpinLoopCount);
 343 
 344   bind(SpinLoop);
 345   pause();
 346   decrementl(scr_Reg);
 347   jccb(Assembler::lessEqual, SpinExit);
 348   movptr(tmp_Reg, Address(box_Reg, owner_offset));
 349   testptr(tmp_Reg, tmp_Reg);
 350   jccb(Assembler::notZero, SpinLoop);
 351 
 352   bind(SpinExit);
 353   jmp(retryLabel);
 354   bind(doneRetry);
 355   incrementl(retry_count_Reg); // clear z flag
 356 }
 357 
 358 // Use RTM for normal stack locks
 359 // Input: objReg (object to lock)
 360 void C2_MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg,
 361                                          Register retry_on_abort_count_Reg,
 362                                          RTMLockingCounters* stack_rtm_counters,
 363                                          Metadata* method_data, bool profile_rtm,
 364                                          Label& DONE_LABEL, Label& IsInflated) {
 365   assert(UseRTMForStackLocks, "why call this otherwise?");
 366   assert(tmpReg == rax, "");
 367   assert(scrReg == rdx, "");
 368   Label L_rtm_retry, L_decrement_retry, L_on_abort;
 369 
 370   if (RTMRetryCount > 0) {
 371     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
 372     bind(L_rtm_retry);
 373   }
 374   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
 375   testptr(tmpReg, markWord::monitor_value);  // inflated vs stack-locked|neutral
 376   jcc(Assembler::notZero, IsInflated);
 377 
 378   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 379     Label L_noincrement;
 380     if (RTMTotalCountIncrRate > 1) {
 381       // tmpReg, scrReg and flags are killed
 382       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
 383     }
 384     assert(stack_rtm_counters != nullptr, "should not be null when profiling RTM");
 385     atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg);
 386     bind(L_noincrement);
 387   }
 388   xbegin(L_on_abort);
 389   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));       // fetch markword
 390   andptr(tmpReg, markWord::lock_mask_in_place);     // look at 2 lock bits
 391   cmpptr(tmpReg, markWord::unlocked_value);         // bits = 01 unlocked
 392   jcc(Assembler::equal, DONE_LABEL);        // all done if unlocked
 393 
 394   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
 395   if (UseRTMXendForLockBusy) {
 396     xend();
 397     movptr(abort_status_Reg, 0x2);   // Set the abort status to 2 (so we can retry)
 398     jmp(L_decrement_retry);
 399   }
 400   else {
 401     xabort(0);
 402   }
 403   bind(L_on_abort);
 404   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 405     rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm);
 406   }
 407   bind(L_decrement_retry);
 408   if (RTMRetryCount > 0) {
 409     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
 410     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
 411   }
 412 }
 413 
 414 // Use RTM for inflating locks
 415 // inputs: objReg (object to lock)
 416 //         boxReg (on-stack box address (displaced header location) - KILLED)
 417 //         tmpReg (ObjectMonitor address + markWord::monitor_value)
 418 void C2_MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg,
 419                                             Register scrReg, Register retry_on_busy_count_Reg,
 420                                             Register retry_on_abort_count_Reg,
 421                                             RTMLockingCounters* rtm_counters,
 422                                             Metadata* method_data, bool profile_rtm,
 423                                             Label& DONE_LABEL) {
 424   assert(UseRTMLocking, "why call this otherwise?");
 425   assert(tmpReg == rax, "");
 426   assert(scrReg == rdx, "");
 427   Label L_rtm_retry, L_decrement_retry, L_on_abort;
 428   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 429 
 430   movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value()));
 431   movptr(boxReg, tmpReg); // Save ObjectMonitor address
 432 
 433   if (RTMRetryCount > 0) {
 434     movl(retry_on_busy_count_Reg, RTMRetryCount);  // Retry on lock busy
 435     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
 436     bind(L_rtm_retry);
 437   }
 438   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 439     Label L_noincrement;
 440     if (RTMTotalCountIncrRate > 1) {
 441       // tmpReg, scrReg and flags are killed
 442       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
 443     }
 444     assert(rtm_counters != nullptr, "should not be null when profiling RTM");
 445     atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg);
 446     bind(L_noincrement);
 447   }
 448   xbegin(L_on_abort);
 449   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
 450   movptr(tmpReg, Address(tmpReg, owner_offset));
 451   testptr(tmpReg, tmpReg);
 452   jcc(Assembler::zero, DONE_LABEL);
 453   if (UseRTMXendForLockBusy) {
 454     xend();
 455     jmp(L_decrement_retry);
 456   }
 457   else {
 458     xabort(0);
 459   }
 460   bind(L_on_abort);
 461   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
 462   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 463     rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm);
 464   }
 465   if (RTMRetryCount > 0) {
 466     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
 467     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
 468   }
 469 
 470   movptr(tmpReg, Address(boxReg, owner_offset)) ;
 471   testptr(tmpReg, tmpReg) ;
 472   jccb(Assembler::notZero, L_decrement_retry) ;
 473 
 474   // Appears unlocked - try to swing _owner from null to non-null.
 475   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 476 #ifdef _LP64
 477   Register threadReg = r15_thread;
 478 #else
 479   get_thread(scrReg);
 480   Register threadReg = scrReg;
 481 #endif
 482   lock();
 483   cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg
 484 
 485   if (RTMRetryCount > 0) {
 486     // success done else retry
 487     jccb(Assembler::equal, DONE_LABEL) ;
 488     bind(L_decrement_retry);
 489     // Spin and retry if lock is busy.
 490     rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry);
 491   }
 492   else {
 493     bind(L_decrement_retry);
 494   }
 495 }
 496 
 497 #endif //  INCLUDE_RTM_OPT
 498 
 499 // fast_lock and fast_unlock used by C2
 500 
 501 // Because the transitions from emitted code to the runtime
 502 // monitorenter/exit helper stubs are so slow it's critical that
 503 // we inline both the stack-locking fast path and the inflated fast path.
 504 //
 505 // See also: cmpFastLock and cmpFastUnlock.
 506 //
 507 // What follows is a specialized inline transliteration of the code
 508 // in enter() and exit(). If we're concerned about I$ bloat another
 509 // option would be to emit TrySlowEnter and TrySlowExit methods
 510 // at startup-time.  These methods would accept arguments as
 511 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
 512 // indications in the icc.ZFlag.  fast_lock and fast_unlock would simply
 513 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
 514 // In practice, however, the # of lock sites is bounded and is usually small.
 515 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
 516 // if the processor uses simple bimodal branch predictors keyed by EIP
 517 // Since the helper routines would be called from multiple synchronization
 518 // sites.
 519 //
 520 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
 521 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
 522 // to those specialized methods.  That'd give us a mostly platform-independent
 523 // implementation that the JITs could optimize and inline at their pleasure.
 524 // Done correctly, the only time we'd need to cross to native could would be
 525 // to park() or unpark() threads.  We'd also need a few more unsafe operators
 526 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
 527 // (b) explicit barriers or fence operations.
 528 //
 529 // TODO:
 530 //
 531 // *  Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
 532 //    This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
 533 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
 534 //    the lock operators would typically be faster than reifying Self.
 535 //
 536 // *  Ideally I'd define the primitives as:
 537 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
 538 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
 539 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
 540 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
 541 //    Furthermore the register assignments are overconstrained, possibly resulting in
 542 //    sub-optimal code near the synchronization site.
 543 //
 544 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
 545 //    Alternately, use a better sp-proximity test.
 546 //
 547 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
 548 //    Either one is sufficient to uniquely identify a thread.
 549 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
 550 //
 551 // *  Intrinsify notify() and notifyAll() for the common cases where the
 552 //    object is locked by the calling thread but the waitlist is empty.
 553 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
 554 //
 555 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
 556 //    But beware of excessive branch density on AMD Opterons.
 557 //
 558 // *  Both fast_lock and fast_unlock set the ICC.ZF to indicate success
 559 //    or failure of the fast path.  If the fast path fails then we pass
 560 //    control to the slow path, typically in C.  In fast_lock and
 561 //    fast_unlock we often branch to DONE_LABEL, just to find that C2
 562 //    will emit a conditional branch immediately after the node.
 563 //    So we have branches to branches and lots of ICC.ZF games.
 564 //    Instead, it might be better to have C2 pass a "FailureLabel"
 565 //    into fast_lock and fast_unlock.  In the case of success, control
 566 //    will drop through the node.  ICC.ZF is undefined at exit.
 567 //    In the case of failure, the node will branch directly to the
 568 //    FailureLabel
 569 
 570 
 571 // obj: object to lock
 572 // box: on-stack box address (displaced header location) - KILLED
 573 // rax,: tmp -- KILLED
 574 // scr: tmp -- KILLED
 575 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
 576                                  Register scrReg, Register cx1Reg, Register cx2Reg,
 577                                  RTMLockingCounters* rtm_counters,
 578                                  RTMLockingCounters* stack_rtm_counters,
 579                                  Metadata* method_data,
 580                                  bool use_rtm, bool profile_rtm) {
 581   // Ensure the register assignments are disjoint
 582   assert(tmpReg == rax, "");
 583 
 584   if (use_rtm) {
 585     assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg);
 586   } else {
 587     assert(cx1Reg == noreg, "");
 588     assert(cx2Reg == noreg, "");
 589     assert_different_registers(objReg, boxReg, tmpReg, scrReg);
 590   }
 591 
 592   // Possible cases that we'll encounter in fast_lock
 593   // ------------------------------------------------
 594   // * Inflated
 595   //    -- unlocked
 596   //    -- Locked
 597   //       = by self
 598   //       = by other
 599   // * neutral
 600   // * stack-locked
 601   //    -- by self
 602   //       = sp-proximity test hits
 603   //       = sp-proximity test generates false-negative
 604   //    -- by other
 605   //
 606 
 607   Label IsInflated, DONE_LABEL, NO_COUNT, COUNT;
 608 
 609   if (DiagnoseSyncOnValueBasedClasses != 0) {
 610     load_klass(tmpReg, objReg, scrReg);
 611     movl(tmpReg, Address(tmpReg, Klass::access_flags_offset()));
 612     testl(tmpReg, JVM_ACC_IS_VALUE_BASED_CLASS);
 613     jcc(Assembler::notZero, DONE_LABEL);
 614   }
 615 
 616 #if INCLUDE_RTM_OPT
 617   if (UseRTMForStackLocks && use_rtm) {
 618     assert(!UseHeavyMonitors, "+UseHeavyMonitors and +UseRTMForStackLocks are mutually exclusive");
 619     rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg,
 620                       stack_rtm_counters, method_data, profile_rtm,
 621                       DONE_LABEL, IsInflated);
 622   }
 623 #endif // INCLUDE_RTM_OPT
 624 
 625   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));          // [FETCH]
 626   testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral
 627   jccb(Assembler::notZero, IsInflated);
 628 
 629   if (!UseHeavyMonitors) {
 630     // Attempt stack-locking ...
 631     orptr (tmpReg, markWord::unlocked_value);
 632     if (EnableValhalla) {
 633       // Mask inline_type bit such that we go to the slow path if object is an inline type
 634       andptr(tmpReg, ~((int) markWord::inline_type_bit_in_place));
 635     }
 636     movptr(Address(boxReg, 0), tmpReg);          // Anticipate successful CAS
 637     lock();
 638     cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes()));      // Updates tmpReg
 639     jcc(Assembler::equal, COUNT);           // Success
 640 
 641     // Recursive locking.
 642     // The object is stack-locked: markword contains stack pointer to BasicLock.
 643     // Locked by current thread if difference with current SP is less than one page.
 644     subptr(tmpReg, rsp);
 645     // Next instruction set ZFlag == 1 (Success) if difference is less then one page.
 646     andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - (int)os::vm_page_size())) );
 647     movptr(Address(boxReg, 0), tmpReg);
 648   } else {
 649     // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0.
 650     testptr(objReg, objReg);
 651   }
 652   jmp(DONE_LABEL);
 653 
 654   bind(IsInflated);
 655   // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value
 656 
 657 #if INCLUDE_RTM_OPT
 658   // Use the same RTM locking code in 32- and 64-bit VM.
 659   if (use_rtm) {
 660     rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg,
 661                          rtm_counters, method_data, profile_rtm, DONE_LABEL);
 662   } else {
 663 #endif // INCLUDE_RTM_OPT
 664 
 665 #ifndef _LP64
 666   // The object is inflated.
 667 
 668   // boxReg refers to the on-stack BasicLock in the current frame.
 669   // We'd like to write:
 670   //   set box->_displaced_header = markWord::unused_mark().  Any non-0 value suffices.
 671   // This is convenient but results a ST-before-CAS penalty.  The following CAS suffers
 672   // additional latency as we have another ST in the store buffer that must drain.
 673 
 674   // avoid ST-before-CAS
 675   // register juggle because we need tmpReg for cmpxchgptr below
 676   movptr(scrReg, boxReg);
 677   movptr(boxReg, tmpReg);                   // consider: LEA box, [tmp-2]
 678 
 679   // Optimistic form: consider XORL tmpReg,tmpReg
 680   movptr(tmpReg, NULL_WORD);
 681 
 682   // Appears unlocked - try to swing _owner from null to non-null.
 683   // Ideally, I'd manifest "Self" with get_thread and then attempt
 684   // to CAS the register containing Self into m->Owner.
 685   // But we don't have enough registers, so instead we can either try to CAS
 686   // rsp or the address of the box (in scr) into &m->owner.  If the CAS succeeds
 687   // we later store "Self" into m->Owner.  Transiently storing a stack address
 688   // (rsp or the address of the box) into  m->owner is harmless.
 689   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 690   lock();
 691   cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 692   movptr(Address(scrReg, 0), 3);          // box->_displaced_header = 3
 693   // If we weren't able to swing _owner from null to the BasicLock
 694   // then take the slow path.
 695   jccb  (Assembler::notZero, NO_COUNT);
 696   // update _owner from BasicLock to thread
 697   get_thread (scrReg);                    // beware: clobbers ICCs
 698   movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg);
 699   xorptr(boxReg, boxReg);                 // set icc.ZFlag = 1 to indicate success
 700 
 701   // If the CAS fails we can either retry or pass control to the slow path.
 702   // We use the latter tactic.
 703   // Pass the CAS result in the icc.ZFlag into DONE_LABEL
 704   // If the CAS was successful ...
 705   //   Self has acquired the lock
 706   //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
 707   // Intentional fall-through into DONE_LABEL ...
 708 #else // _LP64
 709   // It's inflated and we use scrReg for ObjectMonitor* in this section.
 710   movq(scrReg, tmpReg);
 711   xorq(tmpReg, tmpReg);
 712   lock();
 713   cmpxchgptr(r15_thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 714   // Unconditionally set box->_displaced_header = markWord::unused_mark().
 715   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 716   movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value()));
 717   // Propagate ICC.ZF from CAS above into DONE_LABEL.
 718   jccb(Assembler::equal, COUNT);          // CAS above succeeded; propagate ZF = 1 (success)
 719 
 720   cmpptr(r15_thread, rax);                // Check if we are already the owner (recursive lock)
 721   jccb(Assembler::notEqual, NO_COUNT);    // If not recursive, ZF = 0 at this point (fail)
 722   incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 723   xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success
 724 #endif // _LP64
 725 #if INCLUDE_RTM_OPT
 726   } // use_rtm()
 727 #endif
 728   bind(DONE_LABEL);
 729 
 730   // ZFlag == 1 count in fast path
 731   // ZFlag == 0 count in slow path
 732   jccb(Assembler::notZero, NO_COUNT); // jump if ZFlag == 0
 733 
 734   bind(COUNT);
 735   // Count monitors in fast path
 736 #ifndef _LP64
 737   get_thread(tmpReg);
 738   incrementl(Address(tmpReg, JavaThread::held_monitor_count_offset()));
 739 #else // _LP64
 740   incrementq(Address(r15_thread, JavaThread::held_monitor_count_offset()));
 741 #endif
 742 
 743   xorl(tmpReg, tmpReg); // Set ZF == 1
 744 
 745   bind(NO_COUNT);
 746 
 747   // At NO_COUNT the icc ZFlag is set as follows ...
 748   // fast_unlock uses the same protocol.
 749   // ZFlag == 1 -> Success
 750   // ZFlag == 0 -> Failure - force control through the slow path
 751 }
 752 
 753 // obj: object to unlock
 754 // box: box address (displaced header location), killed.  Must be EAX.
 755 // tmp: killed, cannot be obj nor box.
 756 //
 757 // Some commentary on balanced locking:
 758 //
 759 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
 760 // Methods that don't have provably balanced locking are forced to run in the
 761 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
 762 // The interpreter provides two properties:
 763 // I1:  At return-time the interpreter automatically and quietly unlocks any
 764 //      objects acquired the current activation (frame).  Recall that the
 765 //      interpreter maintains an on-stack list of locks currently held by
 766 //      a frame.
 767 // I2:  If a method attempts to unlock an object that is not held by the
 768 //      the frame the interpreter throws IMSX.
 769 //
 770 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
 771 // B() doesn't have provably balanced locking so it runs in the interpreter.
 772 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
 773 // is still locked by A().
 774 //
 775 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
 776 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
 777 // should not be unlocked by "normal" java-level locking and vice-versa.  The specification
 778 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
 779 // Arguably given that the spec legislates the JNI case as undefined our implementation
 780 // could reasonably *avoid* checking owner in fast_unlock().
 781 // In the interest of performance we elide m->Owner==Self check in unlock.
 782 // A perfectly viable alternative is to elide the owner check except when
 783 // Xcheck:jni is enabled.
 784 
 785 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) {
 786   assert(boxReg == rax, "");
 787   assert_different_registers(objReg, boxReg, tmpReg);
 788 
 789   Label DONE_LABEL, Stacked, COUNT, NO_COUNT;
 790 
 791 #if INCLUDE_RTM_OPT
 792   if (UseRTMForStackLocks && use_rtm) {
 793     assert(!UseHeavyMonitors, "+UseHeavyMonitors and +UseRTMForStackLocks are mutually exclusive");
 794     Label L_regular_unlock;
 795     movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword
 796     andptr(tmpReg, markWord::lock_mask_in_place);                     // look at 2 lock bits
 797     cmpptr(tmpReg, markWord::unlocked_value);                         // bits = 01 unlocked
 798     jccb(Assembler::notEqual, L_regular_unlock);                      // if !HLE RegularLock
 799     xend();                                                           // otherwise end...
 800     jmp(DONE_LABEL);                                                  // ... and we're done
 801     bind(L_regular_unlock);
 802   }
 803 #endif
 804 
 805   if (!UseHeavyMonitors) {
 806     cmpptr(Address(boxReg, 0), NULL_WORD);                            // Examine the displaced header
 807     jcc   (Assembler::zero, COUNT);                                   // 0 indicates recursive stack-lock
 808   }
 809   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));   // Examine the object's markword
 810   if (!UseHeavyMonitors) {
 811     testptr(tmpReg, markWord::monitor_value);                         // Inflated?
 812     jccb   (Assembler::zero, Stacked);
 813   }
 814 
 815   // It's inflated.
 816 #if INCLUDE_RTM_OPT
 817   if (use_rtm) {
 818     Label L_regular_inflated_unlock;
 819     int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 820     movptr(boxReg, Address(tmpReg, owner_offset));
 821     testptr(boxReg, boxReg);
 822     jccb(Assembler::notZero, L_regular_inflated_unlock);
 823     xend();
 824     jmpb(DONE_LABEL);
 825     bind(L_regular_inflated_unlock);
 826   }
 827 #endif
 828 
 829   // Despite our balanced locking property we still check that m->_owner == Self
 830   // as java routines or native JNI code called by this thread might
 831   // have released the lock.
 832   // Refer to the comments in synchronizer.cpp for how we might encode extra
 833   // state in _succ so we can avoid fetching EntryList|cxq.
 834   //
 835   // If there's no contention try a 1-0 exit.  That is, exit without
 836   // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
 837   // we detect and recover from the race that the 1-0 exit admits.
 838   //
 839   // Conceptually fast_unlock() must execute a STST|LDST "release" barrier
 840   // before it STs null into _owner, releasing the lock.  Updates
 841   // to data protected by the critical section must be visible before
 842   // we drop the lock (and thus before any other thread could acquire
 843   // the lock and observe the fields protected by the lock).
 844   // IA32's memory-model is SPO, so STs are ordered with respect to
 845   // each other and there's no need for an explicit barrier (fence).
 846   // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
 847 #ifndef _LP64
 848   // Note that we could employ various encoding schemes to reduce
 849   // the number of loads below (currently 4) to just 2 or 3.
 850   // Refer to the comments in synchronizer.cpp.
 851   // In practice the chain of fetches doesn't seem to impact performance, however.
 852   xorptr(boxReg, boxReg);
 853   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 854   jccb  (Assembler::notZero, DONE_LABEL);
 855   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 856   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 857   jccb  (Assembler::notZero, DONE_LABEL);
 858   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 859   jmpb  (DONE_LABEL);
 860 #else // _LP64
 861   // It's inflated
 862   Label CheckSucc, LNotRecursive, LSuccess, LGoSlowPath;
 863 
 864   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0);
 865   jccb(Assembler::equal, LNotRecursive);
 866 
 867   // Recursive inflated unlock
 868   decq(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 869   jmpb(LSuccess);
 870 
 871   bind(LNotRecursive);
 872   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 873   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 874   jccb  (Assembler::notZero, CheckSucc);
 875   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 876   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 877   jmpb  (DONE_LABEL);
 878 
 879   // Try to avoid passing control into the slow_path ...
 880   bind  (CheckSucc);
 881 
 882   // The following optional optimization can be elided if necessary
 883   // Effectively: if (succ == null) goto slow path
 884   // The code reduces the window for a race, however,
 885   // and thus benefits performance.
 886   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD);
 887   jccb  (Assembler::zero, LGoSlowPath);
 888 
 889   xorptr(boxReg, boxReg);
 890   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 891   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 892 
 893   // Memory barrier/fence
 894   // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ
 895   // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack.
 896   // This is faster on Nehalem and AMD Shanghai/Barcelona.
 897   // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences
 898   // We might also restructure (ST Owner=0;barrier;LD _Succ) to
 899   // (mov box,0; xchgq box, &m->Owner; LD _succ) .
 900   lock(); addl(Address(rsp, 0), 0);
 901 
 902   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD);
 903   jccb  (Assembler::notZero, LSuccess);
 904 
 905   // Rare inopportune interleaving - race.
 906   // The successor vanished in the small window above.
 907   // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor.
 908   // We need to ensure progress and succession.
 909   // Try to reacquire the lock.
 910   // If that fails then the new owner is responsible for succession and this
 911   // thread needs to take no further action and can exit via the fast path (success).
 912   // If the re-acquire succeeds then pass control into the slow path.
 913   // As implemented, this latter mode is horrible because we generated more
 914   // coherence traffic on the lock *and* artificially extended the critical section
 915   // length while by virtue of passing control into the slow path.
 916 
 917   // box is really RAX -- the following CMPXCHG depends on that binding
 918   // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R)
 919   lock();
 920   cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 921   // There's no successor so we tried to regrab the lock.
 922   // If that didn't work, then another thread grabbed the
 923   // lock so we're done (and exit was a success).
 924   jccb  (Assembler::notEqual, LSuccess);
 925   // Intentional fall-through into slow path
 926 
 927   bind  (LGoSlowPath);
 928   orl   (boxReg, 1);                      // set ICC.ZF=0 to indicate failure
 929   jmpb  (DONE_LABEL);
 930 
 931   bind  (LSuccess);
 932   testl (boxReg, 0);                      // set ICC.ZF=1 to indicate success
 933   jmpb  (DONE_LABEL);
 934 
 935 #endif
 936   if (!UseHeavyMonitors) {
 937     bind  (Stacked);
 938     movptr(tmpReg, Address (boxReg, 0));      // re-fetch
 939     lock();
 940     cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
 941     // Intentional fall-thru into DONE_LABEL
 942   }
 943   bind(DONE_LABEL);
 944 
 945   // ZFlag == 1 count in fast path
 946   // ZFlag == 0 count in slow path
 947   jccb(Assembler::notZero, NO_COUNT);
 948 
 949   bind(COUNT);
 950   // Count monitors in fast path
 951 #ifndef _LP64
 952   get_thread(tmpReg);
 953   decrementl(Address(tmpReg, JavaThread::held_monitor_count_offset()));
 954 #else // _LP64
 955   decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset()));
 956 #endif
 957 
 958   xorl(tmpReg, tmpReg); // Set ZF == 1
 959 
 960   bind(NO_COUNT);
 961 }
 962 
 963 //-------------------------------------------------------------------------------------------
 964 // Generic instructions support for use in .ad files C2 code generation
 965 
 966 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) {
 967   if (dst != src) {
 968     movdqu(dst, src);
 969   }
 970   if (opcode == Op_AbsVD) {
 971     andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg);
 972   } else {
 973     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 974     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
 975   }
 976 }
 977 
 978 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 979   if (opcode == Op_AbsVD) {
 980     vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg);
 981   } else {
 982     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 983     vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg);
 984   }
 985 }
 986 
 987 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) {
 988   if (dst != src) {
 989     movdqu(dst, src);
 990   }
 991   if (opcode == Op_AbsVF) {
 992     andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg);
 993   } else {
 994     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 995     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
 996   }
 997 }
 998 
 999 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
1000   if (opcode == Op_AbsVF) {
1001     vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg);
1002   } else {
1003     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
1004     vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg);
1005   }
1006 }
1007 
1008 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
1009   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
1010   assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
1011 
1012   if (opcode == Op_MinV) {
1013     if (elem_bt == T_BYTE) {
1014       pminsb(dst, src);
1015     } else if (elem_bt == T_SHORT) {
1016       pminsw(dst, src);
1017     } else if (elem_bt == T_INT) {
1018       pminsd(dst, src);
1019     } else {
1020       assert(elem_bt == T_LONG, "required");
1021       assert(tmp == xmm0, "required");
1022       assert_different_registers(dst, src, tmp);
1023       movdqu(xmm0, dst);
1024       pcmpgtq(xmm0, src);
1025       blendvpd(dst, src);  // xmm0 as mask
1026     }
1027   } else { // opcode == Op_MaxV
1028     if (elem_bt == T_BYTE) {
1029       pmaxsb(dst, src);
1030     } else if (elem_bt == T_SHORT) {
1031       pmaxsw(dst, src);
1032     } else if (elem_bt == T_INT) {
1033       pmaxsd(dst, src);
1034     } else {
1035       assert(elem_bt == T_LONG, "required");
1036       assert(tmp == xmm0, "required");
1037       assert_different_registers(dst, src, tmp);
1038       movdqu(xmm0, src);
1039       pcmpgtq(xmm0, dst);
1040       blendvpd(dst, src);  // xmm0 as mask
1041     }
1042   }
1043 }
1044 
1045 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
1046                                  XMMRegister dst, XMMRegister src1, XMMRegister src2,
1047                                  int vlen_enc) {
1048   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
1049 
1050   if (opcode == Op_MinV) {
1051     if (elem_bt == T_BYTE) {
1052       vpminsb(dst, src1, src2, vlen_enc);
1053     } else if (elem_bt == T_SHORT) {
1054       vpminsw(dst, src1, src2, vlen_enc);
1055     } else if (elem_bt == T_INT) {
1056       vpminsd(dst, src1, src2, vlen_enc);
1057     } else {
1058       assert(elem_bt == T_LONG, "required");
1059       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
1060         vpminsq(dst, src1, src2, vlen_enc);
1061       } else {
1062         assert_different_registers(dst, src1, src2);
1063         vpcmpgtq(dst, src1, src2, vlen_enc);
1064         vblendvpd(dst, src1, src2, dst, vlen_enc);
1065       }
1066     }
1067   } else { // opcode == Op_MaxV
1068     if (elem_bt == T_BYTE) {
1069       vpmaxsb(dst, src1, src2, vlen_enc);
1070     } else if (elem_bt == T_SHORT) {
1071       vpmaxsw(dst, src1, src2, vlen_enc);
1072     } else if (elem_bt == T_INT) {
1073       vpmaxsd(dst, src1, src2, vlen_enc);
1074     } else {
1075       assert(elem_bt == T_LONG, "required");
1076       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
1077         vpmaxsq(dst, src1, src2, vlen_enc);
1078       } else {
1079         assert_different_registers(dst, src1, src2);
1080         vpcmpgtq(dst, src1, src2, vlen_enc);
1081         vblendvpd(dst, src2, src1, dst, vlen_enc);
1082       }
1083     }
1084   }
1085 }
1086 
1087 // Float/Double min max
1088 
1089 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
1090                                    XMMRegister dst, XMMRegister a, XMMRegister b,
1091                                    XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
1092                                    int vlen_enc) {
1093   assert(UseAVX > 0, "required");
1094   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1095          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1096   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1097   assert_different_registers(a, b, tmp, atmp, btmp);
1098 
1099   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1100   bool is_double_word = is_double_word_type(elem_bt);
1101 
1102   if (!is_double_word && is_min) {
1103     vblendvps(atmp, a, b, a, vlen_enc);
1104     vblendvps(btmp, b, a, a, vlen_enc);
1105     vminps(tmp, atmp, btmp, vlen_enc);
1106     vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1107     vblendvps(dst, tmp, atmp, btmp, vlen_enc);
1108   } else if (!is_double_word && !is_min) {
1109     vblendvps(btmp, b, a, b, vlen_enc);
1110     vblendvps(atmp, a, b, b, vlen_enc);
1111     vmaxps(tmp, atmp, btmp, vlen_enc);
1112     vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1113     vblendvps(dst, tmp, atmp, btmp, vlen_enc);
1114   } else if (is_double_word && is_min) {
1115     vblendvpd(atmp, a, b, a, vlen_enc);
1116     vblendvpd(btmp, b, a, a, vlen_enc);
1117     vminpd(tmp, atmp, btmp, vlen_enc);
1118     vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1119     vblendvpd(dst, tmp, atmp, btmp, vlen_enc);
1120   } else {
1121     assert(is_double_word && !is_min, "sanity");
1122     vblendvpd(btmp, b, a, b, vlen_enc);
1123     vblendvpd(atmp, a, b, b, vlen_enc);
1124     vmaxpd(tmp, atmp, btmp, vlen_enc);
1125     vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1126     vblendvpd(dst, tmp, atmp, btmp, vlen_enc);
1127   }
1128 }
1129 
1130 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
1131                                     XMMRegister dst, XMMRegister a, XMMRegister b,
1132                                     KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
1133                                     int vlen_enc) {
1134   assert(UseAVX > 2, "required");
1135   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1136          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1137   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1138   assert_different_registers(dst, a, b, atmp, btmp);
1139 
1140   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1141   bool is_double_word = is_double_word_type(elem_bt);
1142   bool merge = true;
1143 
1144   if (!is_double_word && is_min) {
1145     evpmovd2m(ktmp, a, vlen_enc);
1146     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1147     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1148     vminps(dst, atmp, btmp, vlen_enc);
1149     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1150     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1151   } else if (!is_double_word && !is_min) {
1152     evpmovd2m(ktmp, b, vlen_enc);
1153     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1154     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1155     vmaxps(dst, atmp, btmp, vlen_enc);
1156     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1157     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1158   } else if (is_double_word && is_min) {
1159     evpmovq2m(ktmp, a, vlen_enc);
1160     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1161     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1162     vminpd(dst, atmp, btmp, vlen_enc);
1163     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1164     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1165   } else {
1166     assert(is_double_word && !is_min, "sanity");
1167     evpmovq2m(ktmp, b, vlen_enc);
1168     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1169     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1170     vmaxpd(dst, atmp, btmp, vlen_enc);
1171     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1172     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1173   }
1174 }
1175 
1176 // Float/Double signum
1177 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) {
1178   assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");
1179 
1180   Label DONE_LABEL;
1181 
1182   if (opcode == Op_SignumF) {
1183     assert(UseSSE > 0, "required");
1184     ucomiss(dst, zero);
1185     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1186     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1187     movflt(dst, one);
1188     jcc(Assembler::above, DONE_LABEL);
1189     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
1190   } else if (opcode == Op_SignumD) {
1191     assert(UseSSE > 1, "required");
1192     ucomisd(dst, zero);
1193     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1194     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1195     movdbl(dst, one);
1196     jcc(Assembler::above, DONE_LABEL);
1197     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
1198   }
1199 
1200   bind(DONE_LABEL);
1201 }
1202 
1203 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1204   if (sign) {
1205     pmovsxbw(dst, src);
1206   } else {
1207     pmovzxbw(dst, src);
1208   }
1209 }
1210 
1211 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1212   if (sign) {
1213     vpmovsxbw(dst, src, vector_len);
1214   } else {
1215     vpmovzxbw(dst, src, vector_len);
1216   }
1217 }
1218 
1219 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1220   if (sign) {
1221     vpmovsxbd(dst, src, vector_len);
1222   } else {
1223     vpmovzxbd(dst, src, vector_len);
1224   }
1225 }
1226 
1227 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1228   if (sign) {
1229     vpmovsxwd(dst, src, vector_len);
1230   } else {
1231     vpmovzxwd(dst, src, vector_len);
1232   }
1233 }
1234 
1235 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1236                                      int shift, int vector_len) {
1237   if (opcode == Op_RotateLeftV) {
1238     if (etype == T_INT) {
1239       evprold(dst, src, shift, vector_len);
1240     } else {
1241       assert(etype == T_LONG, "expected type T_LONG");
1242       evprolq(dst, src, shift, vector_len);
1243     }
1244   } else {
1245     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1246     if (etype == T_INT) {
1247       evprord(dst, src, shift, vector_len);
1248     } else {
1249       assert(etype == T_LONG, "expected type T_LONG");
1250       evprorq(dst, src, shift, vector_len);
1251     }
1252   }
1253 }
1254 
1255 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1256                                      XMMRegister shift, int vector_len) {
1257   if (opcode == Op_RotateLeftV) {
1258     if (etype == T_INT) {
1259       evprolvd(dst, src, shift, vector_len);
1260     } else {
1261       assert(etype == T_LONG, "expected type T_LONG");
1262       evprolvq(dst, src, shift, vector_len);
1263     }
1264   } else {
1265     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1266     if (etype == T_INT) {
1267       evprorvd(dst, src, shift, vector_len);
1268     } else {
1269       assert(etype == T_LONG, "expected type T_LONG");
1270       evprorvq(dst, src, shift, vector_len);
1271     }
1272   }
1273 }
1274 
1275 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1276   if (opcode == Op_RShiftVI) {
1277     psrad(dst, shift);
1278   } else if (opcode == Op_LShiftVI) {
1279     pslld(dst, shift);
1280   } else {
1281     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1282     psrld(dst, shift);
1283   }
1284 }
1285 
1286 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1287   switch (opcode) {
1288     case Op_RShiftVI:  psrad(dst, shift); break;
1289     case Op_LShiftVI:  pslld(dst, shift); break;
1290     case Op_URShiftVI: psrld(dst, shift); break;
1291 
1292     default: assert(false, "%s", NodeClassNames[opcode]);
1293   }
1294 }
1295 
1296 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1297   if (opcode == Op_RShiftVI) {
1298     vpsrad(dst, nds, shift, vector_len);
1299   } else if (opcode == Op_LShiftVI) {
1300     vpslld(dst, nds, shift, vector_len);
1301   } else {
1302     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1303     vpsrld(dst, nds, shift, vector_len);
1304   }
1305 }
1306 
1307 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1308   switch (opcode) {
1309     case Op_RShiftVI:  vpsrad(dst, src, shift, vlen_enc); break;
1310     case Op_LShiftVI:  vpslld(dst, src, shift, vlen_enc); break;
1311     case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1312 
1313     default: assert(false, "%s", NodeClassNames[opcode]);
1314   }
1315 }
1316 
1317 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1318   switch (opcode) {
1319     case Op_RShiftVB:  // fall-through
1320     case Op_RShiftVS:  psraw(dst, shift); break;
1321 
1322     case Op_LShiftVB:  // fall-through
1323     case Op_LShiftVS:  psllw(dst, shift);   break;
1324 
1325     case Op_URShiftVS: // fall-through
1326     case Op_URShiftVB: psrlw(dst, shift);  break;
1327 
1328     default: assert(false, "%s", NodeClassNames[opcode]);
1329   }
1330 }
1331 
1332 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1333   switch (opcode) {
1334     case Op_RShiftVB:  // fall-through
1335     case Op_RShiftVS:  vpsraw(dst, src, shift, vlen_enc); break;
1336 
1337     case Op_LShiftVB:  // fall-through
1338     case Op_LShiftVS:  vpsllw(dst, src, shift, vlen_enc); break;
1339 
1340     case Op_URShiftVS: // fall-through
1341     case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1342 
1343     default: assert(false, "%s", NodeClassNames[opcode]);
1344   }
1345 }
1346 
1347 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1348   switch (opcode) {
1349     case Op_RShiftVL:  psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1350     case Op_LShiftVL:  psllq(dst, shift); break;
1351     case Op_URShiftVL: psrlq(dst, shift); break;
1352 
1353     default: assert(false, "%s", NodeClassNames[opcode]);
1354   }
1355 }
1356 
1357 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1358   if (opcode == Op_RShiftVL) {
1359     psrlq(dst, shift);  // using srl to implement sra on pre-avs512 systems
1360   } else if (opcode == Op_LShiftVL) {
1361     psllq(dst, shift);
1362   } else {
1363     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1364     psrlq(dst, shift);
1365   }
1366 }
1367 
1368 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1369   switch (opcode) {
1370     case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1371     case Op_LShiftVL:  vpsllq(dst, src, shift, vlen_enc); break;
1372     case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1373 
1374     default: assert(false, "%s", NodeClassNames[opcode]);
1375   }
1376 }
1377 
1378 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1379   if (opcode == Op_RShiftVL) {
1380     evpsraq(dst, nds, shift, vector_len);
1381   } else if (opcode == Op_LShiftVL) {
1382     vpsllq(dst, nds, shift, vector_len);
1383   } else {
1384     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1385     vpsrlq(dst, nds, shift, vector_len);
1386   }
1387 }
1388 
1389 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1390   switch (opcode) {
1391     case Op_RShiftVB:  // fall-through
1392     case Op_RShiftVS:  // fall-through
1393     case Op_RShiftVI:  vpsravd(dst, src, shift, vlen_enc); break;
1394 
1395     case Op_LShiftVB:  // fall-through
1396     case Op_LShiftVS:  // fall-through
1397     case Op_LShiftVI:  vpsllvd(dst, src, shift, vlen_enc); break;
1398 
1399     case Op_URShiftVB: // fall-through
1400     case Op_URShiftVS: // fall-through
1401     case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1402 
1403     default: assert(false, "%s", NodeClassNames[opcode]);
1404   }
1405 }
1406 
1407 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1408   switch (opcode) {
1409     case Op_RShiftVB:  // fall-through
1410     case Op_RShiftVS:  evpsravw(dst, src, shift, vlen_enc); break;
1411 
1412     case Op_LShiftVB:  // fall-through
1413     case Op_LShiftVS:  evpsllvw(dst, src, shift, vlen_enc); break;
1414 
1415     case Op_URShiftVB: // fall-through
1416     case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1417 
1418     default: assert(false, "%s", NodeClassNames[opcode]);
1419   }
1420 }
1421 
1422 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1423   assert(UseAVX >= 2, "required");
1424   switch (opcode) {
1425     case Op_RShiftVL: {
1426       if (UseAVX > 2) {
1427         assert(tmp == xnoreg, "not used");
1428         if (!VM_Version::supports_avx512vl()) {
1429           vlen_enc = Assembler::AVX_512bit;
1430         }
1431         evpsravq(dst, src, shift, vlen_enc);
1432       } else {
1433         vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1434         vpsrlvq(dst, src, shift, vlen_enc);
1435         vpsrlvq(tmp, tmp, shift, vlen_enc);
1436         vpxor(dst, dst, tmp, vlen_enc);
1437         vpsubq(dst, dst, tmp, vlen_enc);
1438       }
1439       break;
1440     }
1441     case Op_LShiftVL: {
1442       assert(tmp == xnoreg, "not used");
1443       vpsllvq(dst, src, shift, vlen_enc);
1444       break;
1445     }
1446     case Op_URShiftVL: {
1447       assert(tmp == xnoreg, "not used");
1448       vpsrlvq(dst, src, shift, vlen_enc);
1449       break;
1450     }
1451     default: assert(false, "%s", NodeClassNames[opcode]);
1452   }
1453 }
1454 
1455 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1456 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1457   assert(opcode == Op_LShiftVB ||
1458          opcode == Op_RShiftVB ||
1459          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1460   bool sign = (opcode != Op_URShiftVB);
1461   assert(vector_len == 0, "required");
1462   vextendbd(sign, dst, src, 1);
1463   vpmovzxbd(vtmp, shift, 1);
1464   varshiftd(opcode, dst, dst, vtmp, 1);
1465   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg);
1466   vextracti128_high(vtmp, dst);
1467   vpackusdw(dst, dst, vtmp, 0);
1468 }
1469 
1470 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1471 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1472   assert(opcode == Op_LShiftVB ||
1473          opcode == Op_RShiftVB ||
1474          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1475   bool sign = (opcode != Op_URShiftVB);
1476   int ext_vector_len = vector_len + 1;
1477   vextendbw(sign, dst, src, ext_vector_len);
1478   vpmovzxbw(vtmp, shift, ext_vector_len);
1479   varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1480   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg);
1481   if (vector_len == 0) {
1482     vextracti128_high(vtmp, dst);
1483     vpackuswb(dst, dst, vtmp, vector_len);
1484   } else {
1485     vextracti64x4_high(vtmp, dst);
1486     vpackuswb(dst, dst, vtmp, vector_len);
1487     vpermq(dst, dst, 0xD8, vector_len);
1488   }
1489 }
1490 
1491 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1492   switch(typ) {
1493     case T_BYTE:
1494       pinsrb(dst, val, idx);
1495       break;
1496     case T_SHORT:
1497       pinsrw(dst, val, idx);
1498       break;
1499     case T_INT:
1500       pinsrd(dst, val, idx);
1501       break;
1502     case T_LONG:
1503       pinsrq(dst, val, idx);
1504       break;
1505     default:
1506       assert(false,"Should not reach here.");
1507       break;
1508   }
1509 }
1510 
1511 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1512   switch(typ) {
1513     case T_BYTE:
1514       vpinsrb(dst, src, val, idx);
1515       break;
1516     case T_SHORT:
1517       vpinsrw(dst, src, val, idx);
1518       break;
1519     case T_INT:
1520       vpinsrd(dst, src, val, idx);
1521       break;
1522     case T_LONG:
1523       vpinsrq(dst, src, val, idx);
1524       break;
1525     default:
1526       assert(false,"Should not reach here.");
1527       break;
1528   }
1529 }
1530 
1531 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1532   switch(typ) {
1533     case T_INT:
1534       vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1535       break;
1536     case T_FLOAT:
1537       vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1538       break;
1539     case T_LONG:
1540       vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1541       break;
1542     case T_DOUBLE:
1543       vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1544       break;
1545     default:
1546       assert(false,"Should not reach here.");
1547       break;
1548   }
1549 }
1550 
1551 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1552   switch(typ) {
1553     case T_INT:
1554       evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1555       break;
1556     case T_FLOAT:
1557       evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1558       break;
1559     case T_LONG:
1560       evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1561       break;
1562     case T_DOUBLE:
1563       evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1564       break;
1565     default:
1566       assert(false,"Should not reach here.");
1567       break;
1568   }
1569 }
1570 
1571 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1572   switch(typ) {
1573     case T_INT:
1574       evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1575       break;
1576     case T_FLOAT:
1577       evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1578       break;
1579     case T_LONG:
1580       evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1581       break;
1582     case T_DOUBLE:
1583       evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1584       break;
1585     default:
1586       assert(false,"Should not reach here.");
1587       break;
1588   }
1589 }
1590 
1591 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) {
1592   if (vlen_in_bytes <= 16) {
1593     pxor (dst, dst);
1594     psubb(dst, src);
1595     switch (elem_bt) {
1596       case T_BYTE:   /* nothing to do */ break;
1597       case T_SHORT:  pmovsxbw(dst, dst); break;
1598       case T_INT:    pmovsxbd(dst, dst); break;
1599       case T_FLOAT:  pmovsxbd(dst, dst); break;
1600       case T_LONG:   pmovsxbq(dst, dst); break;
1601       case T_DOUBLE: pmovsxbq(dst, dst); break;
1602 
1603       default: assert(false, "%s", type2name(elem_bt));
1604     }
1605   } else {
1606     assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, "");
1607     int vlen_enc = vector_length_encoding(vlen_in_bytes);
1608 
1609     vpxor (dst, dst, dst, vlen_enc);
1610     vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc);
1611 
1612     switch (elem_bt) {
1613       case T_BYTE:   /* nothing to do */            break;
1614       case T_SHORT:  vpmovsxbw(dst, dst, vlen_enc); break;
1615       case T_INT:    vpmovsxbd(dst, dst, vlen_enc); break;
1616       case T_FLOAT:  vpmovsxbd(dst, dst, vlen_enc); break;
1617       case T_LONG:   vpmovsxbq(dst, dst, vlen_enc); break;
1618       case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1619 
1620       default: assert(false, "%s", type2name(elem_bt));
1621     }
1622   }
1623 }
1624 
1625 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) {
1626   if (novlbwdq) {
1627     vpmovsxbd(xtmp, src, vlen_enc);
1628     evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()),
1629             Assembler::eq, true, vlen_enc, noreg);
1630   } else {
1631     vpxor(xtmp, xtmp, xtmp, vlen_enc);
1632     vpsubb(xtmp, xtmp, src, vlen_enc);
1633     evpmovb2m(dst, xtmp, vlen_enc);
1634   }
1635 }
1636 
1637 void C2_MacroAssembler::load_vector(XMMRegister dst, Address src, int vlen_in_bytes) {
1638   switch (vlen_in_bytes) {
1639     case 4:  movdl(dst, src);   break;
1640     case 8:  movq(dst, src);    break;
1641     case 16: movdqu(dst, src);  break;
1642     case 32: vmovdqu(dst, src); break;
1643     case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break;
1644     default: ShouldNotReachHere();
1645   }
1646 }
1647 
1648 void C2_MacroAssembler::load_vector(XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) {
1649   assert(rscratch != noreg || always_reachable(src), "missing");
1650 
1651   if (reachable(src)) {
1652     load_vector(dst, as_Address(src), vlen_in_bytes);
1653   } else {
1654     lea(rscratch, src);
1655     load_vector(dst, Address(rscratch, 0), vlen_in_bytes);
1656   }
1657 }
1658 
1659 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) {
1660   int vlen_enc = vector_length_encoding(vlen);
1661   if (VM_Version::supports_avx()) {
1662     if (bt == T_LONG) {
1663       if (VM_Version::supports_avx2()) {
1664         vpbroadcastq(dst, src, vlen_enc);
1665       } else {
1666         vmovddup(dst, src, vlen_enc);
1667       }
1668     } else if (bt == T_DOUBLE) {
1669       if (vlen_enc != Assembler::AVX_128bit) {
1670         vbroadcastsd(dst, src, vlen_enc, noreg);
1671       } else {
1672         vmovddup(dst, src, vlen_enc);
1673       }
1674     } else {
1675       if (VM_Version::supports_avx2() && is_integral_type(bt)) {
1676         vpbroadcastd(dst, src, vlen_enc);
1677       } else {
1678         vbroadcastss(dst, src, vlen_enc);
1679       }
1680     }
1681   } else if (VM_Version::supports_sse3()) {
1682     movddup(dst, src);
1683   } else {
1684     movq(dst, src);
1685     if (vlen == 16) {
1686       punpcklqdq(dst, dst);
1687     }
1688   }
1689 }
1690 
1691 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) {
1692   // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64.
1693   int offset = exact_log2(type2aelembytes(bt)) << 6;
1694   if (is_floating_point_type(bt)) {
1695     offset += 128;
1696   }
1697   ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset);
1698   load_vector(dst, addr, vlen_in_bytes);
1699 }
1700 
1701 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1702 
1703 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1704   int vector_len = Assembler::AVX_128bit;
1705 
1706   switch (opcode) {
1707     case Op_AndReductionV:  pand(dst, src); break;
1708     case Op_OrReductionV:   por (dst, src); break;
1709     case Op_XorReductionV:  pxor(dst, src); break;
1710     case Op_MinReductionV:
1711       switch (typ) {
1712         case T_BYTE:        pminsb(dst, src); break;
1713         case T_SHORT:       pminsw(dst, src); break;
1714         case T_INT:         pminsd(dst, src); break;
1715         case T_LONG:        assert(UseAVX > 2, "required");
1716                             vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
1717         default:            assert(false, "wrong type");
1718       }
1719       break;
1720     case Op_MaxReductionV:
1721       switch (typ) {
1722         case T_BYTE:        pmaxsb(dst, src); break;
1723         case T_SHORT:       pmaxsw(dst, src); break;
1724         case T_INT:         pmaxsd(dst, src); break;
1725         case T_LONG:        assert(UseAVX > 2, "required");
1726                             vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
1727         default:            assert(false, "wrong type");
1728       }
1729       break;
1730     case Op_AddReductionVF: addss(dst, src); break;
1731     case Op_AddReductionVD: addsd(dst, src); break;
1732     case Op_AddReductionVI:
1733       switch (typ) {
1734         case T_BYTE:        paddb(dst, src); break;
1735         case T_SHORT:       paddw(dst, src); break;
1736         case T_INT:         paddd(dst, src); break;
1737         default:            assert(false, "wrong type");
1738       }
1739       break;
1740     case Op_AddReductionVL: paddq(dst, src); break;
1741     case Op_MulReductionVF: mulss(dst, src); break;
1742     case Op_MulReductionVD: mulsd(dst, src); break;
1743     case Op_MulReductionVI:
1744       switch (typ) {
1745         case T_SHORT:       pmullw(dst, src); break;
1746         case T_INT:         pmulld(dst, src); break;
1747         default:            assert(false, "wrong type");
1748       }
1749       break;
1750     case Op_MulReductionVL: assert(UseAVX > 2, "required");
1751                             evpmullq(dst, dst, src, vector_len); break;
1752     default:                assert(false, "wrong opcode");
1753   }
1754 }
1755 
1756 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1757   int vector_len = Assembler::AVX_256bit;
1758 
1759   switch (opcode) {
1760     case Op_AndReductionV:  vpand(dst, src1, src2, vector_len); break;
1761     case Op_OrReductionV:   vpor (dst, src1, src2, vector_len); break;
1762     case Op_XorReductionV:  vpxor(dst, src1, src2, vector_len); break;
1763     case Op_MinReductionV:
1764       switch (typ) {
1765         case T_BYTE:        vpminsb(dst, src1, src2, vector_len); break;
1766         case T_SHORT:       vpminsw(dst, src1, src2, vector_len); break;
1767         case T_INT:         vpminsd(dst, src1, src2, vector_len); break;
1768         case T_LONG:        assert(UseAVX > 2, "required");
1769                             vpminsq(dst, src1, src2, vector_len); break;
1770         default:            assert(false, "wrong type");
1771       }
1772       break;
1773     case Op_MaxReductionV:
1774       switch (typ) {
1775         case T_BYTE:        vpmaxsb(dst, src1, src2, vector_len); break;
1776         case T_SHORT:       vpmaxsw(dst, src1, src2, vector_len); break;
1777         case T_INT:         vpmaxsd(dst, src1, src2, vector_len); break;
1778         case T_LONG:        assert(UseAVX > 2, "required");
1779                             vpmaxsq(dst, src1, src2, vector_len); break;
1780         default:            assert(false, "wrong type");
1781       }
1782       break;
1783     case Op_AddReductionVI:
1784       switch (typ) {
1785         case T_BYTE:        vpaddb(dst, src1, src2, vector_len); break;
1786         case T_SHORT:       vpaddw(dst, src1, src2, vector_len); break;
1787         case T_INT:         vpaddd(dst, src1, src2, vector_len); break;
1788         default:            assert(false, "wrong type");
1789       }
1790       break;
1791     case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
1792     case Op_MulReductionVI:
1793       switch (typ) {
1794         case T_SHORT:       vpmullw(dst, src1, src2, vector_len); break;
1795         case T_INT:         vpmulld(dst, src1, src2, vector_len); break;
1796         default:            assert(false, "wrong type");
1797       }
1798       break;
1799     case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break;
1800     default:                assert(false, "wrong opcode");
1801   }
1802 }
1803 
1804 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
1805                                   XMMRegister dst, XMMRegister src,
1806                                   XMMRegister vtmp1, XMMRegister vtmp2) {
1807   switch (opcode) {
1808     case Op_AddReductionVF:
1809     case Op_MulReductionVF:
1810       reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1811       break;
1812 
1813     case Op_AddReductionVD:
1814     case Op_MulReductionVD:
1815       reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1816       break;
1817 
1818     default: assert(false, "wrong opcode");
1819   }
1820 }
1821 
1822 void C2_MacroAssembler::reduceB(int opcode, int vlen,
1823                              Register dst, Register src1, XMMRegister src2,
1824                              XMMRegister vtmp1, XMMRegister vtmp2) {
1825   switch (vlen) {
1826     case  8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1827     case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1828     case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1829     case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1830 
1831     default: assert(false, "wrong vector length");
1832   }
1833 }
1834 
1835 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
1836                              Register dst, Register src1, XMMRegister src2,
1837                              XMMRegister vtmp1, XMMRegister vtmp2) {
1838   switch (vlen) {
1839     case  8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1840     case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1841     case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1842     case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1843 
1844     default: assert(false, "wrong vector length");
1845   }
1846 }
1847 
1848 void C2_MacroAssembler::reduceS(int opcode, int vlen,
1849                              Register dst, Register src1, XMMRegister src2,
1850                              XMMRegister vtmp1, XMMRegister vtmp2) {
1851   switch (vlen) {
1852     case  4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1853     case  8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1854     case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1855     case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1856 
1857     default: assert(false, "wrong vector length");
1858   }
1859 }
1860 
1861 void C2_MacroAssembler::reduceI(int opcode, int vlen,
1862                              Register dst, Register src1, XMMRegister src2,
1863                              XMMRegister vtmp1, XMMRegister vtmp2) {
1864   switch (vlen) {
1865     case  2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1866     case  4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1867     case  8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1868     case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1869 
1870     default: assert(false, "wrong vector length");
1871   }
1872 }
1873 
1874 #ifdef _LP64
1875 void C2_MacroAssembler::reduceL(int opcode, int vlen,
1876                              Register dst, Register src1, XMMRegister src2,
1877                              XMMRegister vtmp1, XMMRegister vtmp2) {
1878   switch (vlen) {
1879     case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1880     case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1881     case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1882 
1883     default: assert(false, "wrong vector length");
1884   }
1885 }
1886 #endif // _LP64
1887 
1888 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1889   switch (vlen) {
1890     case 2:
1891       assert(vtmp2 == xnoreg, "");
1892       reduce2F(opcode, dst, src, vtmp1);
1893       break;
1894     case 4:
1895       assert(vtmp2 == xnoreg, "");
1896       reduce4F(opcode, dst, src, vtmp1);
1897       break;
1898     case 8:
1899       reduce8F(opcode, dst, src, vtmp1, vtmp2);
1900       break;
1901     case 16:
1902       reduce16F(opcode, dst, src, vtmp1, vtmp2);
1903       break;
1904     default: assert(false, "wrong vector length");
1905   }
1906 }
1907 
1908 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1909   switch (vlen) {
1910     case 2:
1911       assert(vtmp2 == xnoreg, "");
1912       reduce2D(opcode, dst, src, vtmp1);
1913       break;
1914     case 4:
1915       reduce4D(opcode, dst, src, vtmp1, vtmp2);
1916       break;
1917     case 8:
1918       reduce8D(opcode, dst, src, vtmp1, vtmp2);
1919       break;
1920     default: assert(false, "wrong vector length");
1921   }
1922 }
1923 
1924 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1925   if (opcode == Op_AddReductionVI) {
1926     if (vtmp1 != src2) {
1927       movdqu(vtmp1, src2);
1928     }
1929     phaddd(vtmp1, vtmp1);
1930   } else {
1931     pshufd(vtmp1, src2, 0x1);
1932     reduce_operation_128(T_INT, opcode, vtmp1, src2);
1933   }
1934   movdl(vtmp2, src1);
1935   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
1936   movdl(dst, vtmp1);
1937 }
1938 
1939 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1940   if (opcode == Op_AddReductionVI) {
1941     if (vtmp1 != src2) {
1942       movdqu(vtmp1, src2);
1943     }
1944     phaddd(vtmp1, src2);
1945     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1946   } else {
1947     pshufd(vtmp2, src2, 0xE);
1948     reduce_operation_128(T_INT, opcode, vtmp2, src2);
1949     reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1950   }
1951 }
1952 
1953 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1954   if (opcode == Op_AddReductionVI) {
1955     vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
1956     vextracti128_high(vtmp2, vtmp1);
1957     vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
1958     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1959   } else {
1960     vextracti128_high(vtmp1, src2);
1961     reduce_operation_128(T_INT, opcode, vtmp1, src2);
1962     reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1963   }
1964 }
1965 
1966 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1967   vextracti64x4_high(vtmp2, src2);
1968   reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
1969   reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1970 }
1971 
1972 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1973   pshufd(vtmp2, src2, 0x1);
1974   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
1975   movdqu(vtmp1, vtmp2);
1976   psrldq(vtmp1, 2);
1977   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
1978   movdqu(vtmp2, vtmp1);
1979   psrldq(vtmp2, 1);
1980   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
1981   movdl(vtmp2, src1);
1982   pmovsxbd(vtmp1, vtmp1);
1983   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
1984   pextrb(dst, vtmp1, 0x0);
1985   movsbl(dst, dst);
1986 }
1987 
1988 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1989   pshufd(vtmp1, src2, 0xE);
1990   reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
1991   reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1992 }
1993 
1994 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1995   vextracti128_high(vtmp2, src2);
1996   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
1997   reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1998 }
1999 
2000 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2001   vextracti64x4_high(vtmp1, src2);
2002   reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
2003   reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2004 }
2005 
2006 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2007   pmovsxbw(vtmp2, src2);
2008   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2009 }
2010 
2011 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2012   if (UseAVX > 1) {
2013     int vector_len = Assembler::AVX_256bit;
2014     vpmovsxbw(vtmp1, src2, vector_len);
2015     reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2016   } else {
2017     pmovsxbw(vtmp2, src2);
2018     reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2019     pshufd(vtmp2, src2, 0x1);
2020     pmovsxbw(vtmp2, src2);
2021     reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2022   }
2023 }
2024 
2025 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2026   if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
2027     int vector_len = Assembler::AVX_512bit;
2028     vpmovsxbw(vtmp1, src2, vector_len);
2029     reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2030   } else {
2031     assert(UseAVX >= 2,"Should not reach here.");
2032     mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
2033     vextracti128_high(vtmp2, src2);
2034     mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2035   }
2036 }
2037 
2038 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2039   mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
2040   vextracti64x4_high(vtmp2, src2);
2041   mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2042 }
2043 
2044 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2045   if (opcode == Op_AddReductionVI) {
2046     if (vtmp1 != src2) {
2047       movdqu(vtmp1, src2);
2048     }
2049     phaddw(vtmp1, vtmp1);
2050     phaddw(vtmp1, vtmp1);
2051   } else {
2052     pshufd(vtmp2, src2, 0x1);
2053     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2054     movdqu(vtmp1, vtmp2);
2055     psrldq(vtmp1, 2);
2056     reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
2057   }
2058   movdl(vtmp2, src1);
2059   pmovsxwd(vtmp1, vtmp1);
2060   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2061   pextrw(dst, vtmp1, 0x0);
2062   movswl(dst, dst);
2063 }
2064 
2065 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2066   if (opcode == Op_AddReductionVI) {
2067     if (vtmp1 != src2) {
2068       movdqu(vtmp1, src2);
2069     }
2070     phaddw(vtmp1, src2);
2071   } else {
2072     pshufd(vtmp1, src2, 0xE);
2073     reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
2074   }
2075   reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2076 }
2077 
2078 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2079   if (opcode == Op_AddReductionVI) {
2080     int vector_len = Assembler::AVX_256bit;
2081     vphaddw(vtmp2, src2, src2, vector_len);
2082     vpermq(vtmp2, vtmp2, 0xD8, vector_len);
2083   } else {
2084     vextracti128_high(vtmp2, src2);
2085     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2086   }
2087   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2088 }
2089 
2090 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2091   int vector_len = Assembler::AVX_256bit;
2092   vextracti64x4_high(vtmp1, src2);
2093   reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
2094   reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2095 }
2096 
2097 #ifdef _LP64
2098 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2099   pshufd(vtmp2, src2, 0xE);
2100   reduce_operation_128(T_LONG, opcode, vtmp2, src2);
2101   movdq(vtmp1, src1);
2102   reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
2103   movdq(dst, vtmp1);
2104 }
2105 
2106 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2107   vextracti128_high(vtmp1, src2);
2108   reduce_operation_128(T_LONG, opcode, vtmp1, src2);
2109   reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2110 }
2111 
2112 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2113   vextracti64x4_high(vtmp2, src2);
2114   reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
2115   reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2116 }
2117 
2118 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
2119   mov64(temp, -1L);
2120   bzhiq(temp, temp, len);
2121   kmovql(dst, temp);
2122 }
2123 #endif // _LP64
2124 
2125 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2126   reduce_operation_128(T_FLOAT, opcode, dst, src);
2127   pshufd(vtmp, src, 0x1);
2128   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2129 }
2130 
2131 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2132   reduce2F(opcode, dst, src, vtmp);
2133   pshufd(vtmp, src, 0x2);
2134   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2135   pshufd(vtmp, src, 0x3);
2136   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2137 }
2138 
2139 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2140   reduce4F(opcode, dst, src, vtmp2);
2141   vextractf128_high(vtmp2, src);
2142   reduce4F(opcode, dst, vtmp2, vtmp1);
2143 }
2144 
2145 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2146   reduce8F(opcode, dst, src, vtmp1, vtmp2);
2147   vextracti64x4_high(vtmp1, src);
2148   reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
2149 }
2150 
2151 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2152   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2153   pshufd(vtmp, src, 0xE);
2154   reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
2155 }
2156 
2157 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2158   reduce2D(opcode, dst, src, vtmp2);
2159   vextractf128_high(vtmp2, src);
2160   reduce2D(opcode, dst, vtmp2, vtmp1);
2161 }
2162 
2163 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2164   reduce4D(opcode, dst, src, vtmp1, vtmp2);
2165   vextracti64x4_high(vtmp1, src);
2166   reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
2167 }
2168 
2169 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) {
2170   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2171 }
2172 
2173 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) {
2174   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2175 }
2176 
2177 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask,
2178                                  int vec_enc) {
2179   switch(elem_bt) {
2180     case T_INT:
2181     case T_FLOAT:
2182       vmaskmovps(dst, src, mask, vec_enc);
2183       break;
2184     case T_LONG:
2185     case T_DOUBLE:
2186       vmaskmovpd(dst, src, mask, vec_enc);
2187       break;
2188     default:
2189       fatal("Unsupported type %s", type2name(elem_bt));
2190       break;
2191   }
2192 }
2193 
2194 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask,
2195                                  int vec_enc) {
2196   switch(elem_bt) {
2197     case T_INT:
2198     case T_FLOAT:
2199       vmaskmovps(dst, src, mask, vec_enc);
2200       break;
2201     case T_LONG:
2202     case T_DOUBLE:
2203       vmaskmovpd(dst, src, mask, vec_enc);
2204       break;
2205     default:
2206       fatal("Unsupported type %s", type2name(elem_bt));
2207       break;
2208   }
2209 }
2210 
2211 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
2212                                           XMMRegister dst, XMMRegister src,
2213                                           XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2214                                           XMMRegister xmm_0, XMMRegister xmm_1) {
2215   const int permconst[] = {1, 14};
2216   XMMRegister wsrc = src;
2217   XMMRegister wdst = xmm_0;
2218   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2219 
2220   int vlen_enc = Assembler::AVX_128bit;
2221   if (vlen == 16) {
2222     vlen_enc = Assembler::AVX_256bit;
2223   }
2224 
2225   for (int i = log2(vlen) - 1; i >=0; i--) {
2226     if (i == 0 && !is_dst_valid) {
2227       wdst = dst;
2228     }
2229     if (i == 3) {
2230       vextracti64x4_high(wtmp, wsrc);
2231     } else if (i == 2) {
2232       vextracti128_high(wtmp, wsrc);
2233     } else { // i = [0,1]
2234       vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
2235     }
2236     vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2237     wsrc = wdst;
2238     vlen_enc = Assembler::AVX_128bit;
2239   }
2240   if (is_dst_valid) {
2241     vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2242   }
2243 }
2244 
2245 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
2246                                         XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2247                                         XMMRegister xmm_0, XMMRegister xmm_1) {
2248   XMMRegister wsrc = src;
2249   XMMRegister wdst = xmm_0;
2250   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2251   int vlen_enc = Assembler::AVX_128bit;
2252   if (vlen == 8) {
2253     vlen_enc = Assembler::AVX_256bit;
2254   }
2255   for (int i = log2(vlen) - 1; i >=0; i--) {
2256     if (i == 0 && !is_dst_valid) {
2257       wdst = dst;
2258     }
2259     if (i == 1) {
2260       vextracti128_high(wtmp, wsrc);
2261     } else if (i == 2) {
2262       vextracti64x4_high(wtmp, wsrc);
2263     } else {
2264       assert(i == 0, "%d", i);
2265       vpermilpd(wtmp, wsrc, 1, vlen_enc);
2266     }
2267     vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2268     wsrc = wdst;
2269     vlen_enc = Assembler::AVX_128bit;
2270   }
2271   if (is_dst_valid) {
2272     vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2273   }
2274 }
2275 
2276 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2277   switch (bt) {
2278     case T_BYTE:  pextrb(dst, src, idx); break;
2279     case T_SHORT: pextrw(dst, src, idx); break;
2280     case T_INT:   pextrd(dst, src, idx); break;
2281     case T_LONG:  pextrq(dst, src, idx); break;
2282 
2283     default:
2284       assert(false,"Should not reach here.");
2285       break;
2286   }
2287 }
2288 
2289 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2290   int esize =  type2aelembytes(typ);
2291   int elem_per_lane = 16/esize;
2292   int lane = elemindex / elem_per_lane;
2293   int eindex = elemindex % elem_per_lane;
2294 
2295   if (lane >= 2) {
2296     assert(UseAVX > 2, "required");
2297     vextractf32x4(dst, src, lane & 3);
2298     return dst;
2299   } else if (lane > 0) {
2300     assert(UseAVX > 0, "required");
2301     vextractf128(dst, src, lane);
2302     return dst;
2303   } else {
2304     return src;
2305   }
2306 }
2307 
2308 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) {
2309   if (typ == T_BYTE) {
2310     movsbl(dst, dst);
2311   } else if (typ == T_SHORT) {
2312     movswl(dst, dst);
2313   }
2314 }
2315 
2316 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2317   int esize =  type2aelembytes(typ);
2318   int elem_per_lane = 16/esize;
2319   int eindex = elemindex % elem_per_lane;
2320   assert(is_integral_type(typ),"required");
2321 
2322   if (eindex == 0) {
2323     if (typ == T_LONG) {
2324       movq(dst, src);
2325     } else {
2326       movdl(dst, src);
2327       movsxl(typ, dst);
2328     }
2329   } else {
2330     extract(typ, dst, src, eindex);
2331     movsxl(typ, dst);
2332   }
2333 }
2334 
2335 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) {
2336   int esize =  type2aelembytes(typ);
2337   int elem_per_lane = 16/esize;
2338   int eindex = elemindex % elem_per_lane;
2339   assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2340 
2341   if (eindex == 0) {
2342     movq(dst, src);
2343   } else {
2344     if (typ == T_FLOAT) {
2345       if (UseAVX == 0) {
2346         movdqu(dst, src);
2347         shufps(dst, dst, eindex);
2348       } else {
2349         vshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2350       }
2351     } else {
2352       if (UseAVX == 0) {
2353         movdqu(dst, src);
2354         psrldq(dst, eindex*esize);
2355       } else {
2356         vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2357       }
2358       movq(dst, dst);
2359     }
2360   }
2361   // Zero upper bits
2362   if (typ == T_FLOAT) {
2363     if (UseAVX == 0) {
2364       assert(vtmp != xnoreg, "required.");
2365       movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg);
2366       pand(dst, vtmp);
2367     } else {
2368       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg);
2369     }
2370   }
2371 }
2372 
2373 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) {
2374   switch(typ) {
2375     case T_BYTE:
2376     case T_BOOLEAN:
2377       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2378       break;
2379     case T_SHORT:
2380     case T_CHAR:
2381       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2382       break;
2383     case T_INT:
2384     case T_FLOAT:
2385       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2386       break;
2387     case T_LONG:
2388     case T_DOUBLE:
2389       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2390       break;
2391     default:
2392       assert(false,"Should not reach here.");
2393       break;
2394   }
2395 }
2396 
2397 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) {
2398   assert(rscratch != noreg || always_reachable(src2), "missing");
2399 
2400   switch(typ) {
2401     case T_BOOLEAN:
2402     case T_BYTE:
2403       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2404       break;
2405     case T_CHAR:
2406     case T_SHORT:
2407       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2408       break;
2409     case T_INT:
2410     case T_FLOAT:
2411       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2412       break;
2413     case T_LONG:
2414     case T_DOUBLE:
2415       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2416       break;
2417     default:
2418       assert(false,"Should not reach here.");
2419       break;
2420   }
2421 }
2422 
2423 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2424   switch(typ) {
2425     case T_BYTE:
2426       evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2427       break;
2428     case T_SHORT:
2429       evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2430       break;
2431     case T_INT:
2432     case T_FLOAT:
2433       evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2434       break;
2435     case T_LONG:
2436     case T_DOUBLE:
2437       evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2438       break;
2439     default:
2440       assert(false,"Should not reach here.");
2441       break;
2442   }
2443 }
2444 
2445 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) {
2446   assert(vlen_in_bytes <= 32, "");
2447   int esize = type2aelembytes(bt);
2448   if (vlen_in_bytes == 32) {
2449     assert(vtmp == xnoreg, "required.");
2450     if (esize >= 4) {
2451       vtestps(src1, src2, AVX_256bit);
2452     } else {
2453       vptest(src1, src2, AVX_256bit);
2454     }
2455     return;
2456   }
2457   if (vlen_in_bytes < 16) {
2458     // Duplicate the lower part to fill the whole register,
2459     // Don't need to do so for src2
2460     assert(vtmp != xnoreg, "required");
2461     int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04;
2462     pshufd(vtmp, src1, shuffle_imm);
2463   } else {
2464     assert(vtmp == xnoreg, "required");
2465     vtmp = src1;
2466   }
2467   if (esize >= 4 && VM_Version::supports_avx()) {
2468     vtestps(vtmp, src2, AVX_128bit);
2469   } else {
2470     ptest(vtmp, src2);
2471   }
2472 }
2473 
2474 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
2475   assert(UseAVX >= 2, "required");
2476 #ifdef ASSERT
2477   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2478   bool is_bw_supported = VM_Version::supports_avx512bw();
2479   if (is_bw && !is_bw_supported) {
2480     assert(vlen_enc != Assembler::AVX_512bit, "required");
2481     assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16),
2482            "XMM register should be 0-15");
2483   }
2484 #endif // ASSERT
2485   switch (elem_bt) {
2486     case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return;
2487     case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return;
2488     case T_INT: vpaddd(dst, src1, src2, vlen_enc); return;
2489     case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return;
2490     case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return;
2491     case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return;
2492     default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2493   }
2494 }
2495 
2496 #ifdef _LP64
2497 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) {
2498   assert(UseAVX >= 2, "required");
2499   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2500   bool is_vl = vlen_enc != Assembler::AVX_512bit;
2501   if ((UseAVX > 2) &&
2502       (!is_bw || VM_Version::supports_avx512bw()) &&
2503       (!is_vl || VM_Version::supports_avx512vl())) {
2504     switch (elem_bt) {
2505       case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return;
2506       case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return;
2507       case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return;
2508       case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return;
2509       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2510     }
2511   } else {
2512     assert(vlen_enc != Assembler::AVX_512bit, "required");
2513     assert((dst->encoding() < 16),"XMM register should be 0-15");
2514     switch (elem_bt) {
2515       case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return;
2516       case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return;
2517       case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return;
2518       case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return;
2519       case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return;
2520       case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return;
2521       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2522     }
2523   }
2524 }
2525 #endif
2526 
2527 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
2528   switch (to_elem_bt) {
2529     case T_SHORT:
2530       vpmovsxbw(dst, src, vlen_enc);
2531       break;
2532     case T_INT:
2533       vpmovsxbd(dst, src, vlen_enc);
2534       break;
2535     case T_FLOAT:
2536       vpmovsxbd(dst, src, vlen_enc);
2537       vcvtdq2ps(dst, dst, vlen_enc);
2538       break;
2539     case T_LONG:
2540       vpmovsxbq(dst, src, vlen_enc);
2541       break;
2542     case T_DOUBLE: {
2543       int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
2544       vpmovsxbd(dst, src, mid_vlen_enc);
2545       vcvtdq2pd(dst, dst, vlen_enc);
2546       break;
2547     }
2548     default:
2549       fatal("Unsupported type %s", type2name(to_elem_bt));
2550       break;
2551   }
2552 }
2553 
2554 //-------------------------------------------------------------------------------------------
2555 
2556 // IndexOf for constant substrings with size >= 8 chars
2557 // which don't need to be loaded through stack.
2558 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2559                                          Register cnt1, Register cnt2,
2560                                          int int_cnt2,  Register result,
2561                                          XMMRegister vec, Register tmp,
2562                                          int ae) {
2563   ShortBranchVerifier sbv(this);
2564   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2565   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2566 
2567   // This method uses the pcmpestri instruction with bound registers
2568   //   inputs:
2569   //     xmm - substring
2570   //     rax - substring length (elements count)
2571   //     mem - scanned string
2572   //     rdx - string length (elements count)
2573   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2574   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2575   //   outputs:
2576   //     rcx - matched index in string
2577   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2578   int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2579   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2580   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2581   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2582 
2583   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2584         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2585         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2586 
2587   // Note, inline_string_indexOf() generates checks:
2588   // if (substr.count > string.count) return -1;
2589   // if (substr.count == 0) return 0;
2590   assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2591 
2592   // Load substring.
2593   if (ae == StrIntrinsicNode::UL) {
2594     pmovzxbw(vec, Address(str2, 0));
2595   } else {
2596     movdqu(vec, Address(str2, 0));
2597   }
2598   movl(cnt2, int_cnt2);
2599   movptr(result, str1); // string addr
2600 
2601   if (int_cnt2 > stride) {
2602     jmpb(SCAN_TO_SUBSTR);
2603 
2604     // Reload substr for rescan, this code
2605     // is executed only for large substrings (> 8 chars)
2606     bind(RELOAD_SUBSTR);
2607     if (ae == StrIntrinsicNode::UL) {
2608       pmovzxbw(vec, Address(str2, 0));
2609     } else {
2610       movdqu(vec, Address(str2, 0));
2611     }
2612     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2613 
2614     bind(RELOAD_STR);
2615     // We came here after the beginning of the substring was
2616     // matched but the rest of it was not so we need to search
2617     // again. Start from the next element after the previous match.
2618 
2619     // cnt2 is number of substring reminding elements and
2620     // cnt1 is number of string reminding elements when cmp failed.
2621     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
2622     subl(cnt1, cnt2);
2623     addl(cnt1, int_cnt2);
2624     movl(cnt2, int_cnt2); // Now restore cnt2
2625 
2626     decrementl(cnt1);     // Shift to next element
2627     cmpl(cnt1, cnt2);
2628     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2629 
2630     addptr(result, (1<<scale1));
2631 
2632   } // (int_cnt2 > 8)
2633 
2634   // Scan string for start of substr in 16-byte vectors
2635   bind(SCAN_TO_SUBSTR);
2636   pcmpestri(vec, Address(result, 0), mode);
2637   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2638   subl(cnt1, stride);
2639   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2640   cmpl(cnt1, cnt2);
2641   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2642   addptr(result, 16);
2643   jmpb(SCAN_TO_SUBSTR);
2644 
2645   // Found a potential substr
2646   bind(FOUND_CANDIDATE);
2647   // Matched whole vector if first element matched (tmp(rcx) == 0).
2648   if (int_cnt2 == stride) {
2649     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
2650   } else { // int_cnt2 > 8
2651     jccb(Assembler::overflow, FOUND_SUBSTR);
2652   }
2653   // After pcmpestri tmp(rcx) contains matched element index
2654   // Compute start addr of substr
2655   lea(result, Address(result, tmp, scale1));
2656 
2657   // Make sure string is still long enough
2658   subl(cnt1, tmp);
2659   cmpl(cnt1, cnt2);
2660   if (int_cnt2 == stride) {
2661     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2662   } else { // int_cnt2 > 8
2663     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
2664   }
2665   // Left less then substring.
2666 
2667   bind(RET_NOT_FOUND);
2668   movl(result, -1);
2669   jmp(EXIT);
2670 
2671   if (int_cnt2 > stride) {
2672     // This code is optimized for the case when whole substring
2673     // is matched if its head is matched.
2674     bind(MATCH_SUBSTR_HEAD);
2675     pcmpestri(vec, Address(result, 0), mode);
2676     // Reload only string if does not match
2677     jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
2678 
2679     Label CONT_SCAN_SUBSTR;
2680     // Compare the rest of substring (> 8 chars).
2681     bind(FOUND_SUBSTR);
2682     // First 8 chars are already matched.
2683     negptr(cnt2);
2684     addptr(cnt2, stride);
2685 
2686     bind(SCAN_SUBSTR);
2687     subl(cnt1, stride);
2688     cmpl(cnt2, -stride); // Do not read beyond substring
2689     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
2690     // Back-up strings to avoid reading beyond substring:
2691     // cnt1 = cnt1 - cnt2 + 8
2692     addl(cnt1, cnt2); // cnt2 is negative
2693     addl(cnt1, stride);
2694     movl(cnt2, stride); negptr(cnt2);
2695     bind(CONT_SCAN_SUBSTR);
2696     if (int_cnt2 < (int)G) {
2697       int tail_off1 = int_cnt2<<scale1;
2698       int tail_off2 = int_cnt2<<scale2;
2699       if (ae == StrIntrinsicNode::UL) {
2700         pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
2701       } else {
2702         movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
2703       }
2704       pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
2705     } else {
2706       // calculate index in register to avoid integer overflow (int_cnt2*2)
2707       movl(tmp, int_cnt2);
2708       addptr(tmp, cnt2);
2709       if (ae == StrIntrinsicNode::UL) {
2710         pmovzxbw(vec, Address(str2, tmp, scale2, 0));
2711       } else {
2712         movdqu(vec, Address(str2, tmp, scale2, 0));
2713       }
2714       pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
2715     }
2716     // Need to reload strings pointers if not matched whole vector
2717     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2718     addptr(cnt2, stride);
2719     jcc(Assembler::negative, SCAN_SUBSTR);
2720     // Fall through if found full substring
2721 
2722   } // (int_cnt2 > 8)
2723 
2724   bind(RET_FOUND);
2725   // Found result if we matched full small substring.
2726   // Compute substr offset
2727   subptr(result, str1);
2728   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2729     shrl(result, 1); // index
2730   }
2731   bind(EXIT);
2732 
2733 } // string_indexofC8
2734 
2735 // Small strings are loaded through stack if they cross page boundary.
2736 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
2737                                        Register cnt1, Register cnt2,
2738                                        int int_cnt2,  Register result,
2739                                        XMMRegister vec, Register tmp,
2740                                        int ae) {
2741   ShortBranchVerifier sbv(this);
2742   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2743   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2744 
2745   //
2746   // int_cnt2 is length of small (< 8 chars) constant substring
2747   // or (-1) for non constant substring in which case its length
2748   // is in cnt2 register.
2749   //
2750   // Note, inline_string_indexOf() generates checks:
2751   // if (substr.count > string.count) return -1;
2752   // if (substr.count == 0) return 0;
2753   //
2754   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2755   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
2756   // This method uses the pcmpestri instruction with bound registers
2757   //   inputs:
2758   //     xmm - substring
2759   //     rax - substring length (elements count)
2760   //     mem - scanned string
2761   //     rdx - string length (elements count)
2762   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2763   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2764   //   outputs:
2765   //     rcx - matched index in string
2766   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2767   int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2768   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2769   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2770 
2771   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
2772         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
2773         FOUND_CANDIDATE;
2774 
2775   { //========================================================
2776     // We don't know where these strings are located
2777     // and we can't read beyond them. Load them through stack.
2778     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
2779 
2780     movptr(tmp, rsp); // save old SP
2781 
2782     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
2783       if (int_cnt2 == (1>>scale2)) { // One byte
2784         assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
2785         load_unsigned_byte(result, Address(str2, 0));
2786         movdl(vec, result); // move 32 bits
2787       } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
2788         // Not enough header space in 32-bit VM: 12+3 = 15.
2789         movl(result, Address(str2, -1));
2790         shrl(result, 8);
2791         movdl(vec, result); // move 32 bits
2792       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
2793         load_unsigned_short(result, Address(str2, 0));
2794         movdl(vec, result); // move 32 bits
2795       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
2796         movdl(vec, Address(str2, 0)); // move 32 bits
2797       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
2798         movq(vec, Address(str2, 0));  // move 64 bits
2799       } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
2800         // Array header size is 12 bytes in 32-bit VM
2801         // + 6 bytes for 3 chars == 18 bytes,
2802         // enough space to load vec and shift.
2803         assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
2804         if (ae == StrIntrinsicNode::UL) {
2805           int tail_off = int_cnt2-8;
2806           pmovzxbw(vec, Address(str2, tail_off));
2807           psrldq(vec, -2*tail_off);
2808         }
2809         else {
2810           int tail_off = int_cnt2*(1<<scale2);
2811           movdqu(vec, Address(str2, tail_off-16));
2812           psrldq(vec, 16-tail_off);
2813         }
2814       }
2815     } else { // not constant substring
2816       cmpl(cnt2, stride);
2817       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
2818 
2819       // We can read beyond string if srt+16 does not cross page boundary
2820       // since heaps are aligned and mapped by pages.
2821       assert(os::vm_page_size() < (int)G, "default page should be small");
2822       movl(result, str2); // We need only low 32 bits
2823       andl(result, ((int)os::vm_page_size()-1));
2824       cmpl(result, ((int)os::vm_page_size()-16));
2825       jccb(Assembler::belowEqual, CHECK_STR);
2826 
2827       // Move small strings to stack to allow load 16 bytes into vec.
2828       subptr(rsp, 16);
2829       int stk_offset = wordSize-(1<<scale2);
2830       push(cnt2);
2831 
2832       bind(COPY_SUBSTR);
2833       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
2834         load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
2835         movb(Address(rsp, cnt2, scale2, stk_offset), result);
2836       } else if (ae == StrIntrinsicNode::UU) {
2837         load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
2838         movw(Address(rsp, cnt2, scale2, stk_offset), result);
2839       }
2840       decrement(cnt2);
2841       jccb(Assembler::notZero, COPY_SUBSTR);
2842 
2843       pop(cnt2);
2844       movptr(str2, rsp);  // New substring address
2845     } // non constant
2846 
2847     bind(CHECK_STR);
2848     cmpl(cnt1, stride);
2849     jccb(Assembler::aboveEqual, BIG_STRINGS);
2850 
2851     // Check cross page boundary.
2852     movl(result, str1); // We need only low 32 bits
2853     andl(result, ((int)os::vm_page_size()-1));
2854     cmpl(result, ((int)os::vm_page_size()-16));
2855     jccb(Assembler::belowEqual, BIG_STRINGS);
2856 
2857     subptr(rsp, 16);
2858     int stk_offset = -(1<<scale1);
2859     if (int_cnt2 < 0) { // not constant
2860       push(cnt2);
2861       stk_offset += wordSize;
2862     }
2863     movl(cnt2, cnt1);
2864 
2865     bind(COPY_STR);
2866     if (ae == StrIntrinsicNode::LL) {
2867       load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
2868       movb(Address(rsp, cnt2, scale1, stk_offset), result);
2869     } else {
2870       load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
2871       movw(Address(rsp, cnt2, scale1, stk_offset), result);
2872     }
2873     decrement(cnt2);
2874     jccb(Assembler::notZero, COPY_STR);
2875 
2876     if (int_cnt2 < 0) { // not constant
2877       pop(cnt2);
2878     }
2879     movptr(str1, rsp);  // New string address
2880 
2881     bind(BIG_STRINGS);
2882     // Load substring.
2883     if (int_cnt2 < 0) { // -1
2884       if (ae == StrIntrinsicNode::UL) {
2885         pmovzxbw(vec, Address(str2, 0));
2886       } else {
2887         movdqu(vec, Address(str2, 0));
2888       }
2889       push(cnt2);       // substr count
2890       push(str2);       // substr addr
2891       push(str1);       // string addr
2892     } else {
2893       // Small (< 8 chars) constant substrings are loaded already.
2894       movl(cnt2, int_cnt2);
2895     }
2896     push(tmp);  // original SP
2897 
2898   } // Finished loading
2899 
2900   //========================================================
2901   // Start search
2902   //
2903 
2904   movptr(result, str1); // string addr
2905 
2906   if (int_cnt2  < 0) {  // Only for non constant substring
2907     jmpb(SCAN_TO_SUBSTR);
2908 
2909     // SP saved at sp+0
2910     // String saved at sp+1*wordSize
2911     // Substr saved at sp+2*wordSize
2912     // Substr count saved at sp+3*wordSize
2913 
2914     // Reload substr for rescan, this code
2915     // is executed only for large substrings (> 8 chars)
2916     bind(RELOAD_SUBSTR);
2917     movptr(str2, Address(rsp, 2*wordSize));
2918     movl(cnt2, Address(rsp, 3*wordSize));
2919     if (ae == StrIntrinsicNode::UL) {
2920       pmovzxbw(vec, Address(str2, 0));
2921     } else {
2922       movdqu(vec, Address(str2, 0));
2923     }
2924     // We came here after the beginning of the substring was
2925     // matched but the rest of it was not so we need to search
2926     // again. Start from the next element after the previous match.
2927     subptr(str1, result); // Restore counter
2928     if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2929       shrl(str1, 1);
2930     }
2931     addl(cnt1, str1);
2932     decrementl(cnt1);   // Shift to next element
2933     cmpl(cnt1, cnt2);
2934     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2935 
2936     addptr(result, (1<<scale1));
2937   } // non constant
2938 
2939   // Scan string for start of substr in 16-byte vectors
2940   bind(SCAN_TO_SUBSTR);
2941   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2942   pcmpestri(vec, Address(result, 0), mode);
2943   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2944   subl(cnt1, stride);
2945   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2946   cmpl(cnt1, cnt2);
2947   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2948   addptr(result, 16);
2949 
2950   bind(ADJUST_STR);
2951   cmpl(cnt1, stride); // Do not read beyond string
2952   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2953   // Back-up string to avoid reading beyond string.
2954   lea(result, Address(result, cnt1, scale1, -16));
2955   movl(cnt1, stride);
2956   jmpb(SCAN_TO_SUBSTR);
2957 
2958   // Found a potential substr
2959   bind(FOUND_CANDIDATE);
2960   // After pcmpestri tmp(rcx) contains matched element index
2961 
2962   // Make sure string is still long enough
2963   subl(cnt1, tmp);
2964   cmpl(cnt1, cnt2);
2965   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
2966   // Left less then substring.
2967 
2968   bind(RET_NOT_FOUND);
2969   movl(result, -1);
2970   jmp(CLEANUP);
2971 
2972   bind(FOUND_SUBSTR);
2973   // Compute start addr of substr
2974   lea(result, Address(result, tmp, scale1));
2975   if (int_cnt2 > 0) { // Constant substring
2976     // Repeat search for small substring (< 8 chars)
2977     // from new point without reloading substring.
2978     // Have to check that we don't read beyond string.
2979     cmpl(tmp, stride-int_cnt2);
2980     jccb(Assembler::greater, ADJUST_STR);
2981     // Fall through if matched whole substring.
2982   } else { // non constant
2983     assert(int_cnt2 == -1, "should be != 0");
2984 
2985     addl(tmp, cnt2);
2986     // Found result if we matched whole substring.
2987     cmpl(tmp, stride);
2988     jcc(Assembler::lessEqual, RET_FOUND);
2989 
2990     // Repeat search for small substring (<= 8 chars)
2991     // from new point 'str1' without reloading substring.
2992     cmpl(cnt2, stride);
2993     // Have to check that we don't read beyond string.
2994     jccb(Assembler::lessEqual, ADJUST_STR);
2995 
2996     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
2997     // Compare the rest of substring (> 8 chars).
2998     movptr(str1, result);
2999 
3000     cmpl(tmp, cnt2);
3001     // First 8 chars are already matched.
3002     jccb(Assembler::equal, CHECK_NEXT);
3003 
3004     bind(SCAN_SUBSTR);
3005     pcmpestri(vec, Address(str1, 0), mode);
3006     // Need to reload strings pointers if not matched whole vector
3007     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
3008 
3009     bind(CHECK_NEXT);
3010     subl(cnt2, stride);
3011     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
3012     addptr(str1, 16);
3013     if (ae == StrIntrinsicNode::UL) {
3014       addptr(str2, 8);
3015     } else {
3016       addptr(str2, 16);
3017     }
3018     subl(cnt1, stride);
3019     cmpl(cnt2, stride); // Do not read beyond substring
3020     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
3021     // Back-up strings to avoid reading beyond substring.
3022 
3023     if (ae == StrIntrinsicNode::UL) {
3024       lea(str2, Address(str2, cnt2, scale2, -8));
3025       lea(str1, Address(str1, cnt2, scale1, -16));
3026     } else {
3027       lea(str2, Address(str2, cnt2, scale2, -16));
3028       lea(str1, Address(str1, cnt2, scale1, -16));
3029     }
3030     subl(cnt1, cnt2);
3031     movl(cnt2, stride);
3032     addl(cnt1, stride);
3033     bind(CONT_SCAN_SUBSTR);
3034     if (ae == StrIntrinsicNode::UL) {
3035       pmovzxbw(vec, Address(str2, 0));
3036     } else {
3037       movdqu(vec, Address(str2, 0));
3038     }
3039     jmp(SCAN_SUBSTR);
3040 
3041     bind(RET_FOUND_LONG);
3042     movptr(str1, Address(rsp, wordSize));
3043   } // non constant
3044 
3045   bind(RET_FOUND);
3046   // Compute substr offset
3047   subptr(result, str1);
3048   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3049     shrl(result, 1); // index
3050   }
3051   bind(CLEANUP);
3052   pop(rsp); // restore SP
3053 
3054 } // string_indexof
3055 
3056 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3057                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3058   ShortBranchVerifier sbv(this);
3059   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3060 
3061   int stride = 8;
3062 
3063   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
3064         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
3065         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
3066         FOUND_SEQ_CHAR, DONE_LABEL;
3067 
3068   movptr(result, str1);
3069   if (UseAVX >= 2) {
3070     cmpl(cnt1, stride);
3071     jcc(Assembler::less, SCAN_TO_CHAR);
3072     cmpl(cnt1, 2*stride);
3073     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
3074     movdl(vec1, ch);
3075     vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
3076     vpxor(vec2, vec2);
3077     movl(tmp, cnt1);
3078     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
3079     andl(cnt1,0x0000000F);  //tail count (in chars)
3080 
3081     bind(SCAN_TO_16_CHAR_LOOP);
3082     vmovdqu(vec3, Address(result, 0));
3083     vpcmpeqw(vec3, vec3, vec1, 1);
3084     vptest(vec2, vec3);
3085     jcc(Assembler::carryClear, FOUND_CHAR);
3086     addptr(result, 32);
3087     subl(tmp, 2*stride);
3088     jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
3089     jmp(SCAN_TO_8_CHAR);
3090     bind(SCAN_TO_8_CHAR_INIT);
3091     movdl(vec1, ch);
3092     pshuflw(vec1, vec1, 0x00);
3093     pshufd(vec1, vec1, 0);
3094     pxor(vec2, vec2);
3095   }
3096   bind(SCAN_TO_8_CHAR);
3097   cmpl(cnt1, stride);
3098   jcc(Assembler::less, SCAN_TO_CHAR);
3099   if (UseAVX < 2) {
3100     movdl(vec1, ch);
3101     pshuflw(vec1, vec1, 0x00);
3102     pshufd(vec1, vec1, 0);
3103     pxor(vec2, vec2);
3104   }
3105   movl(tmp, cnt1);
3106   andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
3107   andl(cnt1,0x00000007);  //tail count (in chars)
3108 
3109   bind(SCAN_TO_8_CHAR_LOOP);
3110   movdqu(vec3, Address(result, 0));
3111   pcmpeqw(vec3, vec1);
3112   ptest(vec2, vec3);
3113   jcc(Assembler::carryClear, FOUND_CHAR);
3114   addptr(result, 16);
3115   subl(tmp, stride);
3116   jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
3117   bind(SCAN_TO_CHAR);
3118   testl(cnt1, cnt1);
3119   jcc(Assembler::zero, RET_NOT_FOUND);
3120   bind(SCAN_TO_CHAR_LOOP);
3121   load_unsigned_short(tmp, Address(result, 0));
3122   cmpl(ch, tmp);
3123   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3124   addptr(result, 2);
3125   subl(cnt1, 1);
3126   jccb(Assembler::zero, RET_NOT_FOUND);
3127   jmp(SCAN_TO_CHAR_LOOP);
3128 
3129   bind(RET_NOT_FOUND);
3130   movl(result, -1);
3131   jmpb(DONE_LABEL);
3132 
3133   bind(FOUND_CHAR);
3134   if (UseAVX >= 2) {
3135     vpmovmskb(tmp, vec3);
3136   } else {
3137     pmovmskb(tmp, vec3);
3138   }
3139   bsfl(ch, tmp);
3140   addptr(result, ch);
3141 
3142   bind(FOUND_SEQ_CHAR);
3143   subptr(result, str1);
3144   shrl(result, 1);
3145 
3146   bind(DONE_LABEL);
3147 } // string_indexof_char
3148 
3149 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3150                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3151   ShortBranchVerifier sbv(this);
3152   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3153 
3154   int stride = 16;
3155 
3156   Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
3157         SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
3158         RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
3159         FOUND_SEQ_CHAR, DONE_LABEL;
3160 
3161   movptr(result, str1);
3162   if (UseAVX >= 2) {
3163     cmpl(cnt1, stride);
3164     jcc(Assembler::less, SCAN_TO_CHAR_INIT);
3165     cmpl(cnt1, stride*2);
3166     jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
3167     movdl(vec1, ch);
3168     vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
3169     vpxor(vec2, vec2);
3170     movl(tmp, cnt1);
3171     andl(tmp, 0xFFFFFFE0);  //vector count (in chars)
3172     andl(cnt1,0x0000001F);  //tail count (in chars)
3173 
3174     bind(SCAN_TO_32_CHAR_LOOP);
3175     vmovdqu(vec3, Address(result, 0));
3176     vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
3177     vptest(vec2, vec3);
3178     jcc(Assembler::carryClear, FOUND_CHAR);
3179     addptr(result, 32);
3180     subl(tmp, stride*2);
3181     jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
3182     jmp(SCAN_TO_16_CHAR);
3183 
3184     bind(SCAN_TO_16_CHAR_INIT);
3185     movdl(vec1, ch);
3186     pxor(vec2, vec2);
3187     pshufb(vec1, vec2);
3188   }
3189 
3190   bind(SCAN_TO_16_CHAR);
3191   cmpl(cnt1, stride);
3192   jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left
3193   if (UseAVX < 2) {
3194     movdl(vec1, ch);
3195     pxor(vec2, vec2);
3196     pshufb(vec1, vec2);
3197   }
3198   movl(tmp, cnt1);
3199   andl(tmp, 0xFFFFFFF0);  //vector count (in bytes)
3200   andl(cnt1,0x0000000F);  //tail count (in bytes)
3201 
3202   bind(SCAN_TO_16_CHAR_LOOP);
3203   movdqu(vec3, Address(result, 0));
3204   pcmpeqb(vec3, vec1);
3205   ptest(vec2, vec3);
3206   jcc(Assembler::carryClear, FOUND_CHAR);
3207   addptr(result, 16);
3208   subl(tmp, stride);
3209   jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
3210 
3211   bind(SCAN_TO_CHAR_INIT);
3212   testl(cnt1, cnt1);
3213   jcc(Assembler::zero, RET_NOT_FOUND);
3214   bind(SCAN_TO_CHAR_LOOP);
3215   load_unsigned_byte(tmp, Address(result, 0));
3216   cmpl(ch, tmp);
3217   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3218   addptr(result, 1);
3219   subl(cnt1, 1);
3220   jccb(Assembler::zero, RET_NOT_FOUND);
3221   jmp(SCAN_TO_CHAR_LOOP);
3222 
3223   bind(RET_NOT_FOUND);
3224   movl(result, -1);
3225   jmpb(DONE_LABEL);
3226 
3227   bind(FOUND_CHAR);
3228   if (UseAVX >= 2) {
3229     vpmovmskb(tmp, vec3);
3230   } else {
3231     pmovmskb(tmp, vec3);
3232   }
3233   bsfl(ch, tmp);
3234   addptr(result, ch);
3235 
3236   bind(FOUND_SEQ_CHAR);
3237   subptr(result, str1);
3238 
3239   bind(DONE_LABEL);
3240 } // stringL_indexof_char
3241 
3242 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) {
3243   switch (eltype) {
3244   case T_BOOLEAN: return sizeof(jboolean);
3245   case T_BYTE:  return sizeof(jbyte);
3246   case T_SHORT: return sizeof(jshort);
3247   case T_CHAR:  return sizeof(jchar);
3248   case T_INT:   return sizeof(jint);
3249   default:
3250     ShouldNotReachHere();
3251     return -1;
3252   }
3253 }
3254 
3255 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) {
3256   switch (eltype) {
3257   // T_BOOLEAN used as surrogate for unsigned byte
3258   case T_BOOLEAN: movzbl(dst, src);   break;
3259   case T_BYTE:    movsbl(dst, src);   break;
3260   case T_SHORT:   movswl(dst, src);   break;
3261   case T_CHAR:    movzwl(dst, src);   break;
3262   case T_INT:     movl(dst, src);     break;
3263   default:
3264     ShouldNotReachHere();
3265   }
3266 }
3267 
3268 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) {
3269   load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8);
3270 }
3271 
3272 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) {
3273   load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8);
3274 }
3275 
3276 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) {
3277   const int vlen = Assembler::AVX_256bit;
3278   switch (eltype) {
3279   case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3280   case T_BYTE:      vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3281   case T_SHORT:     vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3282   case T_CHAR:    vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3283   case T_INT:
3284     // do nothing
3285     break;
3286   default:
3287     ShouldNotReachHere();
3288   }
3289 }
3290 
3291 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result,
3292                                         Register index, Register tmp2, Register tmp3, XMMRegister vnext,
3293                                         XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3,
3294                                         XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3,
3295                                         XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3,
3296                                         BasicType eltype) {
3297   ShortBranchVerifier sbv(this);
3298   assert(UseAVX >= 2, "AVX2 intrinsics are required");
3299   assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3);
3300   assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3);
3301 
3302   Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN,
3303         SHORT_UNROLLED_LOOP_EXIT,
3304         UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME,
3305         UNROLLED_VECTOR_LOOP_BEGIN,
3306         END;
3307   switch (eltype) {
3308   case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break;
3309   case T_CHAR:    BLOCK_COMMENT("arrays_hashcode(char) {");          break;
3310   case T_BYTE:    BLOCK_COMMENT("arrays_hashcode(byte) {");          break;
3311   case T_SHORT:   BLOCK_COMMENT("arrays_hashcode(short) {");         break;
3312   case T_INT:     BLOCK_COMMENT("arrays_hashcode(int) {");           break;
3313   default:        BLOCK_COMMENT("arrays_hashcode {");                break;
3314   }
3315 
3316   // For "renaming" for readibility of the code
3317   const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 },
3318                     vresult[] = { vresult0, vresult1, vresult2, vresult3 },
3319                     vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 };
3320 
3321   const int elsize = arrays_hashcode_elsize(eltype);
3322 
3323   /*
3324     if (cnt1 >= 2) {
3325       if (cnt1 >= 32) {
3326         UNROLLED VECTOR LOOP
3327       }
3328       UNROLLED SCALAR LOOP
3329     }
3330     SINGLE SCALAR
3331    */
3332 
3333   cmpl(cnt1, 32);
3334   jcc(Assembler::less, SHORT_UNROLLED_BEGIN);
3335 
3336   // cnt1 >= 32 && generate_vectorized_loop
3337   xorl(index, index);
3338 
3339   // vresult = IntVector.zero(I256);
3340   for (int idx = 0; idx < 4; idx++) {
3341     vpxor(vresult[idx], vresult[idx]);
3342   }
3343   // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]);
3344   Register bound = tmp2;
3345   Register next = tmp3;
3346   lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint))));
3347   movl(next, Address(tmp2, 0));
3348   movdl(vnext, next);
3349   vpbroadcastd(vnext, vnext, Assembler::AVX_256bit);
3350 
3351   // index = 0;
3352   // bound = cnt1 & ~(32 - 1);
3353   movl(bound, cnt1);
3354   andl(bound, ~(32 - 1));
3355   // for (; index < bound; index += 32) {
3356   bind(UNROLLED_VECTOR_LOOP_BEGIN);
3357   // result *= next;
3358   imull(result, next);
3359   // loop fission to upfront the cost of fetching from memory, OOO execution
3360   // can then hopefully do a better job of prefetching
3361   for (int idx = 0; idx < 4; idx++) {
3362     arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype);
3363   }
3364   // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7];
3365   for (int idx = 0; idx < 4; idx++) {
3366     vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit);
3367     arrays_hashcode_elvcast(vtmp[idx], eltype);
3368     vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit);
3369   }
3370   // index += 32;
3371   addl(index, 32);
3372   // index < bound;
3373   cmpl(index, bound);
3374   jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN);
3375   // }
3376 
3377   lea(ary1, Address(ary1, bound, Address::times(elsize)));
3378   subl(cnt1, bound);
3379   // release bound
3380 
3381   // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1);
3382   for (int idx = 0; idx < 4; idx++) {
3383     lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint))));
3384     arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT);
3385     vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit);
3386   }
3387   // result += vresult.reduceLanes(ADD);
3388   for (int idx = 0; idx < 4; idx++) {
3389     reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]);
3390   }
3391 
3392   // } else if (cnt1 < 32) {
3393 
3394   bind(SHORT_UNROLLED_BEGIN);
3395   // int i = 1;
3396   movl(index, 1);
3397   cmpl(index, cnt1);
3398   jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT);
3399 
3400   // for (; i < cnt1 ; i += 2) {
3401   bind(SHORT_UNROLLED_LOOP_BEGIN);
3402   movl(tmp3, 961);
3403   imull(result, tmp3);
3404   arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3405   movl(tmp3, tmp2);
3406   shll(tmp3, 5);
3407   subl(tmp3, tmp2);
3408   addl(result, tmp3);
3409   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype);
3410   addl(result, tmp3);
3411   addl(index, 2);
3412   cmpl(index, cnt1);
3413   jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN);
3414 
3415   // }
3416   // if (i >= cnt1) {
3417   bind(SHORT_UNROLLED_LOOP_EXIT);
3418   jccb(Assembler::greater, END);
3419   movl(tmp2, result);
3420   shll(result, 5);
3421   subl(result, tmp2);
3422   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3423   addl(result, tmp3);
3424   // }
3425   bind(END);
3426 
3427   BLOCK_COMMENT("} // arrays_hashcode");
3428 
3429 } // arrays_hashcode
3430 
3431 // helper function for string_compare
3432 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
3433                                            Address::ScaleFactor scale, Address::ScaleFactor scale1,
3434                                            Address::ScaleFactor scale2, Register index, int ae) {
3435   if (ae == StrIntrinsicNode::LL) {
3436     load_unsigned_byte(elem1, Address(str1, index, scale, 0));
3437     load_unsigned_byte(elem2, Address(str2, index, scale, 0));
3438   } else if (ae == StrIntrinsicNode::UU) {
3439     load_unsigned_short(elem1, Address(str1, index, scale, 0));
3440     load_unsigned_short(elem2, Address(str2, index, scale, 0));
3441   } else {
3442     load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
3443     load_unsigned_short(elem2, Address(str2, index, scale2, 0));
3444   }
3445 }
3446 
3447 // Compare strings, used for char[] and byte[].
3448 void C2_MacroAssembler::string_compare(Register str1, Register str2,
3449                                        Register cnt1, Register cnt2, Register result,
3450                                        XMMRegister vec1, int ae, KRegister mask) {
3451   ShortBranchVerifier sbv(this);
3452   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
3453   Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only _LP64 && AVX3
3454   int stride, stride2, adr_stride, adr_stride1, adr_stride2;
3455   int stride2x2 = 0x40;
3456   Address::ScaleFactor scale = Address::no_scale;
3457   Address::ScaleFactor scale1 = Address::no_scale;
3458   Address::ScaleFactor scale2 = Address::no_scale;
3459 
3460   if (ae != StrIntrinsicNode::LL) {
3461     stride2x2 = 0x20;
3462   }
3463 
3464   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
3465     shrl(cnt2, 1);
3466   }
3467   // Compute the minimum of the string lengths and the
3468   // difference of the string lengths (stack).
3469   // Do the conditional move stuff
3470   movl(result, cnt1);
3471   subl(cnt1, cnt2);
3472   push(cnt1);
3473   cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
3474 
3475   // Is the minimum length zero?
3476   testl(cnt2, cnt2);
3477   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3478   if (ae == StrIntrinsicNode::LL) {
3479     // Load first bytes
3480     load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
3481     load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
3482   } else if (ae == StrIntrinsicNode::UU) {
3483     // Load first characters
3484     load_unsigned_short(result, Address(str1, 0));
3485     load_unsigned_short(cnt1, Address(str2, 0));
3486   } else {
3487     load_unsigned_byte(result, Address(str1, 0));
3488     load_unsigned_short(cnt1, Address(str2, 0));
3489   }
3490   subl(result, cnt1);
3491   jcc(Assembler::notZero,  POP_LABEL);
3492 
3493   if (ae == StrIntrinsicNode::UU) {
3494     // Divide length by 2 to get number of chars
3495     shrl(cnt2, 1);
3496   }
3497   cmpl(cnt2, 1);
3498   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3499 
3500   // Check if the strings start at the same location and setup scale and stride
3501   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3502     cmpptr(str1, str2);
3503     jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3504     if (ae == StrIntrinsicNode::LL) {
3505       scale = Address::times_1;
3506       stride = 16;
3507     } else {
3508       scale = Address::times_2;
3509       stride = 8;
3510     }
3511   } else {
3512     scale1 = Address::times_1;
3513     scale2 = Address::times_2;
3514     // scale not used
3515     stride = 8;
3516   }
3517 
3518   if (UseAVX >= 2 && UseSSE42Intrinsics) {
3519     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
3520     Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
3521     Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
3522     Label COMPARE_TAIL_LONG;
3523     Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only _LP64 && AVX3
3524 
3525     int pcmpmask = 0x19;
3526     if (ae == StrIntrinsicNode::LL) {
3527       pcmpmask &= ~0x01;
3528     }
3529 
3530     // Setup to compare 16-chars (32-bytes) vectors,
3531     // start from first character again because it has aligned address.
3532     if (ae == StrIntrinsicNode::LL) {
3533       stride2 = 32;
3534     } else {
3535       stride2 = 16;
3536     }
3537     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3538       adr_stride = stride << scale;
3539     } else {
3540       adr_stride1 = 8;  //stride << scale1;
3541       adr_stride2 = 16; //stride << scale2;
3542     }
3543 
3544     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3545     // rax and rdx are used by pcmpestri as elements counters
3546     movl(result, cnt2);
3547     andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
3548     jcc(Assembler::zero, COMPARE_TAIL_LONG);
3549 
3550     // fast path : compare first 2 8-char vectors.
3551     bind(COMPARE_16_CHARS);
3552     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3553       movdqu(vec1, Address(str1, 0));
3554     } else {
3555       pmovzxbw(vec1, Address(str1, 0));
3556     }
3557     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3558     jccb(Assembler::below, COMPARE_INDEX_CHAR);
3559 
3560     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3561       movdqu(vec1, Address(str1, adr_stride));
3562       pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
3563     } else {
3564       pmovzxbw(vec1, Address(str1, adr_stride1));
3565       pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
3566     }
3567     jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
3568     addl(cnt1, stride);
3569 
3570     // Compare the characters at index in cnt1
3571     bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
3572     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3573     subl(result, cnt2);
3574     jmp(POP_LABEL);
3575 
3576     // Setup the registers to start vector comparison loop
3577     bind(COMPARE_WIDE_VECTORS);
3578     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3579       lea(str1, Address(str1, result, scale));
3580       lea(str2, Address(str2, result, scale));
3581     } else {
3582       lea(str1, Address(str1, result, scale1));
3583       lea(str2, Address(str2, result, scale2));
3584     }
3585     subl(result, stride2);
3586     subl(cnt2, stride2);
3587     jcc(Assembler::zero, COMPARE_WIDE_TAIL);
3588     negptr(result);
3589 
3590     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
3591     bind(COMPARE_WIDE_VECTORS_LOOP);
3592 
3593 #ifdef _LP64
3594     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3595       cmpl(cnt2, stride2x2);
3596       jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3597       testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
3598       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
3599 
3600       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3601       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3602         evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
3603         evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3604       } else {
3605         vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
3606         evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3607       }
3608       kortestql(mask, mask);
3609       jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
3610       addptr(result, stride2x2);  // update since we already compared at this addr
3611       subl(cnt2, stride2x2);      // and sub the size too
3612       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3613 
3614       vpxor(vec1, vec1);
3615       jmpb(COMPARE_WIDE_TAIL);
3616     }//if (VM_Version::supports_avx512vlbw())
3617 #endif // _LP64
3618 
3619 
3620     bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3621     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3622       vmovdqu(vec1, Address(str1, result, scale));
3623       vpxor(vec1, Address(str2, result, scale));
3624     } else {
3625       vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
3626       vpxor(vec1, Address(str2, result, scale2));
3627     }
3628     vptest(vec1, vec1);
3629     jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
3630     addptr(result, stride2);
3631     subl(cnt2, stride2);
3632     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
3633     // clean upper bits of YMM registers
3634     vpxor(vec1, vec1);
3635 
3636     // compare wide vectors tail
3637     bind(COMPARE_WIDE_TAIL);
3638     testptr(result, result);
3639     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3640 
3641     movl(result, stride2);
3642     movl(cnt2, result);
3643     negptr(result);
3644     jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3645 
3646     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
3647     bind(VECTOR_NOT_EQUAL);
3648     // clean upper bits of YMM registers
3649     vpxor(vec1, vec1);
3650     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3651       lea(str1, Address(str1, result, scale));
3652       lea(str2, Address(str2, result, scale));
3653     } else {
3654       lea(str1, Address(str1, result, scale1));
3655       lea(str2, Address(str2, result, scale2));
3656     }
3657     jmp(COMPARE_16_CHARS);
3658 
3659     // Compare tail chars, length between 1 to 15 chars
3660     bind(COMPARE_TAIL_LONG);
3661     movl(cnt2, result);
3662     cmpl(cnt2, stride);
3663     jcc(Assembler::less, COMPARE_SMALL_STR);
3664 
3665     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3666       movdqu(vec1, Address(str1, 0));
3667     } else {
3668       pmovzxbw(vec1, Address(str1, 0));
3669     }
3670     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3671     jcc(Assembler::below, COMPARE_INDEX_CHAR);
3672     subptr(cnt2, stride);
3673     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3674     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3675       lea(str1, Address(str1, result, scale));
3676       lea(str2, Address(str2, result, scale));
3677     } else {
3678       lea(str1, Address(str1, result, scale1));
3679       lea(str2, Address(str2, result, scale2));
3680     }
3681     negptr(cnt2);
3682     jmpb(WHILE_HEAD_LABEL);
3683 
3684     bind(COMPARE_SMALL_STR);
3685   } else if (UseSSE42Intrinsics) {
3686     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
3687     int pcmpmask = 0x19;
3688     // Setup to compare 8-char (16-byte) vectors,
3689     // start from first character again because it has aligned address.
3690     movl(result, cnt2);
3691     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
3692     if (ae == StrIntrinsicNode::LL) {
3693       pcmpmask &= ~0x01;
3694     }
3695     jcc(Assembler::zero, COMPARE_TAIL);
3696     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3697       lea(str1, Address(str1, result, scale));
3698       lea(str2, Address(str2, result, scale));
3699     } else {
3700       lea(str1, Address(str1, result, scale1));
3701       lea(str2, Address(str2, result, scale2));
3702     }
3703     negptr(result);
3704 
3705     // pcmpestri
3706     //   inputs:
3707     //     vec1- substring
3708     //     rax - negative string length (elements count)
3709     //     mem - scanned string
3710     //     rdx - string length (elements count)
3711     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
3712     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
3713     //   outputs:
3714     //     rcx - first mismatched element index
3715     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3716 
3717     bind(COMPARE_WIDE_VECTORS);
3718     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3719       movdqu(vec1, Address(str1, result, scale));
3720       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3721     } else {
3722       pmovzxbw(vec1, Address(str1, result, scale1));
3723       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3724     }
3725     // After pcmpestri cnt1(rcx) contains mismatched element index
3726 
3727     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
3728     addptr(result, stride);
3729     subptr(cnt2, stride);
3730     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3731 
3732     // compare wide vectors tail
3733     testptr(result, result);
3734     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3735 
3736     movl(cnt2, stride);
3737     movl(result, stride);
3738     negptr(result);
3739     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3740       movdqu(vec1, Address(str1, result, scale));
3741       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3742     } else {
3743       pmovzxbw(vec1, Address(str1, result, scale1));
3744       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3745     }
3746     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
3747 
3748     // Mismatched characters in the vectors
3749     bind(VECTOR_NOT_EQUAL);
3750     addptr(cnt1, result);
3751     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3752     subl(result, cnt2);
3753     jmpb(POP_LABEL);
3754 
3755     bind(COMPARE_TAIL); // limit is zero
3756     movl(cnt2, result);
3757     // Fallthru to tail compare
3758   }
3759   // Shift str2 and str1 to the end of the arrays, negate min
3760   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3761     lea(str1, Address(str1, cnt2, scale));
3762     lea(str2, Address(str2, cnt2, scale));
3763   } else {
3764     lea(str1, Address(str1, cnt2, scale1));
3765     lea(str2, Address(str2, cnt2, scale2));
3766   }
3767   decrementl(cnt2);  // first character was compared already
3768   negptr(cnt2);
3769 
3770   // Compare the rest of the elements
3771   bind(WHILE_HEAD_LABEL);
3772   load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
3773   subl(result, cnt1);
3774   jccb(Assembler::notZero, POP_LABEL);
3775   increment(cnt2);
3776   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
3777 
3778   // Strings are equal up to min length.  Return the length difference.
3779   bind(LENGTH_DIFF_LABEL);
3780   pop(result);
3781   if (ae == StrIntrinsicNode::UU) {
3782     // Divide diff by 2 to get number of chars
3783     sarl(result, 1);
3784   }
3785   jmpb(DONE_LABEL);
3786 
3787 #ifdef _LP64
3788   if (VM_Version::supports_avx512vlbw()) {
3789 
3790     bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
3791 
3792     kmovql(cnt1, mask);
3793     notq(cnt1);
3794     bsfq(cnt2, cnt1);
3795     if (ae != StrIntrinsicNode::LL) {
3796       // Divide diff by 2 to get number of chars
3797       sarl(cnt2, 1);
3798     }
3799     addq(result, cnt2);
3800     if (ae == StrIntrinsicNode::LL) {
3801       load_unsigned_byte(cnt1, Address(str2, result));
3802       load_unsigned_byte(result, Address(str1, result));
3803     } else if (ae == StrIntrinsicNode::UU) {
3804       load_unsigned_short(cnt1, Address(str2, result, scale));
3805       load_unsigned_short(result, Address(str1, result, scale));
3806     } else {
3807       load_unsigned_short(cnt1, Address(str2, result, scale2));
3808       load_unsigned_byte(result, Address(str1, result, scale1));
3809     }
3810     subl(result, cnt1);
3811     jmpb(POP_LABEL);
3812   }//if (VM_Version::supports_avx512vlbw())
3813 #endif // _LP64
3814 
3815   // Discard the stored length difference
3816   bind(POP_LABEL);
3817   pop(cnt1);
3818 
3819   // That's it
3820   bind(DONE_LABEL);
3821   if(ae == StrIntrinsicNode::UL) {
3822     negl(result);
3823   }
3824 
3825 }
3826 
3827 // Search for Non-ASCII character (Negative byte value) in a byte array,
3828 // return the index of the first such character, otherwise the length
3829 // of the array segment searched.
3830 //   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
3831 //   @IntrinsicCandidate
3832 //   public static int countPositives(byte[] ba, int off, int len) {
3833 //     for (int i = off; i < off + len; i++) {
3834 //       if (ba[i] < 0) {
3835 //         return i - off;
3836 //       }
3837 //     }
3838 //     return len;
3839 //   }
3840 void C2_MacroAssembler::count_positives(Register ary1, Register len,
3841   Register result, Register tmp1,
3842   XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {
3843   // rsi: byte array
3844   // rcx: len
3845   // rax: result
3846   ShortBranchVerifier sbv(this);
3847   assert_different_registers(ary1, len, result, tmp1);
3848   assert_different_registers(vec1, vec2);
3849   Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
3850 
3851   movl(result, len); // copy
3852   // len == 0
3853   testl(len, len);
3854   jcc(Assembler::zero, DONE);
3855 
3856   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
3857     VM_Version::supports_avx512vlbw() &&
3858     VM_Version::supports_bmi2()) {
3859 
3860     Label test_64_loop, test_tail, BREAK_LOOP;
3861     Register tmp3_aliased = len;
3862 
3863     movl(tmp1, len);
3864     vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
3865 
3866     andl(tmp1, 64 - 1);   // tail count (in chars) 0x3F
3867     andl(len, ~(64 - 1));    // vector count (in chars)
3868     jccb(Assembler::zero, test_tail);
3869 
3870     lea(ary1, Address(ary1, len, Address::times_1));
3871     negptr(len);
3872 
3873     bind(test_64_loop);
3874     // Check whether our 64 elements of size byte contain negatives
3875     evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
3876     kortestql(mask1, mask1);
3877     jcc(Assembler::notZero, BREAK_LOOP);
3878 
3879     addptr(len, 64);
3880     jccb(Assembler::notZero, test_64_loop);
3881 
3882     bind(test_tail);
3883     // bail out when there is nothing to be done
3884     testl(tmp1, -1);
3885     jcc(Assembler::zero, DONE);
3886 
3887     // ~(~0 << len) applied up to two times (for 32-bit scenario)
3888 #ifdef _LP64
3889     mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
3890     shlxq(tmp3_aliased, tmp3_aliased, tmp1);
3891     notq(tmp3_aliased);
3892     kmovql(mask2, tmp3_aliased);
3893 #else
3894     Label k_init;
3895     jmp(k_init);
3896 
3897     // We could not read 64-bits from a general purpose register thus we move
3898     // data required to compose 64 1's to the instruction stream
3899     // We emit 64 byte wide series of elements from 0..63 which later on would
3900     // be used as a compare targets with tail count contained in tmp1 register.
3901     // Result would be a k register having tmp1 consecutive number or 1
3902     // counting from least significant bit.
3903     address tmp = pc();
3904     emit_int64(0x0706050403020100);
3905     emit_int64(0x0F0E0D0C0B0A0908);
3906     emit_int64(0x1716151413121110);
3907     emit_int64(0x1F1E1D1C1B1A1918);
3908     emit_int64(0x2726252423222120);
3909     emit_int64(0x2F2E2D2C2B2A2928);
3910     emit_int64(0x3736353433323130);
3911     emit_int64(0x3F3E3D3C3B3A3938);
3912 
3913     bind(k_init);
3914     lea(len, InternalAddress(tmp));
3915     // create mask to test for negative byte inside a vector
3916     evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit);
3917     evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit);
3918 
3919 #endif
3920     evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);
3921     ktestq(mask1, mask2);
3922     jcc(Assembler::zero, DONE);
3923 
3924     bind(BREAK_LOOP);
3925     // At least one byte in the last 64 bytes is negative.
3926     // Set up to look at the last 64 bytes as if they were a tail
3927     lea(ary1, Address(ary1, len, Address::times_1));
3928     addptr(result, len);
3929     // Ignore the very last byte: if all others are positive,
3930     // it must be negative, so we can skip right to the 2+1 byte
3931     // end comparison at this point
3932     orl(result, 63);
3933     movl(len, 63);
3934     // Fallthru to tail compare
3935   } else {
3936 
3937     if (UseAVX >= 2 && UseSSE >= 2) {
3938       // With AVX2, use 32-byte vector compare
3939       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
3940 
3941       // Compare 32-byte vectors
3942       testl(len, 0xffffffe0);   // vector count (in bytes)
3943       jccb(Assembler::zero, TAIL_START);
3944 
3945       andl(len, 0xffffffe0);
3946       lea(ary1, Address(ary1, len, Address::times_1));
3947       negptr(len);
3948 
3949       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
3950       movdl(vec2, tmp1);
3951       vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
3952 
3953       bind(COMPARE_WIDE_VECTORS);
3954       vmovdqu(vec1, Address(ary1, len, Address::times_1));
3955       vptest(vec1, vec2);
3956       jccb(Assembler::notZero, BREAK_LOOP);
3957       addptr(len, 32);
3958       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3959 
3960       testl(result, 0x0000001f);   // any bytes remaining?
3961       jcc(Assembler::zero, DONE);
3962 
3963       // Quick test using the already prepared vector mask
3964       movl(len, result);
3965       andl(len, 0x0000001f);
3966       vmovdqu(vec1, Address(ary1, len, Address::times_1, -32));
3967       vptest(vec1, vec2);
3968       jcc(Assembler::zero, DONE);
3969       // There are zeros, jump to the tail to determine exactly where
3970       jmpb(TAIL_START);
3971 
3972       bind(BREAK_LOOP);
3973       // At least one byte in the last 32-byte vector is negative.
3974       // Set up to look at the last 32 bytes as if they were a tail
3975       lea(ary1, Address(ary1, len, Address::times_1));
3976       addptr(result, len);
3977       // Ignore the very last byte: if all others are positive,
3978       // it must be negative, so we can skip right to the 2+1 byte
3979       // end comparison at this point
3980       orl(result, 31);
3981       movl(len, 31);
3982       // Fallthru to tail compare
3983     } else if (UseSSE42Intrinsics) {
3984       // With SSE4.2, use double quad vector compare
3985       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
3986 
3987       // Compare 16-byte vectors
3988       testl(len, 0xfffffff0);   // vector count (in bytes)
3989       jcc(Assembler::zero, TAIL_START);
3990 
3991       andl(len, 0xfffffff0);
3992       lea(ary1, Address(ary1, len, Address::times_1));
3993       negptr(len);
3994 
3995       movl(tmp1, 0x80808080);
3996       movdl(vec2, tmp1);
3997       pshufd(vec2, vec2, 0);
3998 
3999       bind(COMPARE_WIDE_VECTORS);
4000       movdqu(vec1, Address(ary1, len, Address::times_1));
4001       ptest(vec1, vec2);
4002       jccb(Assembler::notZero, BREAK_LOOP);
4003       addptr(len, 16);
4004       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4005 
4006       testl(result, 0x0000000f); // len is zero, any bytes remaining?
4007       jcc(Assembler::zero, DONE);
4008 
4009       // Quick test using the already prepared vector mask
4010       movl(len, result);
4011       andl(len, 0x0000000f);   // tail count (in bytes)
4012       movdqu(vec1, Address(ary1, len, Address::times_1, -16));
4013       ptest(vec1, vec2);
4014       jcc(Assembler::zero, DONE);
4015       jmpb(TAIL_START);
4016 
4017       bind(BREAK_LOOP);
4018       // At least one byte in the last 16-byte vector is negative.
4019       // Set up and look at the last 16 bytes as if they were a tail
4020       lea(ary1, Address(ary1, len, Address::times_1));
4021       addptr(result, len);
4022       // Ignore the very last byte: if all others are positive,
4023       // it must be negative, so we can skip right to the 2+1 byte
4024       // end comparison at this point
4025       orl(result, 15);
4026       movl(len, 15);
4027       // Fallthru to tail compare
4028     }
4029   }
4030 
4031   bind(TAIL_START);
4032   // Compare 4-byte vectors
4033   andl(len, 0xfffffffc); // vector count (in bytes)
4034   jccb(Assembler::zero, COMPARE_CHAR);
4035 
4036   lea(ary1, Address(ary1, len, Address::times_1));
4037   negptr(len);
4038 
4039   bind(COMPARE_VECTORS);
4040   movl(tmp1, Address(ary1, len, Address::times_1));
4041   andl(tmp1, 0x80808080);
4042   jccb(Assembler::notZero, TAIL_ADJUST);
4043   addptr(len, 4);
4044   jccb(Assembler::notZero, COMPARE_VECTORS);
4045 
4046   // Compare trailing char (final 2-3 bytes), if any
4047   bind(COMPARE_CHAR);
4048 
4049   testl(result, 0x2);   // tail  char
4050   jccb(Assembler::zero, COMPARE_BYTE);
4051   load_unsigned_short(tmp1, Address(ary1, 0));
4052   andl(tmp1, 0x00008080);
4053   jccb(Assembler::notZero, CHAR_ADJUST);
4054   lea(ary1, Address(ary1, 2));
4055 
4056   bind(COMPARE_BYTE);
4057   testl(result, 0x1);   // tail  byte
4058   jccb(Assembler::zero, DONE);
4059   load_unsigned_byte(tmp1, Address(ary1, 0));
4060   testl(tmp1, 0x00000080);
4061   jccb(Assembler::zero, DONE);
4062   subptr(result, 1);
4063   jmpb(DONE);
4064 
4065   bind(TAIL_ADJUST);
4066   // there are negative bits in the last 4 byte block.
4067   // Adjust result and check the next three bytes
4068   addptr(result, len);
4069   orl(result, 3);
4070   lea(ary1, Address(ary1, len, Address::times_1));
4071   jmpb(COMPARE_CHAR);
4072 
4073   bind(CHAR_ADJUST);
4074   // We are looking at a char + optional byte tail, and found that one
4075   // of the bytes in the char is negative. Adjust the result, check the
4076   // first byte and readjust if needed.
4077   andl(result, 0xfffffffc);
4078   testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first
4079   jccb(Assembler::notZero, DONE);
4080   addptr(result, 1);
4081 
4082   // That's it
4083   bind(DONE);
4084   if (UseAVX >= 2 && UseSSE >= 2) {
4085     // clean upper bits of YMM registers
4086     vpxor(vec1, vec1);
4087     vpxor(vec2, vec2);
4088   }
4089 }
4090 
4091 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
4092 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
4093                                       Register limit, Register result, Register chr,
4094                                       XMMRegister vec1, XMMRegister vec2, bool is_char, KRegister mask) {
4095   ShortBranchVerifier sbv(this);
4096   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
4097 
4098   int length_offset  = arrayOopDesc::length_offset_in_bytes();
4099   int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
4100 
4101   if (is_array_equ) {
4102     // Check the input args
4103     cmpoop(ary1, ary2);
4104     jcc(Assembler::equal, TRUE_LABEL);
4105 
4106     // Need additional checks for arrays_equals.
4107     testptr(ary1, ary1);
4108     jcc(Assembler::zero, FALSE_LABEL);
4109     testptr(ary2, ary2);
4110     jcc(Assembler::zero, FALSE_LABEL);
4111 
4112     // Check the lengths
4113     movl(limit, Address(ary1, length_offset));
4114     cmpl(limit, Address(ary2, length_offset));
4115     jcc(Assembler::notEqual, FALSE_LABEL);
4116   }
4117 
4118   // count == 0
4119   testl(limit, limit);
4120   jcc(Assembler::zero, TRUE_LABEL);
4121 
4122   if (is_array_equ) {
4123     // Load array address
4124     lea(ary1, Address(ary1, base_offset));
4125     lea(ary2, Address(ary2, base_offset));
4126   }
4127 
4128   if (is_array_equ && is_char) {
4129     // arrays_equals when used for char[].
4130     shll(limit, 1);      // byte count != 0
4131   }
4132   movl(result, limit); // copy
4133 
4134   if (UseAVX >= 2) {
4135     // With AVX2, use 32-byte vector compare
4136     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4137 
4138     // Compare 32-byte vectors
4139     andl(result, 0x0000001f);  //   tail count (in bytes)
4140     andl(limit, 0xffffffe0);   // vector count (in bytes)
4141     jcc(Assembler::zero, COMPARE_TAIL);
4142 
4143     lea(ary1, Address(ary1, limit, Address::times_1));
4144     lea(ary2, Address(ary2, limit, Address::times_1));
4145     negptr(limit);
4146 
4147 #ifdef _LP64
4148     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
4149       Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
4150 
4151       cmpl(limit, -64);
4152       jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
4153 
4154       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
4155 
4156       evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
4157       evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
4158       kortestql(mask, mask);
4159       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4160       addptr(limit, 64);  // update since we already compared at this addr
4161       cmpl(limit, -64);
4162       jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
4163 
4164       // At this point we may still need to compare -limit+result bytes.
4165       // We could execute the next two instruction and just continue via non-wide path:
4166       //  cmpl(limit, 0);
4167       //  jcc(Assembler::equal, COMPARE_TAIL);  // true
4168       // But since we stopped at the points ary{1,2}+limit which are
4169       // not farther than 64 bytes from the ends of arrays ary{1,2}+result
4170       // (|limit| <= 32 and result < 32),
4171       // we may just compare the last 64 bytes.
4172       //
4173       addptr(result, -64);   // it is safe, bc we just came from this area
4174       evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
4175       evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
4176       kortestql(mask, mask);
4177       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4178 
4179       jmp(TRUE_LABEL);
4180 
4181       bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
4182 
4183     }//if (VM_Version::supports_avx512vlbw())
4184 #endif //_LP64
4185     bind(COMPARE_WIDE_VECTORS);
4186     vmovdqu(vec1, Address(ary1, limit, Address::times_1));
4187     vmovdqu(vec2, Address(ary2, limit, Address::times_1));
4188     vpxor(vec1, vec2);
4189 
4190     vptest(vec1, vec1);
4191     jcc(Assembler::notZero, FALSE_LABEL);
4192     addptr(limit, 32);
4193     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4194 
4195     testl(result, result);
4196     jcc(Assembler::zero, TRUE_LABEL);
4197 
4198     vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
4199     vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
4200     vpxor(vec1, vec2);
4201 
4202     vptest(vec1, vec1);
4203     jccb(Assembler::notZero, FALSE_LABEL);
4204     jmpb(TRUE_LABEL);
4205 
4206     bind(COMPARE_TAIL); // limit is zero
4207     movl(limit, result);
4208     // Fallthru to tail compare
4209   } else if (UseSSE42Intrinsics) {
4210     // With SSE4.2, use double quad vector compare
4211     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4212 
4213     // Compare 16-byte vectors
4214     andl(result, 0x0000000f);  //   tail count (in bytes)
4215     andl(limit, 0xfffffff0);   // vector count (in bytes)
4216     jcc(Assembler::zero, COMPARE_TAIL);
4217 
4218     lea(ary1, Address(ary1, limit, Address::times_1));
4219     lea(ary2, Address(ary2, limit, Address::times_1));
4220     negptr(limit);
4221 
4222     bind(COMPARE_WIDE_VECTORS);
4223     movdqu(vec1, Address(ary1, limit, Address::times_1));
4224     movdqu(vec2, Address(ary2, limit, Address::times_1));
4225     pxor(vec1, vec2);
4226 
4227     ptest(vec1, vec1);
4228     jcc(Assembler::notZero, FALSE_LABEL);
4229     addptr(limit, 16);
4230     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4231 
4232     testl(result, result);
4233     jcc(Assembler::zero, TRUE_LABEL);
4234 
4235     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
4236     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
4237     pxor(vec1, vec2);
4238 
4239     ptest(vec1, vec1);
4240     jccb(Assembler::notZero, FALSE_LABEL);
4241     jmpb(TRUE_LABEL);
4242 
4243     bind(COMPARE_TAIL); // limit is zero
4244     movl(limit, result);
4245     // Fallthru to tail compare
4246   }
4247 
4248   // Compare 4-byte vectors
4249   andl(limit, 0xfffffffc); // vector count (in bytes)
4250   jccb(Assembler::zero, COMPARE_CHAR);
4251 
4252   lea(ary1, Address(ary1, limit, Address::times_1));
4253   lea(ary2, Address(ary2, limit, Address::times_1));
4254   negptr(limit);
4255 
4256   bind(COMPARE_VECTORS);
4257   movl(chr, Address(ary1, limit, Address::times_1));
4258   cmpl(chr, Address(ary2, limit, Address::times_1));
4259   jccb(Assembler::notEqual, FALSE_LABEL);
4260   addptr(limit, 4);
4261   jcc(Assembler::notZero, COMPARE_VECTORS);
4262 
4263   // Compare trailing char (final 2 bytes), if any
4264   bind(COMPARE_CHAR);
4265   testl(result, 0x2);   // tail  char
4266   jccb(Assembler::zero, COMPARE_BYTE);
4267   load_unsigned_short(chr, Address(ary1, 0));
4268   load_unsigned_short(limit, Address(ary2, 0));
4269   cmpl(chr, limit);
4270   jccb(Assembler::notEqual, FALSE_LABEL);
4271 
4272   if (is_array_equ && is_char) {
4273     bind(COMPARE_BYTE);
4274   } else {
4275     lea(ary1, Address(ary1, 2));
4276     lea(ary2, Address(ary2, 2));
4277 
4278     bind(COMPARE_BYTE);
4279     testl(result, 0x1);   // tail  byte
4280     jccb(Assembler::zero, TRUE_LABEL);
4281     load_unsigned_byte(chr, Address(ary1, 0));
4282     load_unsigned_byte(limit, Address(ary2, 0));
4283     cmpl(chr, limit);
4284     jccb(Assembler::notEqual, FALSE_LABEL);
4285   }
4286   bind(TRUE_LABEL);
4287   movl(result, 1);   // return true
4288   jmpb(DONE);
4289 
4290   bind(FALSE_LABEL);
4291   xorl(result, result); // return false
4292 
4293   // That's it
4294   bind(DONE);
4295   if (UseAVX >= 2) {
4296     // clean upper bits of YMM registers
4297     vpxor(vec1, vec1);
4298     vpxor(vec2, vec2);
4299   }
4300 }
4301 
4302 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4303                                     XMMRegister src1, int imm8, bool merge, int vlen_enc) {
4304   switch(ideal_opc) {
4305     case Op_LShiftVS:
4306       Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break;
4307     case Op_LShiftVI:
4308       Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break;
4309     case Op_LShiftVL:
4310       Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break;
4311     case Op_RShiftVS:
4312       Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break;
4313     case Op_RShiftVI:
4314       Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break;
4315     case Op_RShiftVL:
4316       Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break;
4317     case Op_URShiftVS:
4318       Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break;
4319     case Op_URShiftVI:
4320       Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break;
4321     case Op_URShiftVL:
4322       Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break;
4323     case Op_RotateRightV:
4324       evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4325     case Op_RotateLeftV:
4326       evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4327     default:
4328       fatal("Unsupported masked operation"); break;
4329   }
4330 }
4331 
4332 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4333                                     XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc,
4334                                     bool is_varshift) {
4335   switch (ideal_opc) {
4336     case Op_AddVB:
4337       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4338     case Op_AddVS:
4339       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4340     case Op_AddVI:
4341       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4342     case Op_AddVL:
4343       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4344     case Op_AddVF:
4345       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4346     case Op_AddVD:
4347       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4348     case Op_SubVB:
4349       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4350     case Op_SubVS:
4351       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4352     case Op_SubVI:
4353       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4354     case Op_SubVL:
4355       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4356     case Op_SubVF:
4357       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4358     case Op_SubVD:
4359       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4360     case Op_MulVS:
4361       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4362     case Op_MulVI:
4363       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4364     case Op_MulVL:
4365       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4366     case Op_MulVF:
4367       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4368     case Op_MulVD:
4369       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4370     case Op_DivVF:
4371       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4372     case Op_DivVD:
4373       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4374     case Op_SqrtVF:
4375       evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break;
4376     case Op_SqrtVD:
4377       evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break;
4378     case Op_AbsVB:
4379       evpabsb(dst, mask, src2, merge, vlen_enc); break;
4380     case Op_AbsVS:
4381       evpabsw(dst, mask, src2, merge, vlen_enc); break;
4382     case Op_AbsVI:
4383       evpabsd(dst, mask, src2, merge, vlen_enc); break;
4384     case Op_AbsVL:
4385       evpabsq(dst, mask, src2, merge, vlen_enc); break;
4386     case Op_FmaVF:
4387       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4388     case Op_FmaVD:
4389       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4390     case Op_VectorRearrange:
4391       evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break;
4392     case Op_LShiftVS:
4393       evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4394     case Op_LShiftVI:
4395       evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4396     case Op_LShiftVL:
4397       evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4398     case Op_RShiftVS:
4399       evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4400     case Op_RShiftVI:
4401       evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4402     case Op_RShiftVL:
4403       evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4404     case Op_URShiftVS:
4405       evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4406     case Op_URShiftVI:
4407       evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4408     case Op_URShiftVL:
4409       evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4410     case Op_RotateLeftV:
4411       evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4412     case Op_RotateRightV:
4413       evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4414     case Op_MaxV:
4415       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4416     case Op_MinV:
4417       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4418     case Op_XorV:
4419       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4420     case Op_OrV:
4421       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4422     case Op_AndV:
4423       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4424     default:
4425       fatal("Unsupported masked operation"); break;
4426   }
4427 }
4428 
4429 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4430                                     XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4431   switch (ideal_opc) {
4432     case Op_AddVB:
4433       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4434     case Op_AddVS:
4435       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4436     case Op_AddVI:
4437       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4438     case Op_AddVL:
4439       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4440     case Op_AddVF:
4441       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4442     case Op_AddVD:
4443       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4444     case Op_SubVB:
4445       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4446     case Op_SubVS:
4447       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4448     case Op_SubVI:
4449       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4450     case Op_SubVL:
4451       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4452     case Op_SubVF:
4453       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4454     case Op_SubVD:
4455       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4456     case Op_MulVS:
4457       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4458     case Op_MulVI:
4459       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4460     case Op_MulVL:
4461       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4462     case Op_MulVF:
4463       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4464     case Op_MulVD:
4465       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4466     case Op_DivVF:
4467       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4468     case Op_DivVD:
4469       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4470     case Op_FmaVF:
4471       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4472     case Op_FmaVD:
4473       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4474     case Op_MaxV:
4475       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4476     case Op_MinV:
4477       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4478     case Op_XorV:
4479       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4480     case Op_OrV:
4481       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4482     case Op_AndV:
4483       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4484     default:
4485       fatal("Unsupported masked operation"); break;
4486   }
4487 }
4488 
4489 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst,
4490                                   KRegister src1, KRegister src2) {
4491   BasicType etype = T_ILLEGAL;
4492   switch(mask_len) {
4493     case 2:
4494     case 4:
4495     case 8:  etype = T_BYTE; break;
4496     case 16: etype = T_SHORT; break;
4497     case 32: etype = T_INT; break;
4498     case 64: etype = T_LONG; break;
4499     default: fatal("Unsupported type"); break;
4500   }
4501   assert(etype != T_ILLEGAL, "");
4502   switch(ideal_opc) {
4503     case Op_AndVMask:
4504       kand(etype, dst, src1, src2); break;
4505     case Op_OrVMask:
4506       kor(etype, dst, src1, src2); break;
4507     case Op_XorVMask:
4508       kxor(etype, dst, src1, src2); break;
4509     default:
4510       fatal("Unsupported masked operation"); break;
4511   }
4512 }
4513 
4514 /*
4515  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4516  * If src is NaN, the result is 0.
4517  * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE,
4518  * the result is equal to the value of Integer.MIN_VALUE.
4519  * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE,
4520  * the result is equal to the value of Integer.MAX_VALUE.
4521  */
4522 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4523                                                                    XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
4524                                                                    Register rscratch, AddressLiteral float_sign_flip,
4525                                                                    int vec_enc) {
4526   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4527   Label done;
4528   vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch);
4529   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
4530   vptest(xtmp2, xtmp2, vec_enc);
4531   jccb(Assembler::equal, done);
4532 
4533   vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc);
4534   vpxor(xtmp1, xtmp1, xtmp4, vec_enc);
4535 
4536   vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
4537   vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc);
4538   vblendvps(dst, dst, xtmp4, xtmp3, vec_enc);
4539 
4540   // Recompute the mask for remaining special value.
4541   vpxor(xtmp2, xtmp2, xtmp3, vec_enc);
4542   // Extract SRC values corresponding to TRUE mask lanes.
4543   vpand(xtmp4, xtmp2, src, vec_enc);
4544   // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special
4545   // values are set.
4546   vpxor(xtmp3, xtmp2, xtmp4, vec_enc);
4547 
4548   vblendvps(dst, dst, xtmp1, xtmp3, vec_enc);
4549   bind(done);
4550 }
4551 
4552 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4553                                                                     XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4554                                                                     Register rscratch, AddressLiteral float_sign_flip,
4555                                                                     int vec_enc) {
4556   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4557   Label done;
4558   evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
4559   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4560   kortestwl(ktmp1, ktmp1);
4561   jccb(Assembler::equal, done);
4562 
4563   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4564   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4565   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
4566 
4567   kxorwl(ktmp1, ktmp1, ktmp2);
4568   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4569   vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4570   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
4571   bind(done);
4572 }
4573 
4574 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4575                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4576                                                                      Register rscratch, AddressLiteral double_sign_flip,
4577                                                                      int vec_enc) {
4578   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4579 
4580   Label done;
4581   evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
4582   Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc);
4583   kortestwl(ktmp1, ktmp1);
4584   jccb(Assembler::equal, done);
4585 
4586   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4587   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4588   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
4589 
4590   kxorwl(ktmp1, ktmp1, ktmp2);
4591   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4592   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4593   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
4594   bind(done);
4595 }
4596 
4597 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4598                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4599                                                                      Register rscratch, AddressLiteral float_sign_flip,
4600                                                                      int vec_enc) {
4601   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4602   Label done;
4603   evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
4604   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4605   kortestwl(ktmp1, ktmp1);
4606   jccb(Assembler::equal, done);
4607 
4608   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4609   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4610   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
4611 
4612   kxorwl(ktmp1, ktmp1, ktmp2);
4613   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4614   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4615   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
4616   bind(done);
4617 }
4618 
4619 /*
4620  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4621  * If src is NaN, the result is 0.
4622  * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE,
4623  * the result is equal to the value of Long.MIN_VALUE.
4624  * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE,
4625  * the result is equal to the value of Long.MAX_VALUE.
4626  */
4627 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4628                                                                       XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4629                                                                       Register rscratch, AddressLiteral double_sign_flip,
4630                                                                       int vec_enc) {
4631   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4632 
4633   Label done;
4634   evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
4635   evpcmpeqq(ktmp1, xtmp1, dst, vec_enc);
4636   kortestwl(ktmp1, ktmp1);
4637   jccb(Assembler::equal, done);
4638 
4639   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4640   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4641   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
4642 
4643   kxorwl(ktmp1, ktmp1, ktmp2);
4644   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4645   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4646   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
4647   bind(done);
4648 }
4649 
4650 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero,
4651                                                              XMMRegister xtmp, int index, int vec_enc) {
4652    assert(vec_enc < Assembler::AVX_512bit, "");
4653    if (vec_enc == Assembler::AVX_256bit) {
4654      vextractf128_high(xtmp, src);
4655      vshufps(dst, src, xtmp, index, vec_enc);
4656    } else {
4657      vshufps(dst, src, zero, index, vec_enc);
4658    }
4659 }
4660 
4661 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
4662                                                                     XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch,
4663                                                                     AddressLiteral float_sign_flip, int src_vec_enc) {
4664   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4665 
4666   Label done;
4667   // Compare the destination lanes with float_sign_flip
4668   // value to get mask for all special values.
4669   movdqu(xtmp1, float_sign_flip, rscratch);
4670   vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit);
4671   ptest(xtmp2, xtmp2);
4672   jccb(Assembler::equal, done);
4673 
4674   // Flip float_sign_flip to get max integer value.
4675   vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit);
4676   pxor(xtmp1, xtmp4);
4677 
4678   // Set detination lanes corresponding to unordered source lanes as zero.
4679   vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc);
4680   vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc);
4681 
4682   // Shuffle mask vector and pack lower doubles word from each quadword lane.
4683   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
4684   vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit);
4685 
4686   // Recompute the mask for remaining special value.
4687   pxor(xtmp2, xtmp3);
4688   // Extract mask corresponding to non-negative source lanes.
4689   vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc);
4690 
4691   // Shuffle mask vector and pack lower doubles word from each quadword lane.
4692   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
4693   pand(xtmp3, xtmp2);
4694 
4695   // Replace destination lanes holding special value(0x80000000) with max int
4696   // if corresponding source lane holds a +ve value.
4697   vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit);
4698   bind(done);
4699 }
4700 
4701 
4702 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero,
4703                                                    XMMRegister xtmp, Register rscratch, int vec_enc) {
4704   switch(to_elem_bt) {
4705     case T_SHORT:
4706       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing");
4707       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch);
4708       vpackusdw(dst, dst, zero, vec_enc);
4709       if (vec_enc == Assembler::AVX_256bit) {
4710         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
4711       }
4712       break;
4713     case  T_BYTE:
4714       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing");
4715       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch);
4716       vpackusdw(dst, dst, zero, vec_enc);
4717       if (vec_enc == Assembler::AVX_256bit) {
4718         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
4719       }
4720       vpackuswb(dst, dst, zero, vec_enc);
4721       break;
4722     default: assert(false, "%s", type2name(to_elem_bt));
4723   }
4724 }
4725 
4726 /*
4727  * Algorithm for vector D2L and F2I conversions:-
4728  * a) Perform vector D2L/F2I cast.
4729  * b) Choose fast path if none of the result vector lane contains 0x80000000 value.
4730  *    It signifies that source value could be any of the special floating point
4731  *    values(NaN,-Inf,Inf,Max,-Min).
4732  * c) Set destination to zero if source is NaN value.
4733  * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value.
4734  */
4735 
4736 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4737                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
4738                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
4739   int to_elem_sz = type2aelembytes(to_elem_bt);
4740   assert(to_elem_sz <= 4, "");
4741   vcvttps2dq(dst, src, vec_enc);
4742   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc);
4743   if (to_elem_sz < 4) {
4744     vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
4745     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc);
4746   }
4747 }
4748 
4749 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4750                                             XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip,
4751                                             Register rscratch, int vec_enc) {
4752   int to_elem_sz = type2aelembytes(to_elem_bt);
4753   assert(to_elem_sz <= 4, "");
4754   vcvttps2dq(dst, src, vec_enc);
4755   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc);
4756   switch(to_elem_bt) {
4757     case T_INT:
4758       break;
4759     case T_SHORT:
4760       evpmovdw(dst, dst, vec_enc);
4761       break;
4762     case T_BYTE:
4763       evpmovdb(dst, dst, vec_enc);
4764       break;
4765     default: assert(false, "%s", type2name(to_elem_bt));
4766   }
4767 }
4768 
4769 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
4770                                             KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip,
4771                                             Register rscratch, int vec_enc) {
4772   evcvttps2qq(dst, src, vec_enc);
4773   vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc);
4774 }
4775 
4776 // Handling for downcasting from double to integer or sub-word types on AVX2.
4777 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4778                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5,
4779                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
4780   int to_elem_sz = type2aelembytes(to_elem_bt);
4781   assert(to_elem_sz < 8, "");
4782   vcvttpd2dq(dst, src, vec_enc);
4783   vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch,
4784                                               float_sign_flip, vec_enc);
4785   if (to_elem_sz < 4) {
4786     // xtmp4 holds all zero lanes.
4787     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit);
4788   }
4789 }
4790 
4791 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src,
4792                                             XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1,
4793                                             KRegister ktmp2, AddressLiteral sign_flip,
4794                                             Register rscratch, int vec_enc) {
4795   if (VM_Version::supports_avx512dq()) {
4796     evcvttpd2qq(dst, src, vec_enc);
4797     vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
4798     switch(to_elem_bt) {
4799       case T_LONG:
4800         break;
4801       case T_INT:
4802         evpmovsqd(dst, dst, vec_enc);
4803         break;
4804       case T_SHORT:
4805         evpmovsqd(dst, dst, vec_enc);
4806         evpmovdw(dst, dst, vec_enc);
4807         break;
4808       case T_BYTE:
4809         evpmovsqd(dst, dst, vec_enc);
4810         evpmovdb(dst, dst, vec_enc);
4811         break;
4812       default: assert(false, "%s", type2name(to_elem_bt));
4813     }
4814   } else {
4815     assert(type2aelembytes(to_elem_bt) <= 4, "");
4816     vcvttpd2dq(dst, src, vec_enc);
4817     vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
4818     switch(to_elem_bt) {
4819       case T_INT:
4820         break;
4821       case T_SHORT:
4822         evpmovdw(dst, dst, vec_enc);
4823         break;
4824       case T_BYTE:
4825         evpmovdb(dst, dst, vec_enc);
4826         break;
4827       default: assert(false, "%s", type2name(to_elem_bt));
4828     }
4829   }
4830 }
4831 
4832 #ifdef _LP64
4833 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src,
4834                                                  AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
4835                                                  Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
4836   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
4837   // and re-instantiate original MXCSR.RC mode after that.
4838   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
4839 
4840   mov64(tmp, julong_cast(0.5L));
4841   evpbroadcastq(xtmp1, tmp, vec_enc);
4842   vaddpd(xtmp1, src , xtmp1, vec_enc);
4843   evcvtpd2qq(dst, xtmp1, vec_enc);
4844   vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
4845                                                 double_sign_flip, vec_enc);;
4846 
4847   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
4848 }
4849 
4850 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src,
4851                                                 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
4852                                                 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
4853   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
4854   // and re-instantiate original MXCSR.RC mode after that.
4855   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
4856 
4857   movl(tmp, jint_cast(0.5));
4858   movq(xtmp1, tmp);
4859   vbroadcastss(xtmp1, xtmp1, vec_enc);
4860   vaddps(xtmp1, src , xtmp1, vec_enc);
4861   vcvtps2dq(dst, xtmp1, vec_enc);
4862   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
4863                                               float_sign_flip, vec_enc);
4864 
4865   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
4866 }
4867 
4868 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src,
4869                                                AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
4870                                                Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) {
4871   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
4872   // and re-instantiate original MXCSR.RC mode after that.
4873   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
4874 
4875   movl(tmp, jint_cast(0.5));
4876   movq(xtmp1, tmp);
4877   vbroadcastss(xtmp1, xtmp1, vec_enc);
4878   vaddps(xtmp1, src , xtmp1, vec_enc);
4879   vcvtps2dq(dst, xtmp1, vec_enc);
4880   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc);
4881 
4882   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
4883 }
4884 #endif // _LP64
4885 
4886 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
4887                                              BasicType from_elem_bt, BasicType to_elem_bt) {
4888   switch (from_elem_bt) {
4889     case T_BYTE:
4890       switch (to_elem_bt) {
4891         case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break;
4892         case T_INT:   vpmovzxbd(dst, src, vlen_enc); break;
4893         case T_LONG:  vpmovzxbq(dst, src, vlen_enc); break;
4894         default: ShouldNotReachHere();
4895       }
4896       break;
4897     case T_SHORT:
4898       switch (to_elem_bt) {
4899         case T_INT:  vpmovzxwd(dst, src, vlen_enc); break;
4900         case T_LONG: vpmovzxwq(dst, src, vlen_enc); break;
4901         default: ShouldNotReachHere();
4902       }
4903       break;
4904     case T_INT:
4905       assert(to_elem_bt == T_LONG, "");
4906       vpmovzxdq(dst, src, vlen_enc);
4907       break;
4908     default:
4909       ShouldNotReachHere();
4910   }
4911 }
4912 
4913 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
4914                                            BasicType from_elem_bt, BasicType to_elem_bt) {
4915   switch (from_elem_bt) {
4916     case T_BYTE:
4917       switch (to_elem_bt) {
4918         case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break;
4919         case T_INT:   vpmovsxbd(dst, src, vlen_enc); break;
4920         case T_LONG:  vpmovsxbq(dst, src, vlen_enc); break;
4921         default: ShouldNotReachHere();
4922       }
4923       break;
4924     case T_SHORT:
4925       switch (to_elem_bt) {
4926         case T_INT:  vpmovsxwd(dst, src, vlen_enc); break;
4927         case T_LONG: vpmovsxwq(dst, src, vlen_enc); break;
4928         default: ShouldNotReachHere();
4929       }
4930       break;
4931     case T_INT:
4932       assert(to_elem_bt == T_LONG, "");
4933       vpmovsxdq(dst, src, vlen_enc);
4934       break;
4935     default:
4936       ShouldNotReachHere();
4937   }
4938 }
4939 
4940 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src,
4941                                          BasicType dst_bt, BasicType src_bt, int vlen) {
4942   int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen);
4943   assert(vlen_enc != AVX_512bit, "");
4944 
4945   int dst_bt_size = type2aelembytes(dst_bt);
4946   int src_bt_size = type2aelembytes(src_bt);
4947   if (dst_bt_size > src_bt_size) {
4948     switch (dst_bt_size / src_bt_size) {
4949       case 2: vpmovsxbw(dst, src, vlen_enc); break;
4950       case 4: vpmovsxbd(dst, src, vlen_enc); break;
4951       case 8: vpmovsxbq(dst, src, vlen_enc); break;
4952       default: ShouldNotReachHere();
4953     }
4954   } else {
4955     assert(dst_bt_size < src_bt_size, "");
4956     switch (src_bt_size / dst_bt_size) {
4957       case 2: {
4958         if (vlen_enc == AVX_128bit) {
4959           vpacksswb(dst, src, src, vlen_enc);
4960         } else {
4961           vpacksswb(dst, src, src, vlen_enc);
4962           vpermq(dst, dst, 0x08, vlen_enc);
4963         }
4964         break;
4965       }
4966       case 4: {
4967         if (vlen_enc == AVX_128bit) {
4968           vpackssdw(dst, src, src, vlen_enc);
4969           vpacksswb(dst, dst, dst, vlen_enc);
4970         } else {
4971           vpackssdw(dst, src, src, vlen_enc);
4972           vpermq(dst, dst, 0x08, vlen_enc);
4973           vpacksswb(dst, dst, dst, AVX_128bit);
4974         }
4975         break;
4976       }
4977       case 8: {
4978         if (vlen_enc == AVX_128bit) {
4979           vpshufd(dst, src, 0x08, vlen_enc);
4980           vpackssdw(dst, dst, dst, vlen_enc);
4981           vpacksswb(dst, dst, dst, vlen_enc);
4982         } else {
4983           vpshufd(dst, src, 0x08, vlen_enc);
4984           vpermq(dst, dst, 0x08, vlen_enc);
4985           vpackssdw(dst, dst, dst, AVX_128bit);
4986           vpacksswb(dst, dst, dst, AVX_128bit);
4987         }
4988         break;
4989       }
4990       default: ShouldNotReachHere();
4991     }
4992   }
4993 }
4994 
4995 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3,
4996                                    bool merge, BasicType bt, int vlen_enc) {
4997   if (bt == T_INT) {
4998     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
4999   } else {
5000     assert(bt == T_LONG, "");
5001     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5002   }
5003 }
5004 
5005 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3,
5006                                    bool merge, BasicType bt, int vlen_enc) {
5007   if (bt == T_INT) {
5008     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5009   } else {
5010     assert(bt == T_LONG, "");
5011     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5012   }
5013 }
5014 
5015 #ifdef _LP64
5016 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1,
5017                                                Register rtmp2, XMMRegister xtmp, int mask_len,
5018                                                int vec_enc) {
5019   int index = 0;
5020   int vindex = 0;
5021   mov64(rtmp1, 0x0101010101010101L);
5022   pdepq(rtmp1, src, rtmp1);
5023   if (mask_len > 8) {
5024     movq(rtmp2, src);
5025     vpxor(xtmp, xtmp, xtmp, vec_enc);
5026     movq(xtmp, rtmp1);
5027   }
5028   movq(dst, rtmp1);
5029 
5030   mask_len -= 8;
5031   while (mask_len > 0) {
5032     assert ((mask_len & 0x7) == 0, "mask must be multiple of 8");
5033     index++;
5034     if ((index % 2) == 0) {
5035       pxor(xtmp, xtmp);
5036     }
5037     mov64(rtmp1, 0x0101010101010101L);
5038     shrq(rtmp2, 8);
5039     pdepq(rtmp1, rtmp2, rtmp1);
5040     pinsrq(xtmp, rtmp1, index % 2);
5041     vindex = index / 2;
5042     if (vindex) {
5043       // Write entire 16 byte vector when both 64 bit
5044       // lanes are update to save redundant instructions.
5045       if (index % 2) {
5046         vinsertf128(dst, dst, xtmp, vindex);
5047       }
5048     } else {
5049       vmovdqu(dst, xtmp);
5050     }
5051     mask_len -= 8;
5052   }
5053 }
5054 
5055 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) {
5056   switch(opc) {
5057     case Op_VectorMaskTrueCount:
5058       popcntq(dst, tmp);
5059       break;
5060     case Op_VectorMaskLastTrue:
5061       if (VM_Version::supports_lzcnt()) {
5062         lzcntq(tmp, tmp);
5063         movl(dst, 63);
5064         subl(dst, tmp);
5065       } else {
5066         movl(dst, -1);
5067         bsrq(tmp, tmp);
5068         cmov32(Assembler::notZero, dst, tmp);
5069       }
5070       break;
5071     case Op_VectorMaskFirstTrue:
5072       if (VM_Version::supports_bmi1()) {
5073         if (masklen < 32) {
5074           orl(tmp, 1 << masklen);
5075           tzcntl(dst, tmp);
5076         } else if (masklen == 32) {
5077           tzcntl(dst, tmp);
5078         } else {
5079           assert(masklen == 64, "");
5080           tzcntq(dst, tmp);
5081         }
5082       } else {
5083         if (masklen < 32) {
5084           orl(tmp, 1 << masklen);
5085           bsfl(dst, tmp);
5086         } else {
5087           assert(masklen == 32 || masklen == 64, "");
5088           movl(dst, masklen);
5089           if (masklen == 32)  {
5090             bsfl(tmp, tmp);
5091           } else {
5092             bsfq(tmp, tmp);
5093           }
5094           cmov32(Assembler::notZero, dst, tmp);
5095         }
5096       }
5097       break;
5098     case Op_VectorMaskToLong:
5099       assert(dst == tmp, "Dst and tmp should be the same for toLong operations");
5100       break;
5101     default: assert(false, "Unhandled mask operation");
5102   }
5103 }
5104 
5105 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp,
5106                                               int masklen, int masksize, int vec_enc) {
5107   assert(VM_Version::supports_popcnt(), "");
5108 
5109   if(VM_Version::supports_avx512bw()) {
5110     kmovql(tmp, mask);
5111   } else {
5112     assert(masklen <= 16, "");
5113     kmovwl(tmp, mask);
5114   }
5115 
5116   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5117   // operations needs to be clipped.
5118   if (masksize < 16 && opc != Op_VectorMaskFirstTrue) {
5119     andq(tmp, (1 << masklen) - 1);
5120   }
5121 
5122   vector_mask_operation_helper(opc, dst, tmp, masklen);
5123 }
5124 
5125 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
5126                                               Register tmp, int masklen, BasicType bt, int vec_enc) {
5127   assert(vec_enc == AVX_128bit && VM_Version::supports_avx() ||
5128          vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4), "");
5129   assert(VM_Version::supports_popcnt(), "");
5130 
5131   bool need_clip = false;
5132   switch(bt) {
5133     case T_BOOLEAN:
5134       // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1
5135       vpxor(xtmp, xtmp, xtmp, vec_enc);
5136       vpsubb(xtmp, xtmp, mask, vec_enc);
5137       vpmovmskb(tmp, xtmp, vec_enc);
5138       need_clip = masklen < 16;
5139       break;
5140     case T_BYTE:
5141       vpmovmskb(tmp, mask, vec_enc);
5142       need_clip = masklen < 16;
5143       break;
5144     case T_SHORT:
5145       vpacksswb(xtmp, mask, mask, vec_enc);
5146       if (masklen >= 16) {
5147         vpermpd(xtmp, xtmp, 8, vec_enc);
5148       }
5149       vpmovmskb(tmp, xtmp, Assembler::AVX_128bit);
5150       need_clip = masklen < 16;
5151       break;
5152     case T_INT:
5153     case T_FLOAT:
5154       vmovmskps(tmp, mask, vec_enc);
5155       need_clip = masklen < 4;
5156       break;
5157     case T_LONG:
5158     case T_DOUBLE:
5159       vmovmskpd(tmp, mask, vec_enc);
5160       need_clip = masklen < 2;
5161       break;
5162     default: assert(false, "Unhandled type, %s", type2name(bt));
5163   }
5164 
5165   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5166   // operations needs to be clipped.
5167   if (need_clip && opc != Op_VectorMaskFirstTrue) {
5168     // need_clip implies masklen < 32
5169     andq(tmp, (1 << masklen) - 1);
5170   }
5171 
5172   vector_mask_operation_helper(opc, dst, tmp, masklen);
5173 }
5174 
5175 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1,
5176                                              Register rtmp2, int mask_len) {
5177   kmov(rtmp1, src);
5178   andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len)));
5179   mov64(rtmp2, -1L);
5180   pextq(rtmp2, rtmp2, rtmp1);
5181   kmov(dst, rtmp2);
5182 }
5183 
5184 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask,
5185                                                bool merge, BasicType bt, int vec_enc) {
5186   if (opcode == Op_CompressV) {
5187     switch(bt) {
5188     case T_BYTE:
5189       evpcompressb(dst, mask, src, merge, vec_enc);
5190       break;
5191     case T_CHAR:
5192     case T_SHORT:
5193       evpcompressw(dst, mask, src, merge, vec_enc);
5194       break;
5195     case T_INT:
5196       evpcompressd(dst, mask, src, merge, vec_enc);
5197       break;
5198     case T_FLOAT:
5199       evcompressps(dst, mask, src, merge, vec_enc);
5200       break;
5201     case T_LONG:
5202       evpcompressq(dst, mask, src, merge, vec_enc);
5203       break;
5204     case T_DOUBLE:
5205       evcompresspd(dst, mask, src, merge, vec_enc);
5206       break;
5207     default:
5208       fatal("Unsupported type %s", type2name(bt));
5209       break;
5210     }
5211   } else {
5212     assert(opcode == Op_ExpandV, "");
5213     switch(bt) {
5214     case T_BYTE:
5215       evpexpandb(dst, mask, src, merge, vec_enc);
5216       break;
5217     case T_CHAR:
5218     case T_SHORT:
5219       evpexpandw(dst, mask, src, merge, vec_enc);
5220       break;
5221     case T_INT:
5222       evpexpandd(dst, mask, src, merge, vec_enc);
5223       break;
5224     case T_FLOAT:
5225       evexpandps(dst, mask, src, merge, vec_enc);
5226       break;
5227     case T_LONG:
5228       evpexpandq(dst, mask, src, merge, vec_enc);
5229       break;
5230     case T_DOUBLE:
5231       evexpandpd(dst, mask, src, merge, vec_enc);
5232       break;
5233     default:
5234       fatal("Unsupported type %s", type2name(bt));
5235       break;
5236     }
5237   }
5238 }
5239 #endif
5240 
5241 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5242                                            KRegister ktmp1, int vec_enc) {
5243   if (opcode == Op_SignumVD) {
5244     vsubpd(dst, zero, one, vec_enc);
5245     // if src < 0 ? -1 : 1
5246     evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5247     evblendmpd(dst, ktmp1, one, dst, true, vec_enc);
5248     // if src == NaN, -0.0 or 0.0 return src.
5249     evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5250     evblendmpd(dst, ktmp1, dst, src, true, vec_enc);
5251   } else {
5252     assert(opcode == Op_SignumVF, "");
5253     vsubps(dst, zero, one, vec_enc);
5254     // if src < 0 ? -1 : 1
5255     evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5256     evblendmps(dst, ktmp1, one, dst, true, vec_enc);
5257     // if src == NaN, -0.0 or 0.0 return src.
5258     evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5259     evblendmps(dst, ktmp1, dst, src, true, vec_enc);
5260   }
5261 }
5262 
5263 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5264                                           XMMRegister xtmp1, int vec_enc) {
5265   if (opcode == Op_SignumVD) {
5266     vsubpd(dst, zero, one, vec_enc);
5267     // if src < 0 ? -1 : 1
5268     vblendvpd(dst, one, dst, src, vec_enc);
5269     // if src == NaN, -0.0 or 0.0 return src.
5270     vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5271     vblendvpd(dst, dst, src, xtmp1, vec_enc);
5272   } else {
5273     assert(opcode == Op_SignumVF, "");
5274     vsubps(dst, zero, one, vec_enc);
5275     // if src < 0 ? -1 : 1
5276     vblendvps(dst, one, dst, src, vec_enc);
5277     // if src == NaN, -0.0 or 0.0 return src.
5278     vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5279     vblendvps(dst, dst, src, xtmp1, vec_enc);
5280   }
5281 }
5282 
5283 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) {
5284   if (VM_Version::supports_avx512bw()) {
5285     if (mask_len > 32) {
5286       kmovql(dst, src);
5287     } else {
5288       kmovdl(dst, src);
5289       if (mask_len != 32) {
5290         kshiftrdl(dst, dst, 32 - mask_len);
5291       }
5292     }
5293   } else {
5294     assert(mask_len <= 16, "");
5295     kmovwl(dst, src);
5296     if (mask_len != 16) {
5297       kshiftrwl(dst, dst, 16 - mask_len);
5298     }
5299   }
5300 }
5301 
5302 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) {
5303   int lane_size = type2aelembytes(bt);
5304   bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
5305   if ((is_LP64 || lane_size < 8) &&
5306       ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) ||
5307        (is_subword_type(bt) && VM_Version::supports_avx512vlbw()))) {
5308     movptr(rtmp, imm32);
5309     switch(lane_size) {
5310       case 1 : evpbroadcastb(dst, rtmp, vec_enc); break;
5311       case 2 : evpbroadcastw(dst, rtmp, vec_enc); break;
5312       case 4 : evpbroadcastd(dst, rtmp, vec_enc); break;
5313       case 8 : evpbroadcastq(dst, rtmp, vec_enc); break;
5314       fatal("Unsupported lane size %d", lane_size);
5315       break;
5316     }
5317   } else {
5318     movptr(rtmp, imm32);
5319     LP64_ONLY(movq(dst, rtmp)) NOT_LP64(movdl(dst, rtmp));
5320     switch(lane_size) {
5321       case 1 : vpbroadcastb(dst, dst, vec_enc); break;
5322       case 2 : vpbroadcastw(dst, dst, vec_enc); break;
5323       case 4 : vpbroadcastd(dst, dst, vec_enc); break;
5324       case 8 : vpbroadcastq(dst, dst, vec_enc); break;
5325       fatal("Unsupported lane size %d", lane_size);
5326       break;
5327     }
5328   }
5329 }
5330 
5331 //
5332 // Following is lookup table based popcount computation algorithm:-
5333 //       Index   Bit set count
5334 //     [ 0000 ->   0,
5335 //       0001 ->   1,
5336 //       0010 ->   1,
5337 //       0011 ->   2,
5338 //       0100 ->   1,
5339 //       0101 ->   2,
5340 //       0110 ->   2,
5341 //       0111 ->   3,
5342 //       1000 ->   1,
5343 //       1001 ->   2,
5344 //       1010 ->   3,
5345 //       1011 ->   3,
5346 //       1100 ->   2,
5347 //       1101 ->   3,
5348 //       1111 ->   4 ]
5349 //  a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as
5350 //     shuffle indices for lookup table access.
5351 //  b. Right shift each byte of vector lane by 4 positions.
5352 //  c. Count the number of 1s in 4 MSB bits each byte. These bits are used as
5353 //     shuffle indices for lookup table access.
5354 //  d. Add the bitset count of upper and lower 4 bits of each byte.
5355 //  e. Unpack double words to quad words and compute sum of absolute difference of bitset
5356 //     count of all the bytes of a quadword.
5357 //  f. Perform step e. for upper 128bit vector lane.
5358 //  g. Pack the bitset count of quadwords back to double word.
5359 //  h. Unpacking and packing operations are not needed for 64bit vector lane.
5360 
5361 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5362                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5363   assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), "");
5364   vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc);
5365   vpsrlw(dst, src, 4, vec_enc);
5366   vpand(dst, dst, xtmp1, vec_enc);
5367   vpand(xtmp1, src, xtmp1, vec_enc);
5368   vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg);
5369   vpshufb(xtmp1, xtmp2, xtmp1, vec_enc);
5370   vpshufb(dst, xtmp2, dst, vec_enc);
5371   vpaddb(dst, dst, xtmp1, vec_enc);
5372 }
5373 
5374 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5375                                             XMMRegister xtmp2, Register rtmp, int vec_enc) {
5376   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5377   // Following code is as per steps e,f,g and h of above algorithm.
5378   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5379   vpunpckhdq(dst, xtmp1, xtmp2, vec_enc);
5380   vpsadbw(dst, dst, xtmp2, vec_enc);
5381   vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc);
5382   vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc);
5383   vpackuswb(dst, xtmp1, dst, vec_enc);
5384 }
5385 
5386 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5387                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5388   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5389   // Add the popcount of upper and lower bytes of word.
5390   vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc);
5391   vpsrlw(dst, xtmp1, 8, vec_enc);
5392   vpand(xtmp1, xtmp1, xtmp2, vec_enc);
5393   vpaddw(dst, dst, xtmp1, vec_enc);
5394 }
5395 
5396 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5397                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5398   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5399   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5400   vpsadbw(dst, xtmp1, xtmp2, vec_enc);
5401 }
5402 
5403 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5404                                                  XMMRegister xtmp2, Register rtmp, int vec_enc) {
5405   switch(bt) {
5406     case T_LONG:
5407       vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5408       break;
5409     case T_INT:
5410       vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5411       break;
5412     case T_CHAR:
5413     case T_SHORT:
5414       vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5415       break;
5416     case T_BYTE:
5417     case T_BOOLEAN:
5418       vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5419       break;
5420     default:
5421       fatal("Unsupported type %s", type2name(bt));
5422       break;
5423   }
5424 }
5425 
5426 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5427                                                       KRegister mask, bool merge, int vec_enc) {
5428   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5429   switch(bt) {
5430     case T_LONG:
5431       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5432       evpopcntq(dst, mask, src, merge, vec_enc);
5433       break;
5434     case T_INT:
5435       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5436       evpopcntd(dst, mask, src, merge, vec_enc);
5437       break;
5438     case T_CHAR:
5439     case T_SHORT:
5440       assert(VM_Version::supports_avx512_bitalg(), "");
5441       evpopcntw(dst, mask, src, merge, vec_enc);
5442       break;
5443     case T_BYTE:
5444     case T_BOOLEAN:
5445       assert(VM_Version::supports_avx512_bitalg(), "");
5446       evpopcntb(dst, mask, src, merge, vec_enc);
5447       break;
5448     default:
5449       fatal("Unsupported type %s", type2name(bt));
5450       break;
5451   }
5452 }
5453 
5454 #ifndef _LP64
5455 void C2_MacroAssembler::vector_maskall_operation32(KRegister dst, Register src, KRegister tmp, int mask_len) {
5456   assert(VM_Version::supports_avx512bw(), "");
5457   kmovdl(tmp, src);
5458   kunpckdql(dst, tmp, tmp);
5459 }
5460 #endif
5461 
5462 // Bit reversal algorithm first reverses the bits of each byte followed by
5463 // a byte level reversal for multi-byte primitive types (short/int/long).
5464 // Algorithm performs a lookup table access to get reverse bit sequence
5465 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte
5466 // is obtained by swapping the reverse bit sequences of upper and lower
5467 // nibble of a byte.
5468 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5469                                            XMMRegister xtmp2, Register rtmp, int vec_enc) {
5470   if (VM_Version::supports_avx512vlbw()) {
5471 
5472     // Get the reverse bit sequence of lower nibble of each byte.
5473     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg);
5474     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5475     evpandq(dst, xtmp2, src, vec_enc);
5476     vpshufb(dst, xtmp1, dst, vec_enc);
5477     vpsllq(dst, dst, 4, vec_enc);
5478 
5479     // Get the reverse bit sequence of upper nibble of each byte.
5480     vpandn(xtmp2, xtmp2, src, vec_enc);
5481     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5482     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5483 
5484     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5485     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5486     evporq(xtmp2, dst, xtmp2, vec_enc);
5487     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
5488 
5489   } else if(vec_enc == Assembler::AVX_512bit) {
5490     // Shift based bit reversal.
5491     assert(bt == T_LONG || bt == T_INT, "");
5492 
5493     // Swap lower and upper nibble of each byte.
5494     vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc);
5495 
5496     // Swap two least and most significant bits of each nibble.
5497     vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc);
5498 
5499     // Swap adjacent pair of bits.
5500     evmovdqul(xtmp1, k0, dst, true, vec_enc);
5501     vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc);
5502 
5503     evmovdqul(xtmp1, k0, dst, true, vec_enc);
5504     vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc);
5505   } else {
5506     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp);
5507     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5508 
5509     // Get the reverse bit sequence of lower nibble of each byte.
5510     vpand(dst, xtmp2, src, vec_enc);
5511     vpshufb(dst, xtmp1, dst, vec_enc);
5512     vpsllq(dst, dst, 4, vec_enc);
5513 
5514     // Get the reverse bit sequence of upper nibble of each byte.
5515     vpandn(xtmp2, xtmp2, src, vec_enc);
5516     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5517     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5518 
5519     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5520     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5521     vpor(xtmp2, dst, xtmp2, vec_enc);
5522     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
5523   }
5524 }
5525 
5526 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc,
5527                                                 XMMRegister xtmp, Register rscratch) {
5528   assert(VM_Version::supports_gfni(), "");
5529   assert(rscratch != noreg || always_reachable(mask), "missing");
5530 
5531   // Galois field instruction based bit reversal based on following algorithm.
5532   // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
5533   vpbroadcastq(xtmp, mask, vec_enc, rscratch);
5534   vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc);
5535   vector_reverse_byte(bt, dst, xtmp, vec_enc);
5536 }
5537 
5538 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src,
5539                                           XMMRegister xtmp1, Register rtmp, int vec_enc) {
5540   vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc);
5541   evpandq(dst, xtmp1, src, vec_enc);
5542   vpsllq(dst, dst, nbits, vec_enc);
5543   vpandn(xtmp1, xtmp1, src, vec_enc);
5544   vpsrlq(xtmp1, xtmp1, nbits, vec_enc);
5545   evporq(dst, dst, xtmp1, vec_enc);
5546 }
5547 
5548 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5549                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5550   // Shift based bit reversal.
5551   assert(VM_Version::supports_evex(), "");
5552   switch(bt) {
5553     case T_LONG:
5554       // Swap upper and lower double word of each quad word.
5555       evprorq(xtmp1, k0, src, 32, true, vec_enc);
5556       evprord(xtmp1, k0, xtmp1, 16, true, vec_enc);
5557       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
5558       break;
5559     case T_INT:
5560       // Swap upper and lower word of each double word.
5561       evprord(xtmp1, k0, src, 16, true, vec_enc);
5562       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
5563       break;
5564     case T_CHAR:
5565     case T_SHORT:
5566       // Swap upper and lower byte of each word.
5567       vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc);
5568       break;
5569     case T_BYTE:
5570       evmovdquq(dst, k0, src, true, vec_enc);
5571       break;
5572     default:
5573       fatal("Unsupported type %s", type2name(bt));
5574       break;
5575   }
5576 }
5577 
5578 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) {
5579   if (bt == T_BYTE) {
5580     if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) {
5581       evmovdquq(dst, k0, src, true, vec_enc);
5582     } else {
5583       vmovdqu(dst, src);
5584     }
5585     return;
5586   }
5587   // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using
5588   // pre-computed shuffle indices.
5589   switch(bt) {
5590     case T_LONG:
5591       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg);
5592       break;
5593     case T_INT:
5594       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg);
5595       break;
5596     case T_CHAR:
5597     case T_SHORT:
5598       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg);
5599       break;
5600     default:
5601       fatal("Unsupported type %s", type2name(bt));
5602       break;
5603   }
5604   vpshufb(dst, src, dst, vec_enc);
5605 }
5606 
5607 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5608                                                         XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
5609                                                         KRegister ktmp, Register rtmp, bool merge, int vec_enc) {
5610   assert(is_integral_type(bt), "");
5611   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5612   assert(VM_Version::supports_avx512cd(), "");
5613   switch(bt) {
5614     case T_LONG:
5615       evplzcntq(dst, ktmp, src, merge, vec_enc);
5616       break;
5617     case T_INT:
5618       evplzcntd(dst, ktmp, src, merge, vec_enc);
5619       break;
5620     case T_SHORT:
5621       vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc);
5622       vpunpcklwd(xtmp2, xtmp1, src, vec_enc);
5623       evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc);
5624       vpunpckhwd(dst, xtmp1, src, vec_enc);
5625       evplzcntd(dst, ktmp, dst, merge, vec_enc);
5626       vpackusdw(dst, xtmp2, dst, vec_enc);
5627       break;
5628     case T_BYTE:
5629       // T1 = Compute leading zero counts of 4 LSB bits of each byte by
5630       // accessing the lookup table.
5631       // T2 = Compute leading zero counts of 4 MSB bits of each byte by
5632       // accessing the lookup table.
5633       // Add T1 to T2 if 4 MSB bits of byte are all zeros.
5634       assert(VM_Version::supports_avx512bw(), "");
5635       evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp);
5636       vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc);
5637       vpand(xtmp2, dst, src, vec_enc);
5638       vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5639       vpsrlw(xtmp3, src, 4, vec_enc);
5640       vpand(xtmp3, dst, xtmp3, vec_enc);
5641       vpshufb(dst, xtmp1, xtmp3, vec_enc);
5642       vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
5643       evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc);
5644       evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc);
5645       break;
5646     default:
5647       fatal("Unsupported type %s", type2name(bt));
5648       break;
5649   }
5650 }
5651 
5652 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5653                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
5654   vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp);
5655   vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5656   // T1 = Compute leading zero counts of 4 LSB bits of each byte by
5657   // accessing the lookup table.
5658   vpand(dst, xtmp2, src, vec_enc);
5659   vpshufb(dst, xtmp1, dst, vec_enc);
5660   // T2 = Compute leading zero counts of 4 MSB bits of each byte by
5661   // accessing the lookup table.
5662   vpsrlw(xtmp3, src, 4, vec_enc);
5663   vpand(xtmp3, xtmp2, xtmp3, vec_enc);
5664   vpshufb(xtmp2, xtmp1, xtmp3, vec_enc);
5665   // Add T1 to T2 if 4 MSB bits of byte are all zeros.
5666   vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
5667   vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc);
5668   vpaddb(dst, dst, xtmp2, vec_enc);
5669   vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc);
5670 }
5671 
5672 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5673                                                              XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
5674   vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
5675   // Add zero counts of lower byte and upper byte of a word if
5676   // upper byte holds a zero value.
5677   vpsrlw(xtmp3, src, 8, vec_enc);
5678   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
5679   vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc);
5680   vpsllw(xtmp2, dst, 8, vec_enc);
5681   vpaddw(xtmp2, xtmp2, dst, vec_enc);
5682   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
5683   vpsrlw(dst, dst, 8, vec_enc);
5684 }
5685 
5686 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5687                                                            XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) {
5688   // Since IEEE 754 floating point format represents mantissa in 1.0 format
5689   // hence biased exponent can be used to compute leading zero count as per
5690   // following formula:-
5691   // LZCNT = 32 - (biased_exp - 127)
5692   // Special handling has been introduced for Zero, Max_Int and -ve source values.
5693 
5694   // Broadcast 0xFF
5695   vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc);
5696   vpsrld(xtmp1, xtmp1, 24, vec_enc);
5697 
5698   // Extract biased exponent.
5699   vcvtdq2ps(dst, src, vec_enc);
5700   vpsrld(dst, dst, 23, vec_enc);
5701   vpand(dst, dst, xtmp1, vec_enc);
5702 
5703   // Broadcast 127.
5704   vpsrld(xtmp1, xtmp1, 1, vec_enc);
5705   // Exponent = biased_exp - 127
5706   vpsubd(dst, dst, xtmp1, vec_enc);
5707 
5708   // Exponent = Exponent  + 1
5709   vpsrld(xtmp3, xtmp1, 6, vec_enc);
5710   vpaddd(dst, dst, xtmp3, vec_enc);
5711 
5712   // Replace -ve exponent with zero, exponent is -ve when src
5713   // lane contains a zero value.
5714   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5715   vblendvps(dst, dst, xtmp2, dst, vec_enc);
5716 
5717   // Rematerialize broadcast 32.
5718   vpslld(xtmp1, xtmp3, 5, vec_enc);
5719   // Exponent is 32 if corresponding source lane contains max_int value.
5720   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
5721   // LZCNT = 32 - exponent
5722   vpsubd(dst, xtmp1, dst, vec_enc);
5723 
5724   // Replace LZCNT with a value 1 if corresponding source lane
5725   // contains max_int value.
5726   vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc);
5727 
5728   // Replace biased_exp with 0 if source lane value is less than zero.
5729   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5730   vblendvps(dst, dst, xtmp2, src, vec_enc);
5731 }
5732 
5733 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5734                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
5735   vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
5736   // Add zero counts of lower word and upper word of a double word if
5737   // upper word holds a zero value.
5738   vpsrld(xtmp3, src, 16, vec_enc);
5739   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
5740   vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc);
5741   vpslld(xtmp2, dst, 16, vec_enc);
5742   vpaddd(xtmp2, xtmp2, dst, vec_enc);
5743   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
5744   vpsrld(dst, dst, 16, vec_enc);
5745   // Add zero counts of lower doubleword and upper doubleword of a
5746   // quadword if upper doubleword holds a zero value.
5747   vpsrlq(xtmp3, src, 32, vec_enc);
5748   vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc);
5749   vpsllq(xtmp2, dst, 32, vec_enc);
5750   vpaddq(xtmp2, xtmp2, dst, vec_enc);
5751   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
5752   vpsrlq(dst, dst, 32, vec_enc);
5753 }
5754 
5755 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src,
5756                                                        XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
5757                                                        Register rtmp, int vec_enc) {
5758   assert(is_integral_type(bt), "unexpected type");
5759   assert(vec_enc < Assembler::AVX_512bit, "");
5760   switch(bt) {
5761     case T_LONG:
5762       vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
5763       break;
5764     case T_INT:
5765       vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
5766       break;
5767     case T_SHORT:
5768       vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
5769       break;
5770     case T_BYTE:
5771       vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
5772       break;
5773     default:
5774       fatal("Unsupported type %s", type2name(bt));
5775       break;
5776   }
5777 }
5778 
5779 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) {
5780   switch(bt) {
5781     case T_BYTE:
5782       vpsubb(dst, src1, src2, vec_enc);
5783       break;
5784     case T_SHORT:
5785       vpsubw(dst, src1, src2, vec_enc);
5786       break;
5787     case T_INT:
5788       vpsubd(dst, src1, src2, vec_enc);
5789       break;
5790     case T_LONG:
5791       vpsubq(dst, src1, src2, vec_enc);
5792       break;
5793     default:
5794       fatal("Unsupported type %s", type2name(bt));
5795       break;
5796   }
5797 }
5798 
5799 // Trailing zero count computation is based on leading zero count operation as per
5800 // following equation. All AVX3 targets support AVX512CD feature which offers
5801 // direct vector instruction to compute leading zero count.
5802 //      CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x)
5803 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5804                                                          XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
5805                                                          XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) {
5806   assert(is_integral_type(bt), "");
5807   // xtmp = -1
5808   vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc);
5809   // xtmp = xtmp + src
5810   vpadd(bt, xtmp4, xtmp4, src, vec_enc);
5811   // xtmp = xtmp & ~src
5812   vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc);
5813   vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc);
5814   vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc);
5815   vpsub(bt, dst, xtmp4, dst, vec_enc);
5816 }
5817 
5818 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation
5819 //      CTZ = PRIM_TYPE_WIDHT - POPC(x | -x)
5820 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5821                                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
5822   assert(is_integral_type(bt), "");
5823   // xtmp = 0
5824   vpxor(xtmp3 , xtmp3, xtmp3, vec_enc);
5825   // xtmp = 0 - src
5826   vpsub(bt, xtmp3, xtmp3, src, vec_enc);
5827   // xtmp = xtmp | src
5828   vpor(xtmp3, xtmp3, src, vec_enc);
5829   vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc);
5830   vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc);
5831   vpsub(bt, dst, xtmp1, dst, vec_enc);
5832 }
5833 
5834 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) {
5835   Label done;
5836   Label neg_divisor_fastpath;
5837   cmpl(divisor, 0);
5838   jccb(Assembler::less, neg_divisor_fastpath);
5839   xorl(rdx, rdx);
5840   divl(divisor);
5841   jmpb(done);
5842   bind(neg_divisor_fastpath);
5843   // Fastpath for divisor < 0:
5844   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
5845   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
5846   movl(rdx, rax);
5847   subl(rdx, divisor);
5848   if (VM_Version::supports_bmi1()) {
5849     andnl(rax, rdx, rax);
5850   } else {
5851     notl(rdx);
5852     andl(rax, rdx);
5853   }
5854   shrl(rax, 31);
5855   bind(done);
5856 }
5857 
5858 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) {
5859   Label done;
5860   Label neg_divisor_fastpath;
5861   cmpl(divisor, 0);
5862   jccb(Assembler::less, neg_divisor_fastpath);
5863   xorl(rdx, rdx);
5864   divl(divisor);
5865   jmpb(done);
5866   bind(neg_divisor_fastpath);
5867   // Fastpath when divisor < 0:
5868   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
5869   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
5870   movl(rdx, rax);
5871   subl(rax, divisor);
5872   if (VM_Version::supports_bmi1()) {
5873     andnl(rax, rax, rdx);
5874   } else {
5875     notl(rax);
5876     andl(rax, rdx);
5877   }
5878   sarl(rax, 31);
5879   andl(rax, divisor);
5880   subl(rdx, rax);
5881   bind(done);
5882 }
5883 
5884 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) {
5885   Label done;
5886   Label neg_divisor_fastpath;
5887 
5888   cmpl(divisor, 0);
5889   jccb(Assembler::less, neg_divisor_fastpath);
5890   xorl(rdx, rdx);
5891   divl(divisor);
5892   jmpb(done);
5893   bind(neg_divisor_fastpath);
5894   // Fastpath for divisor < 0:
5895   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
5896   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
5897   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
5898   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
5899   movl(rdx, rax);
5900   subl(rax, divisor);
5901   if (VM_Version::supports_bmi1()) {
5902     andnl(rax, rax, rdx);
5903   } else {
5904     notl(rax);
5905     andl(rax, rdx);
5906   }
5907   movl(tmp, rax);
5908   shrl(rax, 31); // quotient
5909   sarl(tmp, 31);
5910   andl(tmp, divisor);
5911   subl(rdx, tmp); // remainder
5912   bind(done);
5913 }
5914 
5915 #ifdef _LP64
5916 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1,
5917                                  XMMRegister xtmp2, Register rtmp) {
5918   if(VM_Version::supports_gfni()) {
5919     // Galois field instruction based bit reversal based on following algorithm.
5920     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
5921     mov64(rtmp, 0x8040201008040201L);
5922     movq(xtmp1, src);
5923     movq(xtmp2, rtmp);
5924     gf2p8affineqb(xtmp1, xtmp2, 0);
5925     movq(dst, xtmp1);
5926   } else {
5927     // Swap even and odd numbered bits.
5928     movl(rtmp, src);
5929     andl(rtmp, 0x55555555);
5930     shll(rtmp, 1);
5931     movl(dst, src);
5932     andl(dst, 0xAAAAAAAA);
5933     shrl(dst, 1);
5934     orl(dst, rtmp);
5935 
5936     // Swap LSB and MSB 2 bits of each nibble.
5937     movl(rtmp, dst);
5938     andl(rtmp, 0x33333333);
5939     shll(rtmp, 2);
5940     andl(dst, 0xCCCCCCCC);
5941     shrl(dst, 2);
5942     orl(dst, rtmp);
5943 
5944     // Swap LSB and MSB 4 bits of each byte.
5945     movl(rtmp, dst);
5946     andl(rtmp, 0x0F0F0F0F);
5947     shll(rtmp, 4);
5948     andl(dst, 0xF0F0F0F0);
5949     shrl(dst, 4);
5950     orl(dst, rtmp);
5951   }
5952   bswapl(dst);
5953 }
5954 
5955 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1,
5956                                  XMMRegister xtmp2, Register rtmp1, Register rtmp2) {
5957   if(VM_Version::supports_gfni()) {
5958     // Galois field instruction based bit reversal based on following algorithm.
5959     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
5960     mov64(rtmp1, 0x8040201008040201L);
5961     movq(xtmp1, src);
5962     movq(xtmp2, rtmp1);
5963     gf2p8affineqb(xtmp1, xtmp2, 0);
5964     movq(dst, xtmp1);
5965   } else {
5966     // Swap even and odd numbered bits.
5967     movq(rtmp1, src);
5968     mov64(rtmp2, 0x5555555555555555L);
5969     andq(rtmp1, rtmp2);
5970     shlq(rtmp1, 1);
5971     movq(dst, src);
5972     notq(rtmp2);
5973     andq(dst, rtmp2);
5974     shrq(dst, 1);
5975     orq(dst, rtmp1);
5976 
5977     // Swap LSB and MSB 2 bits of each nibble.
5978     movq(rtmp1, dst);
5979     mov64(rtmp2, 0x3333333333333333L);
5980     andq(rtmp1, rtmp2);
5981     shlq(rtmp1, 2);
5982     notq(rtmp2);
5983     andq(dst, rtmp2);
5984     shrq(dst, 2);
5985     orq(dst, rtmp1);
5986 
5987     // Swap LSB and MSB 4 bits of each byte.
5988     movq(rtmp1, dst);
5989     mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL);
5990     andq(rtmp1, rtmp2);
5991     shlq(rtmp1, 4);
5992     notq(rtmp2);
5993     andq(dst, rtmp2);
5994     shrq(dst, 4);
5995     orq(dst, rtmp1);
5996   }
5997   bswapq(dst);
5998 }
5999 
6000 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) {
6001   Label done;
6002   Label neg_divisor_fastpath;
6003   cmpq(divisor, 0);
6004   jccb(Assembler::less, neg_divisor_fastpath);
6005   xorl(rdx, rdx);
6006   divq(divisor);
6007   jmpb(done);
6008   bind(neg_divisor_fastpath);
6009   // Fastpath for divisor < 0:
6010   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6011   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6012   movq(rdx, rax);
6013   subq(rdx, divisor);
6014   if (VM_Version::supports_bmi1()) {
6015     andnq(rax, rdx, rax);
6016   } else {
6017     notq(rdx);
6018     andq(rax, rdx);
6019   }
6020   shrq(rax, 63);
6021   bind(done);
6022 }
6023 
6024 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) {
6025   Label done;
6026   Label neg_divisor_fastpath;
6027   cmpq(divisor, 0);
6028   jccb(Assembler::less, neg_divisor_fastpath);
6029   xorq(rdx, rdx);
6030   divq(divisor);
6031   jmp(done);
6032   bind(neg_divisor_fastpath);
6033   // Fastpath when divisor < 0:
6034   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6035   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6036   movq(rdx, rax);
6037   subq(rax, divisor);
6038   if (VM_Version::supports_bmi1()) {
6039     andnq(rax, rax, rdx);
6040   } else {
6041     notq(rax);
6042     andq(rax, rdx);
6043   }
6044   sarq(rax, 63);
6045   andq(rax, divisor);
6046   subq(rdx, rax);
6047   bind(done);
6048 }
6049 
6050 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) {
6051   Label done;
6052   Label neg_divisor_fastpath;
6053   cmpq(divisor, 0);
6054   jccb(Assembler::less, neg_divisor_fastpath);
6055   xorq(rdx, rdx);
6056   divq(divisor);
6057   jmp(done);
6058   bind(neg_divisor_fastpath);
6059   // Fastpath for divisor < 0:
6060   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6061   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6062   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6063   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6064   movq(rdx, rax);
6065   subq(rax, divisor);
6066   if (VM_Version::supports_bmi1()) {
6067     andnq(rax, rax, rdx);
6068   } else {
6069     notq(rax);
6070     andq(rax, rdx);
6071   }
6072   movq(tmp, rax);
6073   shrq(rax, 63); // quotient
6074   sarq(tmp, 63);
6075   andq(tmp, divisor);
6076   subq(rdx, tmp); // remainder
6077   bind(done);
6078 }
6079 #endif
6080 
6081 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1,
6082                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp,
6083                                         int vlen_enc) {
6084   assert(VM_Version::supports_avx512bw(), "");
6085   // Byte shuffles are inlane operations and indices are determined using
6086   // lower 4 bit of each shuffle lane, thus all shuffle indices are
6087   // normalized to index range 0-15. This makes sure that all the multiples
6088   // of an index value are placed at same relative position in 128 bit
6089   // lane i.e. elements corresponding to shuffle indices 16, 32 and 64
6090   // will be 16th element in their respective 128 bit lanes.
6091   movl(rtmp, 16);
6092   evpbroadcastb(xtmp1, rtmp, vlen_enc);
6093 
6094   // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16,
6095   // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using
6096   // original shuffle indices and move the shuffled lanes corresponding to true
6097   // mask to destination vector.
6098   evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
6099   evshufi64x2(xtmp2, src, src, 0x0, vlen_enc);
6100   evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc);
6101 
6102   // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32
6103   // and broadcasting second 128 bit lane.
6104   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6105   vpsllq(xtmp2, xtmp1, 0x1, vlen_enc);
6106   evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
6107   evshufi64x2(xtmp3, src, src, 0x55, vlen_enc);
6108   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6109 
6110   // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48
6111   // and broadcasting third 128 bit lane.
6112   evpcmpb(ktmp, k0, shuffle,  xtmp2, Assembler::nlt, true, vlen_enc);
6113   vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc);
6114   evpcmpb(ktmp, ktmp, shuffle,  xtmp1, Assembler::lt, true, vlen_enc);
6115   evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc);
6116   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6117 
6118   // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64
6119   // and broadcasting third 128 bit lane.
6120   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6121   vpsllq(xtmp2, xtmp2, 0x1, vlen_enc);
6122   evpcmpb(ktmp, ktmp, shuffle,  xtmp2, Assembler::lt, true, vlen_enc);
6123   evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc);
6124   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6125 }
6126 
6127 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst,
6128                                                    XMMRegister shuffle, XMMRegister src, int vlen_enc) {
6129   if (vlen_enc == AVX_128bit) {
6130     vpermilps(dst, src, shuffle, vlen_enc);
6131   } else if (bt == T_INT) {
6132     vpermd(dst, shuffle, src, vlen_enc);
6133   } else {
6134     assert(bt == T_FLOAT, "");
6135     vpermps(dst, shuffle, src, vlen_enc);
6136   }
6137 }