1 /*
   2  * Copyright (c) 2020, 2022, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "gc/shared/barrierSet.hpp"
  29 #include "gc/shared/barrierSetAssembler.hpp"
  30 #include "oops/methodData.hpp"
  31 #include "opto/c2_MacroAssembler.hpp"
  32 #include "opto/intrinsicnode.hpp"
  33 #include "opto/output.hpp"
  34 #include "opto/opcodes.hpp"
  35 #include "opto/subnode.hpp"
  36 #include "runtime/objectMonitor.hpp"
  37 #include "runtime/stubRoutines.hpp"
  38 
  39 #ifdef PRODUCT
  40 #define BLOCK_COMMENT(str) /* nothing */
  41 #define STOP(error) stop(error)
  42 #else
  43 #define BLOCK_COMMENT(str) block_comment(str)
  44 #define STOP(error) block_comment(error); stop(error)
  45 #endif
  46 
  47 // C2 compiled method's prolog code.
  48 void C2_MacroAssembler::verified_entry(Compile* C, int sp_inc) {
  49   int framesize = C->output()->frame_size_in_bytes();
  50   int bangsize = C->output()->bang_size_in_bytes();
  51   bool fp_mode_24b = false;
  52   int stack_bang_size = C->output()->need_stack_bang(bangsize) ? bangsize : 0;
  53 
  54   // WARNING: Initial instruction MUST be 5 bytes or longer so that
  55   // NativeJump::patch_verified_entry will be able to patch out the entry
  56   // code safely. The push to verify stack depth is ok at 5 bytes,
  57   // the frame allocation can be either 3 or 6 bytes. So if we don't do
  58   // stack bang then we must use the 6 byte frame allocation even if
  59   // we have no frame. :-(
  60   assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
  61 
  62   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
  63   // Remove word for return addr
  64   framesize -= wordSize;
  65   stack_bang_size -= wordSize;
  66 
  67   // Calls to C2R adapters often do not accept exceptional returns.
  68   // We require that their callers must bang for them.  But be careful, because
  69   // some VM calls (such as call site linkage) can use several kilobytes of
  70   // stack.  But the stack safety zone should account for that.
  71   // See bugs 4446381, 4468289, 4497237.
  72   if (stack_bang_size > 0) {
  73     generate_stack_overflow_check(stack_bang_size);
  74 
  75     // We always push rbp, so that on return to interpreter rbp, will be
  76     // restored correctly and we can correct the stack.
  77     push(rbp);
  78     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  79     if (PreserveFramePointer) {
  80       mov(rbp, rsp);
  81     }
  82     // Remove word for ebp
  83     framesize -= wordSize;
  84 
  85     // Create frame
  86     if (framesize) {
  87       subptr(rsp, framesize);
  88     }
  89   } else {
  90     // Create frame (force generation of a 4 byte immediate value)
  91     subptr_imm32(rsp, framesize);
  92 
  93     // Save RBP register now.
  94     framesize -= wordSize;
  95     movptr(Address(rsp, framesize), rbp);
  96     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  97     if (PreserveFramePointer) {
  98       movptr(rbp, rsp);
  99       if (framesize > 0) {
 100         addptr(rbp, framesize);
 101       }
 102     }
 103   }
 104 
 105   if (C->needs_stack_repair()) {
 106     // Save stack increment just below the saved rbp (also account for fixed framesize and rbp)
 107     assert((sp_inc & (StackAlignmentInBytes-1)) == 0, "stack increment not aligned");
 108     movptr(Address(rsp, framesize - wordSize), sp_inc + framesize + wordSize);
 109   }
 110 
 111   if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
 112     framesize -= wordSize;
 113     movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 114   }
 115 
 116 #ifndef _LP64
 117   // If method sets FPU control word do it now
 118   if (fp_mode_24b) {
 119     fldcw(ExternalAddress(StubRoutines::x86::addr_fpu_cntrl_wrd_24()));
 120   }
 121   if (UseSSE >= 2 && VerifyFPU) {
 122     verify_FPU(0, "FPU stack must be clean on entry");
 123   }
 124 #endif
 125 
 126 #ifdef ASSERT
 127   if (VerifyStackAtCalls) {
 128     Label L;
 129     push(rax);
 130     mov(rax, rsp);
 131     andptr(rax, StackAlignmentInBytes-1);
 132     cmpptr(rax, StackAlignmentInBytes-wordSize);
 133     pop(rax);
 134     jcc(Assembler::equal, L);
 135     STOP("Stack is not properly aligned!");
 136     bind(L);
 137   }
 138 #endif
 139 }
 140 
 141 void C2_MacroAssembler::emit_entry_barrier_stub(C2EntryBarrierStub* stub) {
 142   bind(stub->slow_path());
 143   call(RuntimeAddress(StubRoutines::x86::method_entry_barrier()));
 144   jmp(stub->continuation(), false /* maybe_short */);
 145 }
 146 
 147 int C2_MacroAssembler::entry_barrier_stub_size() {
 148   return 10;
 149 }
 150 
 151 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
 152   switch (vlen_in_bytes) {
 153     case  4: // fall-through
 154     case  8: // fall-through
 155     case 16: return Assembler::AVX_128bit;
 156     case 32: return Assembler::AVX_256bit;
 157     case 64: return Assembler::AVX_512bit;
 158 
 159     default: {
 160       ShouldNotReachHere();
 161       return Assembler::AVX_NoVec;
 162     }
 163   }
 164 }
 165 
 166 #if INCLUDE_RTM_OPT
 167 
 168 // Update rtm_counters based on abort status
 169 // input: abort_status
 170 //        rtm_counters (RTMLockingCounters*)
 171 // flags are killed
 172 void C2_MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) {
 173 
 174   atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset()));
 175   if (PrintPreciseRTMLockingStatistics) {
 176     for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {
 177       Label check_abort;
 178       testl(abort_status, (1<<i));
 179       jccb(Assembler::equal, check_abort);
 180       atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx))));
 181       bind(check_abort);
 182     }
 183   }
 184 }
 185 
 186 // Branch if (random & (count-1) != 0), count is 2^n
 187 // tmp, scr and flags are killed
 188 void C2_MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) {
 189   assert(tmp == rax, "");
 190   assert(scr == rdx, "");
 191   rdtsc(); // modifies EDX:EAX
 192   andptr(tmp, count-1);
 193   jccb(Assembler::notZero, brLabel);
 194 }
 195 
 196 // Perform abort ratio calculation, set no_rtm bit if high ratio
 197 // input:  rtm_counters_Reg (RTMLockingCounters* address)
 198 // tmpReg, rtm_counters_Reg and flags are killed
 199 void C2_MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg,
 200                                                     Register rtm_counters_Reg,
 201                                                     RTMLockingCounters* rtm_counters,
 202                                                     Metadata* method_data) {
 203   Label L_done, L_check_always_rtm1, L_check_always_rtm2;
 204 
 205   if (RTMLockingCalculationDelay > 0) {
 206     // Delay calculation
 207     movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr()));
 208     testptr(tmpReg, tmpReg);
 209     jccb(Assembler::equal, L_done);
 210   }
 211   // Abort ratio calculation only if abort_count > RTMAbortThreshold
 212   //   Aborted transactions = abort_count * 100
 213   //   All transactions = total_count *  RTMTotalCountIncrRate
 214   //   Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
 215 
 216   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset()));
 217   cmpptr(tmpReg, RTMAbortThreshold);
 218   jccb(Assembler::below, L_check_always_rtm2);
 219   imulptr(tmpReg, tmpReg, 100);
 220 
 221   Register scrReg = rtm_counters_Reg;
 222   movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
 223   imulptr(scrReg, scrReg, RTMTotalCountIncrRate);
 224   imulptr(scrReg, scrReg, RTMAbortRatio);
 225   cmpptr(tmpReg, scrReg);
 226   jccb(Assembler::below, L_check_always_rtm1);
 227   if (method_data != NULL) {
 228     // set rtm_state to "no rtm" in MDO
 229     mov_metadata(tmpReg, method_data);
 230     lock();
 231     orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), NoRTM);
 232   }
 233   jmpb(L_done);
 234   bind(L_check_always_rtm1);
 235   // Reload RTMLockingCounters* address
 236   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
 237   bind(L_check_always_rtm2);
 238   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
 239   cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate);
 240   jccb(Assembler::below, L_done);
 241   if (method_data != NULL) {
 242     // set rtm_state to "always rtm" in MDO
 243     mov_metadata(tmpReg, method_data);
 244     lock();
 245     orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), UseRTM);
 246   }
 247   bind(L_done);
 248 }
 249 
 250 // Update counters and perform abort ratio calculation
 251 // input:  abort_status_Reg
 252 // rtm_counters_Reg, flags are killed
 253 void C2_MacroAssembler::rtm_profiling(Register abort_status_Reg,
 254                                       Register rtm_counters_Reg,
 255                                       RTMLockingCounters* rtm_counters,
 256                                       Metadata* method_data,
 257                                       bool profile_rtm) {
 258 
 259   assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
 260   // update rtm counters based on rax value at abort
 261   // reads abort_status_Reg, updates flags
 262   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
 263   rtm_counters_update(abort_status_Reg, rtm_counters_Reg);
 264   if (profile_rtm) {
 265     // Save abort status because abort_status_Reg is used by following code.
 266     if (RTMRetryCount > 0) {
 267       push(abort_status_Reg);
 268     }
 269     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
 270     rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data);
 271     // restore abort status
 272     if (RTMRetryCount > 0) {
 273       pop(abort_status_Reg);
 274     }
 275   }
 276 }
 277 
 278 // Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4)
 279 // inputs: retry_count_Reg
 280 //       : abort_status_Reg
 281 // output: retry_count_Reg decremented by 1
 282 // flags are killed
 283 void C2_MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) {
 284   Label doneRetry;
 285   assert(abort_status_Reg == rax, "");
 286   // The abort reason bits are in eax (see all states in rtmLocking.hpp)
 287   // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4)
 288   // if reason is in 0x6 and retry count != 0 then retry
 289   andptr(abort_status_Reg, 0x6);
 290   jccb(Assembler::zero, doneRetry);
 291   testl(retry_count_Reg, retry_count_Reg);
 292   jccb(Assembler::zero, doneRetry);
 293   pause();
 294   decrementl(retry_count_Reg);
 295   jmp(retryLabel);
 296   bind(doneRetry);
 297 }
 298 
 299 // Spin and retry if lock is busy,
 300 // inputs: box_Reg (monitor address)
 301 //       : retry_count_Reg
 302 // output: retry_count_Reg decremented by 1
 303 //       : clear z flag if retry count exceeded
 304 // tmp_Reg, scr_Reg, flags are killed
 305 void C2_MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg,
 306                                                Register tmp_Reg, Register scr_Reg, Label& retryLabel) {
 307   Label SpinLoop, SpinExit, doneRetry;
 308   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 309 
 310   testl(retry_count_Reg, retry_count_Reg);
 311   jccb(Assembler::zero, doneRetry);
 312   decrementl(retry_count_Reg);
 313   movptr(scr_Reg, RTMSpinLoopCount);
 314 
 315   bind(SpinLoop);
 316   pause();
 317   decrementl(scr_Reg);
 318   jccb(Assembler::lessEqual, SpinExit);
 319   movptr(tmp_Reg, Address(box_Reg, owner_offset));
 320   testptr(tmp_Reg, tmp_Reg);
 321   jccb(Assembler::notZero, SpinLoop);
 322 
 323   bind(SpinExit);
 324   jmp(retryLabel);
 325   bind(doneRetry);
 326   incrementl(retry_count_Reg); // clear z flag
 327 }
 328 
 329 // Use RTM for normal stack locks
 330 // Input: objReg (object to lock)
 331 void C2_MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg,
 332                                          Register retry_on_abort_count_Reg,
 333                                          RTMLockingCounters* stack_rtm_counters,
 334                                          Metadata* method_data, bool profile_rtm,
 335                                          Label& DONE_LABEL, Label& IsInflated) {
 336   assert(UseRTMForStackLocks, "why call this otherwise?");
 337   assert(tmpReg == rax, "");
 338   assert(scrReg == rdx, "");
 339   Label L_rtm_retry, L_decrement_retry, L_on_abort;
 340 
 341   if (RTMRetryCount > 0) {
 342     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
 343     bind(L_rtm_retry);
 344   }
 345   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
 346   testptr(tmpReg, markWord::monitor_value);  // inflated vs stack-locked|neutral
 347   jcc(Assembler::notZero, IsInflated);
 348 
 349   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 350     Label L_noincrement;
 351     if (RTMTotalCountIncrRate > 1) {
 352       // tmpReg, scrReg and flags are killed
 353       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
 354     }
 355     assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM");
 356     atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg);
 357     bind(L_noincrement);
 358   }
 359   xbegin(L_on_abort);
 360   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));       // fetch markword
 361   andptr(tmpReg, markWord::lock_mask_in_place);     // look at 2 lock bits
 362   cmpptr(tmpReg, markWord::unlocked_value);         // bits = 01 unlocked
 363   jcc(Assembler::equal, DONE_LABEL);        // all done if unlocked
 364 
 365   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
 366   if (UseRTMXendForLockBusy) {
 367     xend();
 368     movptr(abort_status_Reg, 0x2);   // Set the abort status to 2 (so we can retry)
 369     jmp(L_decrement_retry);
 370   }
 371   else {
 372     xabort(0);
 373   }
 374   bind(L_on_abort);
 375   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 376     rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm);
 377   }
 378   bind(L_decrement_retry);
 379   if (RTMRetryCount > 0) {
 380     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
 381     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
 382   }
 383 }
 384 
 385 // Use RTM for inflating locks
 386 // inputs: objReg (object to lock)
 387 //         boxReg (on-stack box address (displaced header location) - KILLED)
 388 //         tmpReg (ObjectMonitor address + markWord::monitor_value)
 389 void C2_MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg,
 390                                             Register scrReg, Register retry_on_busy_count_Reg,
 391                                             Register retry_on_abort_count_Reg,
 392                                             RTMLockingCounters* rtm_counters,
 393                                             Metadata* method_data, bool profile_rtm,
 394                                             Label& DONE_LABEL) {
 395   assert(UseRTMLocking, "why call this otherwise?");
 396   assert(tmpReg == rax, "");
 397   assert(scrReg == rdx, "");
 398   Label L_rtm_retry, L_decrement_retry, L_on_abort;
 399   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 400 
 401   movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value()));
 402   movptr(boxReg, tmpReg); // Save ObjectMonitor address
 403 
 404   if (RTMRetryCount > 0) {
 405     movl(retry_on_busy_count_Reg, RTMRetryCount);  // Retry on lock busy
 406     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
 407     bind(L_rtm_retry);
 408   }
 409   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 410     Label L_noincrement;
 411     if (RTMTotalCountIncrRate > 1) {
 412       // tmpReg, scrReg and flags are killed
 413       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
 414     }
 415     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
 416     atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg);
 417     bind(L_noincrement);
 418   }
 419   xbegin(L_on_abort);
 420   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
 421   movptr(tmpReg, Address(tmpReg, owner_offset));
 422   testptr(tmpReg, tmpReg);
 423   jcc(Assembler::zero, DONE_LABEL);
 424   if (UseRTMXendForLockBusy) {
 425     xend();
 426     jmp(L_decrement_retry);
 427   }
 428   else {
 429     xabort(0);
 430   }
 431   bind(L_on_abort);
 432   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
 433   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 434     rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm);
 435   }
 436   if (RTMRetryCount > 0) {
 437     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
 438     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
 439   }
 440 
 441   movptr(tmpReg, Address(boxReg, owner_offset)) ;
 442   testptr(tmpReg, tmpReg) ;
 443   jccb(Assembler::notZero, L_decrement_retry) ;
 444 
 445   // Appears unlocked - try to swing _owner from null to non-null.
 446   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 447 #ifdef _LP64
 448   Register threadReg = r15_thread;
 449 #else
 450   get_thread(scrReg);
 451   Register threadReg = scrReg;
 452 #endif
 453   lock();
 454   cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg
 455 
 456   if (RTMRetryCount > 0) {
 457     // success done else retry
 458     jccb(Assembler::equal, DONE_LABEL) ;
 459     bind(L_decrement_retry);
 460     // Spin and retry if lock is busy.
 461     rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry);
 462   }
 463   else {
 464     bind(L_decrement_retry);
 465   }
 466 }
 467 
 468 #endif //  INCLUDE_RTM_OPT
 469 
 470 // fast_lock and fast_unlock used by C2
 471 
 472 // Because the transitions from emitted code to the runtime
 473 // monitorenter/exit helper stubs are so slow it's critical that
 474 // we inline both the stack-locking fast path and the inflated fast path.
 475 //
 476 // See also: cmpFastLock and cmpFastUnlock.
 477 //
 478 // What follows is a specialized inline transliteration of the code
 479 // in enter() and exit(). If we're concerned about I$ bloat another
 480 // option would be to emit TrySlowEnter and TrySlowExit methods
 481 // at startup-time.  These methods would accept arguments as
 482 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
 483 // indications in the icc.ZFlag.  fast_lock and fast_unlock would simply
 484 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
 485 // In practice, however, the # of lock sites is bounded and is usually small.
 486 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
 487 // if the processor uses simple bimodal branch predictors keyed by EIP
 488 // Since the helper routines would be called from multiple synchronization
 489 // sites.
 490 //
 491 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
 492 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
 493 // to those specialized methods.  That'd give us a mostly platform-independent
 494 // implementation that the JITs could optimize and inline at their pleasure.
 495 // Done correctly, the only time we'd need to cross to native could would be
 496 // to park() or unpark() threads.  We'd also need a few more unsafe operators
 497 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
 498 // (b) explicit barriers or fence operations.
 499 //
 500 // TODO:
 501 //
 502 // *  Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
 503 //    This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
 504 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
 505 //    the lock operators would typically be faster than reifying Self.
 506 //
 507 // *  Ideally I'd define the primitives as:
 508 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
 509 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
 510 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
 511 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
 512 //    Furthermore the register assignments are overconstrained, possibly resulting in
 513 //    sub-optimal code near the synchronization site.
 514 //
 515 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
 516 //    Alternately, use a better sp-proximity test.
 517 //
 518 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
 519 //    Either one is sufficient to uniquely identify a thread.
 520 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
 521 //
 522 // *  Intrinsify notify() and notifyAll() for the common cases where the
 523 //    object is locked by the calling thread but the waitlist is empty.
 524 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
 525 //
 526 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
 527 //    But beware of excessive branch density on AMD Opterons.
 528 //
 529 // *  Both fast_lock and fast_unlock set the ICC.ZF to indicate success
 530 //    or failure of the fast path.  If the fast path fails then we pass
 531 //    control to the slow path, typically in C.  In fast_lock and
 532 //    fast_unlock we often branch to DONE_LABEL, just to find that C2
 533 //    will emit a conditional branch immediately after the node.
 534 //    So we have branches to branches and lots of ICC.ZF games.
 535 //    Instead, it might be better to have C2 pass a "FailureLabel"
 536 //    into fast_lock and fast_unlock.  In the case of success, control
 537 //    will drop through the node.  ICC.ZF is undefined at exit.
 538 //    In the case of failure, the node will branch directly to the
 539 //    FailureLabel
 540 
 541 
 542 // obj: object to lock
 543 // box: on-stack box address (displaced header location) - KILLED
 544 // rax,: tmp -- KILLED
 545 // scr: tmp -- KILLED
 546 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
 547                                  Register scrReg, Register cx1Reg, Register cx2Reg,
 548                                  RTMLockingCounters* rtm_counters,
 549                                  RTMLockingCounters* stack_rtm_counters,
 550                                  Metadata* method_data,
 551                                  bool use_rtm, bool profile_rtm) {
 552   // Ensure the register assignments are disjoint
 553   assert(tmpReg == rax, "");
 554 
 555   if (use_rtm) {
 556     assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg);
 557   } else {
 558     assert(cx2Reg == noreg, "");
 559     assert_different_registers(objReg, boxReg, tmpReg, scrReg);
 560   }
 561 
 562   // Possible cases that we'll encounter in fast_lock
 563   // ------------------------------------------------
 564   // * Inflated
 565   //    -- unlocked
 566   //    -- Locked
 567   //       = by self
 568   //       = by other
 569   // * neutral
 570   // * stack-locked
 571   //    -- by self
 572   //       = sp-proximity test hits
 573   //       = sp-proximity test generates false-negative
 574   //    -- by other
 575   //
 576 
 577   Label IsInflated, DONE_LABEL, NO_COUNT, COUNT;
 578 
 579   if (DiagnoseSyncOnValueBasedClasses != 0) {
 580     load_klass(tmpReg, objReg, cx1Reg);
 581     movl(tmpReg, Address(tmpReg, Klass::access_flags_offset()));
 582     testl(tmpReg, JVM_ACC_IS_VALUE_BASED_CLASS);
 583     jcc(Assembler::notZero, DONE_LABEL);
 584   }
 585 
 586 #if INCLUDE_RTM_OPT
 587   if (UseRTMForStackLocks && use_rtm) {
 588     assert(!UseHeavyMonitors, "+UseHeavyMonitors and +UseRTMForStackLocks are mutually exclusive");
 589     rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg,
 590                       stack_rtm_counters, method_data, profile_rtm,
 591                       DONE_LABEL, IsInflated);
 592   }
 593 #endif // INCLUDE_RTM_OPT
 594 
 595   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));          // [FETCH]
 596   testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral
 597   jccb(Assembler::notZero, IsInflated);
 598 
 599   if (!UseHeavyMonitors) {
 600     // Attempt stack-locking ...
 601     orptr (tmpReg, markWord::unlocked_value);
 602     if (EnableValhalla) {
 603       // Mask inline_type bit such that we go to the slow path if object is an inline type
 604       andptr(tmpReg, ~((int) markWord::inline_type_bit_in_place));
 605     }
 606     movptr(Address(boxReg, 0), tmpReg);          // Anticipate successful CAS
 607     lock();
 608     cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes()));      // Updates tmpReg
 609     jcc(Assembler::equal, COUNT);           // Success
 610 
 611     // Recursive locking.
 612     // The object is stack-locked: markword contains stack pointer to BasicLock.
 613     // Locked by current thread if difference with current SP is less than one page.
 614     subptr(tmpReg, rsp);
 615     // Next instruction set ZFlag == 1 (Success) if difference is less then one page.
 616     andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - os::vm_page_size())) );
 617     movptr(Address(boxReg, 0), tmpReg);
 618   } else {
 619     // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0.
 620     testptr(objReg, objReg);
 621   }
 622   jmp(DONE_LABEL);
 623 
 624   bind(IsInflated);
 625   // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value
 626 
 627 #if INCLUDE_RTM_OPT
 628   // Use the same RTM locking code in 32- and 64-bit VM.
 629   if (use_rtm) {
 630     rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg,
 631                          rtm_counters, method_data, profile_rtm, DONE_LABEL);
 632   } else {
 633 #endif // INCLUDE_RTM_OPT
 634 
 635 #ifndef _LP64
 636   // The object is inflated.
 637 
 638   // boxReg refers to the on-stack BasicLock in the current frame.
 639   // We'd like to write:
 640   //   set box->_displaced_header = markWord::unused_mark().  Any non-0 value suffices.
 641   // This is convenient but results a ST-before-CAS penalty.  The following CAS suffers
 642   // additional latency as we have another ST in the store buffer that must drain.
 643 
 644   // avoid ST-before-CAS
 645   // register juggle because we need tmpReg for cmpxchgptr below
 646   movptr(scrReg, boxReg);
 647   movptr(boxReg, tmpReg);                   // consider: LEA box, [tmp-2]
 648 
 649   // Optimistic form: consider XORL tmpReg,tmpReg
 650   movptr(tmpReg, NULL_WORD);
 651 
 652   // Appears unlocked - try to swing _owner from null to non-null.
 653   // Ideally, I'd manifest "Self" with get_thread and then attempt
 654   // to CAS the register containing Self into m->Owner.
 655   // But we don't have enough registers, so instead we can either try to CAS
 656   // rsp or the address of the box (in scr) into &m->owner.  If the CAS succeeds
 657   // we later store "Self" into m->Owner.  Transiently storing a stack address
 658   // (rsp or the address of the box) into  m->owner is harmless.
 659   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 660   lock();
 661   cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 662   movptr(Address(scrReg, 0), 3);          // box->_displaced_header = 3
 663   // If we weren't able to swing _owner from NULL to the BasicLock
 664   // then take the slow path.
 665   jccb  (Assembler::notZero, NO_COUNT);
 666   // update _owner from BasicLock to thread
 667   get_thread (scrReg);                    // beware: clobbers ICCs
 668   movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg);
 669   xorptr(boxReg, boxReg);                 // set icc.ZFlag = 1 to indicate success
 670 
 671   // If the CAS fails we can either retry or pass control to the slow path.
 672   // We use the latter tactic.
 673   // Pass the CAS result in the icc.ZFlag into DONE_LABEL
 674   // If the CAS was successful ...
 675   //   Self has acquired the lock
 676   //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
 677   // Intentional fall-through into DONE_LABEL ...
 678 #else // _LP64
 679   // It's inflated and we use scrReg for ObjectMonitor* in this section.
 680   movq(scrReg, tmpReg);
 681   xorq(tmpReg, tmpReg);
 682   lock();
 683   cmpxchgptr(r15_thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 684   // Unconditionally set box->_displaced_header = markWord::unused_mark().
 685   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 686   movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value()));
 687   // Propagate ICC.ZF from CAS above into DONE_LABEL.
 688   jccb(Assembler::equal, COUNT);          // CAS above succeeded; propagate ZF = 1 (success)
 689 
 690   cmpptr(r15_thread, rax);                // Check if we are already the owner (recursive lock)
 691   jccb(Assembler::notEqual, NO_COUNT);    // If not recursive, ZF = 0 at this point (fail)
 692   incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 693   xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success
 694 #endif // _LP64
 695 #if INCLUDE_RTM_OPT
 696   } // use_rtm()
 697 #endif
 698   // DONE_LABEL is a hot target - we'd really like to place it at the
 699   // start of cache line by padding with NOPs.
 700   // See the AMD and Intel software optimization manuals for the
 701   // most efficient "long" NOP encodings.
 702   // Unfortunately none of our alignment mechanisms suffice.
 703   bind(DONE_LABEL);
 704 
 705   // ZFlag == 1 count in fast path
 706   // ZFlag == 0 count in slow path
 707   jccb(Assembler::notZero, NO_COUNT); // jump if ZFlag == 0
 708 
 709   bind(COUNT);
 710   // Count monitors in fast path
 711 #ifndef _LP64
 712   get_thread(tmpReg);
 713   incrementl(Address(tmpReg, JavaThread::held_monitor_count_offset()));
 714 #else // _LP64
 715   incrementq(Address(r15_thread, JavaThread::held_monitor_count_offset()));
 716 #endif
 717 
 718   xorl(tmpReg, tmpReg); // Set ZF == 1
 719 
 720   bind(NO_COUNT);
 721 
 722   // At NO_COUNT the icc ZFlag is set as follows ...
 723   // fast_unlock uses the same protocol.
 724   // ZFlag == 1 -> Success
 725   // ZFlag == 0 -> Failure - force control through the slow path
 726 }
 727 
 728 // obj: object to unlock
 729 // box: box address (displaced header location), killed.  Must be EAX.
 730 // tmp: killed, cannot be obj nor box.
 731 //
 732 // Some commentary on balanced locking:
 733 //
 734 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
 735 // Methods that don't have provably balanced locking are forced to run in the
 736 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
 737 // The interpreter provides two properties:
 738 // I1:  At return-time the interpreter automatically and quietly unlocks any
 739 //      objects acquired the current activation (frame).  Recall that the
 740 //      interpreter maintains an on-stack list of locks currently held by
 741 //      a frame.
 742 // I2:  If a method attempts to unlock an object that is not held by the
 743 //      the frame the interpreter throws IMSX.
 744 //
 745 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
 746 // B() doesn't have provably balanced locking so it runs in the interpreter.
 747 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
 748 // is still locked by A().
 749 //
 750 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
 751 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
 752 // should not be unlocked by "normal" java-level locking and vice-versa.  The specification
 753 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
 754 // Arguably given that the spec legislates the JNI case as undefined our implementation
 755 // could reasonably *avoid* checking owner in fast_unlock().
 756 // In the interest of performance we elide m->Owner==Self check in unlock.
 757 // A perfectly viable alternative is to elide the owner check except when
 758 // Xcheck:jni is enabled.
 759 
 760 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) {
 761   assert(boxReg == rax, "");
 762   assert_different_registers(objReg, boxReg, tmpReg);
 763 
 764   Label DONE_LABEL, Stacked, CheckSucc, COUNT, NO_COUNT;
 765 
 766 #if INCLUDE_RTM_OPT
 767   if (UseRTMForStackLocks && use_rtm) {
 768     assert(!UseHeavyMonitors, "+UseHeavyMonitors and +UseRTMForStackLocks are mutually exclusive");
 769     Label L_regular_unlock;
 770     movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword
 771     andptr(tmpReg, markWord::lock_mask_in_place);                     // look at 2 lock bits
 772     cmpptr(tmpReg, markWord::unlocked_value);                         // bits = 01 unlocked
 773     jccb(Assembler::notEqual, L_regular_unlock);                      // if !HLE RegularLock
 774     xend();                                                           // otherwise end...
 775     jmp(DONE_LABEL);                                                  // ... and we're done
 776     bind(L_regular_unlock);
 777   }
 778 #endif
 779 
 780   if (!UseHeavyMonitors) {
 781     cmpptr(Address(boxReg, 0), NULL_WORD);                            // Examine the displaced header
 782     jcc   (Assembler::zero, COUNT);                                   // 0 indicates recursive stack-lock
 783   }
 784   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));   // Examine the object's markword
 785   if (!UseHeavyMonitors) {
 786     testptr(tmpReg, markWord::monitor_value);                         // Inflated?
 787     jccb   (Assembler::zero, Stacked);
 788   }
 789 
 790   // It's inflated.
 791 #if INCLUDE_RTM_OPT
 792   if (use_rtm) {
 793     Label L_regular_inflated_unlock;
 794     int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 795     movptr(boxReg, Address(tmpReg, owner_offset));
 796     testptr(boxReg, boxReg);
 797     jccb(Assembler::notZero, L_regular_inflated_unlock);
 798     xend();
 799     jmpb(DONE_LABEL);
 800     bind(L_regular_inflated_unlock);
 801   }
 802 #endif
 803 
 804   // Despite our balanced locking property we still check that m->_owner == Self
 805   // as java routines or native JNI code called by this thread might
 806   // have released the lock.
 807   // Refer to the comments in synchronizer.cpp for how we might encode extra
 808   // state in _succ so we can avoid fetching EntryList|cxq.
 809   //
 810   // If there's no contention try a 1-0 exit.  That is, exit without
 811   // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
 812   // we detect and recover from the race that the 1-0 exit admits.
 813   //
 814   // Conceptually fast_unlock() must execute a STST|LDST "release" barrier
 815   // before it STs null into _owner, releasing the lock.  Updates
 816   // to data protected by the critical section must be visible before
 817   // we drop the lock (and thus before any other thread could acquire
 818   // the lock and observe the fields protected by the lock).
 819   // IA32's memory-model is SPO, so STs are ordered with respect to
 820   // each other and there's no need for an explicit barrier (fence).
 821   // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
 822 #ifndef _LP64
 823   get_thread (boxReg);
 824 
 825   // Note that we could employ various encoding schemes to reduce
 826   // the number of loads below (currently 4) to just 2 or 3.
 827   // Refer to the comments in synchronizer.cpp.
 828   // In practice the chain of fetches doesn't seem to impact performance, however.
 829   xorptr(boxReg, boxReg);
 830   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 831   jccb  (Assembler::notZero, DONE_LABEL);
 832   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 833   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 834   jccb  (Assembler::notZero, CheckSucc);
 835   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 836   jmpb  (DONE_LABEL);
 837 
 838   bind (Stacked);
 839   // It's not inflated and it's not recursively stack-locked.
 840   // It must be stack-locked.
 841   // Try to reset the header to displaced header.
 842   // The "box" value on the stack is stable, so we can reload
 843   // and be assured we observe the same value as above.
 844   movptr(tmpReg, Address(boxReg, 0));
 845   lock();
 846   cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
 847   // Intention fall-thru into DONE_LABEL
 848 
 849   // DONE_LABEL is a hot target - we'd really like to place it at the
 850   // start of cache line by padding with NOPs.
 851   // See the AMD and Intel software optimization manuals for the
 852   // most efficient "long" NOP encodings.
 853   // Unfortunately none of our alignment mechanisms suffice.
 854   bind (CheckSucc);
 855 #else // _LP64
 856   // It's inflated
 857   Label LNotRecursive, LSuccess, LGoSlowPath;
 858 
 859   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0);
 860   jccb(Assembler::equal, LNotRecursive);
 861 
 862   // Recursive inflated unlock
 863   decq(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 864   jmpb(LSuccess);
 865 
 866   bind(LNotRecursive);
 867   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 868   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 869   jccb  (Assembler::notZero, CheckSucc);
 870   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 871   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 872   jmpb  (DONE_LABEL);
 873 
 874   // Try to avoid passing control into the slow_path ...
 875   bind  (CheckSucc);
 876 
 877   // The following optional optimization can be elided if necessary
 878   // Effectively: if (succ == null) goto slow path
 879   // The code reduces the window for a race, however,
 880   // and thus benefits performance.
 881   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD);
 882   jccb  (Assembler::zero, LGoSlowPath);
 883 
 884   xorptr(boxReg, boxReg);
 885   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 886   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 887 
 888   // Memory barrier/fence
 889   // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ
 890   // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack.
 891   // This is faster on Nehalem and AMD Shanghai/Barcelona.
 892   // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences
 893   // We might also restructure (ST Owner=0;barrier;LD _Succ) to
 894   // (mov box,0; xchgq box, &m->Owner; LD _succ) .
 895   lock(); addl(Address(rsp, 0), 0);
 896 
 897   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD);
 898   jccb  (Assembler::notZero, LSuccess);
 899 
 900   // Rare inopportune interleaving - race.
 901   // The successor vanished in the small window above.
 902   // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor.
 903   // We need to ensure progress and succession.
 904   // Try to reacquire the lock.
 905   // If that fails then the new owner is responsible for succession and this
 906   // thread needs to take no further action and can exit via the fast path (success).
 907   // If the re-acquire succeeds then pass control into the slow path.
 908   // As implemented, this latter mode is horrible because we generated more
 909   // coherence traffic on the lock *and* artificially extended the critical section
 910   // length while by virtue of passing control into the slow path.
 911 
 912   // box is really RAX -- the following CMPXCHG depends on that binding
 913   // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R)
 914   lock();
 915   cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 916   // There's no successor so we tried to regrab the lock.
 917   // If that didn't work, then another thread grabbed the
 918   // lock so we're done (and exit was a success).
 919   jccb  (Assembler::notEqual, LSuccess);
 920   // Intentional fall-through into slow path
 921 
 922   bind  (LGoSlowPath);
 923   orl   (boxReg, 1);                      // set ICC.ZF=0 to indicate failure
 924   jmpb  (DONE_LABEL);
 925 
 926   bind  (LSuccess);
 927   testl (boxReg, 0);                      // set ICC.ZF=1 to indicate success
 928   jmpb  (DONE_LABEL);
 929 
 930   if (!UseHeavyMonitors) {
 931     bind  (Stacked);
 932     movptr(tmpReg, Address (boxReg, 0));      // re-fetch
 933     lock();
 934     cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
 935   }
 936 #endif
 937   bind(DONE_LABEL);
 938 
 939   // ZFlag == 1 count in fast path
 940   // ZFlag == 0 count in slow path
 941   jccb(Assembler::notZero, NO_COUNT);
 942 
 943   bind(COUNT);
 944   // Count monitors in fast path
 945 #ifndef _LP64
 946   get_thread(tmpReg);
 947   decrementl(Address(tmpReg, JavaThread::held_monitor_count_offset()));
 948 #else // _LP64
 949   decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset()));
 950 #endif
 951 
 952   xorl(tmpReg, tmpReg); // Set ZF == 1
 953 
 954   bind(NO_COUNT);
 955 }
 956 
 957 //-------------------------------------------------------------------------------------------
 958 // Generic instructions support for use in .ad files C2 code generation
 959 
 960 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) {
 961   if (dst != src) {
 962     movdqu(dst, src);
 963   }
 964   if (opcode == Op_AbsVD) {
 965     andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg);
 966   } else {
 967     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 968     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
 969   }
 970 }
 971 
 972 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 973   if (opcode == Op_AbsVD) {
 974     vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg);
 975   } else {
 976     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 977     vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg);
 978   }
 979 }
 980 
 981 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) {
 982   if (dst != src) {
 983     movdqu(dst, src);
 984   }
 985   if (opcode == Op_AbsVF) {
 986     andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg);
 987   } else {
 988     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 989     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
 990   }
 991 }
 992 
 993 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 994   if (opcode == Op_AbsVF) {
 995     vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg);
 996   } else {
 997     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 998     vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg);
 999   }
1000 }
1001 
1002 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
1003   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
1004   assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
1005 
1006   if (opcode == Op_MinV) {
1007     if (elem_bt == T_BYTE) {
1008       pminsb(dst, src);
1009     } else if (elem_bt == T_SHORT) {
1010       pminsw(dst, src);
1011     } else if (elem_bt == T_INT) {
1012       pminsd(dst, src);
1013     } else {
1014       assert(elem_bt == T_LONG, "required");
1015       assert(tmp == xmm0, "required");
1016       assert_different_registers(dst, src, tmp);
1017       movdqu(xmm0, dst);
1018       pcmpgtq(xmm0, src);
1019       blendvpd(dst, src);  // xmm0 as mask
1020     }
1021   } else { // opcode == Op_MaxV
1022     if (elem_bt == T_BYTE) {
1023       pmaxsb(dst, src);
1024     } else if (elem_bt == T_SHORT) {
1025       pmaxsw(dst, src);
1026     } else if (elem_bt == T_INT) {
1027       pmaxsd(dst, src);
1028     } else {
1029       assert(elem_bt == T_LONG, "required");
1030       assert(tmp == xmm0, "required");
1031       assert_different_registers(dst, src, tmp);
1032       movdqu(xmm0, src);
1033       pcmpgtq(xmm0, dst);
1034       blendvpd(dst, src);  // xmm0 as mask
1035     }
1036   }
1037 }
1038 
1039 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
1040                                  XMMRegister dst, XMMRegister src1, XMMRegister src2,
1041                                  int vlen_enc) {
1042   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
1043 
1044   if (opcode == Op_MinV) {
1045     if (elem_bt == T_BYTE) {
1046       vpminsb(dst, src1, src2, vlen_enc);
1047     } else if (elem_bt == T_SHORT) {
1048       vpminsw(dst, src1, src2, vlen_enc);
1049     } else if (elem_bt == T_INT) {
1050       vpminsd(dst, src1, src2, vlen_enc);
1051     } else {
1052       assert(elem_bt == T_LONG, "required");
1053       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
1054         vpminsq(dst, src1, src2, vlen_enc);
1055       } else {
1056         assert_different_registers(dst, src1, src2);
1057         vpcmpgtq(dst, src1, src2, vlen_enc);
1058         vblendvpd(dst, src1, src2, dst, vlen_enc);
1059       }
1060     }
1061   } else { // opcode == Op_MaxV
1062     if (elem_bt == T_BYTE) {
1063       vpmaxsb(dst, src1, src2, vlen_enc);
1064     } else if (elem_bt == T_SHORT) {
1065       vpmaxsw(dst, src1, src2, vlen_enc);
1066     } else if (elem_bt == T_INT) {
1067       vpmaxsd(dst, src1, src2, vlen_enc);
1068     } else {
1069       assert(elem_bt == T_LONG, "required");
1070       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
1071         vpmaxsq(dst, src1, src2, vlen_enc);
1072       } else {
1073         assert_different_registers(dst, src1, src2);
1074         vpcmpgtq(dst, src1, src2, vlen_enc);
1075         vblendvpd(dst, src2, src1, dst, vlen_enc);
1076       }
1077     }
1078   }
1079 }
1080 
1081 // Float/Double min max
1082 
1083 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
1084                                    XMMRegister dst, XMMRegister a, XMMRegister b,
1085                                    XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
1086                                    int vlen_enc) {
1087   assert(UseAVX > 0, "required");
1088   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1089          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1090   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1091   assert_different_registers(a, b, tmp, atmp, btmp);
1092 
1093   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1094   bool is_double_word = is_double_word_type(elem_bt);
1095 
1096   if (!is_double_word && is_min) {
1097     vblendvps(atmp, a, b, a, vlen_enc);
1098     vblendvps(btmp, b, a, a, vlen_enc);
1099     vminps(tmp, atmp, btmp, vlen_enc);
1100     vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1101     vblendvps(dst, tmp, atmp, btmp, vlen_enc);
1102   } else if (!is_double_word && !is_min) {
1103     vblendvps(btmp, b, a, b, vlen_enc);
1104     vblendvps(atmp, a, b, b, vlen_enc);
1105     vmaxps(tmp, atmp, btmp, vlen_enc);
1106     vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1107     vblendvps(dst, tmp, atmp, btmp, vlen_enc);
1108   } else if (is_double_word && is_min) {
1109     vblendvpd(atmp, a, b, a, vlen_enc);
1110     vblendvpd(btmp, b, a, a, vlen_enc);
1111     vminpd(tmp, atmp, btmp, vlen_enc);
1112     vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1113     vblendvpd(dst, tmp, atmp, btmp, vlen_enc);
1114   } else {
1115     assert(is_double_word && !is_min, "sanity");
1116     vblendvpd(btmp, b, a, b, vlen_enc);
1117     vblendvpd(atmp, a, b, b, vlen_enc);
1118     vmaxpd(tmp, atmp, btmp, vlen_enc);
1119     vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1120     vblendvpd(dst, tmp, atmp, btmp, vlen_enc);
1121   }
1122 }
1123 
1124 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
1125                                     XMMRegister dst, XMMRegister a, XMMRegister b,
1126                                     KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
1127                                     int vlen_enc) {
1128   assert(UseAVX > 2, "required");
1129   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1130          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1131   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1132   assert_different_registers(dst, a, b, atmp, btmp);
1133 
1134   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1135   bool is_double_word = is_double_word_type(elem_bt);
1136   bool merge = true;
1137 
1138   if (!is_double_word && is_min) {
1139     evpmovd2m(ktmp, a, vlen_enc);
1140     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1141     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1142     vminps(dst, atmp, btmp, vlen_enc);
1143     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1144     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1145   } else if (!is_double_word && !is_min) {
1146     evpmovd2m(ktmp, b, vlen_enc);
1147     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1148     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1149     vmaxps(dst, atmp, btmp, vlen_enc);
1150     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1151     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1152   } else if (is_double_word && is_min) {
1153     evpmovq2m(ktmp, a, vlen_enc);
1154     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1155     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1156     vminpd(dst, atmp, btmp, vlen_enc);
1157     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1158     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1159   } else {
1160     assert(is_double_word && !is_min, "sanity");
1161     evpmovq2m(ktmp, b, vlen_enc);
1162     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1163     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1164     vmaxpd(dst, atmp, btmp, vlen_enc);
1165     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1166     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1167   }
1168 }
1169 
1170 // Float/Double signum
1171 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) {
1172   assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");
1173 
1174   Label DONE_LABEL;
1175 
1176   if (opcode == Op_SignumF) {
1177     assert(UseSSE > 0, "required");
1178     ucomiss(dst, zero);
1179     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1180     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1181     movflt(dst, one);
1182     jcc(Assembler::above, DONE_LABEL);
1183     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
1184   } else if (opcode == Op_SignumD) {
1185     assert(UseSSE > 1, "required");
1186     ucomisd(dst, zero);
1187     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1188     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1189     movdbl(dst, one);
1190     jcc(Assembler::above, DONE_LABEL);
1191     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
1192   }
1193 
1194   bind(DONE_LABEL);
1195 }
1196 
1197 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1198   if (sign) {
1199     pmovsxbw(dst, src);
1200   } else {
1201     pmovzxbw(dst, src);
1202   }
1203 }
1204 
1205 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1206   if (sign) {
1207     vpmovsxbw(dst, src, vector_len);
1208   } else {
1209     vpmovzxbw(dst, src, vector_len);
1210   }
1211 }
1212 
1213 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1214   if (sign) {
1215     vpmovsxbd(dst, src, vector_len);
1216   } else {
1217     vpmovzxbd(dst, src, vector_len);
1218   }
1219 }
1220 
1221 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1222   if (sign) {
1223     vpmovsxwd(dst, src, vector_len);
1224   } else {
1225     vpmovzxwd(dst, src, vector_len);
1226   }
1227 }
1228 
1229 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1230                                      int shift, int vector_len) {
1231   if (opcode == Op_RotateLeftV) {
1232     if (etype == T_INT) {
1233       evprold(dst, src, shift, vector_len);
1234     } else {
1235       assert(etype == T_LONG, "expected type T_LONG");
1236       evprolq(dst, src, shift, vector_len);
1237     }
1238   } else {
1239     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1240     if (etype == T_INT) {
1241       evprord(dst, src, shift, vector_len);
1242     } else {
1243       assert(etype == T_LONG, "expected type T_LONG");
1244       evprorq(dst, src, shift, vector_len);
1245     }
1246   }
1247 }
1248 
1249 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1250                                      XMMRegister shift, int vector_len) {
1251   if (opcode == Op_RotateLeftV) {
1252     if (etype == T_INT) {
1253       evprolvd(dst, src, shift, vector_len);
1254     } else {
1255       assert(etype == T_LONG, "expected type T_LONG");
1256       evprolvq(dst, src, shift, vector_len);
1257     }
1258   } else {
1259     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1260     if (etype == T_INT) {
1261       evprorvd(dst, src, shift, vector_len);
1262     } else {
1263       assert(etype == T_LONG, "expected type T_LONG");
1264       evprorvq(dst, src, shift, vector_len);
1265     }
1266   }
1267 }
1268 
1269 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1270   if (opcode == Op_RShiftVI) {
1271     psrad(dst, shift);
1272   } else if (opcode == Op_LShiftVI) {
1273     pslld(dst, shift);
1274   } else {
1275     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1276     psrld(dst, shift);
1277   }
1278 }
1279 
1280 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1281   switch (opcode) {
1282     case Op_RShiftVI:  psrad(dst, shift); break;
1283     case Op_LShiftVI:  pslld(dst, shift); break;
1284     case Op_URShiftVI: psrld(dst, shift); break;
1285 
1286     default: assert(false, "%s", NodeClassNames[opcode]);
1287   }
1288 }
1289 
1290 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1291   if (opcode == Op_RShiftVI) {
1292     vpsrad(dst, nds, shift, vector_len);
1293   } else if (opcode == Op_LShiftVI) {
1294     vpslld(dst, nds, shift, vector_len);
1295   } else {
1296     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1297     vpsrld(dst, nds, shift, vector_len);
1298   }
1299 }
1300 
1301 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1302   switch (opcode) {
1303     case Op_RShiftVI:  vpsrad(dst, src, shift, vlen_enc); break;
1304     case Op_LShiftVI:  vpslld(dst, src, shift, vlen_enc); break;
1305     case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1306 
1307     default: assert(false, "%s", NodeClassNames[opcode]);
1308   }
1309 }
1310 
1311 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1312   switch (opcode) {
1313     case Op_RShiftVB:  // fall-through
1314     case Op_RShiftVS:  psraw(dst, shift); break;
1315 
1316     case Op_LShiftVB:  // fall-through
1317     case Op_LShiftVS:  psllw(dst, shift);   break;
1318 
1319     case Op_URShiftVS: // fall-through
1320     case Op_URShiftVB: psrlw(dst, shift);  break;
1321 
1322     default: assert(false, "%s", NodeClassNames[opcode]);
1323   }
1324 }
1325 
1326 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1327   switch (opcode) {
1328     case Op_RShiftVB:  // fall-through
1329     case Op_RShiftVS:  vpsraw(dst, src, shift, vlen_enc); break;
1330 
1331     case Op_LShiftVB:  // fall-through
1332     case Op_LShiftVS:  vpsllw(dst, src, shift, vlen_enc); break;
1333 
1334     case Op_URShiftVS: // fall-through
1335     case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1336 
1337     default: assert(false, "%s", NodeClassNames[opcode]);
1338   }
1339 }
1340 
1341 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1342   switch (opcode) {
1343     case Op_RShiftVL:  psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1344     case Op_LShiftVL:  psllq(dst, shift); break;
1345     case Op_URShiftVL: psrlq(dst, shift); break;
1346 
1347     default: assert(false, "%s", NodeClassNames[opcode]);
1348   }
1349 }
1350 
1351 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1352   if (opcode == Op_RShiftVL) {
1353     psrlq(dst, shift);  // using srl to implement sra on pre-avs512 systems
1354   } else if (opcode == Op_LShiftVL) {
1355     psllq(dst, shift);
1356   } else {
1357     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1358     psrlq(dst, shift);
1359   }
1360 }
1361 
1362 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1363   switch (opcode) {
1364     case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1365     case Op_LShiftVL:  vpsllq(dst, src, shift, vlen_enc); break;
1366     case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1367 
1368     default: assert(false, "%s", NodeClassNames[opcode]);
1369   }
1370 }
1371 
1372 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1373   if (opcode == Op_RShiftVL) {
1374     evpsraq(dst, nds, shift, vector_len);
1375   } else if (opcode == Op_LShiftVL) {
1376     vpsllq(dst, nds, shift, vector_len);
1377   } else {
1378     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1379     vpsrlq(dst, nds, shift, vector_len);
1380   }
1381 }
1382 
1383 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1384   switch (opcode) {
1385     case Op_RShiftVB:  // fall-through
1386     case Op_RShiftVS:  // fall-through
1387     case Op_RShiftVI:  vpsravd(dst, src, shift, vlen_enc); break;
1388 
1389     case Op_LShiftVB:  // fall-through
1390     case Op_LShiftVS:  // fall-through
1391     case Op_LShiftVI:  vpsllvd(dst, src, shift, vlen_enc); break;
1392 
1393     case Op_URShiftVB: // fall-through
1394     case Op_URShiftVS: // fall-through
1395     case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1396 
1397     default: assert(false, "%s", NodeClassNames[opcode]);
1398   }
1399 }
1400 
1401 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1402   switch (opcode) {
1403     case Op_RShiftVB:  // fall-through
1404     case Op_RShiftVS:  evpsravw(dst, src, shift, vlen_enc); break;
1405 
1406     case Op_LShiftVB:  // fall-through
1407     case Op_LShiftVS:  evpsllvw(dst, src, shift, vlen_enc); break;
1408 
1409     case Op_URShiftVB: // fall-through
1410     case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1411 
1412     default: assert(false, "%s", NodeClassNames[opcode]);
1413   }
1414 }
1415 
1416 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1417   assert(UseAVX >= 2, "required");
1418   switch (opcode) {
1419     case Op_RShiftVL: {
1420       if (UseAVX > 2) {
1421         assert(tmp == xnoreg, "not used");
1422         if (!VM_Version::supports_avx512vl()) {
1423           vlen_enc = Assembler::AVX_512bit;
1424         }
1425         evpsravq(dst, src, shift, vlen_enc);
1426       } else {
1427         vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1428         vpsrlvq(dst, src, shift, vlen_enc);
1429         vpsrlvq(tmp, tmp, shift, vlen_enc);
1430         vpxor(dst, dst, tmp, vlen_enc);
1431         vpsubq(dst, dst, tmp, vlen_enc);
1432       }
1433       break;
1434     }
1435     case Op_LShiftVL: {
1436       assert(tmp == xnoreg, "not used");
1437       vpsllvq(dst, src, shift, vlen_enc);
1438       break;
1439     }
1440     case Op_URShiftVL: {
1441       assert(tmp == xnoreg, "not used");
1442       vpsrlvq(dst, src, shift, vlen_enc);
1443       break;
1444     }
1445     default: assert(false, "%s", NodeClassNames[opcode]);
1446   }
1447 }
1448 
1449 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1450 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1451   assert(opcode == Op_LShiftVB ||
1452          opcode == Op_RShiftVB ||
1453          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1454   bool sign = (opcode != Op_URShiftVB);
1455   assert(vector_len == 0, "required");
1456   vextendbd(sign, dst, src, 1);
1457   vpmovzxbd(vtmp, shift, 1);
1458   varshiftd(opcode, dst, dst, vtmp, 1);
1459   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg);
1460   vextracti128_high(vtmp, dst);
1461   vpackusdw(dst, dst, vtmp, 0);
1462 }
1463 
1464 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1465 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1466   assert(opcode == Op_LShiftVB ||
1467          opcode == Op_RShiftVB ||
1468          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1469   bool sign = (opcode != Op_URShiftVB);
1470   int ext_vector_len = vector_len + 1;
1471   vextendbw(sign, dst, src, ext_vector_len);
1472   vpmovzxbw(vtmp, shift, ext_vector_len);
1473   varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1474   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg);
1475   if (vector_len == 0) {
1476     vextracti128_high(vtmp, dst);
1477     vpackuswb(dst, dst, vtmp, vector_len);
1478   } else {
1479     vextracti64x4_high(vtmp, dst);
1480     vpackuswb(dst, dst, vtmp, vector_len);
1481     vpermq(dst, dst, 0xD8, vector_len);
1482   }
1483 }
1484 
1485 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1486   switch(typ) {
1487     case T_BYTE:
1488       pinsrb(dst, val, idx);
1489       break;
1490     case T_SHORT:
1491       pinsrw(dst, val, idx);
1492       break;
1493     case T_INT:
1494       pinsrd(dst, val, idx);
1495       break;
1496     case T_LONG:
1497       pinsrq(dst, val, idx);
1498       break;
1499     default:
1500       assert(false,"Should not reach here.");
1501       break;
1502   }
1503 }
1504 
1505 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1506   switch(typ) {
1507     case T_BYTE:
1508       vpinsrb(dst, src, val, idx);
1509       break;
1510     case T_SHORT:
1511       vpinsrw(dst, src, val, idx);
1512       break;
1513     case T_INT:
1514       vpinsrd(dst, src, val, idx);
1515       break;
1516     case T_LONG:
1517       vpinsrq(dst, src, val, idx);
1518       break;
1519     default:
1520       assert(false,"Should not reach here.");
1521       break;
1522   }
1523 }
1524 
1525 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1526   switch(typ) {
1527     case T_INT:
1528       vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1529       break;
1530     case T_FLOAT:
1531       vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1532       break;
1533     case T_LONG:
1534       vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1535       break;
1536     case T_DOUBLE:
1537       vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1538       break;
1539     default:
1540       assert(false,"Should not reach here.");
1541       break;
1542   }
1543 }
1544 
1545 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1546   switch(typ) {
1547     case T_INT:
1548       evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1549       break;
1550     case T_FLOAT:
1551       evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1552       break;
1553     case T_LONG:
1554       evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1555       break;
1556     case T_DOUBLE:
1557       evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1558       break;
1559     default:
1560       assert(false,"Should not reach here.");
1561       break;
1562   }
1563 }
1564 
1565 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1566   switch(typ) {
1567     case T_INT:
1568       evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1569       break;
1570     case T_FLOAT:
1571       evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1572       break;
1573     case T_LONG:
1574       evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1575       break;
1576     case T_DOUBLE:
1577       evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1578       break;
1579     default:
1580       assert(false,"Should not reach here.");
1581       break;
1582   }
1583 }
1584 
1585 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) {
1586   if (vlen_in_bytes <= 16) {
1587     pxor (dst, dst);
1588     psubb(dst, src);
1589     switch (elem_bt) {
1590       case T_BYTE:   /* nothing to do */ break;
1591       case T_SHORT:  pmovsxbw(dst, dst); break;
1592       case T_INT:    pmovsxbd(dst, dst); break;
1593       case T_FLOAT:  pmovsxbd(dst, dst); break;
1594       case T_LONG:   pmovsxbq(dst, dst); break;
1595       case T_DOUBLE: pmovsxbq(dst, dst); break;
1596 
1597       default: assert(false, "%s", type2name(elem_bt));
1598     }
1599   } else {
1600     assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, "");
1601     int vlen_enc = vector_length_encoding(vlen_in_bytes);
1602 
1603     vpxor (dst, dst, dst, vlen_enc);
1604     vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc);
1605 
1606     switch (elem_bt) {
1607       case T_BYTE:   /* nothing to do */            break;
1608       case T_SHORT:  vpmovsxbw(dst, dst, vlen_enc); break;
1609       case T_INT:    vpmovsxbd(dst, dst, vlen_enc); break;
1610       case T_FLOAT:  vpmovsxbd(dst, dst, vlen_enc); break;
1611       case T_LONG:   vpmovsxbq(dst, dst, vlen_enc); break;
1612       case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1613 
1614       default: assert(false, "%s", type2name(elem_bt));
1615     }
1616   }
1617 }
1618 
1619 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) {
1620   if (novlbwdq) {
1621     vpmovsxbd(xtmp, src, vlen_enc);
1622     evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()),
1623             Assembler::eq, true, vlen_enc, noreg);
1624   } else {
1625     vpxor(xtmp, xtmp, xtmp, vlen_enc);
1626     vpsubb(xtmp, xtmp, src, vlen_enc);
1627     evpmovb2m(dst, xtmp, vlen_enc);
1628   }
1629 }
1630 
1631 void C2_MacroAssembler::load_vector(XMMRegister dst, Address src, int vlen_in_bytes) {
1632   switch (vlen_in_bytes) {
1633     case 4:  movdl(dst, src);   break;
1634     case 8:  movq(dst, src);    break;
1635     case 16: movdqu(dst, src);  break;
1636     case 32: vmovdqu(dst, src); break;
1637     case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break;
1638     default: ShouldNotReachHere();
1639   }
1640 }
1641 
1642 void C2_MacroAssembler::load_vector(XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) {
1643   assert(rscratch != noreg || always_reachable(src), "missing");
1644 
1645   if (reachable(src)) {
1646     load_vector(dst, as_Address(src), vlen_in_bytes);
1647   } else {
1648     lea(rscratch, src);
1649     load_vector(dst, Address(rscratch, 0), vlen_in_bytes);
1650   }
1651 }
1652 
1653 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) {
1654   int vlen_enc = vector_length_encoding(vlen);
1655   if (VM_Version::supports_avx()) {
1656     if (bt == T_LONG) {
1657       if (VM_Version::supports_avx2()) {
1658         vpbroadcastq(dst, src, vlen_enc);
1659       } else {
1660         vmovddup(dst, src, vlen_enc);
1661       }
1662     } else if (bt == T_DOUBLE) {
1663       if (vlen_enc != Assembler::AVX_128bit) {
1664         vbroadcastsd(dst, src, vlen_enc, noreg);
1665       } else {
1666         vmovddup(dst, src, vlen_enc);
1667       }
1668     } else {
1669       if (VM_Version::supports_avx2() && is_integral_type(bt)) {
1670         vpbroadcastd(dst, src, vlen_enc);
1671       } else {
1672         vbroadcastss(dst, src, vlen_enc);
1673       }
1674     }
1675   } else if (VM_Version::supports_sse3()) {
1676     movddup(dst, src);
1677   } else {
1678     movq(dst, src);
1679     if (vlen == 16) {
1680       punpcklqdq(dst, dst);
1681     }
1682   }
1683 }
1684 
1685 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes) {
1686   ExternalAddress addr(StubRoutines::x86::vector_iota_indices());
1687   if (vlen_in_bytes <= 4) {
1688     movdl(dst, addr);
1689   } else if (vlen_in_bytes == 8) {
1690     movq(dst, addr);
1691   } else if (vlen_in_bytes == 16) {
1692     movdqu(dst, addr, noreg);
1693   } else if (vlen_in_bytes == 32) {
1694     vmovdqu(dst, addr, noreg);
1695   } else {
1696     assert(vlen_in_bytes == 64, "%d", vlen_in_bytes);
1697     evmovdqub(dst, k0, addr, false /*merge*/, Assembler::AVX_512bit, noreg);
1698   }
1699 }
1700 
1701 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1702 
1703 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1704   int vector_len = Assembler::AVX_128bit;
1705 
1706   switch (opcode) {
1707     case Op_AndReductionV:  pand(dst, src); break;
1708     case Op_OrReductionV:   por (dst, src); break;
1709     case Op_XorReductionV:  pxor(dst, src); break;
1710     case Op_MinReductionV:
1711       switch (typ) {
1712         case T_BYTE:        pminsb(dst, src); break;
1713         case T_SHORT:       pminsw(dst, src); break;
1714         case T_INT:         pminsd(dst, src); break;
1715         case T_LONG:        assert(UseAVX > 2, "required");
1716                             vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
1717         default:            assert(false, "wrong type");
1718       }
1719       break;
1720     case Op_MaxReductionV:
1721       switch (typ) {
1722         case T_BYTE:        pmaxsb(dst, src); break;
1723         case T_SHORT:       pmaxsw(dst, src); break;
1724         case T_INT:         pmaxsd(dst, src); break;
1725         case T_LONG:        assert(UseAVX > 2, "required");
1726                             vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
1727         default:            assert(false, "wrong type");
1728       }
1729       break;
1730     case Op_AddReductionVF: addss(dst, src); break;
1731     case Op_AddReductionVD: addsd(dst, src); break;
1732     case Op_AddReductionVI:
1733       switch (typ) {
1734         case T_BYTE:        paddb(dst, src); break;
1735         case T_SHORT:       paddw(dst, src); break;
1736         case T_INT:         paddd(dst, src); break;
1737         default:            assert(false, "wrong type");
1738       }
1739       break;
1740     case Op_AddReductionVL: paddq(dst, src); break;
1741     case Op_MulReductionVF: mulss(dst, src); break;
1742     case Op_MulReductionVD: mulsd(dst, src); break;
1743     case Op_MulReductionVI:
1744       switch (typ) {
1745         case T_SHORT:       pmullw(dst, src); break;
1746         case T_INT:         pmulld(dst, src); break;
1747         default:            assert(false, "wrong type");
1748       }
1749       break;
1750     case Op_MulReductionVL: assert(UseAVX > 2, "required");
1751                             vpmullq(dst, dst, src, vector_len); break;
1752     default:                assert(false, "wrong opcode");
1753   }
1754 }
1755 
1756 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1757   int vector_len = Assembler::AVX_256bit;
1758 
1759   switch (opcode) {
1760     case Op_AndReductionV:  vpand(dst, src1, src2, vector_len); break;
1761     case Op_OrReductionV:   vpor (dst, src1, src2, vector_len); break;
1762     case Op_XorReductionV:  vpxor(dst, src1, src2, vector_len); break;
1763     case Op_MinReductionV:
1764       switch (typ) {
1765         case T_BYTE:        vpminsb(dst, src1, src2, vector_len); break;
1766         case T_SHORT:       vpminsw(dst, src1, src2, vector_len); break;
1767         case T_INT:         vpminsd(dst, src1, src2, vector_len); break;
1768         case T_LONG:        assert(UseAVX > 2, "required");
1769                             vpminsq(dst, src1, src2, vector_len); break;
1770         default:            assert(false, "wrong type");
1771       }
1772       break;
1773     case Op_MaxReductionV:
1774       switch (typ) {
1775         case T_BYTE:        vpmaxsb(dst, src1, src2, vector_len); break;
1776         case T_SHORT:       vpmaxsw(dst, src1, src2, vector_len); break;
1777         case T_INT:         vpmaxsd(dst, src1, src2, vector_len); break;
1778         case T_LONG:        assert(UseAVX > 2, "required");
1779                             vpmaxsq(dst, src1, src2, vector_len); break;
1780         default:            assert(false, "wrong type");
1781       }
1782       break;
1783     case Op_AddReductionVI:
1784       switch (typ) {
1785         case T_BYTE:        vpaddb(dst, src1, src2, vector_len); break;
1786         case T_SHORT:       vpaddw(dst, src1, src2, vector_len); break;
1787         case T_INT:         vpaddd(dst, src1, src2, vector_len); break;
1788         default:            assert(false, "wrong type");
1789       }
1790       break;
1791     case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
1792     case Op_MulReductionVI:
1793       switch (typ) {
1794         case T_SHORT:       vpmullw(dst, src1, src2, vector_len); break;
1795         case T_INT:         vpmulld(dst, src1, src2, vector_len); break;
1796         default:            assert(false, "wrong type");
1797       }
1798       break;
1799     case Op_MulReductionVL: vpmullq(dst, src1, src2, vector_len); break;
1800     default:                assert(false, "wrong opcode");
1801   }
1802 }
1803 
1804 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
1805                                   XMMRegister dst, XMMRegister src,
1806                                   XMMRegister vtmp1, XMMRegister vtmp2) {
1807   switch (opcode) {
1808     case Op_AddReductionVF:
1809     case Op_MulReductionVF:
1810       reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1811       break;
1812 
1813     case Op_AddReductionVD:
1814     case Op_MulReductionVD:
1815       reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1816       break;
1817 
1818     default: assert(false, "wrong opcode");
1819   }
1820 }
1821 
1822 void C2_MacroAssembler::reduceB(int opcode, int vlen,
1823                              Register dst, Register src1, XMMRegister src2,
1824                              XMMRegister vtmp1, XMMRegister vtmp2) {
1825   switch (vlen) {
1826     case  8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1827     case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1828     case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1829     case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1830 
1831     default: assert(false, "wrong vector length");
1832   }
1833 }
1834 
1835 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
1836                              Register dst, Register src1, XMMRegister src2,
1837                              XMMRegister vtmp1, XMMRegister vtmp2) {
1838   switch (vlen) {
1839     case  8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1840     case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1841     case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1842     case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1843 
1844     default: assert(false, "wrong vector length");
1845   }
1846 }
1847 
1848 void C2_MacroAssembler::reduceS(int opcode, int vlen,
1849                              Register dst, Register src1, XMMRegister src2,
1850                              XMMRegister vtmp1, XMMRegister vtmp2) {
1851   switch (vlen) {
1852     case  4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1853     case  8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1854     case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1855     case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1856 
1857     default: assert(false, "wrong vector length");
1858   }
1859 }
1860 
1861 void C2_MacroAssembler::reduceI(int opcode, int vlen,
1862                              Register dst, Register src1, XMMRegister src2,
1863                              XMMRegister vtmp1, XMMRegister vtmp2) {
1864   switch (vlen) {
1865     case  2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1866     case  4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1867     case  8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1868     case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1869 
1870     default: assert(false, "wrong vector length");
1871   }
1872 }
1873 
1874 #ifdef _LP64
1875 void C2_MacroAssembler::reduceL(int opcode, int vlen,
1876                              Register dst, Register src1, XMMRegister src2,
1877                              XMMRegister vtmp1, XMMRegister vtmp2) {
1878   switch (vlen) {
1879     case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1880     case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1881     case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1882 
1883     default: assert(false, "wrong vector length");
1884   }
1885 }
1886 #endif // _LP64
1887 
1888 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1889   switch (vlen) {
1890     case 2:
1891       assert(vtmp2 == xnoreg, "");
1892       reduce2F(opcode, dst, src, vtmp1);
1893       break;
1894     case 4:
1895       assert(vtmp2 == xnoreg, "");
1896       reduce4F(opcode, dst, src, vtmp1);
1897       break;
1898     case 8:
1899       reduce8F(opcode, dst, src, vtmp1, vtmp2);
1900       break;
1901     case 16:
1902       reduce16F(opcode, dst, src, vtmp1, vtmp2);
1903       break;
1904     default: assert(false, "wrong vector length");
1905   }
1906 }
1907 
1908 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1909   switch (vlen) {
1910     case 2:
1911       assert(vtmp2 == xnoreg, "");
1912       reduce2D(opcode, dst, src, vtmp1);
1913       break;
1914     case 4:
1915       reduce4D(opcode, dst, src, vtmp1, vtmp2);
1916       break;
1917     case 8:
1918       reduce8D(opcode, dst, src, vtmp1, vtmp2);
1919       break;
1920     default: assert(false, "wrong vector length");
1921   }
1922 }
1923 
1924 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1925   if (opcode == Op_AddReductionVI) {
1926     if (vtmp1 != src2) {
1927       movdqu(vtmp1, src2);
1928     }
1929     phaddd(vtmp1, vtmp1);
1930   } else {
1931     pshufd(vtmp1, src2, 0x1);
1932     reduce_operation_128(T_INT, opcode, vtmp1, src2);
1933   }
1934   movdl(vtmp2, src1);
1935   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
1936   movdl(dst, vtmp1);
1937 }
1938 
1939 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1940   if (opcode == Op_AddReductionVI) {
1941     if (vtmp1 != src2) {
1942       movdqu(vtmp1, src2);
1943     }
1944     phaddd(vtmp1, src2);
1945     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1946   } else {
1947     pshufd(vtmp2, src2, 0xE);
1948     reduce_operation_128(T_INT, opcode, vtmp2, src2);
1949     reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1950   }
1951 }
1952 
1953 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1954   if (opcode == Op_AddReductionVI) {
1955     vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
1956     vextracti128_high(vtmp2, vtmp1);
1957     vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
1958     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1959   } else {
1960     vextracti128_high(vtmp1, src2);
1961     reduce_operation_128(T_INT, opcode, vtmp1, src2);
1962     reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1963   }
1964 }
1965 
1966 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1967   vextracti64x4_high(vtmp2, src2);
1968   reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
1969   reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1970 }
1971 
1972 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1973   pshufd(vtmp2, src2, 0x1);
1974   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
1975   movdqu(vtmp1, vtmp2);
1976   psrldq(vtmp1, 2);
1977   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
1978   movdqu(vtmp2, vtmp1);
1979   psrldq(vtmp2, 1);
1980   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
1981   movdl(vtmp2, src1);
1982   pmovsxbd(vtmp1, vtmp1);
1983   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
1984   pextrb(dst, vtmp1, 0x0);
1985   movsbl(dst, dst);
1986 }
1987 
1988 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1989   pshufd(vtmp1, src2, 0xE);
1990   reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
1991   reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1992 }
1993 
1994 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1995   vextracti128_high(vtmp2, src2);
1996   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
1997   reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1998 }
1999 
2000 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2001   vextracti64x4_high(vtmp1, src2);
2002   reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
2003   reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2004 }
2005 
2006 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2007   pmovsxbw(vtmp2, src2);
2008   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2009 }
2010 
2011 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2012   if (UseAVX > 1) {
2013     int vector_len = Assembler::AVX_256bit;
2014     vpmovsxbw(vtmp1, src2, vector_len);
2015     reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2016   } else {
2017     pmovsxbw(vtmp2, src2);
2018     reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2019     pshufd(vtmp2, src2, 0x1);
2020     pmovsxbw(vtmp2, src2);
2021     reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2022   }
2023 }
2024 
2025 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2026   if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
2027     int vector_len = Assembler::AVX_512bit;
2028     vpmovsxbw(vtmp1, src2, vector_len);
2029     reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2030   } else {
2031     assert(UseAVX >= 2,"Should not reach here.");
2032     mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
2033     vextracti128_high(vtmp2, src2);
2034     mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2035   }
2036 }
2037 
2038 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2039   mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
2040   vextracti64x4_high(vtmp2, src2);
2041   mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2042 }
2043 
2044 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2045   if (opcode == Op_AddReductionVI) {
2046     if (vtmp1 != src2) {
2047       movdqu(vtmp1, src2);
2048     }
2049     phaddw(vtmp1, vtmp1);
2050     phaddw(vtmp1, vtmp1);
2051   } else {
2052     pshufd(vtmp2, src2, 0x1);
2053     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2054     movdqu(vtmp1, vtmp2);
2055     psrldq(vtmp1, 2);
2056     reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
2057   }
2058   movdl(vtmp2, src1);
2059   pmovsxwd(vtmp1, vtmp1);
2060   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2061   pextrw(dst, vtmp1, 0x0);
2062   movswl(dst, dst);
2063 }
2064 
2065 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2066   if (opcode == Op_AddReductionVI) {
2067     if (vtmp1 != src2) {
2068       movdqu(vtmp1, src2);
2069     }
2070     phaddw(vtmp1, src2);
2071   } else {
2072     pshufd(vtmp1, src2, 0xE);
2073     reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
2074   }
2075   reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2076 }
2077 
2078 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2079   if (opcode == Op_AddReductionVI) {
2080     int vector_len = Assembler::AVX_256bit;
2081     vphaddw(vtmp2, src2, src2, vector_len);
2082     vpermq(vtmp2, vtmp2, 0xD8, vector_len);
2083   } else {
2084     vextracti128_high(vtmp2, src2);
2085     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2086   }
2087   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2088 }
2089 
2090 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2091   int vector_len = Assembler::AVX_256bit;
2092   vextracti64x4_high(vtmp1, src2);
2093   reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
2094   reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2095 }
2096 
2097 #ifdef _LP64
2098 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2099   pshufd(vtmp2, src2, 0xE);
2100   reduce_operation_128(T_LONG, opcode, vtmp2, src2);
2101   movdq(vtmp1, src1);
2102   reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
2103   movdq(dst, vtmp1);
2104 }
2105 
2106 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2107   vextracti128_high(vtmp1, src2);
2108   reduce_operation_128(T_LONG, opcode, vtmp1, src2);
2109   reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2110 }
2111 
2112 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2113   vextracti64x4_high(vtmp2, src2);
2114   reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
2115   reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2116 }
2117 
2118 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
2119   mov64(temp, -1L);
2120   bzhiq(temp, temp, len);
2121   kmovql(dst, temp);
2122 }
2123 #endif // _LP64
2124 
2125 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2126   reduce_operation_128(T_FLOAT, opcode, dst, src);
2127   pshufd(vtmp, src, 0x1);
2128   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2129 }
2130 
2131 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2132   reduce2F(opcode, dst, src, vtmp);
2133   pshufd(vtmp, src, 0x2);
2134   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2135   pshufd(vtmp, src, 0x3);
2136   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2137 }
2138 
2139 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2140   reduce4F(opcode, dst, src, vtmp2);
2141   vextractf128_high(vtmp2, src);
2142   reduce4F(opcode, dst, vtmp2, vtmp1);
2143 }
2144 
2145 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2146   reduce8F(opcode, dst, src, vtmp1, vtmp2);
2147   vextracti64x4_high(vtmp1, src);
2148   reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
2149 }
2150 
2151 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2152   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2153   pshufd(vtmp, src, 0xE);
2154   reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
2155 }
2156 
2157 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2158   reduce2D(opcode, dst, src, vtmp2);
2159   vextractf128_high(vtmp2, src);
2160   reduce2D(opcode, dst, vtmp2, vtmp1);
2161 }
2162 
2163 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2164   reduce4D(opcode, dst, src, vtmp1, vtmp2);
2165   vextracti64x4_high(vtmp1, src);
2166   reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
2167 }
2168 
2169 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) {
2170   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2171 }
2172 
2173 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) {
2174   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2175 }
2176 
2177 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask,
2178                                  int vec_enc) {
2179   switch(elem_bt) {
2180     case T_INT:
2181     case T_FLOAT:
2182       vmaskmovps(dst, src, mask, vec_enc);
2183       break;
2184     case T_LONG:
2185     case T_DOUBLE:
2186       vmaskmovpd(dst, src, mask, vec_enc);
2187       break;
2188     default:
2189       fatal("Unsupported type %s", type2name(elem_bt));
2190       break;
2191   }
2192 }
2193 
2194 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask,
2195                                  int vec_enc) {
2196   switch(elem_bt) {
2197     case T_INT:
2198     case T_FLOAT:
2199       vmaskmovps(dst, src, mask, vec_enc);
2200       break;
2201     case T_LONG:
2202     case T_DOUBLE:
2203       vmaskmovpd(dst, src, mask, vec_enc);
2204       break;
2205     default:
2206       fatal("Unsupported type %s", type2name(elem_bt));
2207       break;
2208   }
2209 }
2210 
2211 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
2212                                           XMMRegister dst, XMMRegister src,
2213                                           XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2214                                           XMMRegister xmm_0, XMMRegister xmm_1) {
2215   int permconst[] = {1, 14};
2216   XMMRegister wsrc = src;
2217   XMMRegister wdst = xmm_0;
2218   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2219 
2220   int vlen_enc = Assembler::AVX_128bit;
2221   if (vlen == 16) {
2222     vlen_enc = Assembler::AVX_256bit;
2223   }
2224 
2225   for (int i = log2(vlen) - 1; i >=0; i--) {
2226     if (i == 0 && !is_dst_valid) {
2227       wdst = dst;
2228     }
2229     if (i == 3) {
2230       vextracti64x4_high(wtmp, wsrc);
2231     } else if (i == 2) {
2232       vextracti128_high(wtmp, wsrc);
2233     } else { // i = [0,1]
2234       vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
2235     }
2236     vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2237     wsrc = wdst;
2238     vlen_enc = Assembler::AVX_128bit;
2239   }
2240   if (is_dst_valid) {
2241     vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2242   }
2243 }
2244 
2245 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
2246                                         XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2247                                         XMMRegister xmm_0, XMMRegister xmm_1) {
2248   XMMRegister wsrc = src;
2249   XMMRegister wdst = xmm_0;
2250   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2251   int vlen_enc = Assembler::AVX_128bit;
2252   if (vlen == 8) {
2253     vlen_enc = Assembler::AVX_256bit;
2254   }
2255   for (int i = log2(vlen) - 1; i >=0; i--) {
2256     if (i == 0 && !is_dst_valid) {
2257       wdst = dst;
2258     }
2259     if (i == 1) {
2260       vextracti128_high(wtmp, wsrc);
2261     } else if (i == 2) {
2262       vextracti64x4_high(wtmp, wsrc);
2263     } else {
2264       assert(i == 0, "%d", i);
2265       vpermilpd(wtmp, wsrc, 1, vlen_enc);
2266     }
2267     vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2268     wsrc = wdst;
2269     vlen_enc = Assembler::AVX_128bit;
2270   }
2271   if (is_dst_valid) {
2272     vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2273   }
2274 }
2275 
2276 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2277   switch (bt) {
2278     case T_BYTE:  pextrb(dst, src, idx); break;
2279     case T_SHORT: pextrw(dst, src, idx); break;
2280     case T_INT:   pextrd(dst, src, idx); break;
2281     case T_LONG:  pextrq(dst, src, idx); break;
2282 
2283     default:
2284       assert(false,"Should not reach here.");
2285       break;
2286   }
2287 }
2288 
2289 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2290   int esize =  type2aelembytes(typ);
2291   int elem_per_lane = 16/esize;
2292   int lane = elemindex / elem_per_lane;
2293   int eindex = elemindex % elem_per_lane;
2294 
2295   if (lane >= 2) {
2296     assert(UseAVX > 2, "required");
2297     vextractf32x4(dst, src, lane & 3);
2298     return dst;
2299   } else if (lane > 0) {
2300     assert(UseAVX > 0, "required");
2301     vextractf128(dst, src, lane);
2302     return dst;
2303   } else {
2304     return src;
2305   }
2306 }
2307 
2308 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2309   int esize =  type2aelembytes(typ);
2310   int elem_per_lane = 16/esize;
2311   int eindex = elemindex % elem_per_lane;
2312   assert(is_integral_type(typ),"required");
2313 
2314   if (eindex == 0) {
2315     if (typ == T_LONG) {
2316       movq(dst, src);
2317     } else {
2318       movdl(dst, src);
2319       if (typ == T_BYTE)
2320         movsbl(dst, dst);
2321       else if (typ == T_SHORT)
2322         movswl(dst, dst);
2323     }
2324   } else {
2325     extract(typ, dst, src, eindex);
2326   }
2327 }
2328 
2329 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) {
2330   int esize =  type2aelembytes(typ);
2331   int elem_per_lane = 16/esize;
2332   int eindex = elemindex % elem_per_lane;
2333   assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2334 
2335   if (eindex == 0) {
2336     movq(dst, src);
2337   } else {
2338     if (typ == T_FLOAT) {
2339       if (UseAVX == 0) {
2340         movdqu(dst, src);
2341         shufps(dst, dst, eindex);
2342       } else {
2343         vshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2344       }
2345     } else {
2346       if (UseAVX == 0) {
2347         movdqu(dst, src);
2348         psrldq(dst, eindex*esize);
2349       } else {
2350         vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2351       }
2352       movq(dst, dst);
2353     }
2354   }
2355   // Zero upper bits
2356   if (typ == T_FLOAT) {
2357     if (UseAVX == 0) {
2358       assert(vtmp != xnoreg, "required.");
2359       movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg);
2360       pand(dst, vtmp);
2361     } else {
2362       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg);
2363     }
2364   }
2365 }
2366 
2367 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) {
2368   switch(typ) {
2369     case T_BYTE:
2370     case T_BOOLEAN:
2371       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2372       break;
2373     case T_SHORT:
2374     case T_CHAR:
2375       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2376       break;
2377     case T_INT:
2378     case T_FLOAT:
2379       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2380       break;
2381     case T_LONG:
2382     case T_DOUBLE:
2383       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2384       break;
2385     default:
2386       assert(false,"Should not reach here.");
2387       break;
2388   }
2389 }
2390 
2391 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) {
2392   assert(rscratch != noreg || always_reachable(src2), "missing");
2393 
2394   switch(typ) {
2395     case T_BOOLEAN:
2396     case T_BYTE:
2397       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2398       break;
2399     case T_CHAR:
2400     case T_SHORT:
2401       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2402       break;
2403     case T_INT:
2404     case T_FLOAT:
2405       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2406       break;
2407     case T_LONG:
2408     case T_DOUBLE:
2409       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2410       break;
2411     default:
2412       assert(false,"Should not reach here.");
2413       break;
2414   }
2415 }
2416 
2417 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2418   switch(typ) {
2419     case T_BYTE:
2420       evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2421       break;
2422     case T_SHORT:
2423       evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2424       break;
2425     case T_INT:
2426     case T_FLOAT:
2427       evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2428       break;
2429     case T_LONG:
2430     case T_DOUBLE:
2431       evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2432       break;
2433     default:
2434       assert(false,"Should not reach here.");
2435       break;
2436   }
2437 }
2438 
2439 void C2_MacroAssembler::vectortest(int bt, int vlen, XMMRegister src1, XMMRegister src2,
2440                                    XMMRegister vtmp1, XMMRegister vtmp2, KRegister mask) {
2441   switch(vlen) {
2442     case 4:
2443       assert(vtmp1 != xnoreg, "required.");
2444       // Broadcast lower 32 bits to 128 bits before ptest
2445       pshufd(vtmp1, src1, 0x0);
2446       if (bt == BoolTest::overflow) {
2447         assert(vtmp2 != xnoreg, "required.");
2448         pshufd(vtmp2, src2, 0x0);
2449       } else {
2450         assert(vtmp2 == xnoreg, "required.");
2451         vtmp2 = src2;
2452       }
2453       ptest(vtmp1, vtmp2);
2454      break;
2455     case 8:
2456       assert(vtmp1 != xnoreg, "required.");
2457       // Broadcast lower 64 bits to 128 bits before ptest
2458       pshufd(vtmp1, src1, 0x4);
2459       if (bt == BoolTest::overflow) {
2460         assert(vtmp2 != xnoreg, "required.");
2461         pshufd(vtmp2, src2, 0x4);
2462       } else {
2463         assert(vtmp2 == xnoreg, "required.");
2464         vtmp2 = src2;
2465       }
2466       ptest(vtmp1, vtmp2);
2467      break;
2468     case 16:
2469       assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required.");
2470       ptest(src1, src2);
2471       break;
2472     case 32:
2473       assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required.");
2474       vptest(src1, src2, Assembler::AVX_256bit);
2475       break;
2476     case 64:
2477       {
2478         assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required.");
2479         evpcmpeqb(mask, src1, src2, Assembler::AVX_512bit);
2480         if (bt == BoolTest::ne) {
2481           ktestql(mask, mask);
2482         } else {
2483           assert(bt == BoolTest::overflow, "required");
2484           kortestql(mask, mask);
2485         }
2486       }
2487       break;
2488     default:
2489       assert(false,"Should not reach here.");
2490       break;
2491   }
2492 }
2493 
2494 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
2495   assert(UseAVX >= 2, "required");
2496 #ifdef ASSERT
2497   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2498   bool is_bw_supported = VM_Version::supports_avx512bw();
2499   if (is_bw && !is_bw_supported) {
2500     assert(vlen_enc != Assembler::AVX_512bit, "required");
2501     assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16),
2502            "XMM register should be 0-15");
2503   }
2504 #endif // ASSERT
2505   switch (elem_bt) {
2506     case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return;
2507     case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return;
2508     case T_INT: vpaddd(dst, src1, src2, vlen_enc); return;
2509     case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return;
2510     case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return;
2511     case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return;
2512     default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2513   }
2514 }
2515 
2516 #ifdef _LP64
2517 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) {
2518   assert(UseAVX >= 2, "required");
2519   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2520   bool is_vl = vlen_enc != Assembler::AVX_512bit;
2521   if ((UseAVX > 2) &&
2522       (!is_bw || VM_Version::supports_avx512bw()) &&
2523       (!is_vl || VM_Version::supports_avx512vl())) {
2524     switch (elem_bt) {
2525       case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return;
2526       case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return;
2527       case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return;
2528       case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return;
2529       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2530     }
2531   } else {
2532     assert(vlen_enc != Assembler::AVX_512bit, "required");
2533     assert((dst->encoding() < 16),"XMM register should be 0-15");
2534     switch (elem_bt) {
2535       case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return;
2536       case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return;
2537       case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return;
2538       case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return;
2539       case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return;
2540       case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return;
2541       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2542     }
2543   }
2544 }
2545 #endif
2546 
2547 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
2548   switch (to_elem_bt) {
2549     case T_SHORT:
2550       vpmovsxbw(dst, src, vlen_enc);
2551       break;
2552     case T_INT:
2553       vpmovsxbd(dst, src, vlen_enc);
2554       break;
2555     case T_FLOAT:
2556       vpmovsxbd(dst, src, vlen_enc);
2557       vcvtdq2ps(dst, dst, vlen_enc);
2558       break;
2559     case T_LONG:
2560       vpmovsxbq(dst, src, vlen_enc);
2561       break;
2562     case T_DOUBLE: {
2563       int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
2564       vpmovsxbd(dst, src, mid_vlen_enc);
2565       vcvtdq2pd(dst, dst, vlen_enc);
2566       break;
2567     }
2568     default:
2569       fatal("Unsupported type %s", type2name(to_elem_bt));
2570       break;
2571   }
2572 }
2573 
2574 //-------------------------------------------------------------------------------------------
2575 
2576 // IndexOf for constant substrings with size >= 8 chars
2577 // which don't need to be loaded through stack.
2578 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2579                                          Register cnt1, Register cnt2,
2580                                          int int_cnt2,  Register result,
2581                                          XMMRegister vec, Register tmp,
2582                                          int ae) {
2583   ShortBranchVerifier sbv(this);
2584   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2585   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2586 
2587   // This method uses the pcmpestri instruction with bound registers
2588   //   inputs:
2589   //     xmm - substring
2590   //     rax - substring length (elements count)
2591   //     mem - scanned string
2592   //     rdx - string length (elements count)
2593   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2594   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2595   //   outputs:
2596   //     rcx - matched index in string
2597   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2598   int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2599   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2600   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2601   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2602 
2603   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2604         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2605         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2606 
2607   // Note, inline_string_indexOf() generates checks:
2608   // if (substr.count > string.count) return -1;
2609   // if (substr.count == 0) return 0;
2610   assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2611 
2612   // Load substring.
2613   if (ae == StrIntrinsicNode::UL) {
2614     pmovzxbw(vec, Address(str2, 0));
2615   } else {
2616     movdqu(vec, Address(str2, 0));
2617   }
2618   movl(cnt2, int_cnt2);
2619   movptr(result, str1); // string addr
2620 
2621   if (int_cnt2 > stride) {
2622     jmpb(SCAN_TO_SUBSTR);
2623 
2624     // Reload substr for rescan, this code
2625     // is executed only for large substrings (> 8 chars)
2626     bind(RELOAD_SUBSTR);
2627     if (ae == StrIntrinsicNode::UL) {
2628       pmovzxbw(vec, Address(str2, 0));
2629     } else {
2630       movdqu(vec, Address(str2, 0));
2631     }
2632     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2633 
2634     bind(RELOAD_STR);
2635     // We came here after the beginning of the substring was
2636     // matched but the rest of it was not so we need to search
2637     // again. Start from the next element after the previous match.
2638 
2639     // cnt2 is number of substring reminding elements and
2640     // cnt1 is number of string reminding elements when cmp failed.
2641     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
2642     subl(cnt1, cnt2);
2643     addl(cnt1, int_cnt2);
2644     movl(cnt2, int_cnt2); // Now restore cnt2
2645 
2646     decrementl(cnt1);     // Shift to next element
2647     cmpl(cnt1, cnt2);
2648     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2649 
2650     addptr(result, (1<<scale1));
2651 
2652   } // (int_cnt2 > 8)
2653 
2654   // Scan string for start of substr in 16-byte vectors
2655   bind(SCAN_TO_SUBSTR);
2656   pcmpestri(vec, Address(result, 0), mode);
2657   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2658   subl(cnt1, stride);
2659   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2660   cmpl(cnt1, cnt2);
2661   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2662   addptr(result, 16);
2663   jmpb(SCAN_TO_SUBSTR);
2664 
2665   // Found a potential substr
2666   bind(FOUND_CANDIDATE);
2667   // Matched whole vector if first element matched (tmp(rcx) == 0).
2668   if (int_cnt2 == stride) {
2669     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
2670   } else { // int_cnt2 > 8
2671     jccb(Assembler::overflow, FOUND_SUBSTR);
2672   }
2673   // After pcmpestri tmp(rcx) contains matched element index
2674   // Compute start addr of substr
2675   lea(result, Address(result, tmp, scale1));
2676 
2677   // Make sure string is still long enough
2678   subl(cnt1, tmp);
2679   cmpl(cnt1, cnt2);
2680   if (int_cnt2 == stride) {
2681     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2682   } else { // int_cnt2 > 8
2683     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
2684   }
2685   // Left less then substring.
2686 
2687   bind(RET_NOT_FOUND);
2688   movl(result, -1);
2689   jmp(EXIT);
2690 
2691   if (int_cnt2 > stride) {
2692     // This code is optimized for the case when whole substring
2693     // is matched if its head is matched.
2694     bind(MATCH_SUBSTR_HEAD);
2695     pcmpestri(vec, Address(result, 0), mode);
2696     // Reload only string if does not match
2697     jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
2698 
2699     Label CONT_SCAN_SUBSTR;
2700     // Compare the rest of substring (> 8 chars).
2701     bind(FOUND_SUBSTR);
2702     // First 8 chars are already matched.
2703     negptr(cnt2);
2704     addptr(cnt2, stride);
2705 
2706     bind(SCAN_SUBSTR);
2707     subl(cnt1, stride);
2708     cmpl(cnt2, -stride); // Do not read beyond substring
2709     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
2710     // Back-up strings to avoid reading beyond substring:
2711     // cnt1 = cnt1 - cnt2 + 8
2712     addl(cnt1, cnt2); // cnt2 is negative
2713     addl(cnt1, stride);
2714     movl(cnt2, stride); negptr(cnt2);
2715     bind(CONT_SCAN_SUBSTR);
2716     if (int_cnt2 < (int)G) {
2717       int tail_off1 = int_cnt2<<scale1;
2718       int tail_off2 = int_cnt2<<scale2;
2719       if (ae == StrIntrinsicNode::UL) {
2720         pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
2721       } else {
2722         movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
2723       }
2724       pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
2725     } else {
2726       // calculate index in register to avoid integer overflow (int_cnt2*2)
2727       movl(tmp, int_cnt2);
2728       addptr(tmp, cnt2);
2729       if (ae == StrIntrinsicNode::UL) {
2730         pmovzxbw(vec, Address(str2, tmp, scale2, 0));
2731       } else {
2732         movdqu(vec, Address(str2, tmp, scale2, 0));
2733       }
2734       pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
2735     }
2736     // Need to reload strings pointers if not matched whole vector
2737     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2738     addptr(cnt2, stride);
2739     jcc(Assembler::negative, SCAN_SUBSTR);
2740     // Fall through if found full substring
2741 
2742   } // (int_cnt2 > 8)
2743 
2744   bind(RET_FOUND);
2745   // Found result if we matched full small substring.
2746   // Compute substr offset
2747   subptr(result, str1);
2748   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2749     shrl(result, 1); // index
2750   }
2751   bind(EXIT);
2752 
2753 } // string_indexofC8
2754 
2755 // Small strings are loaded through stack if they cross page boundary.
2756 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
2757                                        Register cnt1, Register cnt2,
2758                                        int int_cnt2,  Register result,
2759                                        XMMRegister vec, Register tmp,
2760                                        int ae) {
2761   ShortBranchVerifier sbv(this);
2762   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2763   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2764 
2765   //
2766   // int_cnt2 is length of small (< 8 chars) constant substring
2767   // or (-1) for non constant substring in which case its length
2768   // is in cnt2 register.
2769   //
2770   // Note, inline_string_indexOf() generates checks:
2771   // if (substr.count > string.count) return -1;
2772   // if (substr.count == 0) return 0;
2773   //
2774   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2775   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
2776   // This method uses the pcmpestri instruction with bound registers
2777   //   inputs:
2778   //     xmm - substring
2779   //     rax - substring length (elements count)
2780   //     mem - scanned string
2781   //     rdx - string length (elements count)
2782   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2783   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2784   //   outputs:
2785   //     rcx - matched index in string
2786   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2787   int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2788   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2789   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2790 
2791   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
2792         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
2793         FOUND_CANDIDATE;
2794 
2795   { //========================================================
2796     // We don't know where these strings are located
2797     // and we can't read beyond them. Load them through stack.
2798     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
2799 
2800     movptr(tmp, rsp); // save old SP
2801 
2802     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
2803       if (int_cnt2 == (1>>scale2)) { // One byte
2804         assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
2805         load_unsigned_byte(result, Address(str2, 0));
2806         movdl(vec, result); // move 32 bits
2807       } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
2808         // Not enough header space in 32-bit VM: 12+3 = 15.
2809         movl(result, Address(str2, -1));
2810         shrl(result, 8);
2811         movdl(vec, result); // move 32 bits
2812       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
2813         load_unsigned_short(result, Address(str2, 0));
2814         movdl(vec, result); // move 32 bits
2815       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
2816         movdl(vec, Address(str2, 0)); // move 32 bits
2817       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
2818         movq(vec, Address(str2, 0));  // move 64 bits
2819       } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
2820         // Array header size is 12 bytes in 32-bit VM
2821         // + 6 bytes for 3 chars == 18 bytes,
2822         // enough space to load vec and shift.
2823         assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
2824         if (ae == StrIntrinsicNode::UL) {
2825           int tail_off = int_cnt2-8;
2826           pmovzxbw(vec, Address(str2, tail_off));
2827           psrldq(vec, -2*tail_off);
2828         }
2829         else {
2830           int tail_off = int_cnt2*(1<<scale2);
2831           movdqu(vec, Address(str2, tail_off-16));
2832           psrldq(vec, 16-tail_off);
2833         }
2834       }
2835     } else { // not constant substring
2836       cmpl(cnt2, stride);
2837       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
2838 
2839       // We can read beyond string if srt+16 does not cross page boundary
2840       // since heaps are aligned and mapped by pages.
2841       assert(os::vm_page_size() < (int)G, "default page should be small");
2842       movl(result, str2); // We need only low 32 bits
2843       andl(result, (os::vm_page_size()-1));
2844       cmpl(result, (os::vm_page_size()-16));
2845       jccb(Assembler::belowEqual, CHECK_STR);
2846 
2847       // Move small strings to stack to allow load 16 bytes into vec.
2848       subptr(rsp, 16);
2849       int stk_offset = wordSize-(1<<scale2);
2850       push(cnt2);
2851 
2852       bind(COPY_SUBSTR);
2853       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
2854         load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
2855         movb(Address(rsp, cnt2, scale2, stk_offset), result);
2856       } else if (ae == StrIntrinsicNode::UU) {
2857         load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
2858         movw(Address(rsp, cnt2, scale2, stk_offset), result);
2859       }
2860       decrement(cnt2);
2861       jccb(Assembler::notZero, COPY_SUBSTR);
2862 
2863       pop(cnt2);
2864       movptr(str2, rsp);  // New substring address
2865     } // non constant
2866 
2867     bind(CHECK_STR);
2868     cmpl(cnt1, stride);
2869     jccb(Assembler::aboveEqual, BIG_STRINGS);
2870 
2871     // Check cross page boundary.
2872     movl(result, str1); // We need only low 32 bits
2873     andl(result, (os::vm_page_size()-1));
2874     cmpl(result, (os::vm_page_size()-16));
2875     jccb(Assembler::belowEqual, BIG_STRINGS);
2876 
2877     subptr(rsp, 16);
2878     int stk_offset = -(1<<scale1);
2879     if (int_cnt2 < 0) { // not constant
2880       push(cnt2);
2881       stk_offset += wordSize;
2882     }
2883     movl(cnt2, cnt1);
2884 
2885     bind(COPY_STR);
2886     if (ae == StrIntrinsicNode::LL) {
2887       load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
2888       movb(Address(rsp, cnt2, scale1, stk_offset), result);
2889     } else {
2890       load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
2891       movw(Address(rsp, cnt2, scale1, stk_offset), result);
2892     }
2893     decrement(cnt2);
2894     jccb(Assembler::notZero, COPY_STR);
2895 
2896     if (int_cnt2 < 0) { // not constant
2897       pop(cnt2);
2898     }
2899     movptr(str1, rsp);  // New string address
2900 
2901     bind(BIG_STRINGS);
2902     // Load substring.
2903     if (int_cnt2 < 0) { // -1
2904       if (ae == StrIntrinsicNode::UL) {
2905         pmovzxbw(vec, Address(str2, 0));
2906       } else {
2907         movdqu(vec, Address(str2, 0));
2908       }
2909       push(cnt2);       // substr count
2910       push(str2);       // substr addr
2911       push(str1);       // string addr
2912     } else {
2913       // Small (< 8 chars) constant substrings are loaded already.
2914       movl(cnt2, int_cnt2);
2915     }
2916     push(tmp);  // original SP
2917 
2918   } // Finished loading
2919 
2920   //========================================================
2921   // Start search
2922   //
2923 
2924   movptr(result, str1); // string addr
2925 
2926   if (int_cnt2  < 0) {  // Only for non constant substring
2927     jmpb(SCAN_TO_SUBSTR);
2928 
2929     // SP saved at sp+0
2930     // String saved at sp+1*wordSize
2931     // Substr saved at sp+2*wordSize
2932     // Substr count saved at sp+3*wordSize
2933 
2934     // Reload substr for rescan, this code
2935     // is executed only for large substrings (> 8 chars)
2936     bind(RELOAD_SUBSTR);
2937     movptr(str2, Address(rsp, 2*wordSize));
2938     movl(cnt2, Address(rsp, 3*wordSize));
2939     if (ae == StrIntrinsicNode::UL) {
2940       pmovzxbw(vec, Address(str2, 0));
2941     } else {
2942       movdqu(vec, Address(str2, 0));
2943     }
2944     // We came here after the beginning of the substring was
2945     // matched but the rest of it was not so we need to search
2946     // again. Start from the next element after the previous match.
2947     subptr(str1, result); // Restore counter
2948     if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2949       shrl(str1, 1);
2950     }
2951     addl(cnt1, str1);
2952     decrementl(cnt1);   // Shift to next element
2953     cmpl(cnt1, cnt2);
2954     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2955 
2956     addptr(result, (1<<scale1));
2957   } // non constant
2958 
2959   // Scan string for start of substr in 16-byte vectors
2960   bind(SCAN_TO_SUBSTR);
2961   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2962   pcmpestri(vec, Address(result, 0), mode);
2963   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2964   subl(cnt1, stride);
2965   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2966   cmpl(cnt1, cnt2);
2967   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2968   addptr(result, 16);
2969 
2970   bind(ADJUST_STR);
2971   cmpl(cnt1, stride); // Do not read beyond string
2972   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2973   // Back-up string to avoid reading beyond string.
2974   lea(result, Address(result, cnt1, scale1, -16));
2975   movl(cnt1, stride);
2976   jmpb(SCAN_TO_SUBSTR);
2977 
2978   // Found a potential substr
2979   bind(FOUND_CANDIDATE);
2980   // After pcmpestri tmp(rcx) contains matched element index
2981 
2982   // Make sure string is still long enough
2983   subl(cnt1, tmp);
2984   cmpl(cnt1, cnt2);
2985   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
2986   // Left less then substring.
2987 
2988   bind(RET_NOT_FOUND);
2989   movl(result, -1);
2990   jmp(CLEANUP);
2991 
2992   bind(FOUND_SUBSTR);
2993   // Compute start addr of substr
2994   lea(result, Address(result, tmp, scale1));
2995   if (int_cnt2 > 0) { // Constant substring
2996     // Repeat search for small substring (< 8 chars)
2997     // from new point without reloading substring.
2998     // Have to check that we don't read beyond string.
2999     cmpl(tmp, stride-int_cnt2);
3000     jccb(Assembler::greater, ADJUST_STR);
3001     // Fall through if matched whole substring.
3002   } else { // non constant
3003     assert(int_cnt2 == -1, "should be != 0");
3004 
3005     addl(tmp, cnt2);
3006     // Found result if we matched whole substring.
3007     cmpl(tmp, stride);
3008     jcc(Assembler::lessEqual, RET_FOUND);
3009 
3010     // Repeat search for small substring (<= 8 chars)
3011     // from new point 'str1' without reloading substring.
3012     cmpl(cnt2, stride);
3013     // Have to check that we don't read beyond string.
3014     jccb(Assembler::lessEqual, ADJUST_STR);
3015 
3016     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
3017     // Compare the rest of substring (> 8 chars).
3018     movptr(str1, result);
3019 
3020     cmpl(tmp, cnt2);
3021     // First 8 chars are already matched.
3022     jccb(Assembler::equal, CHECK_NEXT);
3023 
3024     bind(SCAN_SUBSTR);
3025     pcmpestri(vec, Address(str1, 0), mode);
3026     // Need to reload strings pointers if not matched whole vector
3027     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
3028 
3029     bind(CHECK_NEXT);
3030     subl(cnt2, stride);
3031     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
3032     addptr(str1, 16);
3033     if (ae == StrIntrinsicNode::UL) {
3034       addptr(str2, 8);
3035     } else {
3036       addptr(str2, 16);
3037     }
3038     subl(cnt1, stride);
3039     cmpl(cnt2, stride); // Do not read beyond substring
3040     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
3041     // Back-up strings to avoid reading beyond substring.
3042 
3043     if (ae == StrIntrinsicNode::UL) {
3044       lea(str2, Address(str2, cnt2, scale2, -8));
3045       lea(str1, Address(str1, cnt2, scale1, -16));
3046     } else {
3047       lea(str2, Address(str2, cnt2, scale2, -16));
3048       lea(str1, Address(str1, cnt2, scale1, -16));
3049     }
3050     subl(cnt1, cnt2);
3051     movl(cnt2, stride);
3052     addl(cnt1, stride);
3053     bind(CONT_SCAN_SUBSTR);
3054     if (ae == StrIntrinsicNode::UL) {
3055       pmovzxbw(vec, Address(str2, 0));
3056     } else {
3057       movdqu(vec, Address(str2, 0));
3058     }
3059     jmp(SCAN_SUBSTR);
3060 
3061     bind(RET_FOUND_LONG);
3062     movptr(str1, Address(rsp, wordSize));
3063   } // non constant
3064 
3065   bind(RET_FOUND);
3066   // Compute substr offset
3067   subptr(result, str1);
3068   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3069     shrl(result, 1); // index
3070   }
3071   bind(CLEANUP);
3072   pop(rsp); // restore SP
3073 
3074 } // string_indexof
3075 
3076 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3077                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3078   ShortBranchVerifier sbv(this);
3079   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3080 
3081   int stride = 8;
3082 
3083   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
3084         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
3085         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
3086         FOUND_SEQ_CHAR, DONE_LABEL;
3087 
3088   movptr(result, str1);
3089   if (UseAVX >= 2) {
3090     cmpl(cnt1, stride);
3091     jcc(Assembler::less, SCAN_TO_CHAR);
3092     cmpl(cnt1, 2*stride);
3093     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
3094     movdl(vec1, ch);
3095     vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
3096     vpxor(vec2, vec2);
3097     movl(tmp, cnt1);
3098     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
3099     andl(cnt1,0x0000000F);  //tail count (in chars)
3100 
3101     bind(SCAN_TO_16_CHAR_LOOP);
3102     vmovdqu(vec3, Address(result, 0));
3103     vpcmpeqw(vec3, vec3, vec1, 1);
3104     vptest(vec2, vec3);
3105     jcc(Assembler::carryClear, FOUND_CHAR);
3106     addptr(result, 32);
3107     subl(tmp, 2*stride);
3108     jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
3109     jmp(SCAN_TO_8_CHAR);
3110     bind(SCAN_TO_8_CHAR_INIT);
3111     movdl(vec1, ch);
3112     pshuflw(vec1, vec1, 0x00);
3113     pshufd(vec1, vec1, 0);
3114     pxor(vec2, vec2);
3115   }
3116   bind(SCAN_TO_8_CHAR);
3117   cmpl(cnt1, stride);
3118   jcc(Assembler::less, SCAN_TO_CHAR);
3119   if (UseAVX < 2) {
3120     movdl(vec1, ch);
3121     pshuflw(vec1, vec1, 0x00);
3122     pshufd(vec1, vec1, 0);
3123     pxor(vec2, vec2);
3124   }
3125   movl(tmp, cnt1);
3126   andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
3127   andl(cnt1,0x00000007);  //tail count (in chars)
3128 
3129   bind(SCAN_TO_8_CHAR_LOOP);
3130   movdqu(vec3, Address(result, 0));
3131   pcmpeqw(vec3, vec1);
3132   ptest(vec2, vec3);
3133   jcc(Assembler::carryClear, FOUND_CHAR);
3134   addptr(result, 16);
3135   subl(tmp, stride);
3136   jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
3137   bind(SCAN_TO_CHAR);
3138   testl(cnt1, cnt1);
3139   jcc(Assembler::zero, RET_NOT_FOUND);
3140   bind(SCAN_TO_CHAR_LOOP);
3141   load_unsigned_short(tmp, Address(result, 0));
3142   cmpl(ch, tmp);
3143   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3144   addptr(result, 2);
3145   subl(cnt1, 1);
3146   jccb(Assembler::zero, RET_NOT_FOUND);
3147   jmp(SCAN_TO_CHAR_LOOP);
3148 
3149   bind(RET_NOT_FOUND);
3150   movl(result, -1);
3151   jmpb(DONE_LABEL);
3152 
3153   bind(FOUND_CHAR);
3154   if (UseAVX >= 2) {
3155     vpmovmskb(tmp, vec3);
3156   } else {
3157     pmovmskb(tmp, vec3);
3158   }
3159   bsfl(ch, tmp);
3160   addptr(result, ch);
3161 
3162   bind(FOUND_SEQ_CHAR);
3163   subptr(result, str1);
3164   shrl(result, 1);
3165 
3166   bind(DONE_LABEL);
3167 } // string_indexof_char
3168 
3169 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3170                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3171   ShortBranchVerifier sbv(this);
3172   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3173 
3174   int stride = 16;
3175 
3176   Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
3177         SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
3178         RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
3179         FOUND_SEQ_CHAR, DONE_LABEL;
3180 
3181   movptr(result, str1);
3182   if (UseAVX >= 2) {
3183     cmpl(cnt1, stride);
3184     jcc(Assembler::less, SCAN_TO_CHAR_INIT);
3185     cmpl(cnt1, stride*2);
3186     jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
3187     movdl(vec1, ch);
3188     vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
3189     vpxor(vec2, vec2);
3190     movl(tmp, cnt1);
3191     andl(tmp, 0xFFFFFFE0);  //vector count (in chars)
3192     andl(cnt1,0x0000001F);  //tail count (in chars)
3193 
3194     bind(SCAN_TO_32_CHAR_LOOP);
3195     vmovdqu(vec3, Address(result, 0));
3196     vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
3197     vptest(vec2, vec3);
3198     jcc(Assembler::carryClear, FOUND_CHAR);
3199     addptr(result, 32);
3200     subl(tmp, stride*2);
3201     jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
3202     jmp(SCAN_TO_16_CHAR);
3203 
3204     bind(SCAN_TO_16_CHAR_INIT);
3205     movdl(vec1, ch);
3206     pxor(vec2, vec2);
3207     pshufb(vec1, vec2);
3208   }
3209 
3210   bind(SCAN_TO_16_CHAR);
3211   cmpl(cnt1, stride);
3212   jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left
3213   if (UseAVX < 2) {
3214     movdl(vec1, ch);
3215     pxor(vec2, vec2);
3216     pshufb(vec1, vec2);
3217   }
3218   movl(tmp, cnt1);
3219   andl(tmp, 0xFFFFFFF0);  //vector count (in bytes)
3220   andl(cnt1,0x0000000F);  //tail count (in bytes)
3221 
3222   bind(SCAN_TO_16_CHAR_LOOP);
3223   movdqu(vec3, Address(result, 0));
3224   pcmpeqb(vec3, vec1);
3225   ptest(vec2, vec3);
3226   jcc(Assembler::carryClear, FOUND_CHAR);
3227   addptr(result, 16);
3228   subl(tmp, stride);
3229   jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
3230 
3231   bind(SCAN_TO_CHAR_INIT);
3232   testl(cnt1, cnt1);
3233   jcc(Assembler::zero, RET_NOT_FOUND);
3234   bind(SCAN_TO_CHAR_LOOP);
3235   load_unsigned_byte(tmp, Address(result, 0));
3236   cmpl(ch, tmp);
3237   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3238   addptr(result, 1);
3239   subl(cnt1, 1);
3240   jccb(Assembler::zero, RET_NOT_FOUND);
3241   jmp(SCAN_TO_CHAR_LOOP);
3242 
3243   bind(RET_NOT_FOUND);
3244   movl(result, -1);
3245   jmpb(DONE_LABEL);
3246 
3247   bind(FOUND_CHAR);
3248   if (UseAVX >= 2) {
3249     vpmovmskb(tmp, vec3);
3250   } else {
3251     pmovmskb(tmp, vec3);
3252   }
3253   bsfl(ch, tmp);
3254   addptr(result, ch);
3255 
3256   bind(FOUND_SEQ_CHAR);
3257   subptr(result, str1);
3258 
3259   bind(DONE_LABEL);
3260 } // stringL_indexof_char
3261 
3262 // helper function for string_compare
3263 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
3264                                            Address::ScaleFactor scale, Address::ScaleFactor scale1,
3265                                            Address::ScaleFactor scale2, Register index, int ae) {
3266   if (ae == StrIntrinsicNode::LL) {
3267     load_unsigned_byte(elem1, Address(str1, index, scale, 0));
3268     load_unsigned_byte(elem2, Address(str2, index, scale, 0));
3269   } else if (ae == StrIntrinsicNode::UU) {
3270     load_unsigned_short(elem1, Address(str1, index, scale, 0));
3271     load_unsigned_short(elem2, Address(str2, index, scale, 0));
3272   } else {
3273     load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
3274     load_unsigned_short(elem2, Address(str2, index, scale2, 0));
3275   }
3276 }
3277 
3278 // Compare strings, used for char[] and byte[].
3279 void C2_MacroAssembler::string_compare(Register str1, Register str2,
3280                                        Register cnt1, Register cnt2, Register result,
3281                                        XMMRegister vec1, int ae, KRegister mask) {
3282   ShortBranchVerifier sbv(this);
3283   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
3284   Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only _LP64 && AVX3
3285   int stride, stride2, adr_stride, adr_stride1, adr_stride2;
3286   int stride2x2 = 0x40;
3287   Address::ScaleFactor scale = Address::no_scale;
3288   Address::ScaleFactor scale1 = Address::no_scale;
3289   Address::ScaleFactor scale2 = Address::no_scale;
3290 
3291   if (ae != StrIntrinsicNode::LL) {
3292     stride2x2 = 0x20;
3293   }
3294 
3295   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
3296     shrl(cnt2, 1);
3297   }
3298   // Compute the minimum of the string lengths and the
3299   // difference of the string lengths (stack).
3300   // Do the conditional move stuff
3301   movl(result, cnt1);
3302   subl(cnt1, cnt2);
3303   push(cnt1);
3304   cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
3305 
3306   // Is the minimum length zero?
3307   testl(cnt2, cnt2);
3308   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3309   if (ae == StrIntrinsicNode::LL) {
3310     // Load first bytes
3311     load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
3312     load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
3313   } else if (ae == StrIntrinsicNode::UU) {
3314     // Load first characters
3315     load_unsigned_short(result, Address(str1, 0));
3316     load_unsigned_short(cnt1, Address(str2, 0));
3317   } else {
3318     load_unsigned_byte(result, Address(str1, 0));
3319     load_unsigned_short(cnt1, Address(str2, 0));
3320   }
3321   subl(result, cnt1);
3322   jcc(Assembler::notZero,  POP_LABEL);
3323 
3324   if (ae == StrIntrinsicNode::UU) {
3325     // Divide length by 2 to get number of chars
3326     shrl(cnt2, 1);
3327   }
3328   cmpl(cnt2, 1);
3329   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3330 
3331   // Check if the strings start at the same location and setup scale and stride
3332   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3333     cmpptr(str1, str2);
3334     jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3335     if (ae == StrIntrinsicNode::LL) {
3336       scale = Address::times_1;
3337       stride = 16;
3338     } else {
3339       scale = Address::times_2;
3340       stride = 8;
3341     }
3342   } else {
3343     scale1 = Address::times_1;
3344     scale2 = Address::times_2;
3345     // scale not used
3346     stride = 8;
3347   }
3348 
3349   if (UseAVX >= 2 && UseSSE42Intrinsics) {
3350     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
3351     Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
3352     Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
3353     Label COMPARE_TAIL_LONG;
3354     Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only _LP64 && AVX3
3355 
3356     int pcmpmask = 0x19;
3357     if (ae == StrIntrinsicNode::LL) {
3358       pcmpmask &= ~0x01;
3359     }
3360 
3361     // Setup to compare 16-chars (32-bytes) vectors,
3362     // start from first character again because it has aligned address.
3363     if (ae == StrIntrinsicNode::LL) {
3364       stride2 = 32;
3365     } else {
3366       stride2 = 16;
3367     }
3368     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3369       adr_stride = stride << scale;
3370     } else {
3371       adr_stride1 = 8;  //stride << scale1;
3372       adr_stride2 = 16; //stride << scale2;
3373     }
3374 
3375     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3376     // rax and rdx are used by pcmpestri as elements counters
3377     movl(result, cnt2);
3378     andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
3379     jcc(Assembler::zero, COMPARE_TAIL_LONG);
3380 
3381     // fast path : compare first 2 8-char vectors.
3382     bind(COMPARE_16_CHARS);
3383     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3384       movdqu(vec1, Address(str1, 0));
3385     } else {
3386       pmovzxbw(vec1, Address(str1, 0));
3387     }
3388     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3389     jccb(Assembler::below, COMPARE_INDEX_CHAR);
3390 
3391     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3392       movdqu(vec1, Address(str1, adr_stride));
3393       pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
3394     } else {
3395       pmovzxbw(vec1, Address(str1, adr_stride1));
3396       pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
3397     }
3398     jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
3399     addl(cnt1, stride);
3400 
3401     // Compare the characters at index in cnt1
3402     bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
3403     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3404     subl(result, cnt2);
3405     jmp(POP_LABEL);
3406 
3407     // Setup the registers to start vector comparison loop
3408     bind(COMPARE_WIDE_VECTORS);
3409     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3410       lea(str1, Address(str1, result, scale));
3411       lea(str2, Address(str2, result, scale));
3412     } else {
3413       lea(str1, Address(str1, result, scale1));
3414       lea(str2, Address(str2, result, scale2));
3415     }
3416     subl(result, stride2);
3417     subl(cnt2, stride2);
3418     jcc(Assembler::zero, COMPARE_WIDE_TAIL);
3419     negptr(result);
3420 
3421     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
3422     bind(COMPARE_WIDE_VECTORS_LOOP);
3423 
3424 #ifdef _LP64
3425     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3426       cmpl(cnt2, stride2x2);
3427       jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3428       testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
3429       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
3430 
3431       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3432       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3433         evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
3434         evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3435       } else {
3436         vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
3437         evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3438       }
3439       kortestql(mask, mask);
3440       jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
3441       addptr(result, stride2x2);  // update since we already compared at this addr
3442       subl(cnt2, stride2x2);      // and sub the size too
3443       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3444 
3445       vpxor(vec1, vec1);
3446       jmpb(COMPARE_WIDE_TAIL);
3447     }//if (VM_Version::supports_avx512vlbw())
3448 #endif // _LP64
3449 
3450 
3451     bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3452     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3453       vmovdqu(vec1, Address(str1, result, scale));
3454       vpxor(vec1, Address(str2, result, scale));
3455     } else {
3456       vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
3457       vpxor(vec1, Address(str2, result, scale2));
3458     }
3459     vptest(vec1, vec1);
3460     jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
3461     addptr(result, stride2);
3462     subl(cnt2, stride2);
3463     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
3464     // clean upper bits of YMM registers
3465     vpxor(vec1, vec1);
3466 
3467     // compare wide vectors tail
3468     bind(COMPARE_WIDE_TAIL);
3469     testptr(result, result);
3470     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3471 
3472     movl(result, stride2);
3473     movl(cnt2, result);
3474     negptr(result);
3475     jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3476 
3477     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
3478     bind(VECTOR_NOT_EQUAL);
3479     // clean upper bits of YMM registers
3480     vpxor(vec1, vec1);
3481     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3482       lea(str1, Address(str1, result, scale));
3483       lea(str2, Address(str2, result, scale));
3484     } else {
3485       lea(str1, Address(str1, result, scale1));
3486       lea(str2, Address(str2, result, scale2));
3487     }
3488     jmp(COMPARE_16_CHARS);
3489 
3490     // Compare tail chars, length between 1 to 15 chars
3491     bind(COMPARE_TAIL_LONG);
3492     movl(cnt2, result);
3493     cmpl(cnt2, stride);
3494     jcc(Assembler::less, COMPARE_SMALL_STR);
3495 
3496     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3497       movdqu(vec1, Address(str1, 0));
3498     } else {
3499       pmovzxbw(vec1, Address(str1, 0));
3500     }
3501     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3502     jcc(Assembler::below, COMPARE_INDEX_CHAR);
3503     subptr(cnt2, stride);
3504     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3505     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3506       lea(str1, Address(str1, result, scale));
3507       lea(str2, Address(str2, result, scale));
3508     } else {
3509       lea(str1, Address(str1, result, scale1));
3510       lea(str2, Address(str2, result, scale2));
3511     }
3512     negptr(cnt2);
3513     jmpb(WHILE_HEAD_LABEL);
3514 
3515     bind(COMPARE_SMALL_STR);
3516   } else if (UseSSE42Intrinsics) {
3517     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
3518     int pcmpmask = 0x19;
3519     // Setup to compare 8-char (16-byte) vectors,
3520     // start from first character again because it has aligned address.
3521     movl(result, cnt2);
3522     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
3523     if (ae == StrIntrinsicNode::LL) {
3524       pcmpmask &= ~0x01;
3525     }
3526     jcc(Assembler::zero, COMPARE_TAIL);
3527     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3528       lea(str1, Address(str1, result, scale));
3529       lea(str2, Address(str2, result, scale));
3530     } else {
3531       lea(str1, Address(str1, result, scale1));
3532       lea(str2, Address(str2, result, scale2));
3533     }
3534     negptr(result);
3535 
3536     // pcmpestri
3537     //   inputs:
3538     //     vec1- substring
3539     //     rax - negative string length (elements count)
3540     //     mem - scanned string
3541     //     rdx - string length (elements count)
3542     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
3543     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
3544     //   outputs:
3545     //     rcx - first mismatched element index
3546     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3547 
3548     bind(COMPARE_WIDE_VECTORS);
3549     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3550       movdqu(vec1, Address(str1, result, scale));
3551       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3552     } else {
3553       pmovzxbw(vec1, Address(str1, result, scale1));
3554       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3555     }
3556     // After pcmpestri cnt1(rcx) contains mismatched element index
3557 
3558     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
3559     addptr(result, stride);
3560     subptr(cnt2, stride);
3561     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3562 
3563     // compare wide vectors tail
3564     testptr(result, result);
3565     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3566 
3567     movl(cnt2, stride);
3568     movl(result, stride);
3569     negptr(result);
3570     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3571       movdqu(vec1, Address(str1, result, scale));
3572       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3573     } else {
3574       pmovzxbw(vec1, Address(str1, result, scale1));
3575       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3576     }
3577     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
3578 
3579     // Mismatched characters in the vectors
3580     bind(VECTOR_NOT_EQUAL);
3581     addptr(cnt1, result);
3582     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3583     subl(result, cnt2);
3584     jmpb(POP_LABEL);
3585 
3586     bind(COMPARE_TAIL); // limit is zero
3587     movl(cnt2, result);
3588     // Fallthru to tail compare
3589   }
3590   // Shift str2 and str1 to the end of the arrays, negate min
3591   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3592     lea(str1, Address(str1, cnt2, scale));
3593     lea(str2, Address(str2, cnt2, scale));
3594   } else {
3595     lea(str1, Address(str1, cnt2, scale1));
3596     lea(str2, Address(str2, cnt2, scale2));
3597   }
3598   decrementl(cnt2);  // first character was compared already
3599   negptr(cnt2);
3600 
3601   // Compare the rest of the elements
3602   bind(WHILE_HEAD_LABEL);
3603   load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
3604   subl(result, cnt1);
3605   jccb(Assembler::notZero, POP_LABEL);
3606   increment(cnt2);
3607   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
3608 
3609   // Strings are equal up to min length.  Return the length difference.
3610   bind(LENGTH_DIFF_LABEL);
3611   pop(result);
3612   if (ae == StrIntrinsicNode::UU) {
3613     // Divide diff by 2 to get number of chars
3614     sarl(result, 1);
3615   }
3616   jmpb(DONE_LABEL);
3617 
3618 #ifdef _LP64
3619   if (VM_Version::supports_avx512vlbw()) {
3620 
3621     bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
3622 
3623     kmovql(cnt1, mask);
3624     notq(cnt1);
3625     bsfq(cnt2, cnt1);
3626     if (ae != StrIntrinsicNode::LL) {
3627       // Divide diff by 2 to get number of chars
3628       sarl(cnt2, 1);
3629     }
3630     addq(result, cnt2);
3631     if (ae == StrIntrinsicNode::LL) {
3632       load_unsigned_byte(cnt1, Address(str2, result));
3633       load_unsigned_byte(result, Address(str1, result));
3634     } else if (ae == StrIntrinsicNode::UU) {
3635       load_unsigned_short(cnt1, Address(str2, result, scale));
3636       load_unsigned_short(result, Address(str1, result, scale));
3637     } else {
3638       load_unsigned_short(cnt1, Address(str2, result, scale2));
3639       load_unsigned_byte(result, Address(str1, result, scale1));
3640     }
3641     subl(result, cnt1);
3642     jmpb(POP_LABEL);
3643   }//if (VM_Version::supports_avx512vlbw())
3644 #endif // _LP64
3645 
3646   // Discard the stored length difference
3647   bind(POP_LABEL);
3648   pop(cnt1);
3649 
3650   // That's it
3651   bind(DONE_LABEL);
3652   if(ae == StrIntrinsicNode::UL) {
3653     negl(result);
3654   }
3655 
3656 }
3657 
3658 // Search for Non-ASCII character (Negative byte value) in a byte array,
3659 // return the index of the first such character, otherwise the length
3660 // of the array segment searched.
3661 //   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
3662 //   @IntrinsicCandidate
3663 //   public static int countPositives(byte[] ba, int off, int len) {
3664 //     for (int i = off; i < off + len; i++) {
3665 //       if (ba[i] < 0) {
3666 //         return i - off;
3667 //       }
3668 //     }
3669 //     return len;
3670 //   }
3671 void C2_MacroAssembler::count_positives(Register ary1, Register len,
3672   Register result, Register tmp1,
3673   XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {
3674   // rsi: byte array
3675   // rcx: len
3676   // rax: result
3677   ShortBranchVerifier sbv(this);
3678   assert_different_registers(ary1, len, result, tmp1);
3679   assert_different_registers(vec1, vec2);
3680   Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
3681 
3682   movl(result, len); // copy
3683   // len == 0
3684   testl(len, len);
3685   jcc(Assembler::zero, DONE);
3686 
3687   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
3688     VM_Version::supports_avx512vlbw() &&
3689     VM_Version::supports_bmi2()) {
3690 
3691     Label test_64_loop, test_tail, BREAK_LOOP;
3692     Register tmp3_aliased = len;
3693 
3694     movl(tmp1, len);
3695     vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
3696 
3697     andl(tmp1, 64 - 1);   // tail count (in chars) 0x3F
3698     andl(len, ~(64 - 1));    // vector count (in chars)
3699     jccb(Assembler::zero, test_tail);
3700 
3701     lea(ary1, Address(ary1, len, Address::times_1));
3702     negptr(len);
3703 
3704     bind(test_64_loop);
3705     // Check whether our 64 elements of size byte contain negatives
3706     evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
3707     kortestql(mask1, mask1);
3708     jcc(Assembler::notZero, BREAK_LOOP);
3709 
3710     addptr(len, 64);
3711     jccb(Assembler::notZero, test_64_loop);
3712 
3713     bind(test_tail);
3714     // bail out when there is nothing to be done
3715     testl(tmp1, -1);
3716     jcc(Assembler::zero, DONE);
3717 
3718     // ~(~0 << len) applied up to two times (for 32-bit scenario)
3719 #ifdef _LP64
3720     mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
3721     shlxq(tmp3_aliased, tmp3_aliased, tmp1);
3722     notq(tmp3_aliased);
3723     kmovql(mask2, tmp3_aliased);
3724 #else
3725     Label k_init;
3726     jmp(k_init);
3727 
3728     // We could not read 64-bits from a general purpose register thus we move
3729     // data required to compose 64 1's to the instruction stream
3730     // We emit 64 byte wide series of elements from 0..63 which later on would
3731     // be used as a compare targets with tail count contained in tmp1 register.
3732     // Result would be a k register having tmp1 consecutive number or 1
3733     // counting from least significant bit.
3734     address tmp = pc();
3735     emit_int64(0x0706050403020100);
3736     emit_int64(0x0F0E0D0C0B0A0908);
3737     emit_int64(0x1716151413121110);
3738     emit_int64(0x1F1E1D1C1B1A1918);
3739     emit_int64(0x2726252423222120);
3740     emit_int64(0x2F2E2D2C2B2A2928);
3741     emit_int64(0x3736353433323130);
3742     emit_int64(0x3F3E3D3C3B3A3938);
3743 
3744     bind(k_init);
3745     lea(len, InternalAddress(tmp));
3746     // create mask to test for negative byte inside a vector
3747     evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit);
3748     evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit);
3749 
3750 #endif
3751     evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);
3752     ktestq(mask1, mask2);
3753     jcc(Assembler::zero, DONE);
3754 
3755     bind(BREAK_LOOP);
3756     // At least one byte in the last 64 bytes is negative.
3757     // Set up to look at the last 64 bytes as if they were a tail
3758     lea(ary1, Address(ary1, len, Address::times_1));
3759     addptr(result, len);
3760     // Ignore the very last byte: if all others are positive,
3761     // it must be negative, so we can skip right to the 2+1 byte
3762     // end comparison at this point
3763     orl(result, 63);
3764     movl(len, 63);
3765     // Fallthru to tail compare
3766   } else {
3767 
3768     if (UseAVX >= 2 && UseSSE >= 2) {
3769       // With AVX2, use 32-byte vector compare
3770       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
3771 
3772       // Compare 32-byte vectors
3773       testl(len, 0xffffffe0);   // vector count (in bytes)
3774       jccb(Assembler::zero, TAIL_START);
3775 
3776       andl(len, 0xffffffe0);
3777       lea(ary1, Address(ary1, len, Address::times_1));
3778       negptr(len);
3779 
3780       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
3781       movdl(vec2, tmp1);
3782       vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
3783 
3784       bind(COMPARE_WIDE_VECTORS);
3785       vmovdqu(vec1, Address(ary1, len, Address::times_1));
3786       vptest(vec1, vec2);
3787       jccb(Assembler::notZero, BREAK_LOOP);
3788       addptr(len, 32);
3789       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3790 
3791       testl(result, 0x0000001f);   // any bytes remaining?
3792       jcc(Assembler::zero, DONE);
3793 
3794       // Quick test using the already prepared vector mask
3795       movl(len, result);
3796       andl(len, 0x0000001f);
3797       vmovdqu(vec1, Address(ary1, len, Address::times_1, -32));
3798       vptest(vec1, vec2);
3799       jcc(Assembler::zero, DONE);
3800       // There are zeros, jump to the tail to determine exactly where
3801       jmpb(TAIL_START);
3802 
3803       bind(BREAK_LOOP);
3804       // At least one byte in the last 32-byte vector is negative.
3805       // Set up to look at the last 32 bytes as if they were a tail
3806       lea(ary1, Address(ary1, len, Address::times_1));
3807       addptr(result, len);
3808       // Ignore the very last byte: if all others are positive,
3809       // it must be negative, so we can skip right to the 2+1 byte
3810       // end comparison at this point
3811       orl(result, 31);
3812       movl(len, 31);
3813       // Fallthru to tail compare
3814     } else if (UseSSE42Intrinsics) {
3815       // With SSE4.2, use double quad vector compare
3816       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
3817 
3818       // Compare 16-byte vectors
3819       testl(len, 0xfffffff0);   // vector count (in bytes)
3820       jcc(Assembler::zero, TAIL_START);
3821 
3822       andl(len, 0xfffffff0);
3823       lea(ary1, Address(ary1, len, Address::times_1));
3824       negptr(len);
3825 
3826       movl(tmp1, 0x80808080);
3827       movdl(vec2, tmp1);
3828       pshufd(vec2, vec2, 0);
3829 
3830       bind(COMPARE_WIDE_VECTORS);
3831       movdqu(vec1, Address(ary1, len, Address::times_1));
3832       ptest(vec1, vec2);
3833       jccb(Assembler::notZero, BREAK_LOOP);
3834       addptr(len, 16);
3835       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3836 
3837       testl(result, 0x0000000f); // len is zero, any bytes remaining?
3838       jcc(Assembler::zero, DONE);
3839 
3840       // Quick test using the already prepared vector mask
3841       movl(len, result);
3842       andl(len, 0x0000000f);   // tail count (in bytes)
3843       movdqu(vec1, Address(ary1, len, Address::times_1, -16));
3844       ptest(vec1, vec2);
3845       jcc(Assembler::zero, DONE);
3846       jmpb(TAIL_START);
3847 
3848       bind(BREAK_LOOP);
3849       // At least one byte in the last 16-byte vector is negative.
3850       // Set up and look at the last 16 bytes as if they were a tail
3851       lea(ary1, Address(ary1, len, Address::times_1));
3852       addptr(result, len);
3853       // Ignore the very last byte: if all others are positive,
3854       // it must be negative, so we can skip right to the 2+1 byte
3855       // end comparison at this point
3856       orl(result, 15);
3857       movl(len, 15);
3858       // Fallthru to tail compare
3859     }
3860   }
3861 
3862   bind(TAIL_START);
3863   // Compare 4-byte vectors
3864   andl(len, 0xfffffffc); // vector count (in bytes)
3865   jccb(Assembler::zero, COMPARE_CHAR);
3866 
3867   lea(ary1, Address(ary1, len, Address::times_1));
3868   negptr(len);
3869 
3870   bind(COMPARE_VECTORS);
3871   movl(tmp1, Address(ary1, len, Address::times_1));
3872   andl(tmp1, 0x80808080);
3873   jccb(Assembler::notZero, TAIL_ADJUST);
3874   addptr(len, 4);
3875   jccb(Assembler::notZero, COMPARE_VECTORS);
3876 
3877   // Compare trailing char (final 2-3 bytes), if any
3878   bind(COMPARE_CHAR);
3879 
3880   testl(result, 0x2);   // tail  char
3881   jccb(Assembler::zero, COMPARE_BYTE);
3882   load_unsigned_short(tmp1, Address(ary1, 0));
3883   andl(tmp1, 0x00008080);
3884   jccb(Assembler::notZero, CHAR_ADJUST);
3885   lea(ary1, Address(ary1, 2));
3886 
3887   bind(COMPARE_BYTE);
3888   testl(result, 0x1);   // tail  byte
3889   jccb(Assembler::zero, DONE);
3890   load_unsigned_byte(tmp1, Address(ary1, 0));
3891   testl(tmp1, 0x00000080);
3892   jccb(Assembler::zero, DONE);
3893   subptr(result, 1);
3894   jmpb(DONE);
3895 
3896   bind(TAIL_ADJUST);
3897   // there are negative bits in the last 4 byte block.
3898   // Adjust result and check the next three bytes
3899   addptr(result, len);
3900   orl(result, 3);
3901   lea(ary1, Address(ary1, len, Address::times_1));
3902   jmpb(COMPARE_CHAR);
3903 
3904   bind(CHAR_ADJUST);
3905   // We are looking at a char + optional byte tail, and found that one
3906   // of the bytes in the char is negative. Adjust the result, check the
3907   // first byte and readjust if needed.
3908   andl(result, 0xfffffffc);
3909   testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first
3910   jccb(Assembler::notZero, DONE);
3911   addptr(result, 1);
3912 
3913   // That's it
3914   bind(DONE);
3915   if (UseAVX >= 2 && UseSSE >= 2) {
3916     // clean upper bits of YMM registers
3917     vpxor(vec1, vec1);
3918     vpxor(vec2, vec2);
3919   }
3920 }
3921 
3922 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
3923 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
3924                                       Register limit, Register result, Register chr,
3925                                       XMMRegister vec1, XMMRegister vec2, bool is_char, KRegister mask) {
3926   ShortBranchVerifier sbv(this);
3927   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
3928 
3929   int length_offset  = arrayOopDesc::length_offset_in_bytes();
3930   int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
3931 
3932   if (is_array_equ) {
3933     // Check the input args
3934     cmpoop(ary1, ary2);
3935     jcc(Assembler::equal, TRUE_LABEL);
3936 
3937     // Need additional checks for arrays_equals.
3938     testptr(ary1, ary1);
3939     jcc(Assembler::zero, FALSE_LABEL);
3940     testptr(ary2, ary2);
3941     jcc(Assembler::zero, FALSE_LABEL);
3942 
3943     // Check the lengths
3944     movl(limit, Address(ary1, length_offset));
3945     cmpl(limit, Address(ary2, length_offset));
3946     jcc(Assembler::notEqual, FALSE_LABEL);
3947   }
3948 
3949   // count == 0
3950   testl(limit, limit);
3951   jcc(Assembler::zero, TRUE_LABEL);
3952 
3953   if (is_array_equ) {
3954     // Load array address
3955     lea(ary1, Address(ary1, base_offset));
3956     lea(ary2, Address(ary2, base_offset));
3957   }
3958 
3959   if (is_array_equ && is_char) {
3960     // arrays_equals when used for char[].
3961     shll(limit, 1);      // byte count != 0
3962   }
3963   movl(result, limit); // copy
3964 
3965   if (UseAVX >= 2) {
3966     // With AVX2, use 32-byte vector compare
3967     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
3968 
3969     // Compare 32-byte vectors
3970     andl(result, 0x0000001f);  //   tail count (in bytes)
3971     andl(limit, 0xffffffe0);   // vector count (in bytes)
3972     jcc(Assembler::zero, COMPARE_TAIL);
3973 
3974     lea(ary1, Address(ary1, limit, Address::times_1));
3975     lea(ary2, Address(ary2, limit, Address::times_1));
3976     negptr(limit);
3977 
3978 #ifdef _LP64
3979     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3980       Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
3981 
3982       cmpl(limit, -64);
3983       jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3984 
3985       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3986 
3987       evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
3988       evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
3989       kortestql(mask, mask);
3990       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
3991       addptr(limit, 64);  // update since we already compared at this addr
3992       cmpl(limit, -64);
3993       jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3994 
3995       // At this point we may still need to compare -limit+result bytes.
3996       // We could execute the next two instruction and just continue via non-wide path:
3997       //  cmpl(limit, 0);
3998       //  jcc(Assembler::equal, COMPARE_TAIL);  // true
3999       // But since we stopped at the points ary{1,2}+limit which are
4000       // not farther than 64 bytes from the ends of arrays ary{1,2}+result
4001       // (|limit| <= 32 and result < 32),
4002       // we may just compare the last 64 bytes.
4003       //
4004       addptr(result, -64);   // it is safe, bc we just came from this area
4005       evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
4006       evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
4007       kortestql(mask, mask);
4008       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4009 
4010       jmp(TRUE_LABEL);
4011 
4012       bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
4013 
4014     }//if (VM_Version::supports_avx512vlbw())
4015 #endif //_LP64
4016     bind(COMPARE_WIDE_VECTORS);
4017     vmovdqu(vec1, Address(ary1, limit, Address::times_1));
4018     vmovdqu(vec2, Address(ary2, limit, Address::times_1));
4019     vpxor(vec1, vec2);
4020 
4021     vptest(vec1, vec1);
4022     jcc(Assembler::notZero, FALSE_LABEL);
4023     addptr(limit, 32);
4024     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4025 
4026     testl(result, result);
4027     jcc(Assembler::zero, TRUE_LABEL);
4028 
4029     vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
4030     vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
4031     vpxor(vec1, vec2);
4032 
4033     vptest(vec1, vec1);
4034     jccb(Assembler::notZero, FALSE_LABEL);
4035     jmpb(TRUE_LABEL);
4036 
4037     bind(COMPARE_TAIL); // limit is zero
4038     movl(limit, result);
4039     // Fallthru to tail compare
4040   } else if (UseSSE42Intrinsics) {
4041     // With SSE4.2, use double quad vector compare
4042     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4043 
4044     // Compare 16-byte vectors
4045     andl(result, 0x0000000f);  //   tail count (in bytes)
4046     andl(limit, 0xfffffff0);   // vector count (in bytes)
4047     jcc(Assembler::zero, COMPARE_TAIL);
4048 
4049     lea(ary1, Address(ary1, limit, Address::times_1));
4050     lea(ary2, Address(ary2, limit, Address::times_1));
4051     negptr(limit);
4052 
4053     bind(COMPARE_WIDE_VECTORS);
4054     movdqu(vec1, Address(ary1, limit, Address::times_1));
4055     movdqu(vec2, Address(ary2, limit, Address::times_1));
4056     pxor(vec1, vec2);
4057 
4058     ptest(vec1, vec1);
4059     jcc(Assembler::notZero, FALSE_LABEL);
4060     addptr(limit, 16);
4061     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4062 
4063     testl(result, result);
4064     jcc(Assembler::zero, TRUE_LABEL);
4065 
4066     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
4067     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
4068     pxor(vec1, vec2);
4069 
4070     ptest(vec1, vec1);
4071     jccb(Assembler::notZero, FALSE_LABEL);
4072     jmpb(TRUE_LABEL);
4073 
4074     bind(COMPARE_TAIL); // limit is zero
4075     movl(limit, result);
4076     // Fallthru to tail compare
4077   }
4078 
4079   // Compare 4-byte vectors
4080   andl(limit, 0xfffffffc); // vector count (in bytes)
4081   jccb(Assembler::zero, COMPARE_CHAR);
4082 
4083   lea(ary1, Address(ary1, limit, Address::times_1));
4084   lea(ary2, Address(ary2, limit, Address::times_1));
4085   negptr(limit);
4086 
4087   bind(COMPARE_VECTORS);
4088   movl(chr, Address(ary1, limit, Address::times_1));
4089   cmpl(chr, Address(ary2, limit, Address::times_1));
4090   jccb(Assembler::notEqual, FALSE_LABEL);
4091   addptr(limit, 4);
4092   jcc(Assembler::notZero, COMPARE_VECTORS);
4093 
4094   // Compare trailing char (final 2 bytes), if any
4095   bind(COMPARE_CHAR);
4096   testl(result, 0x2);   // tail  char
4097   jccb(Assembler::zero, COMPARE_BYTE);
4098   load_unsigned_short(chr, Address(ary1, 0));
4099   load_unsigned_short(limit, Address(ary2, 0));
4100   cmpl(chr, limit);
4101   jccb(Assembler::notEqual, FALSE_LABEL);
4102 
4103   if (is_array_equ && is_char) {
4104     bind(COMPARE_BYTE);
4105   } else {
4106     lea(ary1, Address(ary1, 2));
4107     lea(ary2, Address(ary2, 2));
4108 
4109     bind(COMPARE_BYTE);
4110     testl(result, 0x1);   // tail  byte
4111     jccb(Assembler::zero, TRUE_LABEL);
4112     load_unsigned_byte(chr, Address(ary1, 0));
4113     load_unsigned_byte(limit, Address(ary2, 0));
4114     cmpl(chr, limit);
4115     jccb(Assembler::notEqual, FALSE_LABEL);
4116   }
4117   bind(TRUE_LABEL);
4118   movl(result, 1);   // return true
4119   jmpb(DONE);
4120 
4121   bind(FALSE_LABEL);
4122   xorl(result, result); // return false
4123 
4124   // That's it
4125   bind(DONE);
4126   if (UseAVX >= 2) {
4127     // clean upper bits of YMM registers
4128     vpxor(vec1, vec1);
4129     vpxor(vec2, vec2);
4130   }
4131 }
4132 
4133 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4134                                     XMMRegister src1, int imm8, bool merge, int vlen_enc) {
4135   switch(ideal_opc) {
4136     case Op_LShiftVS:
4137       Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break;
4138     case Op_LShiftVI:
4139       Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break;
4140     case Op_LShiftVL:
4141       Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break;
4142     case Op_RShiftVS:
4143       Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break;
4144     case Op_RShiftVI:
4145       Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break;
4146     case Op_RShiftVL:
4147       Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break;
4148     case Op_URShiftVS:
4149       Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break;
4150     case Op_URShiftVI:
4151       Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break;
4152     case Op_URShiftVL:
4153       Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break;
4154     case Op_RotateRightV:
4155       evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4156     case Op_RotateLeftV:
4157       evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4158     default:
4159       fatal("Unsupported masked operation"); break;
4160   }
4161 }
4162 
4163 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4164                                     XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc,
4165                                     bool is_varshift) {
4166   switch (ideal_opc) {
4167     case Op_AddVB:
4168       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4169     case Op_AddVS:
4170       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4171     case Op_AddVI:
4172       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4173     case Op_AddVL:
4174       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4175     case Op_AddVF:
4176       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4177     case Op_AddVD:
4178       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4179     case Op_SubVB:
4180       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4181     case Op_SubVS:
4182       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4183     case Op_SubVI:
4184       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4185     case Op_SubVL:
4186       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4187     case Op_SubVF:
4188       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4189     case Op_SubVD:
4190       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4191     case Op_MulVS:
4192       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4193     case Op_MulVI:
4194       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4195     case Op_MulVL:
4196       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4197     case Op_MulVF:
4198       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4199     case Op_MulVD:
4200       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4201     case Op_DivVF:
4202       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4203     case Op_DivVD:
4204       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4205     case Op_SqrtVF:
4206       evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break;
4207     case Op_SqrtVD:
4208       evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break;
4209     case Op_AbsVB:
4210       evpabsb(dst, mask, src2, merge, vlen_enc); break;
4211     case Op_AbsVS:
4212       evpabsw(dst, mask, src2, merge, vlen_enc); break;
4213     case Op_AbsVI:
4214       evpabsd(dst, mask, src2, merge, vlen_enc); break;
4215     case Op_AbsVL:
4216       evpabsq(dst, mask, src2, merge, vlen_enc); break;
4217     case Op_FmaVF:
4218       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4219     case Op_FmaVD:
4220       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4221     case Op_VectorRearrange:
4222       evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break;
4223     case Op_LShiftVS:
4224       evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4225     case Op_LShiftVI:
4226       evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4227     case Op_LShiftVL:
4228       evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4229     case Op_RShiftVS:
4230       evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4231     case Op_RShiftVI:
4232       evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4233     case Op_RShiftVL:
4234       evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4235     case Op_URShiftVS:
4236       evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4237     case Op_URShiftVI:
4238       evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4239     case Op_URShiftVL:
4240       evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4241     case Op_RotateLeftV:
4242       evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4243     case Op_RotateRightV:
4244       evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4245     case Op_MaxV:
4246       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4247     case Op_MinV:
4248       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4249     case Op_XorV:
4250       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4251     case Op_OrV:
4252       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4253     case Op_AndV:
4254       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4255     default:
4256       fatal("Unsupported masked operation"); break;
4257   }
4258 }
4259 
4260 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4261                                     XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4262   switch (ideal_opc) {
4263     case Op_AddVB:
4264       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4265     case Op_AddVS:
4266       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4267     case Op_AddVI:
4268       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4269     case Op_AddVL:
4270       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4271     case Op_AddVF:
4272       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4273     case Op_AddVD:
4274       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4275     case Op_SubVB:
4276       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4277     case Op_SubVS:
4278       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4279     case Op_SubVI:
4280       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4281     case Op_SubVL:
4282       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4283     case Op_SubVF:
4284       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4285     case Op_SubVD:
4286       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4287     case Op_MulVS:
4288       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4289     case Op_MulVI:
4290       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4291     case Op_MulVL:
4292       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4293     case Op_MulVF:
4294       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4295     case Op_MulVD:
4296       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4297     case Op_DivVF:
4298       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4299     case Op_DivVD:
4300       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4301     case Op_FmaVF:
4302       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4303     case Op_FmaVD:
4304       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4305     case Op_MaxV:
4306       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4307     case Op_MinV:
4308       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4309     case Op_XorV:
4310       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4311     case Op_OrV:
4312       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4313     case Op_AndV:
4314       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4315     default:
4316       fatal("Unsupported masked operation"); break;
4317   }
4318 }
4319 
4320 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst,
4321                                   KRegister src1, KRegister src2) {
4322   BasicType etype = T_ILLEGAL;
4323   switch(mask_len) {
4324     case 2:
4325     case 4:
4326     case 8:  etype = T_BYTE; break;
4327     case 16: etype = T_SHORT; break;
4328     case 32: etype = T_INT; break;
4329     case 64: etype = T_LONG; break;
4330     default: fatal("Unsupported type"); break;
4331   }
4332   assert(etype != T_ILLEGAL, "");
4333   switch(ideal_opc) {
4334     case Op_AndVMask:
4335       kand(etype, dst, src1, src2); break;
4336     case Op_OrVMask:
4337       kor(etype, dst, src1, src2); break;
4338     case Op_XorVMask:
4339       kxor(etype, dst, src1, src2); break;
4340     default:
4341       fatal("Unsupported masked operation"); break;
4342   }
4343 }
4344 
4345 /*
4346  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4347  * If src is NaN, the result is 0.
4348  * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE,
4349  * the result is equal to the value of Integer.MIN_VALUE.
4350  * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE,
4351  * the result is equal to the value of Integer.MAX_VALUE.
4352  */
4353 void C2_MacroAssembler::vector_cast_float_special_cases_avx(XMMRegister dst, XMMRegister src, AddressLiteral float_sign_flip, int vec_enc,
4354                                                             XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
4355                                                             Register rscratch) {
4356   Label done;
4357   vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch);
4358   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
4359   vptest(xtmp2, xtmp2, vec_enc);
4360   jccb(Assembler::equal, done);
4361 
4362   vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc);
4363   vpxor(xtmp1, xtmp1, xtmp4, vec_enc);
4364 
4365   vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
4366   vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc);
4367   vblendvps(dst, dst, xtmp4, xtmp3, vec_enc);
4368 
4369   // Recompute the mask for remaining special value.
4370   vpxor(xtmp2, xtmp2, xtmp3, vec_enc);
4371   // Extract SRC values corresponding to TRUE mask lanes.
4372   vpand(xtmp4, xtmp2, src, vec_enc);
4373   // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special
4374   // values are set.
4375   vpxor(xtmp3, xtmp2, xtmp4, vec_enc);
4376 
4377   vblendvps(dst, dst, xtmp1, xtmp3, vec_enc);
4378   bind(done);
4379 }
4380 
4381 void C2_MacroAssembler::vector_cast_float_special_cases_evex(XMMRegister dst, XMMRegister src, AddressLiteral float_sign_flip, int vec_enc,
4382                                                              XMMRegister xtmp1, XMMRegister xtmp2,
4383                                                              KRegister ktmp1, KRegister ktmp2,
4384                                                              Register rscratch) {
4385   Label done;
4386   evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
4387   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4388   kortestwl(ktmp1, ktmp1);
4389   jccb(Assembler::equal, done);
4390 
4391   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4392   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4393   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
4394 
4395   kxorwl(ktmp1, ktmp1, ktmp2);
4396   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4397   vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4398   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
4399   bind(done);
4400 }
4401 
4402 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src,
4403                                                                      AddressLiteral double_sign_flip, int vec_enc,
4404                                                                      XMMRegister xtmp1, XMMRegister xtmp2,
4405                                                                      KRegister ktmp1, KRegister ktmp2,
4406                                                                      Register rscratch) {
4407   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4408 
4409   Label done;
4410   evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
4411   Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc);
4412   kortestwl(ktmp1, ktmp1);
4413   jccb(Assembler::equal, done);
4414 
4415   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4416   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4417   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
4418 
4419   kxorwl(ktmp1, ktmp1, ktmp2);
4420   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4421   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4422   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
4423   bind(done);
4424 }
4425 
4426 /*
4427  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4428  * If src is NaN, the result is 0.
4429  * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE,
4430  * the result is equal to the value of Long.MIN_VALUE.
4431  * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE,
4432  * the result is equal to the value of Long.MAX_VALUE.
4433  */
4434 void C2_MacroAssembler::vector_cast_double_special_cases_evex(XMMRegister dst, XMMRegister src,
4435                                                               AddressLiteral double_sign_flip, int vec_enc,
4436                                                               XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4437                                                               Register rscratch) {
4438   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4439 
4440   Label done;
4441   evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
4442   evpcmpeqq(ktmp1, xtmp1, dst, vec_enc);
4443   kortestwl(ktmp1, ktmp1);
4444   jccb(Assembler::equal, done);
4445 
4446   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4447   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4448   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
4449 
4450   kxorwl(ktmp1, ktmp1, ktmp2);
4451   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4452   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4453   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
4454   bind(done);
4455 }
4456 
4457 /*
4458  * Algorithm for vector D2L and F2I conversions:-
4459  * a) Perform vector D2L/F2I cast.
4460  * b) Choose fast path if none of the result vector lane contains 0x80000000 value.
4461  *    It signifies that source value could be any of the special floating point
4462  *    values(NaN,-Inf,Inf,Max,-Min).
4463  * c) Set destination to zero if source is NaN value.
4464  * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value.
4465  */
4466 
4467 void C2_MacroAssembler::vector_castD2L_evex(XMMRegister dst, XMMRegister src, AddressLiteral double_sign_flip, int vec_enc,
4468                                             XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, Register rscratch) {
4469   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4470 
4471   evcvttpd2qq(dst, src, vec_enc);
4472   vector_cast_double_special_cases_evex(dst, src, double_sign_flip, vec_enc,
4473                                         xtmp1, xtmp2, ktmp1, ktmp2, rscratch);
4474 }
4475 
4476 void C2_MacroAssembler::vector_castF2I_avx(XMMRegister dst, XMMRegister src, AddressLiteral float_sign_flip, int vec_enc,
4477                                            XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, Register rscratch) {
4478   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4479 
4480   vcvttps2dq(dst, src, vec_enc);
4481   vector_cast_float_special_cases_avx(dst, src, float_sign_flip, vec_enc,
4482                                       xtmp1, xtmp2, xtmp3, xtmp4, rscratch);
4483 }
4484 
4485 void C2_MacroAssembler::vector_castF2I_evex(XMMRegister dst, XMMRegister src, AddressLiteral float_sign_flip, int vec_enc,
4486                                             XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, Register rscratch) {
4487   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4488 
4489   vcvttps2dq(dst, src, vec_enc);
4490   vector_cast_float_special_cases_evex(dst, src, float_sign_flip, vec_enc,
4491                                        xtmp1, xtmp2, ktmp1, ktmp2, rscratch);
4492 }
4493 
4494 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, AddressLiteral float_sign_flip, int vec_enc,
4495                                             XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, Register rscratch) {
4496   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4497 
4498   evcvttps2qq(dst, src, vec_enc);
4499   vector_cast_float_to_long_special_cases_evex(dst, src, float_sign_flip, vec_enc,
4500                                                xtmp1, xtmp2, ktmp1, ktmp2, rscratch);
4501 }
4502 
4503 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, AddressLiteral double_sign_flip, int vec_enc,
4504                                             XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, Register rscratch) {
4505   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4506 
4507   vector_castD2L_evex(dst, src, double_sign_flip, vec_enc,
4508                       xtmp1, xtmp2, ktmp1, ktmp2, rscratch);
4509   if (to_elem_bt != T_LONG) {
4510     switch(to_elem_bt) {
4511       case T_INT:
4512         evpmovsqd(dst, dst, vec_enc);
4513         break;
4514       case T_SHORT:
4515         evpmovsqd(dst, dst, vec_enc);
4516         evpmovdw(dst, dst, vec_enc);
4517         break;
4518       case T_BYTE:
4519         evpmovsqd(dst, dst, vec_enc);
4520         evpmovdb(dst, dst, vec_enc);
4521         break;
4522       default: assert(false, "%s", type2name(to_elem_bt));
4523     }
4524   }
4525 }
4526 
4527 #ifdef _LP64
4528 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src,
4529                                                  AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
4530                                                  Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
4531   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
4532   // and re-instantiate original MXCSR.RC mode after that.
4533   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
4534 
4535   mov64(tmp, julong_cast(0.5L));
4536   evpbroadcastq(xtmp1, tmp, vec_enc);
4537   vaddpd(xtmp1, src , xtmp1, vec_enc);
4538   evcvtpd2qq(dst, xtmp1, vec_enc);
4539   vector_cast_double_special_cases_evex(dst, src, double_sign_flip, vec_enc,
4540                                         xtmp1, xtmp2, ktmp1, ktmp2, tmp);
4541 
4542   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
4543 }
4544 
4545 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src,
4546                                                 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
4547                                                 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
4548   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
4549   // and re-instantiate original MXCSR.RC mode after that.
4550   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
4551 
4552   movl(tmp, jint_cast(0.5));
4553   movq(xtmp1, tmp);
4554   vbroadcastss(xtmp1, xtmp1, vec_enc);
4555   vaddps(xtmp1, src , xtmp1, vec_enc);
4556   vcvtps2dq(dst, xtmp1, vec_enc);
4557   vector_cast_float_special_cases_evex(dst, src, float_sign_flip, vec_enc,
4558                                        xtmp1, xtmp2, ktmp1, ktmp2, tmp);
4559 
4560   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
4561 }
4562 
4563 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src,
4564                                                AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
4565                                                Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) {
4566   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
4567   // and re-instantiate original MXCSR.RC mode after that.
4568   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
4569 
4570   movl(tmp, jint_cast(0.5));
4571   movq(xtmp1, tmp);
4572   vbroadcastss(xtmp1, xtmp1, vec_enc);
4573   vaddps(xtmp1, src , xtmp1, vec_enc);
4574   vcvtps2dq(dst, xtmp1, vec_enc);
4575   vector_cast_float_special_cases_avx(dst, src, float_sign_flip, vec_enc,
4576                                       xtmp1, xtmp2, xtmp3, xtmp4, tmp);
4577 
4578   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
4579 }
4580 #endif // _LP64
4581 
4582 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
4583                                              BasicType from_elem_bt, BasicType to_elem_bt) {
4584   switch (from_elem_bt) {
4585     case T_BYTE:
4586       switch (to_elem_bt) {
4587         case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break;
4588         case T_INT:   vpmovzxbd(dst, src, vlen_enc); break;
4589         case T_LONG:  vpmovzxbq(dst, src, vlen_enc); break;
4590         default: ShouldNotReachHere();
4591       }
4592       break;
4593     case T_SHORT:
4594       switch (to_elem_bt) {
4595         case T_INT:  vpmovzxwd(dst, src, vlen_enc); break;
4596         case T_LONG: vpmovzxwq(dst, src, vlen_enc); break;
4597         default: ShouldNotReachHere();
4598       }
4599       break;
4600     case T_INT:
4601       assert(to_elem_bt == T_LONG, "");
4602       vpmovzxdq(dst, src, vlen_enc);
4603       break;
4604     default:
4605       ShouldNotReachHere();
4606   }
4607 }
4608 
4609 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3,
4610                                    bool merge, BasicType bt, int vlen_enc) {
4611   if (bt == T_INT) {
4612     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
4613   } else {
4614     assert(bt == T_LONG, "");
4615     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
4616   }
4617 }
4618 
4619 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3,
4620                                    bool merge, BasicType bt, int vlen_enc) {
4621   if (bt == T_INT) {
4622     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
4623   } else {
4624     assert(bt == T_LONG, "");
4625     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
4626   }
4627 }
4628 
4629 #ifdef _LP64
4630 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1,
4631                                                Register rtmp2, XMMRegister xtmp, int mask_len,
4632                                                int vec_enc) {
4633   int index = 0;
4634   int vindex = 0;
4635   mov64(rtmp1, 0x0101010101010101L);
4636   pdepq(rtmp1, src, rtmp1);
4637   if (mask_len > 8) {
4638     movq(rtmp2, src);
4639     vpxor(xtmp, xtmp, xtmp, vec_enc);
4640     movq(xtmp, rtmp1);
4641   }
4642   movq(dst, rtmp1);
4643 
4644   mask_len -= 8;
4645   while (mask_len > 0) {
4646     assert ((mask_len & 0x7) == 0, "mask must be multiple of 8");
4647     index++;
4648     if ((index % 2) == 0) {
4649       pxor(xtmp, xtmp);
4650     }
4651     mov64(rtmp1, 0x0101010101010101L);
4652     shrq(rtmp2, 8);
4653     pdepq(rtmp1, rtmp2, rtmp1);
4654     pinsrq(xtmp, rtmp1, index % 2);
4655     vindex = index / 2;
4656     if (vindex) {
4657       // Write entire 16 byte vector when both 64 bit
4658       // lanes are update to save redundant instructions.
4659       if (index % 2) {
4660         vinsertf128(dst, dst, xtmp, vindex);
4661       }
4662     } else {
4663       vmovdqu(dst, xtmp);
4664     }
4665     mask_len -= 8;
4666   }
4667 }
4668 
4669 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) {
4670   switch(opc) {
4671     case Op_VectorMaskTrueCount:
4672       popcntq(dst, tmp);
4673       break;
4674     case Op_VectorMaskLastTrue:
4675       if (VM_Version::supports_lzcnt()) {
4676         lzcntq(tmp, tmp);
4677         movl(dst, 63);
4678         subl(dst, tmp);
4679       } else {
4680         movl(dst, -1);
4681         bsrq(tmp, tmp);
4682         cmov32(Assembler::notZero, dst, tmp);
4683       }
4684       break;
4685     case Op_VectorMaskFirstTrue:
4686       if (VM_Version::supports_bmi1()) {
4687         if (masklen < 32) {
4688           orl(tmp, 1 << masklen);
4689           tzcntl(dst, tmp);
4690         } else if (masklen == 32) {
4691           tzcntl(dst, tmp);
4692         } else {
4693           assert(masklen == 64, "");
4694           tzcntq(dst, tmp);
4695         }
4696       } else {
4697         if (masklen < 32) {
4698           orl(tmp, 1 << masklen);
4699           bsfl(dst, tmp);
4700         } else {
4701           assert(masklen == 32 || masklen == 64, "");
4702           movl(dst, masklen);
4703           if (masklen == 32)  {
4704             bsfl(tmp, tmp);
4705           } else {
4706             bsfq(tmp, tmp);
4707           }
4708           cmov32(Assembler::notZero, dst, tmp);
4709         }
4710       }
4711       break;
4712     case Op_VectorMaskToLong:
4713       assert(dst == tmp, "Dst and tmp should be the same for toLong operations");
4714       break;
4715     default: assert(false, "Unhandled mask operation");
4716   }
4717 }
4718 
4719 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp,
4720                                               int masklen, int masksize, int vec_enc) {
4721   assert(VM_Version::supports_popcnt(), "");
4722 
4723   if(VM_Version::supports_avx512bw()) {
4724     kmovql(tmp, mask);
4725   } else {
4726     assert(masklen <= 16, "");
4727     kmovwl(tmp, mask);
4728   }
4729 
4730   // Mask generated out of partial vector comparisons/replicate/mask manipulation
4731   // operations needs to be clipped.
4732   if (masksize < 16 && opc != Op_VectorMaskFirstTrue) {
4733     andq(tmp, (1 << masklen) - 1);
4734   }
4735 
4736   vector_mask_operation_helper(opc, dst, tmp, masklen);
4737 }
4738 
4739 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
4740                                               Register tmp, int masklen, BasicType bt, int vec_enc) {
4741   assert(vec_enc == AVX_128bit && VM_Version::supports_avx() ||
4742          vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4), "");
4743   assert(VM_Version::supports_popcnt(), "");
4744 
4745   bool need_clip = false;
4746   switch(bt) {
4747     case T_BOOLEAN:
4748       // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1
4749       vpxor(xtmp, xtmp, xtmp, vec_enc);
4750       vpsubb(xtmp, xtmp, mask, vec_enc);
4751       vpmovmskb(tmp, xtmp, vec_enc);
4752       need_clip = masklen < 16;
4753       break;
4754     case T_BYTE:
4755       vpmovmskb(tmp, mask, vec_enc);
4756       need_clip = masklen < 16;
4757       break;
4758     case T_SHORT:
4759       vpacksswb(xtmp, mask, mask, vec_enc);
4760       if (masklen >= 16) {
4761         vpermpd(xtmp, xtmp, 8, vec_enc);
4762       }
4763       vpmovmskb(tmp, xtmp, Assembler::AVX_128bit);
4764       need_clip = masklen < 16;
4765       break;
4766     case T_INT:
4767     case T_FLOAT:
4768       vmovmskps(tmp, mask, vec_enc);
4769       need_clip = masklen < 4;
4770       break;
4771     case T_LONG:
4772     case T_DOUBLE:
4773       vmovmskpd(tmp, mask, vec_enc);
4774       need_clip = masklen < 2;
4775       break;
4776     default: assert(false, "Unhandled type, %s", type2name(bt));
4777   }
4778 
4779   // Mask generated out of partial vector comparisons/replicate/mask manipulation
4780   // operations needs to be clipped.
4781   if (need_clip && opc != Op_VectorMaskFirstTrue) {
4782     // need_clip implies masklen < 32
4783     andq(tmp, (1 << masklen) - 1);
4784   }
4785 
4786   vector_mask_operation_helper(opc, dst, tmp, masklen);
4787 }
4788 
4789 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1,
4790                                              Register rtmp2, int mask_len) {
4791   kmov(rtmp1, src);
4792   andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len)));
4793   mov64(rtmp2, -1L);
4794   pextq(rtmp2, rtmp2, rtmp1);
4795   kmov(dst, rtmp2);
4796 }
4797 
4798 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask,
4799                                                bool merge, BasicType bt, int vec_enc) {
4800   if (opcode == Op_CompressV) {
4801     switch(bt) {
4802     case T_BYTE:
4803       evpcompressb(dst, mask, src, merge, vec_enc);
4804       break;
4805     case T_CHAR:
4806     case T_SHORT:
4807       evpcompressw(dst, mask, src, merge, vec_enc);
4808       break;
4809     case T_INT:
4810       evpcompressd(dst, mask, src, merge, vec_enc);
4811       break;
4812     case T_FLOAT:
4813       evcompressps(dst, mask, src, merge, vec_enc);
4814       break;
4815     case T_LONG:
4816       evpcompressq(dst, mask, src, merge, vec_enc);
4817       break;
4818     case T_DOUBLE:
4819       evcompresspd(dst, mask, src, merge, vec_enc);
4820       break;
4821     default:
4822       fatal("Unsupported type %s", type2name(bt));
4823       break;
4824     }
4825   } else {
4826     assert(opcode == Op_ExpandV, "");
4827     switch(bt) {
4828     case T_BYTE:
4829       evpexpandb(dst, mask, src, merge, vec_enc);
4830       break;
4831     case T_CHAR:
4832     case T_SHORT:
4833       evpexpandw(dst, mask, src, merge, vec_enc);
4834       break;
4835     case T_INT:
4836       evpexpandd(dst, mask, src, merge, vec_enc);
4837       break;
4838     case T_FLOAT:
4839       evexpandps(dst, mask, src, merge, vec_enc);
4840       break;
4841     case T_LONG:
4842       evpexpandq(dst, mask, src, merge, vec_enc);
4843       break;
4844     case T_DOUBLE:
4845       evexpandpd(dst, mask, src, merge, vec_enc);
4846       break;
4847     default:
4848       fatal("Unsupported type %s", type2name(bt));
4849       break;
4850     }
4851   }
4852 }
4853 #endif
4854 
4855 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
4856                                            KRegister ktmp1, int vec_enc) {
4857   if (opcode == Op_SignumVD) {
4858     vsubpd(dst, zero, one, vec_enc);
4859     // if src < 0 ? -1 : 1
4860     evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
4861     evblendmpd(dst, ktmp1, one, dst, true, vec_enc);
4862     // if src == NaN, -0.0 or 0.0 return src.
4863     evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
4864     evblendmpd(dst, ktmp1, dst, src, true, vec_enc);
4865   } else {
4866     assert(opcode == Op_SignumVF, "");
4867     vsubps(dst, zero, one, vec_enc);
4868     // if src < 0 ? -1 : 1
4869     evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
4870     evblendmps(dst, ktmp1, one, dst, true, vec_enc);
4871     // if src == NaN, -0.0 or 0.0 return src.
4872     evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
4873     evblendmps(dst, ktmp1, dst, src, true, vec_enc);
4874   }
4875 }
4876 
4877 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
4878                                           XMMRegister xtmp1, int vec_enc) {
4879   if (opcode == Op_SignumVD) {
4880     vsubpd(dst, zero, one, vec_enc);
4881     // if src < 0 ? -1 : 1
4882     vblendvpd(dst, one, dst, src, vec_enc);
4883     // if src == NaN, -0.0 or 0.0 return src.
4884     vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
4885     vblendvpd(dst, dst, src, xtmp1, vec_enc);
4886   } else {
4887     assert(opcode == Op_SignumVF, "");
4888     vsubps(dst, zero, one, vec_enc);
4889     // if src < 0 ? -1 : 1
4890     vblendvps(dst, one, dst, src, vec_enc);
4891     // if src == NaN, -0.0 or 0.0 return src.
4892     vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
4893     vblendvps(dst, dst, src, xtmp1, vec_enc);
4894   }
4895 }
4896 
4897 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) {
4898   if (VM_Version::supports_avx512bw()) {
4899     if (mask_len > 32) {
4900       kmovql(dst, src);
4901     } else {
4902       kmovdl(dst, src);
4903       if (mask_len != 32) {
4904         kshiftrdl(dst, dst, 32 - mask_len);
4905       }
4906     }
4907   } else {
4908     assert(mask_len <= 16, "");
4909     kmovwl(dst, src);
4910     if (mask_len != 16) {
4911       kshiftrwl(dst, dst, 16 - mask_len);
4912     }
4913   }
4914 }
4915 
4916 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) {
4917   int lane_size = type2aelembytes(bt);
4918   bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
4919   if ((is_LP64 || lane_size < 8) &&
4920       ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) ||
4921        (is_subword_type(bt) && VM_Version::supports_avx512vlbw()))) {
4922     movptr(rtmp, imm32);
4923     switch(lane_size) {
4924       case 1 : evpbroadcastb(dst, rtmp, vec_enc); break;
4925       case 2 : evpbroadcastw(dst, rtmp, vec_enc); break;
4926       case 4 : evpbroadcastd(dst, rtmp, vec_enc); break;
4927       case 8 : evpbroadcastq(dst, rtmp, vec_enc); break;
4928       fatal("Unsupported lane size %d", lane_size);
4929       break;
4930     }
4931   } else {
4932     movptr(rtmp, imm32);
4933     LP64_ONLY(movq(dst, rtmp)) NOT_LP64(movdl(dst, rtmp));
4934     switch(lane_size) {
4935       case 1 : vpbroadcastb(dst, dst, vec_enc); break;
4936       case 2 : vpbroadcastw(dst, dst, vec_enc); break;
4937       case 4 : vpbroadcastd(dst, dst, vec_enc); break;
4938       case 8 : vpbroadcastq(dst, dst, vec_enc); break;
4939       fatal("Unsupported lane size %d", lane_size);
4940       break;
4941     }
4942   }
4943 }
4944 
4945 //
4946 // Following is lookup table based popcount computation algorithm:-
4947 //       Index   Bit set count
4948 //     [ 0000 ->   0,
4949 //       0001 ->   1,
4950 //       0010 ->   1,
4951 //       0011 ->   2,
4952 //       0100 ->   1,
4953 //       0101 ->   2,
4954 //       0110 ->   2,
4955 //       0111 ->   3,
4956 //       1000 ->   1,
4957 //       1001 ->   2,
4958 //       1010 ->   3,
4959 //       1011 ->   3,
4960 //       1100 ->   2,
4961 //       1101 ->   3,
4962 //       1111 ->   4 ]
4963 //  a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as
4964 //     shuffle indices for lookup table access.
4965 //  b. Right shift each byte of vector lane by 4 positions.
4966 //  c. Count the number of 1s in 4 MSB bits each byte. These bits are used as
4967 //     shuffle indices for lookup table access.
4968 //  d. Add the bitset count of upper and lower 4 bits of each byte.
4969 //  e. Unpack double words to quad words and compute sum of absolute difference of bitset
4970 //     count of all the bytes of a quadword.
4971 //  f. Perform step e. for upper 128bit vector lane.
4972 //  g. Pack the bitset count of quadwords back to double word.
4973 //  h. Unpacking and packing operations are not needed for 64bit vector lane.
4974 
4975 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4976                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
4977   assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), "");
4978   vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc);
4979   vpsrlw(dst, src, 4, vec_enc);
4980   vpand(dst, dst, xtmp1, vec_enc);
4981   vpand(xtmp1, src, xtmp1, vec_enc);
4982   vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg);
4983   vpshufb(xtmp1, xtmp2, xtmp1, vec_enc);
4984   vpshufb(dst, xtmp2, dst, vec_enc);
4985   vpaddb(dst, dst, xtmp1, vec_enc);
4986 }
4987 
4988 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4989                                             XMMRegister xtmp2, Register rtmp, int vec_enc) {
4990   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
4991   // Following code is as per steps e,f,g and h of above algorithm.
4992   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4993   vpunpckhdq(dst, xtmp1, xtmp2, vec_enc);
4994   vpsadbw(dst, dst, xtmp2, vec_enc);
4995   vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc);
4996   vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc);
4997   vpackuswb(dst, xtmp1, dst, vec_enc);
4998 }
4999 
5000 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5001                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5002   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5003   // Add the popcount of upper and lower bytes of word.
5004   vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc);
5005   vpsrlw(dst, xtmp1, 8, vec_enc);
5006   vpand(xtmp1, xtmp1, xtmp2, vec_enc);
5007   vpaddw(dst, dst, xtmp1, vec_enc);
5008 }
5009 
5010 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5011                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5012   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5013   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5014   vpsadbw(dst, xtmp1, xtmp2, vec_enc);
5015 }
5016 
5017 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5018                                                  XMMRegister xtmp2, Register rtmp, int vec_enc) {
5019   switch(bt) {
5020     case T_LONG:
5021       vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5022       break;
5023     case T_INT:
5024       vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5025       break;
5026     case T_CHAR:
5027     case T_SHORT:
5028       vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5029       break;
5030     case T_BYTE:
5031     case T_BOOLEAN:
5032       vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5033       break;
5034     default:
5035       fatal("Unsupported type %s", type2name(bt));
5036       break;
5037   }
5038 }
5039 
5040 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5041                                                       KRegister mask, bool merge, int vec_enc) {
5042   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5043   switch(bt) {
5044     case T_LONG:
5045       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5046       evpopcntq(dst, mask, src, merge, vec_enc);
5047       break;
5048     case T_INT:
5049       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5050       evpopcntd(dst, mask, src, merge, vec_enc);
5051       break;
5052     case T_CHAR:
5053     case T_SHORT:
5054       assert(VM_Version::supports_avx512_bitalg(), "");
5055       evpopcntw(dst, mask, src, merge, vec_enc);
5056       break;
5057     case T_BYTE:
5058     case T_BOOLEAN:
5059       assert(VM_Version::supports_avx512_bitalg(), "");
5060       evpopcntb(dst, mask, src, merge, vec_enc);
5061       break;
5062     default:
5063       fatal("Unsupported type %s", type2name(bt));
5064       break;
5065   }
5066 }
5067 
5068 #ifndef _LP64
5069 void C2_MacroAssembler::vector_maskall_operation32(KRegister dst, Register src, KRegister tmp, int mask_len) {
5070   assert(VM_Version::supports_avx512bw(), "");
5071   kmovdl(tmp, src);
5072   kunpckdql(dst, tmp, tmp);
5073 }
5074 #endif
5075 
5076 // Bit reversal algorithm first reverses the bits of each byte followed by
5077 // a byte level reversal for multi-byte primitive types (short/int/long).
5078 // Algorithm performs a lookup table access to get reverse bit sequence
5079 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte
5080 // is obtained by swapping the reverse bit sequences of upper and lower
5081 // nibble of a byte.
5082 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5083                                            XMMRegister xtmp2, Register rtmp, int vec_enc) {
5084   if (VM_Version::supports_avx512vlbw()) {
5085 
5086     // Get the reverse bit sequence of lower nibble of each byte.
5087     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg);
5088     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5089     vpandq(dst, xtmp2, src, vec_enc);
5090     vpshufb(dst, xtmp1, dst, vec_enc);
5091     vpsllq(dst, dst, 4, vec_enc);
5092 
5093     // Get the reverse bit sequence of upper nibble of each byte.
5094     vpandn(xtmp2, xtmp2, src, vec_enc);
5095     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5096     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5097 
5098     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5099     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5100     vporq(xtmp2, dst, xtmp2, vec_enc);
5101     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
5102 
5103   } else if(vec_enc == Assembler::AVX_512bit) {
5104     // Shift based bit reversal.
5105     assert(bt == T_LONG || bt == T_INT, "");
5106 
5107     // Swap lower and upper nibble of each byte.
5108     vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc);
5109 
5110     // Swap two least and most significant bits of each nibble.
5111     vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc);
5112 
5113     // Swap adjacent pair of bits.
5114     evmovdqul(xtmp1, k0, dst, true, vec_enc);
5115     vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc);
5116 
5117     evmovdqul(xtmp1, k0, dst, true, vec_enc);
5118     vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc);
5119   } else {
5120     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp);
5121     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5122 
5123     // Get the reverse bit sequence of lower nibble of each byte.
5124     vpand(dst, xtmp2, src, vec_enc);
5125     vpshufb(dst, xtmp1, dst, vec_enc);
5126     vpsllq(dst, dst, 4, vec_enc);
5127 
5128     // Get the reverse bit sequence of upper nibble of each byte.
5129     vpandn(xtmp2, xtmp2, src, vec_enc);
5130     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5131     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5132 
5133     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5134     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5135     vpor(xtmp2, dst, xtmp2, vec_enc);
5136     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
5137   }
5138 }
5139 
5140 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc,
5141                                                 XMMRegister xtmp, Register rscratch) {
5142   assert(VM_Version::supports_gfni(), "");
5143   assert(rscratch != noreg || always_reachable(mask), "missing");
5144 
5145   // Galois field instruction based bit reversal based on following algorithm.
5146   // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
5147   vpbroadcastq(xtmp, mask, vec_enc, rscratch);
5148   vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc);
5149   vector_reverse_byte(bt, dst, xtmp, vec_enc);
5150 }
5151 
5152 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src,
5153                                           XMMRegister xtmp1, Register rtmp, int vec_enc) {
5154   vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc);
5155   vpandq(dst, xtmp1, src, vec_enc);
5156   vpsllq(dst, dst, nbits, vec_enc);
5157   vpandn(xtmp1, xtmp1, src, vec_enc);
5158   vpsrlq(xtmp1, xtmp1, nbits, vec_enc);
5159   vporq(dst, dst, xtmp1, vec_enc);
5160 }
5161 
5162 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5163                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5164   // Shift based bit reversal.
5165   assert(VM_Version::supports_evex(), "");
5166   switch(bt) {
5167     case T_LONG:
5168       // Swap upper and lower double word of each quad word.
5169       evprorq(xtmp1, k0, src, 32, true, vec_enc);
5170       evprord(xtmp1, k0, xtmp1, 16, true, vec_enc);
5171       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
5172       break;
5173     case T_INT:
5174       // Swap upper and lower word of each double word.
5175       evprord(xtmp1, k0, src, 16, true, vec_enc);
5176       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
5177       break;
5178     case T_CHAR:
5179     case T_SHORT:
5180       // Swap upper and lower byte of each word.
5181       vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc);
5182       break;
5183     case T_BYTE:
5184       evmovdquq(dst, k0, src, true, vec_enc);
5185       break;
5186     default:
5187       fatal("Unsupported type %s", type2name(bt));
5188       break;
5189   }
5190 }
5191 
5192 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) {
5193   if (bt == T_BYTE) {
5194     if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) {
5195       evmovdquq(dst, k0, src, true, vec_enc);
5196     } else {
5197       vmovdqu(dst, src);
5198     }
5199     return;
5200   }
5201   // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using
5202   // pre-computed shuffle indices.
5203   switch(bt) {
5204     case T_LONG:
5205       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg);
5206       break;
5207     case T_INT:
5208       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg);
5209       break;
5210     case T_CHAR:
5211     case T_SHORT:
5212       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg);
5213       break;
5214     default:
5215       fatal("Unsupported type %s", type2name(bt));
5216       break;
5217   }
5218   vpshufb(dst, src, dst, vec_enc);
5219 }
5220 
5221 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5222                                                         XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
5223                                                         KRegister ktmp, Register rtmp, bool merge, int vec_enc) {
5224   assert(is_integral_type(bt), "");
5225   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5226   assert(VM_Version::supports_avx512cd(), "");
5227   switch(bt) {
5228     case T_LONG:
5229       evplzcntq(dst, ktmp, src, merge, vec_enc);
5230       break;
5231     case T_INT:
5232       evplzcntd(dst, ktmp, src, merge, vec_enc);
5233       break;
5234     case T_SHORT:
5235       vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc);
5236       vpunpcklwd(xtmp2, xtmp1, src, vec_enc);
5237       evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc);
5238       vpunpckhwd(dst, xtmp1, src, vec_enc);
5239       evplzcntd(dst, ktmp, dst, merge, vec_enc);
5240       vpackusdw(dst, xtmp2, dst, vec_enc);
5241       break;
5242     case T_BYTE:
5243       // T1 = Compute leading zero counts of 4 LSB bits of each byte by
5244       // accessing the lookup table.
5245       // T2 = Compute leading zero counts of 4 MSB bits of each byte by
5246       // accessing the lookup table.
5247       // Add T1 to T2 if 4 MSB bits of byte are all zeros.
5248       assert(VM_Version::supports_avx512bw(), "");
5249       evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp);
5250       vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc);
5251       vpand(xtmp2, dst, src, vec_enc);
5252       vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5253       vpsrlw(xtmp3, src, 4, vec_enc);
5254       vpand(xtmp3, dst, xtmp3, vec_enc);
5255       vpshufb(dst, xtmp1, xtmp3, vec_enc);
5256       vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
5257       evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc);
5258       evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc);
5259       break;
5260     default:
5261       fatal("Unsupported type %s", type2name(bt));
5262       break;
5263   }
5264 }
5265 
5266 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5267                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
5268   vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp);
5269   vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5270   // T1 = Compute leading zero counts of 4 LSB bits of each byte by
5271   // accessing the lookup table.
5272   vpand(dst, xtmp2, src, vec_enc);
5273   vpshufb(dst, xtmp1, dst, vec_enc);
5274   // T2 = Compute leading zero counts of 4 MSB bits of each byte by
5275   // accessing the lookup table.
5276   vpsrlw(xtmp3, src, 4, vec_enc);
5277   vpand(xtmp3, xtmp2, xtmp3, vec_enc);
5278   vpshufb(xtmp2, xtmp1, xtmp3, vec_enc);
5279   // Add T1 to T2 if 4 MSB bits of byte are all zeros.
5280   vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
5281   vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc);
5282   vpaddb(dst, dst, xtmp2, vec_enc);
5283   vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc);
5284 }
5285 
5286 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5287                                                              XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
5288   vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
5289   // Add zero counts of lower byte and upper byte of a word if
5290   // upper byte holds a zero value.
5291   vpsrlw(xtmp3, src, 8, vec_enc);
5292   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
5293   vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc);
5294   vpsllw(xtmp2, dst, 8, vec_enc);
5295   vpaddw(xtmp2, xtmp2, dst, vec_enc);
5296   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
5297   vpsrlw(dst, dst, 8, vec_enc);
5298 }
5299 
5300 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5301                                                            XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) {
5302   // Since IEEE 754 floating point format represents mantissa in 1.0 format
5303   // hence biased exponent can be used to compute leading zero count as per
5304   // following formula:-
5305   // LZCNT = 32 - (biased_exp - 127)
5306   // Special handling has been introduced for Zero, Max_Int and -ve source values.
5307 
5308   // Broadcast 0xFF
5309   vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc);
5310   vpsrld(xtmp1, xtmp1, 24, vec_enc);
5311 
5312   // Extract biased exponent.
5313   vcvtdq2ps(dst, src, vec_enc);
5314   vpsrld(dst, dst, 23, vec_enc);
5315   vpand(dst, dst, xtmp1, vec_enc);
5316 
5317   // Broadcast 127.
5318   vpsrld(xtmp1, xtmp1, 1, vec_enc);
5319   // Exponent = biased_exp - 127
5320   vpsubd(dst, dst, xtmp1, vec_enc);
5321 
5322   // Exponent = Exponent  + 1
5323   vpsrld(xtmp3, xtmp1, 6, vec_enc);
5324   vpaddd(dst, dst, xtmp3, vec_enc);
5325 
5326   // Replace -ve exponent with zero, exponent is -ve when src
5327   // lane contains a zero value.
5328   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5329   vblendvps(dst, dst, xtmp2, dst, vec_enc);
5330 
5331   // Rematerialize broadcast 32.
5332   vpslld(xtmp1, xtmp3, 5, vec_enc);
5333   // Exponent is 32 if corresponding source lane contains max_int value.
5334   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
5335   // LZCNT = 32 - exponent
5336   vpsubd(dst, xtmp1, dst, vec_enc);
5337 
5338   // Replace LZCNT with a value 1 if corresponding source lane
5339   // contains max_int value.
5340   vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc);
5341 
5342   // Replace biased_exp with 0 if source lane value is less than zero.
5343   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5344   vblendvps(dst, dst, xtmp2, src, vec_enc);
5345 }
5346 
5347 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5348                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
5349   vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
5350   // Add zero counts of lower word and upper word of a double word if
5351   // upper word holds a zero value.
5352   vpsrld(xtmp3, src, 16, vec_enc);
5353   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
5354   vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc);
5355   vpslld(xtmp2, dst, 16, vec_enc);
5356   vpaddd(xtmp2, xtmp2, dst, vec_enc);
5357   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
5358   vpsrld(dst, dst, 16, vec_enc);
5359   // Add zero counts of lower doubleword and upper doubleword of a
5360   // quadword if upper doubleword holds a zero value.
5361   vpsrlq(xtmp3, src, 32, vec_enc);
5362   vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc);
5363   vpsllq(xtmp2, dst, 32, vec_enc);
5364   vpaddq(xtmp2, xtmp2, dst, vec_enc);
5365   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
5366   vpsrlq(dst, dst, 32, vec_enc);
5367 }
5368 
5369 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src,
5370                                                        XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
5371                                                        Register rtmp, int vec_enc) {
5372   assert(is_integral_type(bt), "unexpected type");
5373   assert(vec_enc < Assembler::AVX_512bit, "");
5374   switch(bt) {
5375     case T_LONG:
5376       vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
5377       break;
5378     case T_INT:
5379       vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
5380       break;
5381     case T_SHORT:
5382       vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
5383       break;
5384     case T_BYTE:
5385       vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
5386       break;
5387     default:
5388       fatal("Unsupported type %s", type2name(bt));
5389       break;
5390   }
5391 }
5392 
5393 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) {
5394   switch(bt) {
5395     case T_BYTE:
5396       vpsubb(dst, src1, src2, vec_enc);
5397       break;
5398     case T_SHORT:
5399       vpsubw(dst, src1, src2, vec_enc);
5400       break;
5401     case T_INT:
5402       vpsubd(dst, src1, src2, vec_enc);
5403       break;
5404     case T_LONG:
5405       vpsubq(dst, src1, src2, vec_enc);
5406       break;
5407     default:
5408       fatal("Unsupported type %s", type2name(bt));
5409       break;
5410   }
5411 }
5412 
5413 // Trailing zero count computation is based on leading zero count operation as per
5414 // following equation. All AVX3 targets support AVX512CD feature which offers
5415 // direct vector instruction to compute leading zero count.
5416 //      CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x)
5417 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5418                                                          XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
5419                                                          XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) {
5420   assert(is_integral_type(bt), "");
5421   // xtmp = -1
5422   vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc);
5423   // xtmp = xtmp + src
5424   vpadd(bt, xtmp4, xtmp4, src, vec_enc);
5425   // xtmp = xtmp & ~src
5426   vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc);
5427   vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc);
5428   vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc);
5429   vpsub(bt, dst, xtmp4, dst, vec_enc);
5430 }
5431 
5432 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation
5433 //      CTZ = PRIM_TYPE_WIDHT - POPC(x | -x)
5434 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5435                                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
5436   assert(is_integral_type(bt), "");
5437   // xtmp = 0
5438   vpxor(xtmp3 , xtmp3, xtmp3, vec_enc);
5439   // xtmp = 0 - src
5440   vpsub(bt, xtmp3, xtmp3, src, vec_enc);
5441   // xtmp = xtmp | src
5442   vpor(xtmp3, xtmp3, src, vec_enc);
5443   vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc);
5444   vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc);
5445   vpsub(bt, dst, xtmp1, dst, vec_enc);
5446 }
5447 
5448 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) {
5449   Label done;
5450   Label neg_divisor_fastpath;
5451   cmpl(divisor, 0);
5452   jccb(Assembler::less, neg_divisor_fastpath);
5453   xorl(rdx, rdx);
5454   divl(divisor);
5455   jmpb(done);
5456   bind(neg_divisor_fastpath);
5457   // Fastpath for divisor < 0:
5458   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
5459   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
5460   movl(rdx, rax);
5461   subl(rdx, divisor);
5462   if (VM_Version::supports_bmi1()) {
5463     andnl(rax, rdx, rax);
5464   } else {
5465     notl(rdx);
5466     andl(rax, rdx);
5467   }
5468   shrl(rax, 31);
5469   bind(done);
5470 }
5471 
5472 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) {
5473   Label done;
5474   Label neg_divisor_fastpath;
5475   cmpl(divisor, 0);
5476   jccb(Assembler::less, neg_divisor_fastpath);
5477   xorl(rdx, rdx);
5478   divl(divisor);
5479   jmpb(done);
5480   bind(neg_divisor_fastpath);
5481   // Fastpath when divisor < 0:
5482   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
5483   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
5484   movl(rdx, rax);
5485   subl(rax, divisor);
5486   if (VM_Version::supports_bmi1()) {
5487     andnl(rax, rax, rdx);
5488   } else {
5489     notl(rax);
5490     andl(rax, rdx);
5491   }
5492   sarl(rax, 31);
5493   andl(rax, divisor);
5494   subl(rdx, rax);
5495   bind(done);
5496 }
5497 
5498 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) {
5499   Label done;
5500   Label neg_divisor_fastpath;
5501 
5502   cmpl(divisor, 0);
5503   jccb(Assembler::less, neg_divisor_fastpath);
5504   xorl(rdx, rdx);
5505   divl(divisor);
5506   jmpb(done);
5507   bind(neg_divisor_fastpath);
5508   // Fastpath for divisor < 0:
5509   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
5510   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
5511   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
5512   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
5513   movl(rdx, rax);
5514   subl(rax, divisor);
5515   if (VM_Version::supports_bmi1()) {
5516     andnl(rax, rax, rdx);
5517   } else {
5518     notl(rax);
5519     andl(rax, rdx);
5520   }
5521   movl(tmp, rax);
5522   shrl(rax, 31); // quotient
5523   sarl(tmp, 31);
5524   andl(tmp, divisor);
5525   subl(rdx, tmp); // remainder
5526   bind(done);
5527 }
5528 
5529 #ifdef _LP64
5530 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1,
5531                                  XMMRegister xtmp2, Register rtmp) {
5532   if(VM_Version::supports_gfni()) {
5533     // Galois field instruction based bit reversal based on following algorithm.
5534     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
5535     mov64(rtmp, 0x8040201008040201L);
5536     movq(xtmp1, src);
5537     movq(xtmp2, rtmp);
5538     gf2p8affineqb(xtmp1, xtmp2, 0);
5539     movq(dst, xtmp1);
5540   } else {
5541     // Swap even and odd numbered bits.
5542     movl(rtmp, src);
5543     andl(rtmp, 0x55555555);
5544     shll(rtmp, 1);
5545     movl(dst, src);
5546     andl(dst, 0xAAAAAAAA);
5547     shrl(dst, 1);
5548     orl(dst, rtmp);
5549 
5550     // Swap LSB and MSB 2 bits of each nibble.
5551     movl(rtmp, dst);
5552     andl(rtmp, 0x33333333);
5553     shll(rtmp, 2);
5554     andl(dst, 0xCCCCCCCC);
5555     shrl(dst, 2);
5556     orl(dst, rtmp);
5557 
5558     // Swap LSB and MSB 4 bits of each byte.
5559     movl(rtmp, dst);
5560     andl(rtmp, 0x0F0F0F0F);
5561     shll(rtmp, 4);
5562     andl(dst, 0xF0F0F0F0);
5563     shrl(dst, 4);
5564     orl(dst, rtmp);
5565   }
5566   bswapl(dst);
5567 }
5568 
5569 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1,
5570                                  XMMRegister xtmp2, Register rtmp1, Register rtmp2) {
5571   if(VM_Version::supports_gfni()) {
5572     // Galois field instruction based bit reversal based on following algorithm.
5573     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
5574     mov64(rtmp1, 0x8040201008040201L);
5575     movq(xtmp1, src);
5576     movq(xtmp2, rtmp1);
5577     gf2p8affineqb(xtmp1, xtmp2, 0);
5578     movq(dst, xtmp1);
5579   } else {
5580     // Swap even and odd numbered bits.
5581     movq(rtmp1, src);
5582     mov64(rtmp2, 0x5555555555555555L);
5583     andq(rtmp1, rtmp2);
5584     shlq(rtmp1, 1);
5585     movq(dst, src);
5586     notq(rtmp2);
5587     andq(dst, rtmp2);
5588     shrq(dst, 1);
5589     orq(dst, rtmp1);
5590 
5591     // Swap LSB and MSB 2 bits of each nibble.
5592     movq(rtmp1, dst);
5593     mov64(rtmp2, 0x3333333333333333L);
5594     andq(rtmp1, rtmp2);
5595     shlq(rtmp1, 2);
5596     notq(rtmp2);
5597     andq(dst, rtmp2);
5598     shrq(dst, 2);
5599     orq(dst, rtmp1);
5600 
5601     // Swap LSB and MSB 4 bits of each byte.
5602     movq(rtmp1, dst);
5603     mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL);
5604     andq(rtmp1, rtmp2);
5605     shlq(rtmp1, 4);
5606     notq(rtmp2);
5607     andq(dst, rtmp2);
5608     shrq(dst, 4);
5609     orq(dst, rtmp1);
5610   }
5611   bswapq(dst);
5612 }
5613 
5614 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) {
5615   Label done;
5616   Label neg_divisor_fastpath;
5617   cmpq(divisor, 0);
5618   jccb(Assembler::less, neg_divisor_fastpath);
5619   xorl(rdx, rdx);
5620   divq(divisor);
5621   jmpb(done);
5622   bind(neg_divisor_fastpath);
5623   // Fastpath for divisor < 0:
5624   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
5625   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
5626   movq(rdx, rax);
5627   subq(rdx, divisor);
5628   if (VM_Version::supports_bmi1()) {
5629     andnq(rax, rdx, rax);
5630   } else {
5631     notq(rdx);
5632     andq(rax, rdx);
5633   }
5634   shrq(rax, 63);
5635   bind(done);
5636 }
5637 
5638 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) {
5639   Label done;
5640   Label neg_divisor_fastpath;
5641   cmpq(divisor, 0);
5642   jccb(Assembler::less, neg_divisor_fastpath);
5643   xorq(rdx, rdx);
5644   divq(divisor);
5645   jmp(done);
5646   bind(neg_divisor_fastpath);
5647   // Fastpath when divisor < 0:
5648   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
5649   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
5650   movq(rdx, rax);
5651   subq(rax, divisor);
5652   if (VM_Version::supports_bmi1()) {
5653     andnq(rax, rax, rdx);
5654   } else {
5655     notq(rax);
5656     andq(rax, rdx);
5657   }
5658   sarq(rax, 63);
5659   andq(rax, divisor);
5660   subq(rdx, rax);
5661   bind(done);
5662 }
5663 
5664 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) {
5665   Label done;
5666   Label neg_divisor_fastpath;
5667   cmpq(divisor, 0);
5668   jccb(Assembler::less, neg_divisor_fastpath);
5669   xorq(rdx, rdx);
5670   divq(divisor);
5671   jmp(done);
5672   bind(neg_divisor_fastpath);
5673   // Fastpath for divisor < 0:
5674   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
5675   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
5676   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
5677   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
5678   movq(rdx, rax);
5679   subq(rax, divisor);
5680   if (VM_Version::supports_bmi1()) {
5681     andnq(rax, rax, rdx);
5682   } else {
5683     notq(rax);
5684     andq(rax, rdx);
5685   }
5686   movq(tmp, rax);
5687   shrq(rax, 63); // quotient
5688   sarq(tmp, 63);
5689   andq(tmp, divisor);
5690   subq(rdx, tmp); // remainder
5691   bind(done);
5692 }
5693 #endif
5694 
5695 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1,
5696                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp,
5697                                         int vlen_enc) {
5698   assert(VM_Version::supports_avx512bw(), "");
5699   // Byte shuffles are inlane operations and indices are determined using
5700   // lower 4 bit of each shuffle lane, thus all shuffle indices are
5701   // normalized to index range 0-15. This makes sure that all the multiples
5702   // of an index value are placed at same relative position in 128 bit
5703   // lane i.e. elements corresponding to shuffle indices 16, 32 and 64
5704   // will be 16th element in their respective 128 bit lanes.
5705   movl(rtmp, 16);
5706   evpbroadcastb(xtmp1, rtmp, vlen_enc);
5707 
5708   // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16,
5709   // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using
5710   // original shuffle indices and move the shuffled lanes corresponding to true
5711   // mask to destination vector.
5712   evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
5713   evshufi64x2(xtmp2, src, src, 0x0, vlen_enc);
5714   evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc);
5715 
5716   // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32
5717   // and broadcasting second 128 bit lane.
5718   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
5719   vpsllq(xtmp2, xtmp1, 0x1, vlen_enc);
5720   evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
5721   evshufi64x2(xtmp3, src, src, 0x55, vlen_enc);
5722   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
5723 
5724   // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48
5725   // and broadcasting third 128 bit lane.
5726   evpcmpb(ktmp, k0, shuffle,  xtmp2, Assembler::nlt, true, vlen_enc);
5727   vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc);
5728   evpcmpb(ktmp, ktmp, shuffle,  xtmp1, Assembler::lt, true, vlen_enc);
5729   evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc);
5730   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
5731 
5732   // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64
5733   // and broadcasting third 128 bit lane.
5734   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
5735   vpsllq(xtmp2, xtmp2, 0x1, vlen_enc);
5736   evpcmpb(ktmp, ktmp, shuffle,  xtmp2, Assembler::lt, true, vlen_enc);
5737   evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc);
5738   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
5739 }
5740