1 /*
   2  * Copyright (c) 2020, 2023, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "gc/shared/barrierSet.hpp"
  29 #include "gc/shared/barrierSetAssembler.hpp"
  30 #include "oops/methodData.hpp"
  31 #include "opto/c2_MacroAssembler.hpp"
  32 #include "opto/intrinsicnode.hpp"
  33 #include "opto/output.hpp"
  34 #include "opto/opcodes.hpp"
  35 #include "opto/subnode.hpp"
  36 #include "runtime/objectMonitor.hpp"
  37 #include "runtime/stubRoutines.hpp"
  38 
  39 #ifdef PRODUCT
  40 #define BLOCK_COMMENT(str) /* nothing */
  41 #define STOP(error) stop(error)
  42 #else
  43 #define BLOCK_COMMENT(str) block_comment(str)
  44 #define STOP(error) block_comment(error); stop(error)
  45 #endif
  46 
  47 // C2 compiled method's prolog code.
  48 void C2_MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) {
  49 
  50   // WARNING: Initial instruction MUST be 5 bytes or longer so that
  51   // NativeJump::patch_verified_entry will be able to patch out the entry
  52   // code safely. The push to verify stack depth is ok at 5 bytes,
  53   // the frame allocation can be either 3 or 6 bytes. So if we don't do
  54   // stack bang then we must use the 6 byte frame allocation even if
  55   // we have no frame. :-(
  56   assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
  57 
  58   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
  59   // Remove word for return addr
  60   framesize -= wordSize;
  61   stack_bang_size -= wordSize;
  62 
  63   // Calls to C2R adapters often do not accept exceptional returns.
  64   // We require that their callers must bang for them.  But be careful, because
  65   // some VM calls (such as call site linkage) can use several kilobytes of
  66   // stack.  But the stack safety zone should account for that.
  67   // See bugs 4446381, 4468289, 4497237.
  68   if (stack_bang_size > 0) {
  69     generate_stack_overflow_check(stack_bang_size);
  70 
  71     // We always push rbp, so that on return to interpreter rbp, will be
  72     // restored correctly and we can correct the stack.
  73     push(rbp);
  74     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  75     if (PreserveFramePointer) {
  76       mov(rbp, rsp);
  77     }
  78     // Remove word for ebp
  79     framesize -= wordSize;
  80 
  81     // Create frame
  82     if (framesize) {
  83       subptr(rsp, framesize);
  84     }
  85   } else {
  86     // Create frame (force generation of a 4 byte immediate value)
  87     subptr_imm32(rsp, framesize);
  88 
  89     // Save RBP register now.
  90     framesize -= wordSize;
  91     movptr(Address(rsp, framesize), rbp);
  92     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  93     if (PreserveFramePointer) {
  94       movptr(rbp, rsp);
  95       if (framesize > 0) {
  96         addptr(rbp, framesize);
  97       }
  98     }
  99   }
 100 
 101   if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
 102     framesize -= wordSize;
 103     movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 104   }
 105 
 106 #ifndef _LP64
 107   // If method sets FPU control word do it now
 108   if (fp_mode_24b) {
 109     fldcw(ExternalAddress(StubRoutines::x86::addr_fpu_cntrl_wrd_24()));
 110   }
 111   if (UseSSE >= 2 && VerifyFPU) {
 112     verify_FPU(0, "FPU stack must be clean on entry");
 113   }
 114 #endif
 115 
 116 #ifdef ASSERT
 117   if (VerifyStackAtCalls) {
 118     Label L;
 119     push(rax);
 120     mov(rax, rsp);
 121     andptr(rax, StackAlignmentInBytes-1);
 122     cmpptr(rax, StackAlignmentInBytes-wordSize);
 123     pop(rax);
 124     jcc(Assembler::equal, L);
 125     STOP("Stack is not properly aligned!");
 126     bind(L);
 127   }
 128 #endif
 129 
 130   if (!is_stub) {
 131     BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 132  #ifdef _LP64
 133     if (BarrierSet::barrier_set()->barrier_set_nmethod() != nullptr) {
 134       // We put the non-hot code of the nmethod entry barrier out-of-line in a stub.
 135       Label dummy_slow_path;
 136       Label dummy_continuation;
 137       Label* slow_path = &dummy_slow_path;
 138       Label* continuation = &dummy_continuation;
 139       if (!Compile::current()->output()->in_scratch_emit_size()) {
 140         // Use real labels from actual stub when not emitting code for the purpose of measuring its size
 141         C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
 142         Compile::current()->output()->add_stub(stub);
 143         slow_path = &stub->entry();
 144         continuation = &stub->continuation();
 145       }
 146       bs->nmethod_entry_barrier(this, slow_path, continuation);
 147     }
 148 #else
 149     // Don't bother with out-of-line nmethod entry barrier stub for x86_32.
 150     bs->nmethod_entry_barrier(this, nullptr /* slow_path */, nullptr /* continuation */);
 151 #endif
 152   }
 153 }
 154 
 155 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
 156   switch (vlen_in_bytes) {
 157     case  4: // fall-through
 158     case  8: // fall-through
 159     case 16: return Assembler::AVX_128bit;
 160     case 32: return Assembler::AVX_256bit;
 161     case 64: return Assembler::AVX_512bit;
 162 
 163     default: {
 164       ShouldNotReachHere();
 165       return Assembler::AVX_NoVec;
 166     }
 167   }
 168 }
 169 
 170 #if INCLUDE_RTM_OPT
 171 
 172 // Update rtm_counters based on abort status
 173 // input: abort_status
 174 //        rtm_counters (RTMLockingCounters*)
 175 // flags are killed
 176 void C2_MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) {
 177 
 178   atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset()));
 179   if (PrintPreciseRTMLockingStatistics) {
 180     for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {
 181       Label check_abort;
 182       testl(abort_status, (1<<i));
 183       jccb(Assembler::equal, check_abort);
 184       atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx))));
 185       bind(check_abort);
 186     }
 187   }
 188 }
 189 
 190 // Branch if (random & (count-1) != 0), count is 2^n
 191 // tmp, scr and flags are killed
 192 void C2_MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) {
 193   assert(tmp == rax, "");
 194   assert(scr == rdx, "");
 195   rdtsc(); // modifies EDX:EAX
 196   andptr(tmp, count-1);
 197   jccb(Assembler::notZero, brLabel);
 198 }
 199 
 200 // Perform abort ratio calculation, set no_rtm bit if high ratio
 201 // input:  rtm_counters_Reg (RTMLockingCounters* address)
 202 // tmpReg, rtm_counters_Reg and flags are killed
 203 void C2_MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg,
 204                                                     Register rtm_counters_Reg,
 205                                                     RTMLockingCounters* rtm_counters,
 206                                                     Metadata* method_data) {
 207   Label L_done, L_check_always_rtm1, L_check_always_rtm2;
 208 
 209   if (RTMLockingCalculationDelay > 0) {
 210     // Delay calculation
 211     movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr()));
 212     testptr(tmpReg, tmpReg);
 213     jccb(Assembler::equal, L_done);
 214   }
 215   // Abort ratio calculation only if abort_count > RTMAbortThreshold
 216   //   Aborted transactions = abort_count * 100
 217   //   All transactions = total_count *  RTMTotalCountIncrRate
 218   //   Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
 219 
 220   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset()));
 221   cmpptr(tmpReg, RTMAbortThreshold);
 222   jccb(Assembler::below, L_check_always_rtm2);
 223   imulptr(tmpReg, tmpReg, 100);
 224 
 225   Register scrReg = rtm_counters_Reg;
 226   movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
 227   imulptr(scrReg, scrReg, RTMTotalCountIncrRate);
 228   imulptr(scrReg, scrReg, RTMAbortRatio);
 229   cmpptr(tmpReg, scrReg);
 230   jccb(Assembler::below, L_check_always_rtm1);
 231   if (method_data != nullptr) {
 232     // set rtm_state to "no rtm" in MDO
 233     mov_metadata(tmpReg, method_data);
 234     lock();
 235     orl(Address(tmpReg, MethodData::rtm_state_offset()), NoRTM);
 236   }
 237   jmpb(L_done);
 238   bind(L_check_always_rtm1);
 239   // Reload RTMLockingCounters* address
 240   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
 241   bind(L_check_always_rtm2);
 242   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
 243   cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate);
 244   jccb(Assembler::below, L_done);
 245   if (method_data != nullptr) {
 246     // set rtm_state to "always rtm" in MDO
 247     mov_metadata(tmpReg, method_data);
 248     lock();
 249     orl(Address(tmpReg, MethodData::rtm_state_offset()), UseRTM);
 250   }
 251   bind(L_done);
 252 }
 253 
 254 // Update counters and perform abort ratio calculation
 255 // input:  abort_status_Reg
 256 // rtm_counters_Reg, flags are killed
 257 void C2_MacroAssembler::rtm_profiling(Register abort_status_Reg,
 258                                       Register rtm_counters_Reg,
 259                                       RTMLockingCounters* rtm_counters,
 260                                       Metadata* method_data,
 261                                       bool profile_rtm) {
 262 
 263   assert(rtm_counters != nullptr, "should not be null when profiling RTM");
 264   // update rtm counters based on rax value at abort
 265   // reads abort_status_Reg, updates flags
 266   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
 267   rtm_counters_update(abort_status_Reg, rtm_counters_Reg);
 268   if (profile_rtm) {
 269     // Save abort status because abort_status_Reg is used by following code.
 270     if (RTMRetryCount > 0) {
 271       push(abort_status_Reg);
 272     }
 273     assert(rtm_counters != nullptr, "should not be null when profiling RTM");
 274     rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data);
 275     // restore abort status
 276     if (RTMRetryCount > 0) {
 277       pop(abort_status_Reg);
 278     }
 279   }
 280 }
 281 
 282 // Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4)
 283 // inputs: retry_count_Reg
 284 //       : abort_status_Reg
 285 // output: retry_count_Reg decremented by 1
 286 // flags are killed
 287 void C2_MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) {
 288   Label doneRetry;
 289   assert(abort_status_Reg == rax, "");
 290   // The abort reason bits are in eax (see all states in rtmLocking.hpp)
 291   // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4)
 292   // if reason is in 0x6 and retry count != 0 then retry
 293   andptr(abort_status_Reg, 0x6);
 294   jccb(Assembler::zero, doneRetry);
 295   testl(retry_count_Reg, retry_count_Reg);
 296   jccb(Assembler::zero, doneRetry);
 297   pause();
 298   decrementl(retry_count_Reg);
 299   jmp(retryLabel);
 300   bind(doneRetry);
 301 }
 302 
 303 // Spin and retry if lock is busy,
 304 // inputs: box_Reg (monitor address)
 305 //       : retry_count_Reg
 306 // output: retry_count_Reg decremented by 1
 307 //       : clear z flag if retry count exceeded
 308 // tmp_Reg, scr_Reg, flags are killed
 309 void C2_MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg,
 310                                                Register tmp_Reg, Register scr_Reg, Label& retryLabel) {
 311   Label SpinLoop, SpinExit, doneRetry;
 312   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 313 
 314   testl(retry_count_Reg, retry_count_Reg);
 315   jccb(Assembler::zero, doneRetry);
 316   decrementl(retry_count_Reg);
 317   movptr(scr_Reg, RTMSpinLoopCount);
 318 
 319   bind(SpinLoop);
 320   pause();
 321   decrementl(scr_Reg);
 322   jccb(Assembler::lessEqual, SpinExit);
 323   movptr(tmp_Reg, Address(box_Reg, owner_offset));
 324   testptr(tmp_Reg, tmp_Reg);
 325   jccb(Assembler::notZero, SpinLoop);
 326 
 327   bind(SpinExit);
 328   jmp(retryLabel);
 329   bind(doneRetry);
 330   incrementl(retry_count_Reg); // clear z flag
 331 }
 332 
 333 // Use RTM for normal stack locks
 334 // Input: objReg (object to lock)
 335 void C2_MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg,
 336                                          Register retry_on_abort_count_Reg,
 337                                          RTMLockingCounters* stack_rtm_counters,
 338                                          Metadata* method_data, bool profile_rtm,
 339                                          Label& DONE_LABEL, Label& IsInflated) {
 340   assert(UseRTMForStackLocks, "why call this otherwise?");
 341   assert(tmpReg == rax, "");
 342   assert(scrReg == rdx, "");
 343   Label L_rtm_retry, L_decrement_retry, L_on_abort;
 344 
 345   if (RTMRetryCount > 0) {
 346     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
 347     bind(L_rtm_retry);
 348   }
 349   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
 350   testptr(tmpReg, markWord::monitor_value);  // inflated vs stack-locked|neutral
 351   jcc(Assembler::notZero, IsInflated);
 352 
 353   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 354     Label L_noincrement;
 355     if (RTMTotalCountIncrRate > 1) {
 356       // tmpReg, scrReg and flags are killed
 357       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
 358     }
 359     assert(stack_rtm_counters != nullptr, "should not be null when profiling RTM");
 360     atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg);
 361     bind(L_noincrement);
 362   }
 363   xbegin(L_on_abort);
 364   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));       // fetch markword
 365   andptr(tmpReg, markWord::lock_mask_in_place);     // look at 2 lock bits
 366   cmpptr(tmpReg, markWord::unlocked_value);         // bits = 01 unlocked
 367   jcc(Assembler::equal, DONE_LABEL);        // all done if unlocked
 368 
 369   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
 370   if (UseRTMXendForLockBusy) {
 371     xend();
 372     movptr(abort_status_Reg, 0x2);   // Set the abort status to 2 (so we can retry)
 373     jmp(L_decrement_retry);
 374   }
 375   else {
 376     xabort(0);
 377   }
 378   bind(L_on_abort);
 379   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 380     rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm);
 381   }
 382   bind(L_decrement_retry);
 383   if (RTMRetryCount > 0) {
 384     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
 385     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
 386   }
 387 }
 388 
 389 // Use RTM for inflating locks
 390 // inputs: objReg (object to lock)
 391 //         boxReg (on-stack box address (displaced header location) - KILLED)
 392 //         tmpReg (ObjectMonitor address + markWord::monitor_value)
 393 void C2_MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg,
 394                                             Register scrReg, Register retry_on_busy_count_Reg,
 395                                             Register retry_on_abort_count_Reg,
 396                                             RTMLockingCounters* rtm_counters,
 397                                             Metadata* method_data, bool profile_rtm,
 398                                             Label& DONE_LABEL) {
 399   assert(UseRTMLocking, "why call this otherwise?");
 400   assert(tmpReg == rax, "");
 401   assert(scrReg == rdx, "");
 402   Label L_rtm_retry, L_decrement_retry, L_on_abort;
 403   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 404 
 405   movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value()));
 406   movptr(boxReg, tmpReg); // Save ObjectMonitor address
 407 
 408   if (RTMRetryCount > 0) {
 409     movl(retry_on_busy_count_Reg, RTMRetryCount);  // Retry on lock busy
 410     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
 411     bind(L_rtm_retry);
 412   }
 413   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 414     Label L_noincrement;
 415     if (RTMTotalCountIncrRate > 1) {
 416       // tmpReg, scrReg and flags are killed
 417       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
 418     }
 419     assert(rtm_counters != nullptr, "should not be null when profiling RTM");
 420     atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg);
 421     bind(L_noincrement);
 422   }
 423   xbegin(L_on_abort);
 424   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
 425   movptr(tmpReg, Address(tmpReg, owner_offset));
 426   testptr(tmpReg, tmpReg);
 427   jcc(Assembler::zero, DONE_LABEL);
 428   if (UseRTMXendForLockBusy) {
 429     xend();
 430     jmp(L_decrement_retry);
 431   }
 432   else {
 433     xabort(0);
 434   }
 435   bind(L_on_abort);
 436   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
 437   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 438     rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm);
 439   }
 440   if (RTMRetryCount > 0) {
 441     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
 442     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
 443   }
 444 
 445   movptr(tmpReg, Address(boxReg, owner_offset)) ;
 446   testptr(tmpReg, tmpReg) ;
 447   jccb(Assembler::notZero, L_decrement_retry) ;
 448 
 449   // Appears unlocked - try to swing _owner from null to non-null.
 450   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 451 #ifdef _LP64
 452   Register threadReg = r15_thread;
 453 #else
 454   get_thread(scrReg);
 455   Register threadReg = scrReg;
 456 #endif
 457   lock();
 458   cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg
 459 
 460   if (RTMRetryCount > 0) {
 461     // success done else retry
 462     jccb(Assembler::equal, DONE_LABEL) ;
 463     bind(L_decrement_retry);
 464     // Spin and retry if lock is busy.
 465     rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry);
 466   }
 467   else {
 468     bind(L_decrement_retry);
 469   }
 470 }
 471 
 472 #endif //  INCLUDE_RTM_OPT
 473 
 474 // fast_lock and fast_unlock used by C2
 475 
 476 // Because the transitions from emitted code to the runtime
 477 // monitorenter/exit helper stubs are so slow it's critical that
 478 // we inline both the stack-locking fast path and the inflated fast path.
 479 //
 480 // See also: cmpFastLock and cmpFastUnlock.
 481 //
 482 // What follows is a specialized inline transliteration of the code
 483 // in enter() and exit(). If we're concerned about I$ bloat another
 484 // option would be to emit TrySlowEnter and TrySlowExit methods
 485 // at startup-time.  These methods would accept arguments as
 486 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
 487 // indications in the icc.ZFlag.  fast_lock and fast_unlock would simply
 488 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
 489 // In practice, however, the # of lock sites is bounded and is usually small.
 490 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
 491 // if the processor uses simple bimodal branch predictors keyed by EIP
 492 // Since the helper routines would be called from multiple synchronization
 493 // sites.
 494 //
 495 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
 496 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
 497 // to those specialized methods.  That'd give us a mostly platform-independent
 498 // implementation that the JITs could optimize and inline at their pleasure.
 499 // Done correctly, the only time we'd need to cross to native could would be
 500 // to park() or unpark() threads.  We'd also need a few more unsafe operators
 501 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
 502 // (b) explicit barriers or fence operations.
 503 //
 504 // TODO:
 505 //
 506 // *  Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
 507 //    This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
 508 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
 509 //    the lock operators would typically be faster than reifying Self.
 510 //
 511 // *  Ideally I'd define the primitives as:
 512 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
 513 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
 514 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
 515 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
 516 //    Furthermore the register assignments are overconstrained, possibly resulting in
 517 //    sub-optimal code near the synchronization site.
 518 //
 519 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
 520 //    Alternately, use a better sp-proximity test.
 521 //
 522 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
 523 //    Either one is sufficient to uniquely identify a thread.
 524 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
 525 //
 526 // *  Intrinsify notify() and notifyAll() for the common cases where the
 527 //    object is locked by the calling thread but the waitlist is empty.
 528 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
 529 //
 530 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
 531 //    But beware of excessive branch density on AMD Opterons.
 532 //
 533 // *  Both fast_lock and fast_unlock set the ICC.ZF to indicate success
 534 //    or failure of the fast path.  If the fast path fails then we pass
 535 //    control to the slow path, typically in C.  In fast_lock and
 536 //    fast_unlock we often branch to DONE_LABEL, just to find that C2
 537 //    will emit a conditional branch immediately after the node.
 538 //    So we have branches to branches and lots of ICC.ZF games.
 539 //    Instead, it might be better to have C2 pass a "FailureLabel"
 540 //    into fast_lock and fast_unlock.  In the case of success, control
 541 //    will drop through the node.  ICC.ZF is undefined at exit.
 542 //    In the case of failure, the node will branch directly to the
 543 //    FailureLabel
 544 
 545 
 546 // obj: object to lock
 547 // box: on-stack box address (displaced header location) - KILLED
 548 // rax,: tmp -- KILLED
 549 // scr: tmp -- KILLED
 550 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
 551                                  Register scrReg, Register cx1Reg, Register cx2Reg, Register thread,
 552                                  RTMLockingCounters* rtm_counters,
 553                                  RTMLockingCounters* stack_rtm_counters,
 554                                  Metadata* method_data,
 555                                  bool use_rtm, bool profile_rtm) {
 556   // Ensure the register assignments are disjoint
 557   assert(tmpReg == rax, "");
 558 
 559   if (use_rtm) {
 560     assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg);
 561   } else {
 562     assert(cx1Reg == noreg, "");
 563     assert(cx2Reg == noreg, "");
 564     assert_different_registers(objReg, boxReg, tmpReg, scrReg);
 565   }
 566 
 567   // Possible cases that we'll encounter in fast_lock
 568   // ------------------------------------------------
 569   // * Inflated
 570   //    -- unlocked
 571   //    -- Locked
 572   //       = by self
 573   //       = by other
 574   // * neutral
 575   // * stack-locked
 576   //    -- by self
 577   //       = sp-proximity test hits
 578   //       = sp-proximity test generates false-negative
 579   //    -- by other
 580   //
 581 
 582   Label IsInflated, DONE_LABEL, NO_COUNT, COUNT;
 583 
 584   if (DiagnoseSyncOnValueBasedClasses != 0) {
 585     load_klass(tmpReg, objReg, scrReg);
 586     movl(tmpReg, Address(tmpReg, Klass::access_flags_offset()));
 587     testl(tmpReg, JVM_ACC_IS_VALUE_BASED_CLASS);
 588     jcc(Assembler::notZero, DONE_LABEL);
 589   }
 590 
 591 #if INCLUDE_RTM_OPT
 592   if (UseRTMForStackLocks && use_rtm) {
 593     assert(LockingMode != LM_MONITOR, "LockingMode == 0 (LM_MONITOR) and +UseRTMForStackLocks are mutually exclusive");
 594     rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg,
 595                       stack_rtm_counters, method_data, profile_rtm,
 596                       DONE_LABEL, IsInflated);
 597   }
 598 #endif // INCLUDE_RTM_OPT
 599 
 600   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));          // [FETCH]
 601   testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral
 602   jcc(Assembler::notZero, IsInflated);
 603 
 604   if (LockingMode == LM_MONITOR) {
 605     // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0.
 606     testptr(objReg, objReg);
 607   } else if (LockingMode == LM_LEGACY) {
 608     // Attempt stack-locking ...
 609     orptr (tmpReg, markWord::unlocked_value);
 610     movptr(Address(boxReg, 0), tmpReg);          // Anticipate successful CAS
 611     lock();
 612     cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes()));      // Updates tmpReg
 613     jcc(Assembler::equal, COUNT);           // Success
 614 
 615     // Recursive locking.
 616     // The object is stack-locked: markword contains stack pointer to BasicLock.
 617     // Locked by current thread if difference with current SP is less than one page.
 618     subptr(tmpReg, rsp);
 619     // Next instruction set ZFlag == 1 (Success) if difference is less then one page.
 620     andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - (int)os::vm_page_size())) );
 621     movptr(Address(boxReg, 0), tmpReg);
 622   } else {
 623     assert(LockingMode == LM_LIGHTWEIGHT, "");
 624     lightweight_lock(objReg, tmpReg, thread, scrReg, NO_COUNT);
 625     jmp(COUNT);
 626   }
 627   jmp(DONE_LABEL);
 628 
 629   bind(IsInflated);
 630   // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value
 631 
 632 #if INCLUDE_RTM_OPT
 633   // Use the same RTM locking code in 32- and 64-bit VM.
 634   if (use_rtm) {
 635     rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg,
 636                          rtm_counters, method_data, profile_rtm, DONE_LABEL);
 637   } else {
 638 #endif // INCLUDE_RTM_OPT
 639 
 640 #ifndef _LP64
 641   // The object is inflated.
 642 
 643   // boxReg refers to the on-stack BasicLock in the current frame.
 644   // We'd like to write:
 645   //   set box->_displaced_header = markWord::unused_mark().  Any non-0 value suffices.
 646   // This is convenient but results a ST-before-CAS penalty.  The following CAS suffers
 647   // additional latency as we have another ST in the store buffer that must drain.
 648 
 649   // avoid ST-before-CAS
 650   // register juggle because we need tmpReg for cmpxchgptr below
 651   movptr(scrReg, boxReg);
 652   movptr(boxReg, tmpReg);                   // consider: LEA box, [tmp-2]
 653 
 654   // Optimistic form: consider XORL tmpReg,tmpReg
 655   movptr(tmpReg, NULL_WORD);
 656 
 657   // Appears unlocked - try to swing _owner from null to non-null.
 658   // Ideally, I'd manifest "Self" with get_thread and then attempt
 659   // to CAS the register containing Self into m->Owner.
 660   // But we don't have enough registers, so instead we can either try to CAS
 661   // rsp or the address of the box (in scr) into &m->owner.  If the CAS succeeds
 662   // we later store "Self" into m->Owner.  Transiently storing a stack address
 663   // (rsp or the address of the box) into  m->owner is harmless.
 664   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 665   lock();
 666   cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 667   movptr(Address(scrReg, 0), 3);          // box->_displaced_header = 3
 668   // If we weren't able to swing _owner from null to the BasicLock
 669   // then take the slow path.
 670   jccb  (Assembler::notZero, NO_COUNT);
 671   // update _owner from BasicLock to thread
 672   get_thread (scrReg);                    // beware: clobbers ICCs
 673   movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg);
 674   xorptr(boxReg, boxReg);                 // set icc.ZFlag = 1 to indicate success
 675 
 676   // If the CAS fails we can either retry or pass control to the slow path.
 677   // We use the latter tactic.
 678   // Pass the CAS result in the icc.ZFlag into DONE_LABEL
 679   // If the CAS was successful ...
 680   //   Self has acquired the lock
 681   //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
 682   // Intentional fall-through into DONE_LABEL ...
 683 #else // _LP64
 684   // It's inflated and we use scrReg for ObjectMonitor* in this section.
 685   movq(scrReg, tmpReg);
 686   xorq(tmpReg, tmpReg);
 687   lock();
 688   cmpxchgptr(thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 689   // Unconditionally set box->_displaced_header = markWord::unused_mark().
 690   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 691   movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value()));
 692   // Propagate ICC.ZF from CAS above into DONE_LABEL.
 693   jccb(Assembler::equal, COUNT);          // CAS above succeeded; propagate ZF = 1 (success)
 694 
 695   cmpptr(thread, rax);                // Check if we are already the owner (recursive lock)
 696   jccb(Assembler::notEqual, NO_COUNT);    // If not recursive, ZF = 0 at this point (fail)
 697   incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 698   xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success
 699 #endif // _LP64
 700 #if INCLUDE_RTM_OPT
 701   } // use_rtm()
 702 #endif
 703   bind(DONE_LABEL);
 704 
 705   // ZFlag == 1 count in fast path
 706   // ZFlag == 0 count in slow path
 707   jccb(Assembler::notZero, NO_COUNT); // jump if ZFlag == 0
 708 
 709   bind(COUNT);
 710   // Count monitors in fast path
 711   increment(Address(thread, JavaThread::held_monitor_count_offset()));
 712 
 713   xorl(tmpReg, tmpReg); // Set ZF == 1
 714 
 715   bind(NO_COUNT);
 716 
 717   // At NO_COUNT the icc ZFlag is set as follows ...
 718   // fast_unlock uses the same protocol.
 719   // ZFlag == 1 -> Success
 720   // ZFlag == 0 -> Failure - force control through the slow path
 721 }
 722 
 723 // obj: object to unlock
 724 // box: box address (displaced header location), killed.  Must be EAX.
 725 // tmp: killed, cannot be obj nor box.
 726 //
 727 // Some commentary on balanced locking:
 728 //
 729 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
 730 // Methods that don't have provably balanced locking are forced to run in the
 731 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
 732 // The interpreter provides two properties:
 733 // I1:  At return-time the interpreter automatically and quietly unlocks any
 734 //      objects acquired the current activation (frame).  Recall that the
 735 //      interpreter maintains an on-stack list of locks currently held by
 736 //      a frame.
 737 // I2:  If a method attempts to unlock an object that is not held by the
 738 //      the frame the interpreter throws IMSX.
 739 //
 740 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
 741 // B() doesn't have provably balanced locking so it runs in the interpreter.
 742 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
 743 // is still locked by A().
 744 //
 745 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
 746 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
 747 // should not be unlocked by "normal" java-level locking and vice-versa.  The specification
 748 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
 749 // Arguably given that the spec legislates the JNI case as undefined our implementation
 750 // could reasonably *avoid* checking owner in fast_unlock().
 751 // In the interest of performance we elide m->Owner==Self check in unlock.
 752 // A perfectly viable alternative is to elide the owner check except when
 753 // Xcheck:jni is enabled.
 754 
 755 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) {
 756   assert(boxReg == rax, "");
 757   assert_different_registers(objReg, boxReg, tmpReg);
 758 
 759   Label DONE_LABEL, Stacked, COUNT, NO_COUNT;
 760 
 761 #if INCLUDE_RTM_OPT
 762   if (UseRTMForStackLocks && use_rtm) {
 763     assert(LockingMode != LM_MONITOR, "LockingMode == 0 (LM_MONITOR) and +UseRTMForStackLocks are mutually exclusive");
 764     Label L_regular_unlock;
 765     movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword
 766     andptr(tmpReg, markWord::lock_mask_in_place);                     // look at 2 lock bits
 767     cmpptr(tmpReg, markWord::unlocked_value);                         // bits = 01 unlocked
 768     jccb(Assembler::notEqual, L_regular_unlock);                      // if !HLE RegularLock
 769     xend();                                                           // otherwise end...
 770     jmp(DONE_LABEL);                                                  // ... and we're done
 771     bind(L_regular_unlock);
 772   }
 773 #endif
 774 
 775   if (LockingMode == LM_LEGACY) {
 776     cmpptr(Address(boxReg, 0), NULL_WORD);                            // Examine the displaced header
 777     jcc   (Assembler::zero, COUNT);                                   // 0 indicates recursive stack-lock
 778   }
 779   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));   // Examine the object's markword
 780   if (LockingMode != LM_MONITOR) {
 781     testptr(tmpReg, markWord::monitor_value);                         // Inflated?
 782     jcc(Assembler::zero, Stacked);
 783   }
 784 
 785   // It's inflated.
 786   if (LockingMode == LM_LIGHTWEIGHT) {
 787     // If the owner is ANONYMOUS, we need to fix it -  in an outline stub.
 788     testb(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t) ObjectMonitor::ANONYMOUS_OWNER);
 789 #ifdef _LP64
 790     if (!Compile::current()->output()->in_scratch_emit_size()) {
 791       C2HandleAnonOMOwnerStub* stub = new (Compile::current()->comp_arena()) C2HandleAnonOMOwnerStub(tmpReg, boxReg);
 792       Compile::current()->output()->add_stub(stub);
 793       jcc(Assembler::notEqual, stub->entry());
 794       bind(stub->continuation());
 795     } else
 796 #endif
 797     {
 798       // We can't easily implement this optimization on 32 bit because we don't have a thread register.
 799       // Call the slow-path instead.
 800       jcc(Assembler::notEqual, NO_COUNT);
 801     }
 802   }
 803 
 804 #if INCLUDE_RTM_OPT
 805   if (use_rtm) {
 806     Label L_regular_inflated_unlock;
 807     int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 808     movptr(boxReg, Address(tmpReg, owner_offset));
 809     testptr(boxReg, boxReg);
 810     jccb(Assembler::notZero, L_regular_inflated_unlock);
 811     xend();
 812     jmp(DONE_LABEL);
 813     bind(L_regular_inflated_unlock);
 814   }
 815 #endif
 816 
 817   // Despite our balanced locking property we still check that m->_owner == Self
 818   // as java routines or native JNI code called by this thread might
 819   // have released the lock.
 820   // Refer to the comments in synchronizer.cpp for how we might encode extra
 821   // state in _succ so we can avoid fetching EntryList|cxq.
 822   //
 823   // If there's no contention try a 1-0 exit.  That is, exit without
 824   // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
 825   // we detect and recover from the race that the 1-0 exit admits.
 826   //
 827   // Conceptually fast_unlock() must execute a STST|LDST "release" barrier
 828   // before it STs null into _owner, releasing the lock.  Updates
 829   // to data protected by the critical section must be visible before
 830   // we drop the lock (and thus before any other thread could acquire
 831   // the lock and observe the fields protected by the lock).
 832   // IA32's memory-model is SPO, so STs are ordered with respect to
 833   // each other and there's no need for an explicit barrier (fence).
 834   // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
 835 #ifndef _LP64
 836   // Note that we could employ various encoding schemes to reduce
 837   // the number of loads below (currently 4) to just 2 or 3.
 838   // Refer to the comments in synchronizer.cpp.
 839   // In practice the chain of fetches doesn't seem to impact performance, however.
 840   xorptr(boxReg, boxReg);
 841   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 842   jccb  (Assembler::notZero, DONE_LABEL);
 843   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 844   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 845   jccb  (Assembler::notZero, DONE_LABEL);
 846   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 847   jmpb  (DONE_LABEL);
 848 #else // _LP64
 849   // It's inflated
 850   Label CheckSucc, LNotRecursive, LSuccess, LGoSlowPath;
 851 
 852   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0);
 853   jccb(Assembler::equal, LNotRecursive);
 854 
 855   // Recursive inflated unlock
 856   decq(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 857   jmpb(LSuccess);
 858 
 859   bind(LNotRecursive);
 860   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 861   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 862   jccb  (Assembler::notZero, CheckSucc);
 863   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 864   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 865   jmpb  (DONE_LABEL);
 866 
 867   // Try to avoid passing control into the slow_path ...
 868   bind  (CheckSucc);
 869 
 870   // The following optional optimization can be elided if necessary
 871   // Effectively: if (succ == null) goto slow path
 872   // The code reduces the window for a race, however,
 873   // and thus benefits performance.
 874   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD);
 875   jccb  (Assembler::zero, LGoSlowPath);
 876 
 877   xorptr(boxReg, boxReg);
 878   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 879   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 880 
 881   // Memory barrier/fence
 882   // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ
 883   // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack.
 884   // This is faster on Nehalem and AMD Shanghai/Barcelona.
 885   // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences
 886   // We might also restructure (ST Owner=0;barrier;LD _Succ) to
 887   // (mov box,0; xchgq box, &m->Owner; LD _succ) .
 888   lock(); addl(Address(rsp, 0), 0);
 889 
 890   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD);
 891   jccb  (Assembler::notZero, LSuccess);
 892 
 893   // Rare inopportune interleaving - race.
 894   // The successor vanished in the small window above.
 895   // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor.
 896   // We need to ensure progress and succession.
 897   // Try to reacquire the lock.
 898   // If that fails then the new owner is responsible for succession and this
 899   // thread needs to take no further action and can exit via the fast path (success).
 900   // If the re-acquire succeeds then pass control into the slow path.
 901   // As implemented, this latter mode is horrible because we generated more
 902   // coherence traffic on the lock *and* artificially extended the critical section
 903   // length while by virtue of passing control into the slow path.
 904 
 905   // box is really RAX -- the following CMPXCHG depends on that binding
 906   // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R)
 907   lock();
 908   cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 909   // There's no successor so we tried to regrab the lock.
 910   // If that didn't work, then another thread grabbed the
 911   // lock so we're done (and exit was a success).
 912   jccb  (Assembler::notEqual, LSuccess);
 913   // Intentional fall-through into slow path
 914 
 915   bind  (LGoSlowPath);
 916   orl   (boxReg, 1);                      // set ICC.ZF=0 to indicate failure
 917   jmpb  (DONE_LABEL);
 918 
 919   bind  (LSuccess);
 920   testl (boxReg, 0);                      // set ICC.ZF=1 to indicate success
 921   jmpb  (DONE_LABEL);
 922 
 923 #endif
 924   if (LockingMode != LM_MONITOR) {
 925     bind  (Stacked);
 926     if (LockingMode == LM_LIGHTWEIGHT) {
 927       mov(boxReg, tmpReg);
 928       lightweight_unlock(objReg, boxReg, tmpReg, NO_COUNT);
 929       jmp(COUNT);
 930     } else if (LockingMode == LM_LEGACY) {
 931       movptr(tmpReg, Address (boxReg, 0));      // re-fetch
 932       lock();
 933       cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
 934     }
 935     // Intentional fall-thru into DONE_LABEL
 936   }
 937   bind(DONE_LABEL);
 938 
 939   // ZFlag == 1 count in fast path
 940   // ZFlag == 0 count in slow path
 941   jccb(Assembler::notZero, NO_COUNT);
 942 
 943   bind(COUNT);
 944   // Count monitors in fast path
 945 #ifndef _LP64
 946   get_thread(tmpReg);
 947   decrementl(Address(tmpReg, JavaThread::held_monitor_count_offset()));
 948 #else // _LP64
 949   decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset()));
 950 #endif
 951 
 952   xorl(tmpReg, tmpReg); // Set ZF == 1
 953 
 954   bind(NO_COUNT);
 955 }
 956 
 957 //-------------------------------------------------------------------------------------------
 958 // Generic instructions support for use in .ad files C2 code generation
 959 
 960 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) {
 961   if (dst != src) {
 962     movdqu(dst, src);
 963   }
 964   if (opcode == Op_AbsVD) {
 965     andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg);
 966   } else {
 967     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 968     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
 969   }
 970 }
 971 
 972 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 973   if (opcode == Op_AbsVD) {
 974     vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg);
 975   } else {
 976     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 977     vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg);
 978   }
 979 }
 980 
 981 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) {
 982   if (dst != src) {
 983     movdqu(dst, src);
 984   }
 985   if (opcode == Op_AbsVF) {
 986     andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg);
 987   } else {
 988     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 989     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
 990   }
 991 }
 992 
 993 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 994   if (opcode == Op_AbsVF) {
 995     vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg);
 996   } else {
 997     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 998     vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg);
 999   }
1000 }
1001 
1002 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
1003   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
1004   assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
1005 
1006   if (opcode == Op_MinV) {
1007     if (elem_bt == T_BYTE) {
1008       pminsb(dst, src);
1009     } else if (elem_bt == T_SHORT) {
1010       pminsw(dst, src);
1011     } else if (elem_bt == T_INT) {
1012       pminsd(dst, src);
1013     } else {
1014       assert(elem_bt == T_LONG, "required");
1015       assert(tmp == xmm0, "required");
1016       assert_different_registers(dst, src, tmp);
1017       movdqu(xmm0, dst);
1018       pcmpgtq(xmm0, src);
1019       blendvpd(dst, src);  // xmm0 as mask
1020     }
1021   } else { // opcode == Op_MaxV
1022     if (elem_bt == T_BYTE) {
1023       pmaxsb(dst, src);
1024     } else if (elem_bt == T_SHORT) {
1025       pmaxsw(dst, src);
1026     } else if (elem_bt == T_INT) {
1027       pmaxsd(dst, src);
1028     } else {
1029       assert(elem_bt == T_LONG, "required");
1030       assert(tmp == xmm0, "required");
1031       assert_different_registers(dst, src, tmp);
1032       movdqu(xmm0, src);
1033       pcmpgtq(xmm0, dst);
1034       blendvpd(dst, src);  // xmm0 as mask
1035     }
1036   }
1037 }
1038 
1039 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
1040                                  XMMRegister dst, XMMRegister src1, XMMRegister src2,
1041                                  int vlen_enc) {
1042   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
1043 
1044   if (opcode == Op_MinV) {
1045     if (elem_bt == T_BYTE) {
1046       vpminsb(dst, src1, src2, vlen_enc);
1047     } else if (elem_bt == T_SHORT) {
1048       vpminsw(dst, src1, src2, vlen_enc);
1049     } else if (elem_bt == T_INT) {
1050       vpminsd(dst, src1, src2, vlen_enc);
1051     } else {
1052       assert(elem_bt == T_LONG, "required");
1053       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
1054         vpminsq(dst, src1, src2, vlen_enc);
1055       } else {
1056         assert_different_registers(dst, src1, src2);
1057         vpcmpgtq(dst, src1, src2, vlen_enc);
1058         vblendvpd(dst, src1, src2, dst, vlen_enc);
1059       }
1060     }
1061   } else { // opcode == Op_MaxV
1062     if (elem_bt == T_BYTE) {
1063       vpmaxsb(dst, src1, src2, vlen_enc);
1064     } else if (elem_bt == T_SHORT) {
1065       vpmaxsw(dst, src1, src2, vlen_enc);
1066     } else if (elem_bt == T_INT) {
1067       vpmaxsd(dst, src1, src2, vlen_enc);
1068     } else {
1069       assert(elem_bt == T_LONG, "required");
1070       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
1071         vpmaxsq(dst, src1, src2, vlen_enc);
1072       } else {
1073         assert_different_registers(dst, src1, src2);
1074         vpcmpgtq(dst, src1, src2, vlen_enc);
1075         vblendvpd(dst, src2, src1, dst, vlen_enc);
1076       }
1077     }
1078   }
1079 }
1080 
1081 // Float/Double min max
1082 
1083 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
1084                                    XMMRegister dst, XMMRegister a, XMMRegister b,
1085                                    XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
1086                                    int vlen_enc) {
1087   assert(UseAVX > 0, "required");
1088   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1089          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1090   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1091   assert_different_registers(a, b, tmp, atmp, btmp);
1092 
1093   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1094   bool is_double_word = is_double_word_type(elem_bt);
1095 
1096   if (!is_double_word && is_min) {
1097     vblendvps(atmp, a, b, a, vlen_enc);
1098     vblendvps(btmp, b, a, a, vlen_enc);
1099     vminps(tmp, atmp, btmp, vlen_enc);
1100     vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1101     vblendvps(dst, tmp, atmp, btmp, vlen_enc);
1102   } else if (!is_double_word && !is_min) {
1103     vblendvps(btmp, b, a, b, vlen_enc);
1104     vblendvps(atmp, a, b, b, vlen_enc);
1105     vmaxps(tmp, atmp, btmp, vlen_enc);
1106     vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1107     vblendvps(dst, tmp, atmp, btmp, vlen_enc);
1108   } else if (is_double_word && is_min) {
1109     vblendvpd(atmp, a, b, a, vlen_enc);
1110     vblendvpd(btmp, b, a, a, vlen_enc);
1111     vminpd(tmp, atmp, btmp, vlen_enc);
1112     vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1113     vblendvpd(dst, tmp, atmp, btmp, vlen_enc);
1114   } else {
1115     assert(is_double_word && !is_min, "sanity");
1116     vblendvpd(btmp, b, a, b, vlen_enc);
1117     vblendvpd(atmp, a, b, b, vlen_enc);
1118     vmaxpd(tmp, atmp, btmp, vlen_enc);
1119     vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1120     vblendvpd(dst, tmp, atmp, btmp, vlen_enc);
1121   }
1122 }
1123 
1124 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
1125                                     XMMRegister dst, XMMRegister a, XMMRegister b,
1126                                     KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
1127                                     int vlen_enc) {
1128   assert(UseAVX > 2, "required");
1129   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1130          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1131   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1132   assert_different_registers(dst, a, b, atmp, btmp);
1133 
1134   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1135   bool is_double_word = is_double_word_type(elem_bt);
1136   bool merge = true;
1137 
1138   if (!is_double_word && is_min) {
1139     evpmovd2m(ktmp, a, vlen_enc);
1140     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1141     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1142     vminps(dst, atmp, btmp, vlen_enc);
1143     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1144     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1145   } else if (!is_double_word && !is_min) {
1146     evpmovd2m(ktmp, b, vlen_enc);
1147     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1148     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1149     vmaxps(dst, atmp, btmp, vlen_enc);
1150     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1151     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1152   } else if (is_double_word && is_min) {
1153     evpmovq2m(ktmp, a, vlen_enc);
1154     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1155     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1156     vminpd(dst, atmp, btmp, vlen_enc);
1157     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1158     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1159   } else {
1160     assert(is_double_word && !is_min, "sanity");
1161     evpmovq2m(ktmp, b, vlen_enc);
1162     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1163     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1164     vmaxpd(dst, atmp, btmp, vlen_enc);
1165     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1166     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1167   }
1168 }
1169 
1170 // Float/Double signum
1171 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) {
1172   assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");
1173 
1174   Label DONE_LABEL;
1175 
1176   if (opcode == Op_SignumF) {
1177     assert(UseSSE > 0, "required");
1178     ucomiss(dst, zero);
1179     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1180     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1181     movflt(dst, one);
1182     jcc(Assembler::above, DONE_LABEL);
1183     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
1184   } else if (opcode == Op_SignumD) {
1185     assert(UseSSE > 1, "required");
1186     ucomisd(dst, zero);
1187     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1188     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1189     movdbl(dst, one);
1190     jcc(Assembler::above, DONE_LABEL);
1191     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
1192   }
1193 
1194   bind(DONE_LABEL);
1195 }
1196 
1197 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1198   if (sign) {
1199     pmovsxbw(dst, src);
1200   } else {
1201     pmovzxbw(dst, src);
1202   }
1203 }
1204 
1205 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1206   if (sign) {
1207     vpmovsxbw(dst, src, vector_len);
1208   } else {
1209     vpmovzxbw(dst, src, vector_len);
1210   }
1211 }
1212 
1213 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1214   if (sign) {
1215     vpmovsxbd(dst, src, vector_len);
1216   } else {
1217     vpmovzxbd(dst, src, vector_len);
1218   }
1219 }
1220 
1221 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1222   if (sign) {
1223     vpmovsxwd(dst, src, vector_len);
1224   } else {
1225     vpmovzxwd(dst, src, vector_len);
1226   }
1227 }
1228 
1229 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1230                                      int shift, int vector_len) {
1231   if (opcode == Op_RotateLeftV) {
1232     if (etype == T_INT) {
1233       evprold(dst, src, shift, vector_len);
1234     } else {
1235       assert(etype == T_LONG, "expected type T_LONG");
1236       evprolq(dst, src, shift, vector_len);
1237     }
1238   } else {
1239     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1240     if (etype == T_INT) {
1241       evprord(dst, src, shift, vector_len);
1242     } else {
1243       assert(etype == T_LONG, "expected type T_LONG");
1244       evprorq(dst, src, shift, vector_len);
1245     }
1246   }
1247 }
1248 
1249 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1250                                      XMMRegister shift, int vector_len) {
1251   if (opcode == Op_RotateLeftV) {
1252     if (etype == T_INT) {
1253       evprolvd(dst, src, shift, vector_len);
1254     } else {
1255       assert(etype == T_LONG, "expected type T_LONG");
1256       evprolvq(dst, src, shift, vector_len);
1257     }
1258   } else {
1259     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1260     if (etype == T_INT) {
1261       evprorvd(dst, src, shift, vector_len);
1262     } else {
1263       assert(etype == T_LONG, "expected type T_LONG");
1264       evprorvq(dst, src, shift, vector_len);
1265     }
1266   }
1267 }
1268 
1269 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1270   if (opcode == Op_RShiftVI) {
1271     psrad(dst, shift);
1272   } else if (opcode == Op_LShiftVI) {
1273     pslld(dst, shift);
1274   } else {
1275     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1276     psrld(dst, shift);
1277   }
1278 }
1279 
1280 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1281   switch (opcode) {
1282     case Op_RShiftVI:  psrad(dst, shift); break;
1283     case Op_LShiftVI:  pslld(dst, shift); break;
1284     case Op_URShiftVI: psrld(dst, shift); break;
1285 
1286     default: assert(false, "%s", NodeClassNames[opcode]);
1287   }
1288 }
1289 
1290 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1291   if (opcode == Op_RShiftVI) {
1292     vpsrad(dst, nds, shift, vector_len);
1293   } else if (opcode == Op_LShiftVI) {
1294     vpslld(dst, nds, shift, vector_len);
1295   } else {
1296     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1297     vpsrld(dst, nds, shift, vector_len);
1298   }
1299 }
1300 
1301 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1302   switch (opcode) {
1303     case Op_RShiftVI:  vpsrad(dst, src, shift, vlen_enc); break;
1304     case Op_LShiftVI:  vpslld(dst, src, shift, vlen_enc); break;
1305     case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1306 
1307     default: assert(false, "%s", NodeClassNames[opcode]);
1308   }
1309 }
1310 
1311 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1312   switch (opcode) {
1313     case Op_RShiftVB:  // fall-through
1314     case Op_RShiftVS:  psraw(dst, shift); break;
1315 
1316     case Op_LShiftVB:  // fall-through
1317     case Op_LShiftVS:  psllw(dst, shift);   break;
1318 
1319     case Op_URShiftVS: // fall-through
1320     case Op_URShiftVB: psrlw(dst, shift);  break;
1321 
1322     default: assert(false, "%s", NodeClassNames[opcode]);
1323   }
1324 }
1325 
1326 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1327   switch (opcode) {
1328     case Op_RShiftVB:  // fall-through
1329     case Op_RShiftVS:  vpsraw(dst, src, shift, vlen_enc); break;
1330 
1331     case Op_LShiftVB:  // fall-through
1332     case Op_LShiftVS:  vpsllw(dst, src, shift, vlen_enc); break;
1333 
1334     case Op_URShiftVS: // fall-through
1335     case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1336 
1337     default: assert(false, "%s", NodeClassNames[opcode]);
1338   }
1339 }
1340 
1341 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1342   switch (opcode) {
1343     case Op_RShiftVL:  psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1344     case Op_LShiftVL:  psllq(dst, shift); break;
1345     case Op_URShiftVL: psrlq(dst, shift); break;
1346 
1347     default: assert(false, "%s", NodeClassNames[opcode]);
1348   }
1349 }
1350 
1351 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1352   if (opcode == Op_RShiftVL) {
1353     psrlq(dst, shift);  // using srl to implement sra on pre-avs512 systems
1354   } else if (opcode == Op_LShiftVL) {
1355     psllq(dst, shift);
1356   } else {
1357     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1358     psrlq(dst, shift);
1359   }
1360 }
1361 
1362 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1363   switch (opcode) {
1364     case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1365     case Op_LShiftVL:  vpsllq(dst, src, shift, vlen_enc); break;
1366     case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1367 
1368     default: assert(false, "%s", NodeClassNames[opcode]);
1369   }
1370 }
1371 
1372 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1373   if (opcode == Op_RShiftVL) {
1374     evpsraq(dst, nds, shift, vector_len);
1375   } else if (opcode == Op_LShiftVL) {
1376     vpsllq(dst, nds, shift, vector_len);
1377   } else {
1378     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1379     vpsrlq(dst, nds, shift, vector_len);
1380   }
1381 }
1382 
1383 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1384   switch (opcode) {
1385     case Op_RShiftVB:  // fall-through
1386     case Op_RShiftVS:  // fall-through
1387     case Op_RShiftVI:  vpsravd(dst, src, shift, vlen_enc); break;
1388 
1389     case Op_LShiftVB:  // fall-through
1390     case Op_LShiftVS:  // fall-through
1391     case Op_LShiftVI:  vpsllvd(dst, src, shift, vlen_enc); break;
1392 
1393     case Op_URShiftVB: // fall-through
1394     case Op_URShiftVS: // fall-through
1395     case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1396 
1397     default: assert(false, "%s", NodeClassNames[opcode]);
1398   }
1399 }
1400 
1401 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1402   switch (opcode) {
1403     case Op_RShiftVB:  // fall-through
1404     case Op_RShiftVS:  evpsravw(dst, src, shift, vlen_enc); break;
1405 
1406     case Op_LShiftVB:  // fall-through
1407     case Op_LShiftVS:  evpsllvw(dst, src, shift, vlen_enc); break;
1408 
1409     case Op_URShiftVB: // fall-through
1410     case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1411 
1412     default: assert(false, "%s", NodeClassNames[opcode]);
1413   }
1414 }
1415 
1416 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1417   assert(UseAVX >= 2, "required");
1418   switch (opcode) {
1419     case Op_RShiftVL: {
1420       if (UseAVX > 2) {
1421         assert(tmp == xnoreg, "not used");
1422         if (!VM_Version::supports_avx512vl()) {
1423           vlen_enc = Assembler::AVX_512bit;
1424         }
1425         evpsravq(dst, src, shift, vlen_enc);
1426       } else {
1427         vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1428         vpsrlvq(dst, src, shift, vlen_enc);
1429         vpsrlvq(tmp, tmp, shift, vlen_enc);
1430         vpxor(dst, dst, tmp, vlen_enc);
1431         vpsubq(dst, dst, tmp, vlen_enc);
1432       }
1433       break;
1434     }
1435     case Op_LShiftVL: {
1436       assert(tmp == xnoreg, "not used");
1437       vpsllvq(dst, src, shift, vlen_enc);
1438       break;
1439     }
1440     case Op_URShiftVL: {
1441       assert(tmp == xnoreg, "not used");
1442       vpsrlvq(dst, src, shift, vlen_enc);
1443       break;
1444     }
1445     default: assert(false, "%s", NodeClassNames[opcode]);
1446   }
1447 }
1448 
1449 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1450 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1451   assert(opcode == Op_LShiftVB ||
1452          opcode == Op_RShiftVB ||
1453          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1454   bool sign = (opcode != Op_URShiftVB);
1455   assert(vector_len == 0, "required");
1456   vextendbd(sign, dst, src, 1);
1457   vpmovzxbd(vtmp, shift, 1);
1458   varshiftd(opcode, dst, dst, vtmp, 1);
1459   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg);
1460   vextracti128_high(vtmp, dst);
1461   vpackusdw(dst, dst, vtmp, 0);
1462 }
1463 
1464 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1465 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1466   assert(opcode == Op_LShiftVB ||
1467          opcode == Op_RShiftVB ||
1468          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1469   bool sign = (opcode != Op_URShiftVB);
1470   int ext_vector_len = vector_len + 1;
1471   vextendbw(sign, dst, src, ext_vector_len);
1472   vpmovzxbw(vtmp, shift, ext_vector_len);
1473   varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1474   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg);
1475   if (vector_len == 0) {
1476     vextracti128_high(vtmp, dst);
1477     vpackuswb(dst, dst, vtmp, vector_len);
1478   } else {
1479     vextracti64x4_high(vtmp, dst);
1480     vpackuswb(dst, dst, vtmp, vector_len);
1481     vpermq(dst, dst, 0xD8, vector_len);
1482   }
1483 }
1484 
1485 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1486   switch(typ) {
1487     case T_BYTE:
1488       pinsrb(dst, val, idx);
1489       break;
1490     case T_SHORT:
1491       pinsrw(dst, val, idx);
1492       break;
1493     case T_INT:
1494       pinsrd(dst, val, idx);
1495       break;
1496     case T_LONG:
1497       pinsrq(dst, val, idx);
1498       break;
1499     default:
1500       assert(false,"Should not reach here.");
1501       break;
1502   }
1503 }
1504 
1505 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1506   switch(typ) {
1507     case T_BYTE:
1508       vpinsrb(dst, src, val, idx);
1509       break;
1510     case T_SHORT:
1511       vpinsrw(dst, src, val, idx);
1512       break;
1513     case T_INT:
1514       vpinsrd(dst, src, val, idx);
1515       break;
1516     case T_LONG:
1517       vpinsrq(dst, src, val, idx);
1518       break;
1519     default:
1520       assert(false,"Should not reach here.");
1521       break;
1522   }
1523 }
1524 
1525 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1526   switch(typ) {
1527     case T_INT:
1528       vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1529       break;
1530     case T_FLOAT:
1531       vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1532       break;
1533     case T_LONG:
1534       vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1535       break;
1536     case T_DOUBLE:
1537       vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1538       break;
1539     default:
1540       assert(false,"Should not reach here.");
1541       break;
1542   }
1543 }
1544 
1545 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1546   switch(typ) {
1547     case T_INT:
1548       evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1549       break;
1550     case T_FLOAT:
1551       evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1552       break;
1553     case T_LONG:
1554       evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1555       break;
1556     case T_DOUBLE:
1557       evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1558       break;
1559     default:
1560       assert(false,"Should not reach here.");
1561       break;
1562   }
1563 }
1564 
1565 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1566   switch(typ) {
1567     case T_INT:
1568       evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1569       break;
1570     case T_FLOAT:
1571       evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1572       break;
1573     case T_LONG:
1574       evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1575       break;
1576     case T_DOUBLE:
1577       evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1578       break;
1579     default:
1580       assert(false,"Should not reach here.");
1581       break;
1582   }
1583 }
1584 
1585 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) {
1586   if (vlen_in_bytes <= 16) {
1587     pxor (dst, dst);
1588     psubb(dst, src);
1589     switch (elem_bt) {
1590       case T_BYTE:   /* nothing to do */ break;
1591       case T_SHORT:  pmovsxbw(dst, dst); break;
1592       case T_INT:    pmovsxbd(dst, dst); break;
1593       case T_FLOAT:  pmovsxbd(dst, dst); break;
1594       case T_LONG:   pmovsxbq(dst, dst); break;
1595       case T_DOUBLE: pmovsxbq(dst, dst); break;
1596 
1597       default: assert(false, "%s", type2name(elem_bt));
1598     }
1599   } else {
1600     assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, "");
1601     int vlen_enc = vector_length_encoding(vlen_in_bytes);
1602 
1603     vpxor (dst, dst, dst, vlen_enc);
1604     vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc);
1605 
1606     switch (elem_bt) {
1607       case T_BYTE:   /* nothing to do */            break;
1608       case T_SHORT:  vpmovsxbw(dst, dst, vlen_enc); break;
1609       case T_INT:    vpmovsxbd(dst, dst, vlen_enc); break;
1610       case T_FLOAT:  vpmovsxbd(dst, dst, vlen_enc); break;
1611       case T_LONG:   vpmovsxbq(dst, dst, vlen_enc); break;
1612       case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1613 
1614       default: assert(false, "%s", type2name(elem_bt));
1615     }
1616   }
1617 }
1618 
1619 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) {
1620   if (novlbwdq) {
1621     vpmovsxbd(xtmp, src, vlen_enc);
1622     evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()),
1623             Assembler::eq, true, vlen_enc, noreg);
1624   } else {
1625     vpxor(xtmp, xtmp, xtmp, vlen_enc);
1626     vpsubb(xtmp, xtmp, src, vlen_enc);
1627     evpmovb2m(dst, xtmp, vlen_enc);
1628   }
1629 }
1630 
1631 void C2_MacroAssembler::load_vector(XMMRegister dst, Address src, int vlen_in_bytes) {
1632   switch (vlen_in_bytes) {
1633     case 4:  movdl(dst, src);   break;
1634     case 8:  movq(dst, src);    break;
1635     case 16: movdqu(dst, src);  break;
1636     case 32: vmovdqu(dst, src); break;
1637     case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break;
1638     default: ShouldNotReachHere();
1639   }
1640 }
1641 
1642 void C2_MacroAssembler::load_vector(XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) {
1643   assert(rscratch != noreg || always_reachable(src), "missing");
1644 
1645   if (reachable(src)) {
1646     load_vector(dst, as_Address(src), vlen_in_bytes);
1647   } else {
1648     lea(rscratch, src);
1649     load_vector(dst, Address(rscratch, 0), vlen_in_bytes);
1650   }
1651 }
1652 
1653 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) {
1654   int vlen_enc = vector_length_encoding(vlen);
1655   if (VM_Version::supports_avx()) {
1656     if (bt == T_LONG) {
1657       if (VM_Version::supports_avx2()) {
1658         vpbroadcastq(dst, src, vlen_enc);
1659       } else {
1660         vmovddup(dst, src, vlen_enc);
1661       }
1662     } else if (bt == T_DOUBLE) {
1663       if (vlen_enc != Assembler::AVX_128bit) {
1664         vbroadcastsd(dst, src, vlen_enc, noreg);
1665       } else {
1666         vmovddup(dst, src, vlen_enc);
1667       }
1668     } else {
1669       if (VM_Version::supports_avx2() && is_integral_type(bt)) {
1670         vpbroadcastd(dst, src, vlen_enc);
1671       } else {
1672         vbroadcastss(dst, src, vlen_enc);
1673       }
1674     }
1675   } else if (VM_Version::supports_sse3()) {
1676     movddup(dst, src);
1677   } else {
1678     movq(dst, src);
1679     if (vlen == 16) {
1680       punpcklqdq(dst, dst);
1681     }
1682   }
1683 }
1684 
1685 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) {
1686   // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64.
1687   int offset = exact_log2(type2aelembytes(bt)) << 6;
1688   if (is_floating_point_type(bt)) {
1689     offset += 128;
1690   }
1691   ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset);
1692   load_vector(dst, addr, vlen_in_bytes);
1693 }
1694 
1695 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1696 
1697 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1698   int vector_len = Assembler::AVX_128bit;
1699 
1700   switch (opcode) {
1701     case Op_AndReductionV:  pand(dst, src); break;
1702     case Op_OrReductionV:   por (dst, src); break;
1703     case Op_XorReductionV:  pxor(dst, src); break;
1704     case Op_MinReductionV:
1705       switch (typ) {
1706         case T_BYTE:        pminsb(dst, src); break;
1707         case T_SHORT:       pminsw(dst, src); break;
1708         case T_INT:         pminsd(dst, src); break;
1709         case T_LONG:        assert(UseAVX > 2, "required");
1710                             vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
1711         default:            assert(false, "wrong type");
1712       }
1713       break;
1714     case Op_MaxReductionV:
1715       switch (typ) {
1716         case T_BYTE:        pmaxsb(dst, src); break;
1717         case T_SHORT:       pmaxsw(dst, src); break;
1718         case T_INT:         pmaxsd(dst, src); break;
1719         case T_LONG:        assert(UseAVX > 2, "required");
1720                             vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
1721         default:            assert(false, "wrong type");
1722       }
1723       break;
1724     case Op_AddReductionVF: addss(dst, src); break;
1725     case Op_AddReductionVD: addsd(dst, src); break;
1726     case Op_AddReductionVI:
1727       switch (typ) {
1728         case T_BYTE:        paddb(dst, src); break;
1729         case T_SHORT:       paddw(dst, src); break;
1730         case T_INT:         paddd(dst, src); break;
1731         default:            assert(false, "wrong type");
1732       }
1733       break;
1734     case Op_AddReductionVL: paddq(dst, src); break;
1735     case Op_MulReductionVF: mulss(dst, src); break;
1736     case Op_MulReductionVD: mulsd(dst, src); break;
1737     case Op_MulReductionVI:
1738       switch (typ) {
1739         case T_SHORT:       pmullw(dst, src); break;
1740         case T_INT:         pmulld(dst, src); break;
1741         default:            assert(false, "wrong type");
1742       }
1743       break;
1744     case Op_MulReductionVL: assert(UseAVX > 2, "required");
1745                             evpmullq(dst, dst, src, vector_len); break;
1746     default:                assert(false, "wrong opcode");
1747   }
1748 }
1749 
1750 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1751   int vector_len = Assembler::AVX_256bit;
1752 
1753   switch (opcode) {
1754     case Op_AndReductionV:  vpand(dst, src1, src2, vector_len); break;
1755     case Op_OrReductionV:   vpor (dst, src1, src2, vector_len); break;
1756     case Op_XorReductionV:  vpxor(dst, src1, src2, vector_len); break;
1757     case Op_MinReductionV:
1758       switch (typ) {
1759         case T_BYTE:        vpminsb(dst, src1, src2, vector_len); break;
1760         case T_SHORT:       vpminsw(dst, src1, src2, vector_len); break;
1761         case T_INT:         vpminsd(dst, src1, src2, vector_len); break;
1762         case T_LONG:        assert(UseAVX > 2, "required");
1763                             vpminsq(dst, src1, src2, vector_len); break;
1764         default:            assert(false, "wrong type");
1765       }
1766       break;
1767     case Op_MaxReductionV:
1768       switch (typ) {
1769         case T_BYTE:        vpmaxsb(dst, src1, src2, vector_len); break;
1770         case T_SHORT:       vpmaxsw(dst, src1, src2, vector_len); break;
1771         case T_INT:         vpmaxsd(dst, src1, src2, vector_len); break;
1772         case T_LONG:        assert(UseAVX > 2, "required");
1773                             vpmaxsq(dst, src1, src2, vector_len); break;
1774         default:            assert(false, "wrong type");
1775       }
1776       break;
1777     case Op_AddReductionVI:
1778       switch (typ) {
1779         case T_BYTE:        vpaddb(dst, src1, src2, vector_len); break;
1780         case T_SHORT:       vpaddw(dst, src1, src2, vector_len); break;
1781         case T_INT:         vpaddd(dst, src1, src2, vector_len); break;
1782         default:            assert(false, "wrong type");
1783       }
1784       break;
1785     case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
1786     case Op_MulReductionVI:
1787       switch (typ) {
1788         case T_SHORT:       vpmullw(dst, src1, src2, vector_len); break;
1789         case T_INT:         vpmulld(dst, src1, src2, vector_len); break;
1790         default:            assert(false, "wrong type");
1791       }
1792       break;
1793     case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break;
1794     default:                assert(false, "wrong opcode");
1795   }
1796 }
1797 
1798 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
1799                                   XMMRegister dst, XMMRegister src,
1800                                   XMMRegister vtmp1, XMMRegister vtmp2) {
1801   switch (opcode) {
1802     case Op_AddReductionVF:
1803     case Op_MulReductionVF:
1804       reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1805       break;
1806 
1807     case Op_AddReductionVD:
1808     case Op_MulReductionVD:
1809       reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1810       break;
1811 
1812     default: assert(false, "wrong opcode");
1813   }
1814 }
1815 
1816 void C2_MacroAssembler::reduceB(int opcode, int vlen,
1817                              Register dst, Register src1, XMMRegister src2,
1818                              XMMRegister vtmp1, XMMRegister vtmp2) {
1819   switch (vlen) {
1820     case  8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1821     case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1822     case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1823     case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1824 
1825     default: assert(false, "wrong vector length");
1826   }
1827 }
1828 
1829 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
1830                              Register dst, Register src1, XMMRegister src2,
1831                              XMMRegister vtmp1, XMMRegister vtmp2) {
1832   switch (vlen) {
1833     case  8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1834     case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1835     case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1836     case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1837 
1838     default: assert(false, "wrong vector length");
1839   }
1840 }
1841 
1842 void C2_MacroAssembler::reduceS(int opcode, int vlen,
1843                              Register dst, Register src1, XMMRegister src2,
1844                              XMMRegister vtmp1, XMMRegister vtmp2) {
1845   switch (vlen) {
1846     case  4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1847     case  8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1848     case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1849     case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1850 
1851     default: assert(false, "wrong vector length");
1852   }
1853 }
1854 
1855 void C2_MacroAssembler::reduceI(int opcode, int vlen,
1856                              Register dst, Register src1, XMMRegister src2,
1857                              XMMRegister vtmp1, XMMRegister vtmp2) {
1858   switch (vlen) {
1859     case  2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1860     case  4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1861     case  8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1862     case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1863 
1864     default: assert(false, "wrong vector length");
1865   }
1866 }
1867 
1868 #ifdef _LP64
1869 void C2_MacroAssembler::reduceL(int opcode, int vlen,
1870                              Register dst, Register src1, XMMRegister src2,
1871                              XMMRegister vtmp1, XMMRegister vtmp2) {
1872   switch (vlen) {
1873     case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1874     case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1875     case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1876 
1877     default: assert(false, "wrong vector length");
1878   }
1879 }
1880 #endif // _LP64
1881 
1882 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1883   switch (vlen) {
1884     case 2:
1885       assert(vtmp2 == xnoreg, "");
1886       reduce2F(opcode, dst, src, vtmp1);
1887       break;
1888     case 4:
1889       assert(vtmp2 == xnoreg, "");
1890       reduce4F(opcode, dst, src, vtmp1);
1891       break;
1892     case 8:
1893       reduce8F(opcode, dst, src, vtmp1, vtmp2);
1894       break;
1895     case 16:
1896       reduce16F(opcode, dst, src, vtmp1, vtmp2);
1897       break;
1898     default: assert(false, "wrong vector length");
1899   }
1900 }
1901 
1902 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1903   switch (vlen) {
1904     case 2:
1905       assert(vtmp2 == xnoreg, "");
1906       reduce2D(opcode, dst, src, vtmp1);
1907       break;
1908     case 4:
1909       reduce4D(opcode, dst, src, vtmp1, vtmp2);
1910       break;
1911     case 8:
1912       reduce8D(opcode, dst, src, vtmp1, vtmp2);
1913       break;
1914     default: assert(false, "wrong vector length");
1915   }
1916 }
1917 
1918 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1919   if (opcode == Op_AddReductionVI) {
1920     if (vtmp1 != src2) {
1921       movdqu(vtmp1, src2);
1922     }
1923     phaddd(vtmp1, vtmp1);
1924   } else {
1925     pshufd(vtmp1, src2, 0x1);
1926     reduce_operation_128(T_INT, opcode, vtmp1, src2);
1927   }
1928   movdl(vtmp2, src1);
1929   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
1930   movdl(dst, vtmp1);
1931 }
1932 
1933 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1934   if (opcode == Op_AddReductionVI) {
1935     if (vtmp1 != src2) {
1936       movdqu(vtmp1, src2);
1937     }
1938     phaddd(vtmp1, src2);
1939     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1940   } else {
1941     pshufd(vtmp2, src2, 0xE);
1942     reduce_operation_128(T_INT, opcode, vtmp2, src2);
1943     reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1944   }
1945 }
1946 
1947 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1948   if (opcode == Op_AddReductionVI) {
1949     vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
1950     vextracti128_high(vtmp2, vtmp1);
1951     vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
1952     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1953   } else {
1954     vextracti128_high(vtmp1, src2);
1955     reduce_operation_128(T_INT, opcode, vtmp1, src2);
1956     reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1957   }
1958 }
1959 
1960 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1961   vextracti64x4_high(vtmp2, src2);
1962   reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
1963   reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1964 }
1965 
1966 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1967   pshufd(vtmp2, src2, 0x1);
1968   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
1969   movdqu(vtmp1, vtmp2);
1970   psrldq(vtmp1, 2);
1971   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
1972   movdqu(vtmp2, vtmp1);
1973   psrldq(vtmp2, 1);
1974   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
1975   movdl(vtmp2, src1);
1976   pmovsxbd(vtmp1, vtmp1);
1977   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
1978   pextrb(dst, vtmp1, 0x0);
1979   movsbl(dst, dst);
1980 }
1981 
1982 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1983   pshufd(vtmp1, src2, 0xE);
1984   reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
1985   reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1986 }
1987 
1988 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1989   vextracti128_high(vtmp2, src2);
1990   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
1991   reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1992 }
1993 
1994 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1995   vextracti64x4_high(vtmp1, src2);
1996   reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
1997   reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1998 }
1999 
2000 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2001   pmovsxbw(vtmp2, src2);
2002   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2003 }
2004 
2005 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2006   if (UseAVX > 1) {
2007     int vector_len = Assembler::AVX_256bit;
2008     vpmovsxbw(vtmp1, src2, vector_len);
2009     reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2010   } else {
2011     pmovsxbw(vtmp2, src2);
2012     reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2013     pshufd(vtmp2, src2, 0x1);
2014     pmovsxbw(vtmp2, src2);
2015     reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2016   }
2017 }
2018 
2019 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2020   if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
2021     int vector_len = Assembler::AVX_512bit;
2022     vpmovsxbw(vtmp1, src2, vector_len);
2023     reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2024   } else {
2025     assert(UseAVX >= 2,"Should not reach here.");
2026     mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
2027     vextracti128_high(vtmp2, src2);
2028     mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2029   }
2030 }
2031 
2032 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2033   mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
2034   vextracti64x4_high(vtmp2, src2);
2035   mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2036 }
2037 
2038 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2039   if (opcode == Op_AddReductionVI) {
2040     if (vtmp1 != src2) {
2041       movdqu(vtmp1, src2);
2042     }
2043     phaddw(vtmp1, vtmp1);
2044     phaddw(vtmp1, vtmp1);
2045   } else {
2046     pshufd(vtmp2, src2, 0x1);
2047     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2048     movdqu(vtmp1, vtmp2);
2049     psrldq(vtmp1, 2);
2050     reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
2051   }
2052   movdl(vtmp2, src1);
2053   pmovsxwd(vtmp1, vtmp1);
2054   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2055   pextrw(dst, vtmp1, 0x0);
2056   movswl(dst, dst);
2057 }
2058 
2059 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2060   if (opcode == Op_AddReductionVI) {
2061     if (vtmp1 != src2) {
2062       movdqu(vtmp1, src2);
2063     }
2064     phaddw(vtmp1, src2);
2065   } else {
2066     pshufd(vtmp1, src2, 0xE);
2067     reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
2068   }
2069   reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2070 }
2071 
2072 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2073   if (opcode == Op_AddReductionVI) {
2074     int vector_len = Assembler::AVX_256bit;
2075     vphaddw(vtmp2, src2, src2, vector_len);
2076     vpermq(vtmp2, vtmp2, 0xD8, vector_len);
2077   } else {
2078     vextracti128_high(vtmp2, src2);
2079     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2080   }
2081   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2082 }
2083 
2084 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2085   int vector_len = Assembler::AVX_256bit;
2086   vextracti64x4_high(vtmp1, src2);
2087   reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
2088   reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2089 }
2090 
2091 #ifdef _LP64
2092 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2093   pshufd(vtmp2, src2, 0xE);
2094   reduce_operation_128(T_LONG, opcode, vtmp2, src2);
2095   movdq(vtmp1, src1);
2096   reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
2097   movdq(dst, vtmp1);
2098 }
2099 
2100 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2101   vextracti128_high(vtmp1, src2);
2102   reduce_operation_128(T_LONG, opcode, vtmp1, src2);
2103   reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2104 }
2105 
2106 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2107   vextracti64x4_high(vtmp2, src2);
2108   reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
2109   reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2110 }
2111 
2112 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
2113   mov64(temp, -1L);
2114   bzhiq(temp, temp, len);
2115   kmovql(dst, temp);
2116 }
2117 #endif // _LP64
2118 
2119 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2120   reduce_operation_128(T_FLOAT, opcode, dst, src);
2121   pshufd(vtmp, src, 0x1);
2122   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2123 }
2124 
2125 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2126   reduce2F(opcode, dst, src, vtmp);
2127   pshufd(vtmp, src, 0x2);
2128   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2129   pshufd(vtmp, src, 0x3);
2130   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2131 }
2132 
2133 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2134   reduce4F(opcode, dst, src, vtmp2);
2135   vextractf128_high(vtmp2, src);
2136   reduce4F(opcode, dst, vtmp2, vtmp1);
2137 }
2138 
2139 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2140   reduce8F(opcode, dst, src, vtmp1, vtmp2);
2141   vextracti64x4_high(vtmp1, src);
2142   reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
2143 }
2144 
2145 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2146   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2147   pshufd(vtmp, src, 0xE);
2148   reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
2149 }
2150 
2151 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2152   reduce2D(opcode, dst, src, vtmp2);
2153   vextractf128_high(vtmp2, src);
2154   reduce2D(opcode, dst, vtmp2, vtmp1);
2155 }
2156 
2157 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2158   reduce4D(opcode, dst, src, vtmp1, vtmp2);
2159   vextracti64x4_high(vtmp1, src);
2160   reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
2161 }
2162 
2163 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) {
2164   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2165 }
2166 
2167 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) {
2168   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2169 }
2170 
2171 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask,
2172                                  int vec_enc) {
2173   switch(elem_bt) {
2174     case T_INT:
2175     case T_FLOAT:
2176       vmaskmovps(dst, src, mask, vec_enc);
2177       break;
2178     case T_LONG:
2179     case T_DOUBLE:
2180       vmaskmovpd(dst, src, mask, vec_enc);
2181       break;
2182     default:
2183       fatal("Unsupported type %s", type2name(elem_bt));
2184       break;
2185   }
2186 }
2187 
2188 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask,
2189                                  int vec_enc) {
2190   switch(elem_bt) {
2191     case T_INT:
2192     case T_FLOAT:
2193       vmaskmovps(dst, src, mask, vec_enc);
2194       break;
2195     case T_LONG:
2196     case T_DOUBLE:
2197       vmaskmovpd(dst, src, mask, vec_enc);
2198       break;
2199     default:
2200       fatal("Unsupported type %s", type2name(elem_bt));
2201       break;
2202   }
2203 }
2204 
2205 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
2206                                           XMMRegister dst, XMMRegister src,
2207                                           XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2208                                           XMMRegister xmm_0, XMMRegister xmm_1) {
2209   const int permconst[] = {1, 14};
2210   XMMRegister wsrc = src;
2211   XMMRegister wdst = xmm_0;
2212   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2213 
2214   int vlen_enc = Assembler::AVX_128bit;
2215   if (vlen == 16) {
2216     vlen_enc = Assembler::AVX_256bit;
2217   }
2218 
2219   for (int i = log2(vlen) - 1; i >=0; i--) {
2220     if (i == 0 && !is_dst_valid) {
2221       wdst = dst;
2222     }
2223     if (i == 3) {
2224       vextracti64x4_high(wtmp, wsrc);
2225     } else if (i == 2) {
2226       vextracti128_high(wtmp, wsrc);
2227     } else { // i = [0,1]
2228       vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
2229     }
2230     vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2231     wsrc = wdst;
2232     vlen_enc = Assembler::AVX_128bit;
2233   }
2234   if (is_dst_valid) {
2235     vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2236   }
2237 }
2238 
2239 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
2240                                         XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2241                                         XMMRegister xmm_0, XMMRegister xmm_1) {
2242   XMMRegister wsrc = src;
2243   XMMRegister wdst = xmm_0;
2244   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2245   int vlen_enc = Assembler::AVX_128bit;
2246   if (vlen == 8) {
2247     vlen_enc = Assembler::AVX_256bit;
2248   }
2249   for (int i = log2(vlen) - 1; i >=0; i--) {
2250     if (i == 0 && !is_dst_valid) {
2251       wdst = dst;
2252     }
2253     if (i == 1) {
2254       vextracti128_high(wtmp, wsrc);
2255     } else if (i == 2) {
2256       vextracti64x4_high(wtmp, wsrc);
2257     } else {
2258       assert(i == 0, "%d", i);
2259       vpermilpd(wtmp, wsrc, 1, vlen_enc);
2260     }
2261     vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2262     wsrc = wdst;
2263     vlen_enc = Assembler::AVX_128bit;
2264   }
2265   if (is_dst_valid) {
2266     vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2267   }
2268 }
2269 
2270 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2271   switch (bt) {
2272     case T_BYTE:  pextrb(dst, src, idx); break;
2273     case T_SHORT: pextrw(dst, src, idx); break;
2274     case T_INT:   pextrd(dst, src, idx); break;
2275     case T_LONG:  pextrq(dst, src, idx); break;
2276 
2277     default:
2278       assert(false,"Should not reach here.");
2279       break;
2280   }
2281 }
2282 
2283 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2284   int esize =  type2aelembytes(typ);
2285   int elem_per_lane = 16/esize;
2286   int lane = elemindex / elem_per_lane;
2287   int eindex = elemindex % elem_per_lane;
2288 
2289   if (lane >= 2) {
2290     assert(UseAVX > 2, "required");
2291     vextractf32x4(dst, src, lane & 3);
2292     return dst;
2293   } else if (lane > 0) {
2294     assert(UseAVX > 0, "required");
2295     vextractf128(dst, src, lane);
2296     return dst;
2297   } else {
2298     return src;
2299   }
2300 }
2301 
2302 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) {
2303   if (typ == T_BYTE) {
2304     movsbl(dst, dst);
2305   } else if (typ == T_SHORT) {
2306     movswl(dst, dst);
2307   }
2308 }
2309 
2310 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2311   int esize =  type2aelembytes(typ);
2312   int elem_per_lane = 16/esize;
2313   int eindex = elemindex % elem_per_lane;
2314   assert(is_integral_type(typ),"required");
2315 
2316   if (eindex == 0) {
2317     if (typ == T_LONG) {
2318       movq(dst, src);
2319     } else {
2320       movdl(dst, src);
2321       movsxl(typ, dst);
2322     }
2323   } else {
2324     extract(typ, dst, src, eindex);
2325     movsxl(typ, dst);
2326   }
2327 }
2328 
2329 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) {
2330   int esize =  type2aelembytes(typ);
2331   int elem_per_lane = 16/esize;
2332   int eindex = elemindex % elem_per_lane;
2333   assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2334 
2335   if (eindex == 0) {
2336     movq(dst, src);
2337   } else {
2338     if (typ == T_FLOAT) {
2339       if (UseAVX == 0) {
2340         movdqu(dst, src);
2341         shufps(dst, dst, eindex);
2342       } else {
2343         vshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2344       }
2345     } else {
2346       if (UseAVX == 0) {
2347         movdqu(dst, src);
2348         psrldq(dst, eindex*esize);
2349       } else {
2350         vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2351       }
2352       movq(dst, dst);
2353     }
2354   }
2355   // Zero upper bits
2356   if (typ == T_FLOAT) {
2357     if (UseAVX == 0) {
2358       assert(vtmp != xnoreg, "required.");
2359       movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg);
2360       pand(dst, vtmp);
2361     } else {
2362       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg);
2363     }
2364   }
2365 }
2366 
2367 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) {
2368   switch(typ) {
2369     case T_BYTE:
2370     case T_BOOLEAN:
2371       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2372       break;
2373     case T_SHORT:
2374     case T_CHAR:
2375       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2376       break;
2377     case T_INT:
2378     case T_FLOAT:
2379       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2380       break;
2381     case T_LONG:
2382     case T_DOUBLE:
2383       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2384       break;
2385     default:
2386       assert(false,"Should not reach here.");
2387       break;
2388   }
2389 }
2390 
2391 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) {
2392   assert(rscratch != noreg || always_reachable(src2), "missing");
2393 
2394   switch(typ) {
2395     case T_BOOLEAN:
2396     case T_BYTE:
2397       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2398       break;
2399     case T_CHAR:
2400     case T_SHORT:
2401       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2402       break;
2403     case T_INT:
2404     case T_FLOAT:
2405       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2406       break;
2407     case T_LONG:
2408     case T_DOUBLE:
2409       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2410       break;
2411     default:
2412       assert(false,"Should not reach here.");
2413       break;
2414   }
2415 }
2416 
2417 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2418   switch(typ) {
2419     case T_BYTE:
2420       evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2421       break;
2422     case T_SHORT:
2423       evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2424       break;
2425     case T_INT:
2426     case T_FLOAT:
2427       evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2428       break;
2429     case T_LONG:
2430     case T_DOUBLE:
2431       evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2432       break;
2433     default:
2434       assert(false,"Should not reach here.");
2435       break;
2436   }
2437 }
2438 
2439 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) {
2440   assert(vlen_in_bytes <= 32, "");
2441   int esize = type2aelembytes(bt);
2442   if (vlen_in_bytes == 32) {
2443     assert(vtmp == xnoreg, "required.");
2444     if (esize >= 4) {
2445       vtestps(src1, src2, AVX_256bit);
2446     } else {
2447       vptest(src1, src2, AVX_256bit);
2448     }
2449     return;
2450   }
2451   if (vlen_in_bytes < 16) {
2452     // Duplicate the lower part to fill the whole register,
2453     // Don't need to do so for src2
2454     assert(vtmp != xnoreg, "required");
2455     int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04;
2456     pshufd(vtmp, src1, shuffle_imm);
2457   } else {
2458     assert(vtmp == xnoreg, "required");
2459     vtmp = src1;
2460   }
2461   if (esize >= 4 && VM_Version::supports_avx()) {
2462     vtestps(vtmp, src2, AVX_128bit);
2463   } else {
2464     ptest(vtmp, src2);
2465   }
2466 }
2467 
2468 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
2469   assert(UseAVX >= 2, "required");
2470 #ifdef ASSERT
2471   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2472   bool is_bw_supported = VM_Version::supports_avx512bw();
2473   if (is_bw && !is_bw_supported) {
2474     assert(vlen_enc != Assembler::AVX_512bit, "required");
2475     assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16),
2476            "XMM register should be 0-15");
2477   }
2478 #endif // ASSERT
2479   switch (elem_bt) {
2480     case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return;
2481     case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return;
2482     case T_INT: vpaddd(dst, src1, src2, vlen_enc); return;
2483     case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return;
2484     case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return;
2485     case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return;
2486     default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2487   }
2488 }
2489 
2490 #ifdef _LP64
2491 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) {
2492   assert(UseAVX >= 2, "required");
2493   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2494   bool is_vl = vlen_enc != Assembler::AVX_512bit;
2495   if ((UseAVX > 2) &&
2496       (!is_bw || VM_Version::supports_avx512bw()) &&
2497       (!is_vl || VM_Version::supports_avx512vl())) {
2498     switch (elem_bt) {
2499       case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return;
2500       case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return;
2501       case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return;
2502       case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return;
2503       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2504     }
2505   } else {
2506     assert(vlen_enc != Assembler::AVX_512bit, "required");
2507     assert((dst->encoding() < 16),"XMM register should be 0-15");
2508     switch (elem_bt) {
2509       case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return;
2510       case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return;
2511       case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return;
2512       case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return;
2513       case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return;
2514       case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return;
2515       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2516     }
2517   }
2518 }
2519 #endif
2520 
2521 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
2522   switch (to_elem_bt) {
2523     case T_SHORT:
2524       vpmovsxbw(dst, src, vlen_enc);
2525       break;
2526     case T_INT:
2527       vpmovsxbd(dst, src, vlen_enc);
2528       break;
2529     case T_FLOAT:
2530       vpmovsxbd(dst, src, vlen_enc);
2531       vcvtdq2ps(dst, dst, vlen_enc);
2532       break;
2533     case T_LONG:
2534       vpmovsxbq(dst, src, vlen_enc);
2535       break;
2536     case T_DOUBLE: {
2537       int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
2538       vpmovsxbd(dst, src, mid_vlen_enc);
2539       vcvtdq2pd(dst, dst, vlen_enc);
2540       break;
2541     }
2542     default:
2543       fatal("Unsupported type %s", type2name(to_elem_bt));
2544       break;
2545   }
2546 }
2547 
2548 //-------------------------------------------------------------------------------------------
2549 
2550 // IndexOf for constant substrings with size >= 8 chars
2551 // which don't need to be loaded through stack.
2552 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2553                                          Register cnt1, Register cnt2,
2554                                          int int_cnt2,  Register result,
2555                                          XMMRegister vec, Register tmp,
2556                                          int ae) {
2557   ShortBranchVerifier sbv(this);
2558   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2559   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2560 
2561   // This method uses the pcmpestri instruction with bound registers
2562   //   inputs:
2563   //     xmm - substring
2564   //     rax - substring length (elements count)
2565   //     mem - scanned string
2566   //     rdx - string length (elements count)
2567   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2568   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2569   //   outputs:
2570   //     rcx - matched index in string
2571   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2572   int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2573   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2574   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2575   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2576 
2577   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2578         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2579         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2580 
2581   // Note, inline_string_indexOf() generates checks:
2582   // if (substr.count > string.count) return -1;
2583   // if (substr.count == 0) return 0;
2584   assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2585 
2586   // Load substring.
2587   if (ae == StrIntrinsicNode::UL) {
2588     pmovzxbw(vec, Address(str2, 0));
2589   } else {
2590     movdqu(vec, Address(str2, 0));
2591   }
2592   movl(cnt2, int_cnt2);
2593   movptr(result, str1); // string addr
2594 
2595   if (int_cnt2 > stride) {
2596     jmpb(SCAN_TO_SUBSTR);
2597 
2598     // Reload substr for rescan, this code
2599     // is executed only for large substrings (> 8 chars)
2600     bind(RELOAD_SUBSTR);
2601     if (ae == StrIntrinsicNode::UL) {
2602       pmovzxbw(vec, Address(str2, 0));
2603     } else {
2604       movdqu(vec, Address(str2, 0));
2605     }
2606     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2607 
2608     bind(RELOAD_STR);
2609     // We came here after the beginning of the substring was
2610     // matched but the rest of it was not so we need to search
2611     // again. Start from the next element after the previous match.
2612 
2613     // cnt2 is number of substring reminding elements and
2614     // cnt1 is number of string reminding elements when cmp failed.
2615     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
2616     subl(cnt1, cnt2);
2617     addl(cnt1, int_cnt2);
2618     movl(cnt2, int_cnt2); // Now restore cnt2
2619 
2620     decrementl(cnt1);     // Shift to next element
2621     cmpl(cnt1, cnt2);
2622     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2623 
2624     addptr(result, (1<<scale1));
2625 
2626   } // (int_cnt2 > 8)
2627 
2628   // Scan string for start of substr in 16-byte vectors
2629   bind(SCAN_TO_SUBSTR);
2630   pcmpestri(vec, Address(result, 0), mode);
2631   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2632   subl(cnt1, stride);
2633   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2634   cmpl(cnt1, cnt2);
2635   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2636   addptr(result, 16);
2637   jmpb(SCAN_TO_SUBSTR);
2638 
2639   // Found a potential substr
2640   bind(FOUND_CANDIDATE);
2641   // Matched whole vector if first element matched (tmp(rcx) == 0).
2642   if (int_cnt2 == stride) {
2643     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
2644   } else { // int_cnt2 > 8
2645     jccb(Assembler::overflow, FOUND_SUBSTR);
2646   }
2647   // After pcmpestri tmp(rcx) contains matched element index
2648   // Compute start addr of substr
2649   lea(result, Address(result, tmp, scale1));
2650 
2651   // Make sure string is still long enough
2652   subl(cnt1, tmp);
2653   cmpl(cnt1, cnt2);
2654   if (int_cnt2 == stride) {
2655     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2656   } else { // int_cnt2 > 8
2657     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
2658   }
2659   // Left less then substring.
2660 
2661   bind(RET_NOT_FOUND);
2662   movl(result, -1);
2663   jmp(EXIT);
2664 
2665   if (int_cnt2 > stride) {
2666     // This code is optimized for the case when whole substring
2667     // is matched if its head is matched.
2668     bind(MATCH_SUBSTR_HEAD);
2669     pcmpestri(vec, Address(result, 0), mode);
2670     // Reload only string if does not match
2671     jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
2672 
2673     Label CONT_SCAN_SUBSTR;
2674     // Compare the rest of substring (> 8 chars).
2675     bind(FOUND_SUBSTR);
2676     // First 8 chars are already matched.
2677     negptr(cnt2);
2678     addptr(cnt2, stride);
2679 
2680     bind(SCAN_SUBSTR);
2681     subl(cnt1, stride);
2682     cmpl(cnt2, -stride); // Do not read beyond substring
2683     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
2684     // Back-up strings to avoid reading beyond substring:
2685     // cnt1 = cnt1 - cnt2 + 8
2686     addl(cnt1, cnt2); // cnt2 is negative
2687     addl(cnt1, stride);
2688     movl(cnt2, stride); negptr(cnt2);
2689     bind(CONT_SCAN_SUBSTR);
2690     if (int_cnt2 < (int)G) {
2691       int tail_off1 = int_cnt2<<scale1;
2692       int tail_off2 = int_cnt2<<scale2;
2693       if (ae == StrIntrinsicNode::UL) {
2694         pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
2695       } else {
2696         movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
2697       }
2698       pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
2699     } else {
2700       // calculate index in register to avoid integer overflow (int_cnt2*2)
2701       movl(tmp, int_cnt2);
2702       addptr(tmp, cnt2);
2703       if (ae == StrIntrinsicNode::UL) {
2704         pmovzxbw(vec, Address(str2, tmp, scale2, 0));
2705       } else {
2706         movdqu(vec, Address(str2, tmp, scale2, 0));
2707       }
2708       pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
2709     }
2710     // Need to reload strings pointers if not matched whole vector
2711     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2712     addptr(cnt2, stride);
2713     jcc(Assembler::negative, SCAN_SUBSTR);
2714     // Fall through if found full substring
2715 
2716   } // (int_cnt2 > 8)
2717 
2718   bind(RET_FOUND);
2719   // Found result if we matched full small substring.
2720   // Compute substr offset
2721   subptr(result, str1);
2722   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2723     shrl(result, 1); // index
2724   }
2725   bind(EXIT);
2726 
2727 } // string_indexofC8
2728 
2729 // Small strings are loaded through stack if they cross page boundary.
2730 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
2731                                        Register cnt1, Register cnt2,
2732                                        int int_cnt2,  Register result,
2733                                        XMMRegister vec, Register tmp,
2734                                        int ae) {
2735   ShortBranchVerifier sbv(this);
2736   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2737   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2738 
2739   //
2740   // int_cnt2 is length of small (< 8 chars) constant substring
2741   // or (-1) for non constant substring in which case its length
2742   // is in cnt2 register.
2743   //
2744   // Note, inline_string_indexOf() generates checks:
2745   // if (substr.count > string.count) return -1;
2746   // if (substr.count == 0) return 0;
2747   //
2748   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2749   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
2750   // This method uses the pcmpestri instruction with bound registers
2751   //   inputs:
2752   //     xmm - substring
2753   //     rax - substring length (elements count)
2754   //     mem - scanned string
2755   //     rdx - string length (elements count)
2756   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2757   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2758   //   outputs:
2759   //     rcx - matched index in string
2760   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2761   int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2762   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2763   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2764 
2765   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
2766         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
2767         FOUND_CANDIDATE;
2768 
2769   { //========================================================
2770     // We don't know where these strings are located
2771     // and we can't read beyond them. Load them through stack.
2772     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
2773 
2774     movptr(tmp, rsp); // save old SP
2775 
2776     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
2777       if (int_cnt2 == (1>>scale2)) { // One byte
2778         assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
2779         load_unsigned_byte(result, Address(str2, 0));
2780         movdl(vec, result); // move 32 bits
2781       } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
2782         // Not enough header space in 32-bit VM: 12+3 = 15.
2783         movl(result, Address(str2, -1));
2784         shrl(result, 8);
2785         movdl(vec, result); // move 32 bits
2786       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
2787         load_unsigned_short(result, Address(str2, 0));
2788         movdl(vec, result); // move 32 bits
2789       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
2790         movdl(vec, Address(str2, 0)); // move 32 bits
2791       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
2792         movq(vec, Address(str2, 0));  // move 64 bits
2793       } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
2794         // Array header size is 12 bytes in 32-bit VM
2795         // + 6 bytes for 3 chars == 18 bytes,
2796         // enough space to load vec and shift.
2797         assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
2798         if (ae == StrIntrinsicNode::UL) {
2799           int tail_off = int_cnt2-8;
2800           pmovzxbw(vec, Address(str2, tail_off));
2801           psrldq(vec, -2*tail_off);
2802         }
2803         else {
2804           int tail_off = int_cnt2*(1<<scale2);
2805           movdqu(vec, Address(str2, tail_off-16));
2806           psrldq(vec, 16-tail_off);
2807         }
2808       }
2809     } else { // not constant substring
2810       cmpl(cnt2, stride);
2811       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
2812 
2813       // We can read beyond string if srt+16 does not cross page boundary
2814       // since heaps are aligned and mapped by pages.
2815       assert(os::vm_page_size() < (int)G, "default page should be small");
2816       movl(result, str2); // We need only low 32 bits
2817       andl(result, ((int)os::vm_page_size()-1));
2818       cmpl(result, ((int)os::vm_page_size()-16));
2819       jccb(Assembler::belowEqual, CHECK_STR);
2820 
2821       // Move small strings to stack to allow load 16 bytes into vec.
2822       subptr(rsp, 16);
2823       int stk_offset = wordSize-(1<<scale2);
2824       push(cnt2);
2825 
2826       bind(COPY_SUBSTR);
2827       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
2828         load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
2829         movb(Address(rsp, cnt2, scale2, stk_offset), result);
2830       } else if (ae == StrIntrinsicNode::UU) {
2831         load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
2832         movw(Address(rsp, cnt2, scale2, stk_offset), result);
2833       }
2834       decrement(cnt2);
2835       jccb(Assembler::notZero, COPY_SUBSTR);
2836 
2837       pop(cnt2);
2838       movptr(str2, rsp);  // New substring address
2839     } // non constant
2840 
2841     bind(CHECK_STR);
2842     cmpl(cnt1, stride);
2843     jccb(Assembler::aboveEqual, BIG_STRINGS);
2844 
2845     // Check cross page boundary.
2846     movl(result, str1); // We need only low 32 bits
2847     andl(result, ((int)os::vm_page_size()-1));
2848     cmpl(result, ((int)os::vm_page_size()-16));
2849     jccb(Assembler::belowEqual, BIG_STRINGS);
2850 
2851     subptr(rsp, 16);
2852     int stk_offset = -(1<<scale1);
2853     if (int_cnt2 < 0) { // not constant
2854       push(cnt2);
2855       stk_offset += wordSize;
2856     }
2857     movl(cnt2, cnt1);
2858 
2859     bind(COPY_STR);
2860     if (ae == StrIntrinsicNode::LL) {
2861       load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
2862       movb(Address(rsp, cnt2, scale1, stk_offset), result);
2863     } else {
2864       load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
2865       movw(Address(rsp, cnt2, scale1, stk_offset), result);
2866     }
2867     decrement(cnt2);
2868     jccb(Assembler::notZero, COPY_STR);
2869 
2870     if (int_cnt2 < 0) { // not constant
2871       pop(cnt2);
2872     }
2873     movptr(str1, rsp);  // New string address
2874 
2875     bind(BIG_STRINGS);
2876     // Load substring.
2877     if (int_cnt2 < 0) { // -1
2878       if (ae == StrIntrinsicNode::UL) {
2879         pmovzxbw(vec, Address(str2, 0));
2880       } else {
2881         movdqu(vec, Address(str2, 0));
2882       }
2883       push(cnt2);       // substr count
2884       push(str2);       // substr addr
2885       push(str1);       // string addr
2886     } else {
2887       // Small (< 8 chars) constant substrings are loaded already.
2888       movl(cnt2, int_cnt2);
2889     }
2890     push(tmp);  // original SP
2891 
2892   } // Finished loading
2893 
2894   //========================================================
2895   // Start search
2896   //
2897 
2898   movptr(result, str1); // string addr
2899 
2900   if (int_cnt2  < 0) {  // Only for non constant substring
2901     jmpb(SCAN_TO_SUBSTR);
2902 
2903     // SP saved at sp+0
2904     // String saved at sp+1*wordSize
2905     // Substr saved at sp+2*wordSize
2906     // Substr count saved at sp+3*wordSize
2907 
2908     // Reload substr for rescan, this code
2909     // is executed only for large substrings (> 8 chars)
2910     bind(RELOAD_SUBSTR);
2911     movptr(str2, Address(rsp, 2*wordSize));
2912     movl(cnt2, Address(rsp, 3*wordSize));
2913     if (ae == StrIntrinsicNode::UL) {
2914       pmovzxbw(vec, Address(str2, 0));
2915     } else {
2916       movdqu(vec, Address(str2, 0));
2917     }
2918     // We came here after the beginning of the substring was
2919     // matched but the rest of it was not so we need to search
2920     // again. Start from the next element after the previous match.
2921     subptr(str1, result); // Restore counter
2922     if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2923       shrl(str1, 1);
2924     }
2925     addl(cnt1, str1);
2926     decrementl(cnt1);   // Shift to next element
2927     cmpl(cnt1, cnt2);
2928     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2929 
2930     addptr(result, (1<<scale1));
2931   } // non constant
2932 
2933   // Scan string for start of substr in 16-byte vectors
2934   bind(SCAN_TO_SUBSTR);
2935   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2936   pcmpestri(vec, Address(result, 0), mode);
2937   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2938   subl(cnt1, stride);
2939   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2940   cmpl(cnt1, cnt2);
2941   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2942   addptr(result, 16);
2943 
2944   bind(ADJUST_STR);
2945   cmpl(cnt1, stride); // Do not read beyond string
2946   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2947   // Back-up string to avoid reading beyond string.
2948   lea(result, Address(result, cnt1, scale1, -16));
2949   movl(cnt1, stride);
2950   jmpb(SCAN_TO_SUBSTR);
2951 
2952   // Found a potential substr
2953   bind(FOUND_CANDIDATE);
2954   // After pcmpestri tmp(rcx) contains matched element index
2955 
2956   // Make sure string is still long enough
2957   subl(cnt1, tmp);
2958   cmpl(cnt1, cnt2);
2959   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
2960   // Left less then substring.
2961 
2962   bind(RET_NOT_FOUND);
2963   movl(result, -1);
2964   jmp(CLEANUP);
2965 
2966   bind(FOUND_SUBSTR);
2967   // Compute start addr of substr
2968   lea(result, Address(result, tmp, scale1));
2969   if (int_cnt2 > 0) { // Constant substring
2970     // Repeat search for small substring (< 8 chars)
2971     // from new point without reloading substring.
2972     // Have to check that we don't read beyond string.
2973     cmpl(tmp, stride-int_cnt2);
2974     jccb(Assembler::greater, ADJUST_STR);
2975     // Fall through if matched whole substring.
2976   } else { // non constant
2977     assert(int_cnt2 == -1, "should be != 0");
2978 
2979     addl(tmp, cnt2);
2980     // Found result if we matched whole substring.
2981     cmpl(tmp, stride);
2982     jcc(Assembler::lessEqual, RET_FOUND);
2983 
2984     // Repeat search for small substring (<= 8 chars)
2985     // from new point 'str1' without reloading substring.
2986     cmpl(cnt2, stride);
2987     // Have to check that we don't read beyond string.
2988     jccb(Assembler::lessEqual, ADJUST_STR);
2989 
2990     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
2991     // Compare the rest of substring (> 8 chars).
2992     movptr(str1, result);
2993 
2994     cmpl(tmp, cnt2);
2995     // First 8 chars are already matched.
2996     jccb(Assembler::equal, CHECK_NEXT);
2997 
2998     bind(SCAN_SUBSTR);
2999     pcmpestri(vec, Address(str1, 0), mode);
3000     // Need to reload strings pointers if not matched whole vector
3001     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
3002 
3003     bind(CHECK_NEXT);
3004     subl(cnt2, stride);
3005     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
3006     addptr(str1, 16);
3007     if (ae == StrIntrinsicNode::UL) {
3008       addptr(str2, 8);
3009     } else {
3010       addptr(str2, 16);
3011     }
3012     subl(cnt1, stride);
3013     cmpl(cnt2, stride); // Do not read beyond substring
3014     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
3015     // Back-up strings to avoid reading beyond substring.
3016 
3017     if (ae == StrIntrinsicNode::UL) {
3018       lea(str2, Address(str2, cnt2, scale2, -8));
3019       lea(str1, Address(str1, cnt2, scale1, -16));
3020     } else {
3021       lea(str2, Address(str2, cnt2, scale2, -16));
3022       lea(str1, Address(str1, cnt2, scale1, -16));
3023     }
3024     subl(cnt1, cnt2);
3025     movl(cnt2, stride);
3026     addl(cnt1, stride);
3027     bind(CONT_SCAN_SUBSTR);
3028     if (ae == StrIntrinsicNode::UL) {
3029       pmovzxbw(vec, Address(str2, 0));
3030     } else {
3031       movdqu(vec, Address(str2, 0));
3032     }
3033     jmp(SCAN_SUBSTR);
3034 
3035     bind(RET_FOUND_LONG);
3036     movptr(str1, Address(rsp, wordSize));
3037   } // non constant
3038 
3039   bind(RET_FOUND);
3040   // Compute substr offset
3041   subptr(result, str1);
3042   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3043     shrl(result, 1); // index
3044   }
3045   bind(CLEANUP);
3046   pop(rsp); // restore SP
3047 
3048 } // string_indexof
3049 
3050 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3051                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3052   ShortBranchVerifier sbv(this);
3053   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3054 
3055   int stride = 8;
3056 
3057   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
3058         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
3059         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
3060         FOUND_SEQ_CHAR, DONE_LABEL;
3061 
3062   movptr(result, str1);
3063   if (UseAVX >= 2) {
3064     cmpl(cnt1, stride);
3065     jcc(Assembler::less, SCAN_TO_CHAR);
3066     cmpl(cnt1, 2*stride);
3067     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
3068     movdl(vec1, ch);
3069     vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
3070     vpxor(vec2, vec2);
3071     movl(tmp, cnt1);
3072     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
3073     andl(cnt1,0x0000000F);  //tail count (in chars)
3074 
3075     bind(SCAN_TO_16_CHAR_LOOP);
3076     vmovdqu(vec3, Address(result, 0));
3077     vpcmpeqw(vec3, vec3, vec1, 1);
3078     vptest(vec2, vec3);
3079     jcc(Assembler::carryClear, FOUND_CHAR);
3080     addptr(result, 32);
3081     subl(tmp, 2*stride);
3082     jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
3083     jmp(SCAN_TO_8_CHAR);
3084     bind(SCAN_TO_8_CHAR_INIT);
3085     movdl(vec1, ch);
3086     pshuflw(vec1, vec1, 0x00);
3087     pshufd(vec1, vec1, 0);
3088     pxor(vec2, vec2);
3089   }
3090   bind(SCAN_TO_8_CHAR);
3091   cmpl(cnt1, stride);
3092   jcc(Assembler::less, SCAN_TO_CHAR);
3093   if (UseAVX < 2) {
3094     movdl(vec1, ch);
3095     pshuflw(vec1, vec1, 0x00);
3096     pshufd(vec1, vec1, 0);
3097     pxor(vec2, vec2);
3098   }
3099   movl(tmp, cnt1);
3100   andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
3101   andl(cnt1,0x00000007);  //tail count (in chars)
3102 
3103   bind(SCAN_TO_8_CHAR_LOOP);
3104   movdqu(vec3, Address(result, 0));
3105   pcmpeqw(vec3, vec1);
3106   ptest(vec2, vec3);
3107   jcc(Assembler::carryClear, FOUND_CHAR);
3108   addptr(result, 16);
3109   subl(tmp, stride);
3110   jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
3111   bind(SCAN_TO_CHAR);
3112   testl(cnt1, cnt1);
3113   jcc(Assembler::zero, RET_NOT_FOUND);
3114   bind(SCAN_TO_CHAR_LOOP);
3115   load_unsigned_short(tmp, Address(result, 0));
3116   cmpl(ch, tmp);
3117   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3118   addptr(result, 2);
3119   subl(cnt1, 1);
3120   jccb(Assembler::zero, RET_NOT_FOUND);
3121   jmp(SCAN_TO_CHAR_LOOP);
3122 
3123   bind(RET_NOT_FOUND);
3124   movl(result, -1);
3125   jmpb(DONE_LABEL);
3126 
3127   bind(FOUND_CHAR);
3128   if (UseAVX >= 2) {
3129     vpmovmskb(tmp, vec3);
3130   } else {
3131     pmovmskb(tmp, vec3);
3132   }
3133   bsfl(ch, tmp);
3134   addptr(result, ch);
3135 
3136   bind(FOUND_SEQ_CHAR);
3137   subptr(result, str1);
3138   shrl(result, 1);
3139 
3140   bind(DONE_LABEL);
3141 } // string_indexof_char
3142 
3143 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3144                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3145   ShortBranchVerifier sbv(this);
3146   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3147 
3148   int stride = 16;
3149 
3150   Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
3151         SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
3152         RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
3153         FOUND_SEQ_CHAR, DONE_LABEL;
3154 
3155   movptr(result, str1);
3156   if (UseAVX >= 2) {
3157     cmpl(cnt1, stride);
3158     jcc(Assembler::less, SCAN_TO_CHAR_INIT);
3159     cmpl(cnt1, stride*2);
3160     jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
3161     movdl(vec1, ch);
3162     vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
3163     vpxor(vec2, vec2);
3164     movl(tmp, cnt1);
3165     andl(tmp, 0xFFFFFFE0);  //vector count (in chars)
3166     andl(cnt1,0x0000001F);  //tail count (in chars)
3167 
3168     bind(SCAN_TO_32_CHAR_LOOP);
3169     vmovdqu(vec3, Address(result, 0));
3170     vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
3171     vptest(vec2, vec3);
3172     jcc(Assembler::carryClear, FOUND_CHAR);
3173     addptr(result, 32);
3174     subl(tmp, stride*2);
3175     jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
3176     jmp(SCAN_TO_16_CHAR);
3177 
3178     bind(SCAN_TO_16_CHAR_INIT);
3179     movdl(vec1, ch);
3180     pxor(vec2, vec2);
3181     pshufb(vec1, vec2);
3182   }
3183 
3184   bind(SCAN_TO_16_CHAR);
3185   cmpl(cnt1, stride);
3186   jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left
3187   if (UseAVX < 2) {
3188     movdl(vec1, ch);
3189     pxor(vec2, vec2);
3190     pshufb(vec1, vec2);
3191   }
3192   movl(tmp, cnt1);
3193   andl(tmp, 0xFFFFFFF0);  //vector count (in bytes)
3194   andl(cnt1,0x0000000F);  //tail count (in bytes)
3195 
3196   bind(SCAN_TO_16_CHAR_LOOP);
3197   movdqu(vec3, Address(result, 0));
3198   pcmpeqb(vec3, vec1);
3199   ptest(vec2, vec3);
3200   jcc(Assembler::carryClear, FOUND_CHAR);
3201   addptr(result, 16);
3202   subl(tmp, stride);
3203   jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
3204 
3205   bind(SCAN_TO_CHAR_INIT);
3206   testl(cnt1, cnt1);
3207   jcc(Assembler::zero, RET_NOT_FOUND);
3208   bind(SCAN_TO_CHAR_LOOP);
3209   load_unsigned_byte(tmp, Address(result, 0));
3210   cmpl(ch, tmp);
3211   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3212   addptr(result, 1);
3213   subl(cnt1, 1);
3214   jccb(Assembler::zero, RET_NOT_FOUND);
3215   jmp(SCAN_TO_CHAR_LOOP);
3216 
3217   bind(RET_NOT_FOUND);
3218   movl(result, -1);
3219   jmpb(DONE_LABEL);
3220 
3221   bind(FOUND_CHAR);
3222   if (UseAVX >= 2) {
3223     vpmovmskb(tmp, vec3);
3224   } else {
3225     pmovmskb(tmp, vec3);
3226   }
3227   bsfl(ch, tmp);
3228   addptr(result, ch);
3229 
3230   bind(FOUND_SEQ_CHAR);
3231   subptr(result, str1);
3232 
3233   bind(DONE_LABEL);
3234 } // stringL_indexof_char
3235 
3236 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) {
3237   switch (eltype) {
3238   case T_BOOLEAN: return sizeof(jboolean);
3239   case T_BYTE:  return sizeof(jbyte);
3240   case T_SHORT: return sizeof(jshort);
3241   case T_CHAR:  return sizeof(jchar);
3242   case T_INT:   return sizeof(jint);
3243   default:
3244     ShouldNotReachHere();
3245     return -1;
3246   }
3247 }
3248 
3249 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) {
3250   switch (eltype) {
3251   // T_BOOLEAN used as surrogate for unsigned byte
3252   case T_BOOLEAN: movzbl(dst, src);   break;
3253   case T_BYTE:    movsbl(dst, src);   break;
3254   case T_SHORT:   movswl(dst, src);   break;
3255   case T_CHAR:    movzwl(dst, src);   break;
3256   case T_INT:     movl(dst, src);     break;
3257   default:
3258     ShouldNotReachHere();
3259   }
3260 }
3261 
3262 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) {
3263   load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8);
3264 }
3265 
3266 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) {
3267   load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8);
3268 }
3269 
3270 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) {
3271   const int vlen = Assembler::AVX_256bit;
3272   switch (eltype) {
3273   case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3274   case T_BYTE:      vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3275   case T_SHORT:     vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3276   case T_CHAR:    vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3277   case T_INT:
3278     // do nothing
3279     break;
3280   default:
3281     ShouldNotReachHere();
3282   }
3283 }
3284 
3285 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result,
3286                                         Register index, Register tmp2, Register tmp3, XMMRegister vnext,
3287                                         XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3,
3288                                         XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3,
3289                                         XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3,
3290                                         BasicType eltype) {
3291   ShortBranchVerifier sbv(this);
3292   assert(UseAVX >= 2, "AVX2 intrinsics are required");
3293   assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3);
3294   assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3);
3295 
3296   Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN,
3297         SHORT_UNROLLED_LOOP_EXIT,
3298         UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME,
3299         UNROLLED_VECTOR_LOOP_BEGIN,
3300         END;
3301   switch (eltype) {
3302   case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break;
3303   case T_CHAR:    BLOCK_COMMENT("arrays_hashcode(char) {");          break;
3304   case T_BYTE:    BLOCK_COMMENT("arrays_hashcode(byte) {");          break;
3305   case T_SHORT:   BLOCK_COMMENT("arrays_hashcode(short) {");         break;
3306   case T_INT:     BLOCK_COMMENT("arrays_hashcode(int) {");           break;
3307   default:        BLOCK_COMMENT("arrays_hashcode {");                break;
3308   }
3309 
3310   // For "renaming" for readibility of the code
3311   const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 },
3312                     vresult[] = { vresult0, vresult1, vresult2, vresult3 },
3313                     vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 };
3314 
3315   const int elsize = arrays_hashcode_elsize(eltype);
3316 
3317   /*
3318     if (cnt1 >= 2) {
3319       if (cnt1 >= 32) {
3320         UNROLLED VECTOR LOOP
3321       }
3322       UNROLLED SCALAR LOOP
3323     }
3324     SINGLE SCALAR
3325    */
3326 
3327   cmpl(cnt1, 32);
3328   jcc(Assembler::less, SHORT_UNROLLED_BEGIN);
3329 
3330   // cnt1 >= 32 && generate_vectorized_loop
3331   xorl(index, index);
3332 
3333   // vresult = IntVector.zero(I256);
3334   for (int idx = 0; idx < 4; idx++) {
3335     vpxor(vresult[idx], vresult[idx]);
3336   }
3337   // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]);
3338   Register bound = tmp2;
3339   Register next = tmp3;
3340   lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint))));
3341   movl(next, Address(tmp2, 0));
3342   movdl(vnext, next);
3343   vpbroadcastd(vnext, vnext, Assembler::AVX_256bit);
3344 
3345   // index = 0;
3346   // bound = cnt1 & ~(32 - 1);
3347   movl(bound, cnt1);
3348   andl(bound, ~(32 - 1));
3349   // for (; index < bound; index += 32) {
3350   bind(UNROLLED_VECTOR_LOOP_BEGIN);
3351   // result *= next;
3352   imull(result, next);
3353   // loop fission to upfront the cost of fetching from memory, OOO execution
3354   // can then hopefully do a better job of prefetching
3355   for (int idx = 0; idx < 4; idx++) {
3356     arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype);
3357   }
3358   // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7];
3359   for (int idx = 0; idx < 4; idx++) {
3360     vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit);
3361     arrays_hashcode_elvcast(vtmp[idx], eltype);
3362     vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit);
3363   }
3364   // index += 32;
3365   addl(index, 32);
3366   // index < bound;
3367   cmpl(index, bound);
3368   jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN);
3369   // }
3370 
3371   lea(ary1, Address(ary1, bound, Address::times(elsize)));
3372   subl(cnt1, bound);
3373   // release bound
3374 
3375   // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1);
3376   for (int idx = 0; idx < 4; idx++) {
3377     lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint))));
3378     arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT);
3379     vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit);
3380   }
3381   // result += vresult.reduceLanes(ADD);
3382   for (int idx = 0; idx < 4; idx++) {
3383     reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]);
3384   }
3385 
3386   // } else if (cnt1 < 32) {
3387 
3388   bind(SHORT_UNROLLED_BEGIN);
3389   // int i = 1;
3390   movl(index, 1);
3391   cmpl(index, cnt1);
3392   jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT);
3393 
3394   // for (; i < cnt1 ; i += 2) {
3395   bind(SHORT_UNROLLED_LOOP_BEGIN);
3396   movl(tmp3, 961);
3397   imull(result, tmp3);
3398   arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3399   movl(tmp3, tmp2);
3400   shll(tmp3, 5);
3401   subl(tmp3, tmp2);
3402   addl(result, tmp3);
3403   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype);
3404   addl(result, tmp3);
3405   addl(index, 2);
3406   cmpl(index, cnt1);
3407   jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN);
3408 
3409   // }
3410   // if (i >= cnt1) {
3411   bind(SHORT_UNROLLED_LOOP_EXIT);
3412   jccb(Assembler::greater, END);
3413   movl(tmp2, result);
3414   shll(result, 5);
3415   subl(result, tmp2);
3416   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3417   addl(result, tmp3);
3418   // }
3419   bind(END);
3420 
3421   BLOCK_COMMENT("} // arrays_hashcode");
3422 
3423 } // arrays_hashcode
3424 
3425 // helper function for string_compare
3426 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
3427                                            Address::ScaleFactor scale, Address::ScaleFactor scale1,
3428                                            Address::ScaleFactor scale2, Register index, int ae) {
3429   if (ae == StrIntrinsicNode::LL) {
3430     load_unsigned_byte(elem1, Address(str1, index, scale, 0));
3431     load_unsigned_byte(elem2, Address(str2, index, scale, 0));
3432   } else if (ae == StrIntrinsicNode::UU) {
3433     load_unsigned_short(elem1, Address(str1, index, scale, 0));
3434     load_unsigned_short(elem2, Address(str2, index, scale, 0));
3435   } else {
3436     load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
3437     load_unsigned_short(elem2, Address(str2, index, scale2, 0));
3438   }
3439 }
3440 
3441 // Compare strings, used for char[] and byte[].
3442 void C2_MacroAssembler::string_compare(Register str1, Register str2,
3443                                        Register cnt1, Register cnt2, Register result,
3444                                        XMMRegister vec1, int ae, KRegister mask) {
3445   ShortBranchVerifier sbv(this);
3446   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
3447   Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only _LP64 && AVX3
3448   int stride, stride2, adr_stride, adr_stride1, adr_stride2;
3449   int stride2x2 = 0x40;
3450   Address::ScaleFactor scale = Address::no_scale;
3451   Address::ScaleFactor scale1 = Address::no_scale;
3452   Address::ScaleFactor scale2 = Address::no_scale;
3453 
3454   if (ae != StrIntrinsicNode::LL) {
3455     stride2x2 = 0x20;
3456   }
3457 
3458   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
3459     shrl(cnt2, 1);
3460   }
3461   // Compute the minimum of the string lengths and the
3462   // difference of the string lengths (stack).
3463   // Do the conditional move stuff
3464   movl(result, cnt1);
3465   subl(cnt1, cnt2);
3466   push(cnt1);
3467   cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
3468 
3469   // Is the minimum length zero?
3470   testl(cnt2, cnt2);
3471   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3472   if (ae == StrIntrinsicNode::LL) {
3473     // Load first bytes
3474     load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
3475     load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
3476   } else if (ae == StrIntrinsicNode::UU) {
3477     // Load first characters
3478     load_unsigned_short(result, Address(str1, 0));
3479     load_unsigned_short(cnt1, Address(str2, 0));
3480   } else {
3481     load_unsigned_byte(result, Address(str1, 0));
3482     load_unsigned_short(cnt1, Address(str2, 0));
3483   }
3484   subl(result, cnt1);
3485   jcc(Assembler::notZero,  POP_LABEL);
3486 
3487   if (ae == StrIntrinsicNode::UU) {
3488     // Divide length by 2 to get number of chars
3489     shrl(cnt2, 1);
3490   }
3491   cmpl(cnt2, 1);
3492   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3493 
3494   // Check if the strings start at the same location and setup scale and stride
3495   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3496     cmpptr(str1, str2);
3497     jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3498     if (ae == StrIntrinsicNode::LL) {
3499       scale = Address::times_1;
3500       stride = 16;
3501     } else {
3502       scale = Address::times_2;
3503       stride = 8;
3504     }
3505   } else {
3506     scale1 = Address::times_1;
3507     scale2 = Address::times_2;
3508     // scale not used
3509     stride = 8;
3510   }
3511 
3512   if (UseAVX >= 2 && UseSSE42Intrinsics) {
3513     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
3514     Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
3515     Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
3516     Label COMPARE_TAIL_LONG;
3517     Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only _LP64 && AVX3
3518 
3519     int pcmpmask = 0x19;
3520     if (ae == StrIntrinsicNode::LL) {
3521       pcmpmask &= ~0x01;
3522     }
3523 
3524     // Setup to compare 16-chars (32-bytes) vectors,
3525     // start from first character again because it has aligned address.
3526     if (ae == StrIntrinsicNode::LL) {
3527       stride2 = 32;
3528     } else {
3529       stride2 = 16;
3530     }
3531     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3532       adr_stride = stride << scale;
3533     } else {
3534       adr_stride1 = 8;  //stride << scale1;
3535       adr_stride2 = 16; //stride << scale2;
3536     }
3537 
3538     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3539     // rax and rdx are used by pcmpestri as elements counters
3540     movl(result, cnt2);
3541     andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
3542     jcc(Assembler::zero, COMPARE_TAIL_LONG);
3543 
3544     // fast path : compare first 2 8-char vectors.
3545     bind(COMPARE_16_CHARS);
3546     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3547       movdqu(vec1, Address(str1, 0));
3548     } else {
3549       pmovzxbw(vec1, Address(str1, 0));
3550     }
3551     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3552     jccb(Assembler::below, COMPARE_INDEX_CHAR);
3553 
3554     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3555       movdqu(vec1, Address(str1, adr_stride));
3556       pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
3557     } else {
3558       pmovzxbw(vec1, Address(str1, adr_stride1));
3559       pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
3560     }
3561     jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
3562     addl(cnt1, stride);
3563 
3564     // Compare the characters at index in cnt1
3565     bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
3566     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3567     subl(result, cnt2);
3568     jmp(POP_LABEL);
3569 
3570     // Setup the registers to start vector comparison loop
3571     bind(COMPARE_WIDE_VECTORS);
3572     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3573       lea(str1, Address(str1, result, scale));
3574       lea(str2, Address(str2, result, scale));
3575     } else {
3576       lea(str1, Address(str1, result, scale1));
3577       lea(str2, Address(str2, result, scale2));
3578     }
3579     subl(result, stride2);
3580     subl(cnt2, stride2);
3581     jcc(Assembler::zero, COMPARE_WIDE_TAIL);
3582     negptr(result);
3583 
3584     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
3585     bind(COMPARE_WIDE_VECTORS_LOOP);
3586 
3587 #ifdef _LP64
3588     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3589       cmpl(cnt2, stride2x2);
3590       jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3591       testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
3592       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
3593 
3594       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3595       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3596         evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
3597         evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3598       } else {
3599         vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
3600         evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3601       }
3602       kortestql(mask, mask);
3603       jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
3604       addptr(result, stride2x2);  // update since we already compared at this addr
3605       subl(cnt2, stride2x2);      // and sub the size too
3606       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3607 
3608       vpxor(vec1, vec1);
3609       jmpb(COMPARE_WIDE_TAIL);
3610     }//if (VM_Version::supports_avx512vlbw())
3611 #endif // _LP64
3612 
3613 
3614     bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3615     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3616       vmovdqu(vec1, Address(str1, result, scale));
3617       vpxor(vec1, Address(str2, result, scale));
3618     } else {
3619       vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
3620       vpxor(vec1, Address(str2, result, scale2));
3621     }
3622     vptest(vec1, vec1);
3623     jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
3624     addptr(result, stride2);
3625     subl(cnt2, stride2);
3626     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
3627     // clean upper bits of YMM registers
3628     vpxor(vec1, vec1);
3629 
3630     // compare wide vectors tail
3631     bind(COMPARE_WIDE_TAIL);
3632     testptr(result, result);
3633     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3634 
3635     movl(result, stride2);
3636     movl(cnt2, result);
3637     negptr(result);
3638     jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3639 
3640     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
3641     bind(VECTOR_NOT_EQUAL);
3642     // clean upper bits of YMM registers
3643     vpxor(vec1, vec1);
3644     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3645       lea(str1, Address(str1, result, scale));
3646       lea(str2, Address(str2, result, scale));
3647     } else {
3648       lea(str1, Address(str1, result, scale1));
3649       lea(str2, Address(str2, result, scale2));
3650     }
3651     jmp(COMPARE_16_CHARS);
3652 
3653     // Compare tail chars, length between 1 to 15 chars
3654     bind(COMPARE_TAIL_LONG);
3655     movl(cnt2, result);
3656     cmpl(cnt2, stride);
3657     jcc(Assembler::less, COMPARE_SMALL_STR);
3658 
3659     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3660       movdqu(vec1, Address(str1, 0));
3661     } else {
3662       pmovzxbw(vec1, Address(str1, 0));
3663     }
3664     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3665     jcc(Assembler::below, COMPARE_INDEX_CHAR);
3666     subptr(cnt2, stride);
3667     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3668     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3669       lea(str1, Address(str1, result, scale));
3670       lea(str2, Address(str2, result, scale));
3671     } else {
3672       lea(str1, Address(str1, result, scale1));
3673       lea(str2, Address(str2, result, scale2));
3674     }
3675     negptr(cnt2);
3676     jmpb(WHILE_HEAD_LABEL);
3677 
3678     bind(COMPARE_SMALL_STR);
3679   } else if (UseSSE42Intrinsics) {
3680     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
3681     int pcmpmask = 0x19;
3682     // Setup to compare 8-char (16-byte) vectors,
3683     // start from first character again because it has aligned address.
3684     movl(result, cnt2);
3685     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
3686     if (ae == StrIntrinsicNode::LL) {
3687       pcmpmask &= ~0x01;
3688     }
3689     jcc(Assembler::zero, COMPARE_TAIL);
3690     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3691       lea(str1, Address(str1, result, scale));
3692       lea(str2, Address(str2, result, scale));
3693     } else {
3694       lea(str1, Address(str1, result, scale1));
3695       lea(str2, Address(str2, result, scale2));
3696     }
3697     negptr(result);
3698 
3699     // pcmpestri
3700     //   inputs:
3701     //     vec1- substring
3702     //     rax - negative string length (elements count)
3703     //     mem - scanned string
3704     //     rdx - string length (elements count)
3705     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
3706     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
3707     //   outputs:
3708     //     rcx - first mismatched element index
3709     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3710 
3711     bind(COMPARE_WIDE_VECTORS);
3712     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3713       movdqu(vec1, Address(str1, result, scale));
3714       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3715     } else {
3716       pmovzxbw(vec1, Address(str1, result, scale1));
3717       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3718     }
3719     // After pcmpestri cnt1(rcx) contains mismatched element index
3720 
3721     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
3722     addptr(result, stride);
3723     subptr(cnt2, stride);
3724     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3725 
3726     // compare wide vectors tail
3727     testptr(result, result);
3728     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3729 
3730     movl(cnt2, stride);
3731     movl(result, stride);
3732     negptr(result);
3733     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3734       movdqu(vec1, Address(str1, result, scale));
3735       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3736     } else {
3737       pmovzxbw(vec1, Address(str1, result, scale1));
3738       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3739     }
3740     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
3741 
3742     // Mismatched characters in the vectors
3743     bind(VECTOR_NOT_EQUAL);
3744     addptr(cnt1, result);
3745     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3746     subl(result, cnt2);
3747     jmpb(POP_LABEL);
3748 
3749     bind(COMPARE_TAIL); // limit is zero
3750     movl(cnt2, result);
3751     // Fallthru to tail compare
3752   }
3753   // Shift str2 and str1 to the end of the arrays, negate min
3754   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3755     lea(str1, Address(str1, cnt2, scale));
3756     lea(str2, Address(str2, cnt2, scale));
3757   } else {
3758     lea(str1, Address(str1, cnt2, scale1));
3759     lea(str2, Address(str2, cnt2, scale2));
3760   }
3761   decrementl(cnt2);  // first character was compared already
3762   negptr(cnt2);
3763 
3764   // Compare the rest of the elements
3765   bind(WHILE_HEAD_LABEL);
3766   load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
3767   subl(result, cnt1);
3768   jccb(Assembler::notZero, POP_LABEL);
3769   increment(cnt2);
3770   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
3771 
3772   // Strings are equal up to min length.  Return the length difference.
3773   bind(LENGTH_DIFF_LABEL);
3774   pop(result);
3775   if (ae == StrIntrinsicNode::UU) {
3776     // Divide diff by 2 to get number of chars
3777     sarl(result, 1);
3778   }
3779   jmpb(DONE_LABEL);
3780 
3781 #ifdef _LP64
3782   if (VM_Version::supports_avx512vlbw()) {
3783 
3784     bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
3785 
3786     kmovql(cnt1, mask);
3787     notq(cnt1);
3788     bsfq(cnt2, cnt1);
3789     if (ae != StrIntrinsicNode::LL) {
3790       // Divide diff by 2 to get number of chars
3791       sarl(cnt2, 1);
3792     }
3793     addq(result, cnt2);
3794     if (ae == StrIntrinsicNode::LL) {
3795       load_unsigned_byte(cnt1, Address(str2, result));
3796       load_unsigned_byte(result, Address(str1, result));
3797     } else if (ae == StrIntrinsicNode::UU) {
3798       load_unsigned_short(cnt1, Address(str2, result, scale));
3799       load_unsigned_short(result, Address(str1, result, scale));
3800     } else {
3801       load_unsigned_short(cnt1, Address(str2, result, scale2));
3802       load_unsigned_byte(result, Address(str1, result, scale1));
3803     }
3804     subl(result, cnt1);
3805     jmpb(POP_LABEL);
3806   }//if (VM_Version::supports_avx512vlbw())
3807 #endif // _LP64
3808 
3809   // Discard the stored length difference
3810   bind(POP_LABEL);
3811   pop(cnt1);
3812 
3813   // That's it
3814   bind(DONE_LABEL);
3815   if(ae == StrIntrinsicNode::UL) {
3816     negl(result);
3817   }
3818 
3819 }
3820 
3821 // Search for Non-ASCII character (Negative byte value) in a byte array,
3822 // return the index of the first such character, otherwise the length
3823 // of the array segment searched.
3824 //   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
3825 //   @IntrinsicCandidate
3826 //   public static int countPositives(byte[] ba, int off, int len) {
3827 //     for (int i = off; i < off + len; i++) {
3828 //       if (ba[i] < 0) {
3829 //         return i - off;
3830 //       }
3831 //     }
3832 //     return len;
3833 //   }
3834 void C2_MacroAssembler::count_positives(Register ary1, Register len,
3835   Register result, Register tmp1,
3836   XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {
3837   // rsi: byte array
3838   // rcx: len
3839   // rax: result
3840   ShortBranchVerifier sbv(this);
3841   assert_different_registers(ary1, len, result, tmp1);
3842   assert_different_registers(vec1, vec2);
3843   Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
3844 
3845   movl(result, len); // copy
3846   // len == 0
3847   testl(len, len);
3848   jcc(Assembler::zero, DONE);
3849 
3850   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
3851     VM_Version::supports_avx512vlbw() &&
3852     VM_Version::supports_bmi2()) {
3853 
3854     Label test_64_loop, test_tail, BREAK_LOOP;
3855     Register tmp3_aliased = len;
3856 
3857     movl(tmp1, len);
3858     vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
3859 
3860     andl(tmp1, 64 - 1);   // tail count (in chars) 0x3F
3861     andl(len, ~(64 - 1));    // vector count (in chars)
3862     jccb(Assembler::zero, test_tail);
3863 
3864     lea(ary1, Address(ary1, len, Address::times_1));
3865     negptr(len);
3866 
3867     bind(test_64_loop);
3868     // Check whether our 64 elements of size byte contain negatives
3869     evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
3870     kortestql(mask1, mask1);
3871     jcc(Assembler::notZero, BREAK_LOOP);
3872 
3873     addptr(len, 64);
3874     jccb(Assembler::notZero, test_64_loop);
3875 
3876     bind(test_tail);
3877     // bail out when there is nothing to be done
3878     testl(tmp1, -1);
3879     jcc(Assembler::zero, DONE);
3880 
3881     // ~(~0 << len) applied up to two times (for 32-bit scenario)
3882 #ifdef _LP64
3883     mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
3884     shlxq(tmp3_aliased, tmp3_aliased, tmp1);
3885     notq(tmp3_aliased);
3886     kmovql(mask2, tmp3_aliased);
3887 #else
3888     Label k_init;
3889     jmp(k_init);
3890 
3891     // We could not read 64-bits from a general purpose register thus we move
3892     // data required to compose 64 1's to the instruction stream
3893     // We emit 64 byte wide series of elements from 0..63 which later on would
3894     // be used as a compare targets with tail count contained in tmp1 register.
3895     // Result would be a k register having tmp1 consecutive number or 1
3896     // counting from least significant bit.
3897     address tmp = pc();
3898     emit_int64(0x0706050403020100);
3899     emit_int64(0x0F0E0D0C0B0A0908);
3900     emit_int64(0x1716151413121110);
3901     emit_int64(0x1F1E1D1C1B1A1918);
3902     emit_int64(0x2726252423222120);
3903     emit_int64(0x2F2E2D2C2B2A2928);
3904     emit_int64(0x3736353433323130);
3905     emit_int64(0x3F3E3D3C3B3A3938);
3906 
3907     bind(k_init);
3908     lea(len, InternalAddress(tmp));
3909     // create mask to test for negative byte inside a vector
3910     evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit);
3911     evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit);
3912 
3913 #endif
3914     evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);
3915     ktestq(mask1, mask2);
3916     jcc(Assembler::zero, DONE);
3917 
3918     bind(BREAK_LOOP);
3919     // At least one byte in the last 64 bytes is negative.
3920     // Set up to look at the last 64 bytes as if they were a tail
3921     lea(ary1, Address(ary1, len, Address::times_1));
3922     addptr(result, len);
3923     // Ignore the very last byte: if all others are positive,
3924     // it must be negative, so we can skip right to the 2+1 byte
3925     // end comparison at this point
3926     orl(result, 63);
3927     movl(len, 63);
3928     // Fallthru to tail compare
3929   } else {
3930 
3931     if (UseAVX >= 2 && UseSSE >= 2) {
3932       // With AVX2, use 32-byte vector compare
3933       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
3934 
3935       // Compare 32-byte vectors
3936       testl(len, 0xffffffe0);   // vector count (in bytes)
3937       jccb(Assembler::zero, TAIL_START);
3938 
3939       andl(len, 0xffffffe0);
3940       lea(ary1, Address(ary1, len, Address::times_1));
3941       negptr(len);
3942 
3943       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
3944       movdl(vec2, tmp1);
3945       vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
3946 
3947       bind(COMPARE_WIDE_VECTORS);
3948       vmovdqu(vec1, Address(ary1, len, Address::times_1));
3949       vptest(vec1, vec2);
3950       jccb(Assembler::notZero, BREAK_LOOP);
3951       addptr(len, 32);
3952       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3953 
3954       testl(result, 0x0000001f);   // any bytes remaining?
3955       jcc(Assembler::zero, DONE);
3956 
3957       // Quick test using the already prepared vector mask
3958       movl(len, result);
3959       andl(len, 0x0000001f);
3960       vmovdqu(vec1, Address(ary1, len, Address::times_1, -32));
3961       vptest(vec1, vec2);
3962       jcc(Assembler::zero, DONE);
3963       // There are zeros, jump to the tail to determine exactly where
3964       jmpb(TAIL_START);
3965 
3966       bind(BREAK_LOOP);
3967       // At least one byte in the last 32-byte vector is negative.
3968       // Set up to look at the last 32 bytes as if they were a tail
3969       lea(ary1, Address(ary1, len, Address::times_1));
3970       addptr(result, len);
3971       // Ignore the very last byte: if all others are positive,
3972       // it must be negative, so we can skip right to the 2+1 byte
3973       // end comparison at this point
3974       orl(result, 31);
3975       movl(len, 31);
3976       // Fallthru to tail compare
3977     } else if (UseSSE42Intrinsics) {
3978       // With SSE4.2, use double quad vector compare
3979       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
3980 
3981       // Compare 16-byte vectors
3982       testl(len, 0xfffffff0);   // vector count (in bytes)
3983       jcc(Assembler::zero, TAIL_START);
3984 
3985       andl(len, 0xfffffff0);
3986       lea(ary1, Address(ary1, len, Address::times_1));
3987       negptr(len);
3988 
3989       movl(tmp1, 0x80808080);
3990       movdl(vec2, tmp1);
3991       pshufd(vec2, vec2, 0);
3992 
3993       bind(COMPARE_WIDE_VECTORS);
3994       movdqu(vec1, Address(ary1, len, Address::times_1));
3995       ptest(vec1, vec2);
3996       jccb(Assembler::notZero, BREAK_LOOP);
3997       addptr(len, 16);
3998       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3999 
4000       testl(result, 0x0000000f); // len is zero, any bytes remaining?
4001       jcc(Assembler::zero, DONE);
4002 
4003       // Quick test using the already prepared vector mask
4004       movl(len, result);
4005       andl(len, 0x0000000f);   // tail count (in bytes)
4006       movdqu(vec1, Address(ary1, len, Address::times_1, -16));
4007       ptest(vec1, vec2);
4008       jcc(Assembler::zero, DONE);
4009       jmpb(TAIL_START);
4010 
4011       bind(BREAK_LOOP);
4012       // At least one byte in the last 16-byte vector is negative.
4013       // Set up and look at the last 16 bytes as if they were a tail
4014       lea(ary1, Address(ary1, len, Address::times_1));
4015       addptr(result, len);
4016       // Ignore the very last byte: if all others are positive,
4017       // it must be negative, so we can skip right to the 2+1 byte
4018       // end comparison at this point
4019       orl(result, 15);
4020       movl(len, 15);
4021       // Fallthru to tail compare
4022     }
4023   }
4024 
4025   bind(TAIL_START);
4026   // Compare 4-byte vectors
4027   andl(len, 0xfffffffc); // vector count (in bytes)
4028   jccb(Assembler::zero, COMPARE_CHAR);
4029 
4030   lea(ary1, Address(ary1, len, Address::times_1));
4031   negptr(len);
4032 
4033   bind(COMPARE_VECTORS);
4034   movl(tmp1, Address(ary1, len, Address::times_1));
4035   andl(tmp1, 0x80808080);
4036   jccb(Assembler::notZero, TAIL_ADJUST);
4037   addptr(len, 4);
4038   jccb(Assembler::notZero, COMPARE_VECTORS);
4039 
4040   // Compare trailing char (final 2-3 bytes), if any
4041   bind(COMPARE_CHAR);
4042 
4043   testl(result, 0x2);   // tail  char
4044   jccb(Assembler::zero, COMPARE_BYTE);
4045   load_unsigned_short(tmp1, Address(ary1, 0));
4046   andl(tmp1, 0x00008080);
4047   jccb(Assembler::notZero, CHAR_ADJUST);
4048   lea(ary1, Address(ary1, 2));
4049 
4050   bind(COMPARE_BYTE);
4051   testl(result, 0x1);   // tail  byte
4052   jccb(Assembler::zero, DONE);
4053   load_unsigned_byte(tmp1, Address(ary1, 0));
4054   testl(tmp1, 0x00000080);
4055   jccb(Assembler::zero, DONE);
4056   subptr(result, 1);
4057   jmpb(DONE);
4058 
4059   bind(TAIL_ADJUST);
4060   // there are negative bits in the last 4 byte block.
4061   // Adjust result and check the next three bytes
4062   addptr(result, len);
4063   orl(result, 3);
4064   lea(ary1, Address(ary1, len, Address::times_1));
4065   jmpb(COMPARE_CHAR);
4066 
4067   bind(CHAR_ADJUST);
4068   // We are looking at a char + optional byte tail, and found that one
4069   // of the bytes in the char is negative. Adjust the result, check the
4070   // first byte and readjust if needed.
4071   andl(result, 0xfffffffc);
4072   testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first
4073   jccb(Assembler::notZero, DONE);
4074   addptr(result, 1);
4075 
4076   // That's it
4077   bind(DONE);
4078   if (UseAVX >= 2 && UseSSE >= 2) {
4079     // clean upper bits of YMM registers
4080     vpxor(vec1, vec1);
4081     vpxor(vec2, vec2);
4082   }
4083 }
4084 
4085 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
4086 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
4087                                       Register limit, Register result, Register chr,
4088                                       XMMRegister vec1, XMMRegister vec2, bool is_char, KRegister mask) {
4089   ShortBranchVerifier sbv(this);
4090   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
4091 
4092   int length_offset  = arrayOopDesc::length_offset_in_bytes();
4093   int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
4094 
4095   if (is_array_equ) {
4096     // Check the input args
4097     cmpoop(ary1, ary2);
4098     jcc(Assembler::equal, TRUE_LABEL);
4099 
4100     // Need additional checks for arrays_equals.
4101     testptr(ary1, ary1);
4102     jcc(Assembler::zero, FALSE_LABEL);
4103     testptr(ary2, ary2);
4104     jcc(Assembler::zero, FALSE_LABEL);
4105 
4106     // Check the lengths
4107     movl(limit, Address(ary1, length_offset));
4108     cmpl(limit, Address(ary2, length_offset));
4109     jcc(Assembler::notEqual, FALSE_LABEL);
4110   }
4111 
4112   // count == 0
4113   testl(limit, limit);
4114   jcc(Assembler::zero, TRUE_LABEL);
4115 
4116   if (is_array_equ) {
4117     // Load array address
4118     lea(ary1, Address(ary1, base_offset));
4119     lea(ary2, Address(ary2, base_offset));
4120   }
4121 
4122   if (is_array_equ && is_char) {
4123     // arrays_equals when used for char[].
4124     shll(limit, 1);      // byte count != 0
4125   }
4126   movl(result, limit); // copy
4127 
4128   if (UseAVX >= 2) {
4129     // With AVX2, use 32-byte vector compare
4130     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4131 
4132     // Compare 32-byte vectors
4133     andl(result, 0x0000001f);  //   tail count (in bytes)
4134     andl(limit, 0xffffffe0);   // vector count (in bytes)
4135     jcc(Assembler::zero, COMPARE_TAIL);
4136 
4137     lea(ary1, Address(ary1, limit, Address::times_1));
4138     lea(ary2, Address(ary2, limit, Address::times_1));
4139     negptr(limit);
4140 
4141 #ifdef _LP64
4142     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
4143       Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
4144 
4145       cmpl(limit, -64);
4146       jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
4147 
4148       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
4149 
4150       evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
4151       evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
4152       kortestql(mask, mask);
4153       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4154       addptr(limit, 64);  // update since we already compared at this addr
4155       cmpl(limit, -64);
4156       jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
4157 
4158       // At this point we may still need to compare -limit+result bytes.
4159       // We could execute the next two instruction and just continue via non-wide path:
4160       //  cmpl(limit, 0);
4161       //  jcc(Assembler::equal, COMPARE_TAIL);  // true
4162       // But since we stopped at the points ary{1,2}+limit which are
4163       // not farther than 64 bytes from the ends of arrays ary{1,2}+result
4164       // (|limit| <= 32 and result < 32),
4165       // we may just compare the last 64 bytes.
4166       //
4167       addptr(result, -64);   // it is safe, bc we just came from this area
4168       evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
4169       evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
4170       kortestql(mask, mask);
4171       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4172 
4173       jmp(TRUE_LABEL);
4174 
4175       bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
4176 
4177     }//if (VM_Version::supports_avx512vlbw())
4178 #endif //_LP64
4179     bind(COMPARE_WIDE_VECTORS);
4180     vmovdqu(vec1, Address(ary1, limit, Address::times_1));
4181     vmovdqu(vec2, Address(ary2, limit, Address::times_1));
4182     vpxor(vec1, vec2);
4183 
4184     vptest(vec1, vec1);
4185     jcc(Assembler::notZero, FALSE_LABEL);
4186     addptr(limit, 32);
4187     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4188 
4189     testl(result, result);
4190     jcc(Assembler::zero, TRUE_LABEL);
4191 
4192     vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
4193     vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
4194     vpxor(vec1, vec2);
4195 
4196     vptest(vec1, vec1);
4197     jccb(Assembler::notZero, FALSE_LABEL);
4198     jmpb(TRUE_LABEL);
4199 
4200     bind(COMPARE_TAIL); // limit is zero
4201     movl(limit, result);
4202     // Fallthru to tail compare
4203   } else if (UseSSE42Intrinsics) {
4204     // With SSE4.2, use double quad vector compare
4205     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4206 
4207     // Compare 16-byte vectors
4208     andl(result, 0x0000000f);  //   tail count (in bytes)
4209     andl(limit, 0xfffffff0);   // vector count (in bytes)
4210     jcc(Assembler::zero, COMPARE_TAIL);
4211 
4212     lea(ary1, Address(ary1, limit, Address::times_1));
4213     lea(ary2, Address(ary2, limit, Address::times_1));
4214     negptr(limit);
4215 
4216     bind(COMPARE_WIDE_VECTORS);
4217     movdqu(vec1, Address(ary1, limit, Address::times_1));
4218     movdqu(vec2, Address(ary2, limit, Address::times_1));
4219     pxor(vec1, vec2);
4220 
4221     ptest(vec1, vec1);
4222     jcc(Assembler::notZero, FALSE_LABEL);
4223     addptr(limit, 16);
4224     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4225 
4226     testl(result, result);
4227     jcc(Assembler::zero, TRUE_LABEL);
4228 
4229     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
4230     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
4231     pxor(vec1, vec2);
4232 
4233     ptest(vec1, vec1);
4234     jccb(Assembler::notZero, FALSE_LABEL);
4235     jmpb(TRUE_LABEL);
4236 
4237     bind(COMPARE_TAIL); // limit is zero
4238     movl(limit, result);
4239     // Fallthru to tail compare
4240   }
4241 
4242   // Compare 4-byte vectors
4243   andl(limit, 0xfffffffc); // vector count (in bytes)
4244   jccb(Assembler::zero, COMPARE_CHAR);
4245 
4246   lea(ary1, Address(ary1, limit, Address::times_1));
4247   lea(ary2, Address(ary2, limit, Address::times_1));
4248   negptr(limit);
4249 
4250   bind(COMPARE_VECTORS);
4251   movl(chr, Address(ary1, limit, Address::times_1));
4252   cmpl(chr, Address(ary2, limit, Address::times_1));
4253   jccb(Assembler::notEqual, FALSE_LABEL);
4254   addptr(limit, 4);
4255   jcc(Assembler::notZero, COMPARE_VECTORS);
4256 
4257   // Compare trailing char (final 2 bytes), if any
4258   bind(COMPARE_CHAR);
4259   testl(result, 0x2);   // tail  char
4260   jccb(Assembler::zero, COMPARE_BYTE);
4261   load_unsigned_short(chr, Address(ary1, 0));
4262   load_unsigned_short(limit, Address(ary2, 0));
4263   cmpl(chr, limit);
4264   jccb(Assembler::notEqual, FALSE_LABEL);
4265 
4266   if (is_array_equ && is_char) {
4267     bind(COMPARE_BYTE);
4268   } else {
4269     lea(ary1, Address(ary1, 2));
4270     lea(ary2, Address(ary2, 2));
4271 
4272     bind(COMPARE_BYTE);
4273     testl(result, 0x1);   // tail  byte
4274     jccb(Assembler::zero, TRUE_LABEL);
4275     load_unsigned_byte(chr, Address(ary1, 0));
4276     load_unsigned_byte(limit, Address(ary2, 0));
4277     cmpl(chr, limit);
4278     jccb(Assembler::notEqual, FALSE_LABEL);
4279   }
4280   bind(TRUE_LABEL);
4281   movl(result, 1);   // return true
4282   jmpb(DONE);
4283 
4284   bind(FALSE_LABEL);
4285   xorl(result, result); // return false
4286 
4287   // That's it
4288   bind(DONE);
4289   if (UseAVX >= 2) {
4290     // clean upper bits of YMM registers
4291     vpxor(vec1, vec1);
4292     vpxor(vec2, vec2);
4293   }
4294 }
4295 
4296 #ifdef _LP64
4297 
4298 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) {
4299 #define __ masm.
4300   Register dst = stub.data<0>();
4301   XMMRegister src = stub.data<1>();
4302   address target = stub.data<2>();
4303   __ bind(stub.entry());
4304   __ subptr(rsp, 8);
4305   __ movdbl(Address(rsp), src);
4306   __ call(RuntimeAddress(target));
4307   __ pop(dst);
4308   __ jmp(stub.continuation());
4309 #undef __
4310 }
4311 
4312 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) {
4313   assert(dst_bt == T_INT || dst_bt == T_LONG, "");
4314   assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, "");
4315 
4316   address slowpath_target;
4317   if (dst_bt == T_INT) {
4318     if (src_bt == T_FLOAT) {
4319       cvttss2sil(dst, src);
4320       cmpl(dst, 0x80000000);
4321       slowpath_target = StubRoutines::x86::f2i_fixup();
4322     } else {
4323       cvttsd2sil(dst, src);
4324       cmpl(dst, 0x80000000);
4325       slowpath_target = StubRoutines::x86::d2i_fixup();
4326     }
4327   } else {
4328     if (src_bt == T_FLOAT) {
4329       cvttss2siq(dst, src);
4330       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4331       slowpath_target = StubRoutines::x86::f2l_fixup();
4332     } else {
4333       cvttsd2siq(dst, src);
4334       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4335       slowpath_target = StubRoutines::x86::d2l_fixup();
4336     }
4337   }
4338 
4339   auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, 23, convertF2I_slowpath);
4340   jcc(Assembler::equal, stub->entry());
4341   bind(stub->continuation());
4342 }
4343 
4344 #endif // _LP64
4345 
4346 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4347                                     XMMRegister src1, int imm8, bool merge, int vlen_enc) {
4348   switch(ideal_opc) {
4349     case Op_LShiftVS:
4350       Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break;
4351     case Op_LShiftVI:
4352       Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break;
4353     case Op_LShiftVL:
4354       Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break;
4355     case Op_RShiftVS:
4356       Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break;
4357     case Op_RShiftVI:
4358       Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break;
4359     case Op_RShiftVL:
4360       Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break;
4361     case Op_URShiftVS:
4362       Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break;
4363     case Op_URShiftVI:
4364       Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break;
4365     case Op_URShiftVL:
4366       Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break;
4367     case Op_RotateRightV:
4368       evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4369     case Op_RotateLeftV:
4370       evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4371     default:
4372       fatal("Unsupported masked operation"); break;
4373   }
4374 }
4375 
4376 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4377                                     XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc,
4378                                     bool is_varshift) {
4379   switch (ideal_opc) {
4380     case Op_AddVB:
4381       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4382     case Op_AddVS:
4383       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4384     case Op_AddVI:
4385       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4386     case Op_AddVL:
4387       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4388     case Op_AddVF:
4389       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4390     case Op_AddVD:
4391       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4392     case Op_SubVB:
4393       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4394     case Op_SubVS:
4395       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4396     case Op_SubVI:
4397       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4398     case Op_SubVL:
4399       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4400     case Op_SubVF:
4401       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4402     case Op_SubVD:
4403       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4404     case Op_MulVS:
4405       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4406     case Op_MulVI:
4407       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4408     case Op_MulVL:
4409       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4410     case Op_MulVF:
4411       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4412     case Op_MulVD:
4413       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4414     case Op_DivVF:
4415       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4416     case Op_DivVD:
4417       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4418     case Op_SqrtVF:
4419       evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break;
4420     case Op_SqrtVD:
4421       evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break;
4422     case Op_AbsVB:
4423       evpabsb(dst, mask, src2, merge, vlen_enc); break;
4424     case Op_AbsVS:
4425       evpabsw(dst, mask, src2, merge, vlen_enc); break;
4426     case Op_AbsVI:
4427       evpabsd(dst, mask, src2, merge, vlen_enc); break;
4428     case Op_AbsVL:
4429       evpabsq(dst, mask, src2, merge, vlen_enc); break;
4430     case Op_FmaVF:
4431       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4432     case Op_FmaVD:
4433       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4434     case Op_VectorRearrange:
4435       evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break;
4436     case Op_LShiftVS:
4437       evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4438     case Op_LShiftVI:
4439       evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4440     case Op_LShiftVL:
4441       evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4442     case Op_RShiftVS:
4443       evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4444     case Op_RShiftVI:
4445       evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4446     case Op_RShiftVL:
4447       evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4448     case Op_URShiftVS:
4449       evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4450     case Op_URShiftVI:
4451       evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4452     case Op_URShiftVL:
4453       evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4454     case Op_RotateLeftV:
4455       evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4456     case Op_RotateRightV:
4457       evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4458     case Op_MaxV:
4459       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4460     case Op_MinV:
4461       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4462     case Op_XorV:
4463       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4464     case Op_OrV:
4465       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4466     case Op_AndV:
4467       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4468     default:
4469       fatal("Unsupported masked operation"); break;
4470   }
4471 }
4472 
4473 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4474                                     XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4475   switch (ideal_opc) {
4476     case Op_AddVB:
4477       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4478     case Op_AddVS:
4479       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4480     case Op_AddVI:
4481       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4482     case Op_AddVL:
4483       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4484     case Op_AddVF:
4485       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4486     case Op_AddVD:
4487       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4488     case Op_SubVB:
4489       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4490     case Op_SubVS:
4491       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4492     case Op_SubVI:
4493       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4494     case Op_SubVL:
4495       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4496     case Op_SubVF:
4497       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4498     case Op_SubVD:
4499       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4500     case Op_MulVS:
4501       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4502     case Op_MulVI:
4503       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4504     case Op_MulVL:
4505       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4506     case Op_MulVF:
4507       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4508     case Op_MulVD:
4509       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4510     case Op_DivVF:
4511       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4512     case Op_DivVD:
4513       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4514     case Op_FmaVF:
4515       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4516     case Op_FmaVD:
4517       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4518     case Op_MaxV:
4519       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4520     case Op_MinV:
4521       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4522     case Op_XorV:
4523       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4524     case Op_OrV:
4525       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4526     case Op_AndV:
4527       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4528     default:
4529       fatal("Unsupported masked operation"); break;
4530   }
4531 }
4532 
4533 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst,
4534                                   KRegister src1, KRegister src2) {
4535   BasicType etype = T_ILLEGAL;
4536   switch(mask_len) {
4537     case 2:
4538     case 4:
4539     case 8:  etype = T_BYTE; break;
4540     case 16: etype = T_SHORT; break;
4541     case 32: etype = T_INT; break;
4542     case 64: etype = T_LONG; break;
4543     default: fatal("Unsupported type"); break;
4544   }
4545   assert(etype != T_ILLEGAL, "");
4546   switch(ideal_opc) {
4547     case Op_AndVMask:
4548       kand(etype, dst, src1, src2); break;
4549     case Op_OrVMask:
4550       kor(etype, dst, src1, src2); break;
4551     case Op_XorVMask:
4552       kxor(etype, dst, src1, src2); break;
4553     default:
4554       fatal("Unsupported masked operation"); break;
4555   }
4556 }
4557 
4558 /*
4559  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4560  * If src is NaN, the result is 0.
4561  * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE,
4562  * the result is equal to the value of Integer.MIN_VALUE.
4563  * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE,
4564  * the result is equal to the value of Integer.MAX_VALUE.
4565  */
4566 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4567                                                                    XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
4568                                                                    Register rscratch, AddressLiteral float_sign_flip,
4569                                                                    int vec_enc) {
4570   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4571   Label done;
4572   vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch);
4573   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
4574   vptest(xtmp2, xtmp2, vec_enc);
4575   jccb(Assembler::equal, done);
4576 
4577   vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc);
4578   vpxor(xtmp1, xtmp1, xtmp4, vec_enc);
4579 
4580   vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
4581   vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc);
4582   vblendvps(dst, dst, xtmp4, xtmp3, vec_enc);
4583 
4584   // Recompute the mask for remaining special value.
4585   vpxor(xtmp2, xtmp2, xtmp3, vec_enc);
4586   // Extract SRC values corresponding to TRUE mask lanes.
4587   vpand(xtmp4, xtmp2, src, vec_enc);
4588   // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special
4589   // values are set.
4590   vpxor(xtmp3, xtmp2, xtmp4, vec_enc);
4591 
4592   vblendvps(dst, dst, xtmp1, xtmp3, vec_enc);
4593   bind(done);
4594 }
4595 
4596 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4597                                                                     XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4598                                                                     Register rscratch, AddressLiteral float_sign_flip,
4599                                                                     int vec_enc) {
4600   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4601   Label done;
4602   evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
4603   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4604   kortestwl(ktmp1, ktmp1);
4605   jccb(Assembler::equal, done);
4606 
4607   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4608   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4609   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
4610 
4611   kxorwl(ktmp1, ktmp1, ktmp2);
4612   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4613   vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4614   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
4615   bind(done);
4616 }
4617 
4618 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4619                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4620                                                                      Register rscratch, AddressLiteral double_sign_flip,
4621                                                                      int vec_enc) {
4622   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4623 
4624   Label done;
4625   evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
4626   Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc);
4627   kortestwl(ktmp1, ktmp1);
4628   jccb(Assembler::equal, done);
4629 
4630   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4631   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4632   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
4633 
4634   kxorwl(ktmp1, ktmp1, ktmp2);
4635   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4636   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4637   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
4638   bind(done);
4639 }
4640 
4641 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4642                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4643                                                                      Register rscratch, AddressLiteral float_sign_flip,
4644                                                                      int vec_enc) {
4645   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4646   Label done;
4647   evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
4648   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4649   kortestwl(ktmp1, ktmp1);
4650   jccb(Assembler::equal, done);
4651 
4652   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4653   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4654   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
4655 
4656   kxorwl(ktmp1, ktmp1, ktmp2);
4657   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4658   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4659   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
4660   bind(done);
4661 }
4662 
4663 /*
4664  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4665  * If src is NaN, the result is 0.
4666  * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE,
4667  * the result is equal to the value of Long.MIN_VALUE.
4668  * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE,
4669  * the result is equal to the value of Long.MAX_VALUE.
4670  */
4671 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4672                                                                       XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4673                                                                       Register rscratch, AddressLiteral double_sign_flip,
4674                                                                       int vec_enc) {
4675   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4676 
4677   Label done;
4678   evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
4679   evpcmpeqq(ktmp1, xtmp1, dst, vec_enc);
4680   kortestwl(ktmp1, ktmp1);
4681   jccb(Assembler::equal, done);
4682 
4683   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4684   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4685   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
4686 
4687   kxorwl(ktmp1, ktmp1, ktmp2);
4688   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4689   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4690   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
4691   bind(done);
4692 }
4693 
4694 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero,
4695                                                              XMMRegister xtmp, int index, int vec_enc) {
4696    assert(vec_enc < Assembler::AVX_512bit, "");
4697    if (vec_enc == Assembler::AVX_256bit) {
4698      vextractf128_high(xtmp, src);
4699      vshufps(dst, src, xtmp, index, vec_enc);
4700    } else {
4701      vshufps(dst, src, zero, index, vec_enc);
4702    }
4703 }
4704 
4705 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
4706                                                                     XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch,
4707                                                                     AddressLiteral float_sign_flip, int src_vec_enc) {
4708   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4709 
4710   Label done;
4711   // Compare the destination lanes with float_sign_flip
4712   // value to get mask for all special values.
4713   movdqu(xtmp1, float_sign_flip, rscratch);
4714   vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit);
4715   ptest(xtmp2, xtmp2);
4716   jccb(Assembler::equal, done);
4717 
4718   // Flip float_sign_flip to get max integer value.
4719   vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit);
4720   pxor(xtmp1, xtmp4);
4721 
4722   // Set detination lanes corresponding to unordered source lanes as zero.
4723   vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc);
4724   vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc);
4725 
4726   // Shuffle mask vector and pack lower doubles word from each quadword lane.
4727   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
4728   vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit);
4729 
4730   // Recompute the mask for remaining special value.
4731   pxor(xtmp2, xtmp3);
4732   // Extract mask corresponding to non-negative source lanes.
4733   vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc);
4734 
4735   // Shuffle mask vector and pack lower doubles word from each quadword lane.
4736   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
4737   pand(xtmp3, xtmp2);
4738 
4739   // Replace destination lanes holding special value(0x80000000) with max int
4740   // if corresponding source lane holds a +ve value.
4741   vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit);
4742   bind(done);
4743 }
4744 
4745 
4746 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero,
4747                                                    XMMRegister xtmp, Register rscratch, int vec_enc) {
4748   switch(to_elem_bt) {
4749     case T_SHORT:
4750       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing");
4751       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch);
4752       vpackusdw(dst, dst, zero, vec_enc);
4753       if (vec_enc == Assembler::AVX_256bit) {
4754         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
4755       }
4756       break;
4757     case  T_BYTE:
4758       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing");
4759       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch);
4760       vpackusdw(dst, dst, zero, vec_enc);
4761       if (vec_enc == Assembler::AVX_256bit) {
4762         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
4763       }
4764       vpackuswb(dst, dst, zero, vec_enc);
4765       break;
4766     default: assert(false, "%s", type2name(to_elem_bt));
4767   }
4768 }
4769 
4770 /*
4771  * Algorithm for vector D2L and F2I conversions:-
4772  * a) Perform vector D2L/F2I cast.
4773  * b) Choose fast path if none of the result vector lane contains 0x80000000 value.
4774  *    It signifies that source value could be any of the special floating point
4775  *    values(NaN,-Inf,Inf,Max,-Min).
4776  * c) Set destination to zero if source is NaN value.
4777  * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value.
4778  */
4779 
4780 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4781                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
4782                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
4783   int to_elem_sz = type2aelembytes(to_elem_bt);
4784   assert(to_elem_sz <= 4, "");
4785   vcvttps2dq(dst, src, vec_enc);
4786   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc);
4787   if (to_elem_sz < 4) {
4788     vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
4789     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc);
4790   }
4791 }
4792 
4793 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4794                                             XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip,
4795                                             Register rscratch, int vec_enc) {
4796   int to_elem_sz = type2aelembytes(to_elem_bt);
4797   assert(to_elem_sz <= 4, "");
4798   vcvttps2dq(dst, src, vec_enc);
4799   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc);
4800   switch(to_elem_bt) {
4801     case T_INT:
4802       break;
4803     case T_SHORT:
4804       evpmovdw(dst, dst, vec_enc);
4805       break;
4806     case T_BYTE:
4807       evpmovdb(dst, dst, vec_enc);
4808       break;
4809     default: assert(false, "%s", type2name(to_elem_bt));
4810   }
4811 }
4812 
4813 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
4814                                             KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip,
4815                                             Register rscratch, int vec_enc) {
4816   evcvttps2qq(dst, src, vec_enc);
4817   vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc);
4818 }
4819 
4820 // Handling for downcasting from double to integer or sub-word types on AVX2.
4821 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4822                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5,
4823                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
4824   int to_elem_sz = type2aelembytes(to_elem_bt);
4825   assert(to_elem_sz < 8, "");
4826   vcvttpd2dq(dst, src, vec_enc);
4827   vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch,
4828                                               float_sign_flip, vec_enc);
4829   if (to_elem_sz < 4) {
4830     // xtmp4 holds all zero lanes.
4831     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit);
4832   }
4833 }
4834 
4835 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src,
4836                                             XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1,
4837                                             KRegister ktmp2, AddressLiteral sign_flip,
4838                                             Register rscratch, int vec_enc) {
4839   if (VM_Version::supports_avx512dq()) {
4840     evcvttpd2qq(dst, src, vec_enc);
4841     vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
4842     switch(to_elem_bt) {
4843       case T_LONG:
4844         break;
4845       case T_INT:
4846         evpmovsqd(dst, dst, vec_enc);
4847         break;
4848       case T_SHORT:
4849         evpmovsqd(dst, dst, vec_enc);
4850         evpmovdw(dst, dst, vec_enc);
4851         break;
4852       case T_BYTE:
4853         evpmovsqd(dst, dst, vec_enc);
4854         evpmovdb(dst, dst, vec_enc);
4855         break;
4856       default: assert(false, "%s", type2name(to_elem_bt));
4857     }
4858   } else {
4859     assert(type2aelembytes(to_elem_bt) <= 4, "");
4860     vcvttpd2dq(dst, src, vec_enc);
4861     vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
4862     switch(to_elem_bt) {
4863       case T_INT:
4864         break;
4865       case T_SHORT:
4866         evpmovdw(dst, dst, vec_enc);
4867         break;
4868       case T_BYTE:
4869         evpmovdb(dst, dst, vec_enc);
4870         break;
4871       default: assert(false, "%s", type2name(to_elem_bt));
4872     }
4873   }
4874 }
4875 
4876 #ifdef _LP64
4877 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src,
4878                                                  AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
4879                                                  Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
4880   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
4881   // and re-instantiate original MXCSR.RC mode after that.
4882   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
4883 
4884   mov64(tmp, julong_cast(0.5L));
4885   evpbroadcastq(xtmp1, tmp, vec_enc);
4886   vaddpd(xtmp1, src , xtmp1, vec_enc);
4887   evcvtpd2qq(dst, xtmp1, vec_enc);
4888   vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
4889                                                 double_sign_flip, vec_enc);;
4890 
4891   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
4892 }
4893 
4894 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src,
4895                                                 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
4896                                                 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
4897   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
4898   // and re-instantiate original MXCSR.RC mode after that.
4899   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
4900 
4901   movl(tmp, jint_cast(0.5));
4902   movq(xtmp1, tmp);
4903   vbroadcastss(xtmp1, xtmp1, vec_enc);
4904   vaddps(xtmp1, src , xtmp1, vec_enc);
4905   vcvtps2dq(dst, xtmp1, vec_enc);
4906   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
4907                                               float_sign_flip, vec_enc);
4908 
4909   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
4910 }
4911 
4912 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src,
4913                                                AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
4914                                                Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) {
4915   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
4916   // and re-instantiate original MXCSR.RC mode after that.
4917   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
4918 
4919   movl(tmp, jint_cast(0.5));
4920   movq(xtmp1, tmp);
4921   vbroadcastss(xtmp1, xtmp1, vec_enc);
4922   vaddps(xtmp1, src , xtmp1, vec_enc);
4923   vcvtps2dq(dst, xtmp1, vec_enc);
4924   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc);
4925 
4926   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
4927 }
4928 #endif // _LP64
4929 
4930 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
4931                                              BasicType from_elem_bt, BasicType to_elem_bt) {
4932   switch (from_elem_bt) {
4933     case T_BYTE:
4934       switch (to_elem_bt) {
4935         case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break;
4936         case T_INT:   vpmovzxbd(dst, src, vlen_enc); break;
4937         case T_LONG:  vpmovzxbq(dst, src, vlen_enc); break;
4938         default: ShouldNotReachHere();
4939       }
4940       break;
4941     case T_SHORT:
4942       switch (to_elem_bt) {
4943         case T_INT:  vpmovzxwd(dst, src, vlen_enc); break;
4944         case T_LONG: vpmovzxwq(dst, src, vlen_enc); break;
4945         default: ShouldNotReachHere();
4946       }
4947       break;
4948     case T_INT:
4949       assert(to_elem_bt == T_LONG, "");
4950       vpmovzxdq(dst, src, vlen_enc);
4951       break;
4952     default:
4953       ShouldNotReachHere();
4954   }
4955 }
4956 
4957 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
4958                                            BasicType from_elem_bt, BasicType to_elem_bt) {
4959   switch (from_elem_bt) {
4960     case T_BYTE:
4961       switch (to_elem_bt) {
4962         case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break;
4963         case T_INT:   vpmovsxbd(dst, src, vlen_enc); break;
4964         case T_LONG:  vpmovsxbq(dst, src, vlen_enc); break;
4965         default: ShouldNotReachHere();
4966       }
4967       break;
4968     case T_SHORT:
4969       switch (to_elem_bt) {
4970         case T_INT:  vpmovsxwd(dst, src, vlen_enc); break;
4971         case T_LONG: vpmovsxwq(dst, src, vlen_enc); break;
4972         default: ShouldNotReachHere();
4973       }
4974       break;
4975     case T_INT:
4976       assert(to_elem_bt == T_LONG, "");
4977       vpmovsxdq(dst, src, vlen_enc);
4978       break;
4979     default:
4980       ShouldNotReachHere();
4981   }
4982 }
4983 
4984 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src,
4985                                          BasicType dst_bt, BasicType src_bt, int vlen) {
4986   int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen);
4987   assert(vlen_enc != AVX_512bit, "");
4988 
4989   int dst_bt_size = type2aelembytes(dst_bt);
4990   int src_bt_size = type2aelembytes(src_bt);
4991   if (dst_bt_size > src_bt_size) {
4992     switch (dst_bt_size / src_bt_size) {
4993       case 2: vpmovsxbw(dst, src, vlen_enc); break;
4994       case 4: vpmovsxbd(dst, src, vlen_enc); break;
4995       case 8: vpmovsxbq(dst, src, vlen_enc); break;
4996       default: ShouldNotReachHere();
4997     }
4998   } else {
4999     assert(dst_bt_size < src_bt_size, "");
5000     switch (src_bt_size / dst_bt_size) {
5001       case 2: {
5002         if (vlen_enc == AVX_128bit) {
5003           vpacksswb(dst, src, src, vlen_enc);
5004         } else {
5005           vpacksswb(dst, src, src, vlen_enc);
5006           vpermq(dst, dst, 0x08, vlen_enc);
5007         }
5008         break;
5009       }
5010       case 4: {
5011         if (vlen_enc == AVX_128bit) {
5012           vpackssdw(dst, src, src, vlen_enc);
5013           vpacksswb(dst, dst, dst, vlen_enc);
5014         } else {
5015           vpackssdw(dst, src, src, vlen_enc);
5016           vpermq(dst, dst, 0x08, vlen_enc);
5017           vpacksswb(dst, dst, dst, AVX_128bit);
5018         }
5019         break;
5020       }
5021       case 8: {
5022         if (vlen_enc == AVX_128bit) {
5023           vpshufd(dst, src, 0x08, vlen_enc);
5024           vpackssdw(dst, dst, dst, vlen_enc);
5025           vpacksswb(dst, dst, dst, vlen_enc);
5026         } else {
5027           vpshufd(dst, src, 0x08, vlen_enc);
5028           vpermq(dst, dst, 0x08, vlen_enc);
5029           vpackssdw(dst, dst, dst, AVX_128bit);
5030           vpacksswb(dst, dst, dst, AVX_128bit);
5031         }
5032         break;
5033       }
5034       default: ShouldNotReachHere();
5035     }
5036   }
5037 }
5038 
5039 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3,
5040                                    bool merge, BasicType bt, int vlen_enc) {
5041   if (bt == T_INT) {
5042     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5043   } else {
5044     assert(bt == T_LONG, "");
5045     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5046   }
5047 }
5048 
5049 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3,
5050                                    bool merge, BasicType bt, int vlen_enc) {
5051   if (bt == T_INT) {
5052     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5053   } else {
5054     assert(bt == T_LONG, "");
5055     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5056   }
5057 }
5058 
5059 #ifdef _LP64
5060 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1,
5061                                                Register rtmp2, XMMRegister xtmp, int mask_len,
5062                                                int vec_enc) {
5063   int index = 0;
5064   int vindex = 0;
5065   mov64(rtmp1, 0x0101010101010101L);
5066   pdepq(rtmp1, src, rtmp1);
5067   if (mask_len > 8) {
5068     movq(rtmp2, src);
5069     vpxor(xtmp, xtmp, xtmp, vec_enc);
5070     movq(xtmp, rtmp1);
5071   }
5072   movq(dst, rtmp1);
5073 
5074   mask_len -= 8;
5075   while (mask_len > 0) {
5076     assert ((mask_len & 0x7) == 0, "mask must be multiple of 8");
5077     index++;
5078     if ((index % 2) == 0) {
5079       pxor(xtmp, xtmp);
5080     }
5081     mov64(rtmp1, 0x0101010101010101L);
5082     shrq(rtmp2, 8);
5083     pdepq(rtmp1, rtmp2, rtmp1);
5084     pinsrq(xtmp, rtmp1, index % 2);
5085     vindex = index / 2;
5086     if (vindex) {
5087       // Write entire 16 byte vector when both 64 bit
5088       // lanes are update to save redundant instructions.
5089       if (index % 2) {
5090         vinsertf128(dst, dst, xtmp, vindex);
5091       }
5092     } else {
5093       vmovdqu(dst, xtmp);
5094     }
5095     mask_len -= 8;
5096   }
5097 }
5098 
5099 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) {
5100   switch(opc) {
5101     case Op_VectorMaskTrueCount:
5102       popcntq(dst, tmp);
5103       break;
5104     case Op_VectorMaskLastTrue:
5105       if (VM_Version::supports_lzcnt()) {
5106         lzcntq(tmp, tmp);
5107         movl(dst, 63);
5108         subl(dst, tmp);
5109       } else {
5110         movl(dst, -1);
5111         bsrq(tmp, tmp);
5112         cmov32(Assembler::notZero, dst, tmp);
5113       }
5114       break;
5115     case Op_VectorMaskFirstTrue:
5116       if (VM_Version::supports_bmi1()) {
5117         if (masklen < 32) {
5118           orl(tmp, 1 << masklen);
5119           tzcntl(dst, tmp);
5120         } else if (masklen == 32) {
5121           tzcntl(dst, tmp);
5122         } else {
5123           assert(masklen == 64, "");
5124           tzcntq(dst, tmp);
5125         }
5126       } else {
5127         if (masklen < 32) {
5128           orl(tmp, 1 << masklen);
5129           bsfl(dst, tmp);
5130         } else {
5131           assert(masklen == 32 || masklen == 64, "");
5132           movl(dst, masklen);
5133           if (masklen == 32)  {
5134             bsfl(tmp, tmp);
5135           } else {
5136             bsfq(tmp, tmp);
5137           }
5138           cmov32(Assembler::notZero, dst, tmp);
5139         }
5140       }
5141       break;
5142     case Op_VectorMaskToLong:
5143       assert(dst == tmp, "Dst and tmp should be the same for toLong operations");
5144       break;
5145     default: assert(false, "Unhandled mask operation");
5146   }
5147 }
5148 
5149 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp,
5150                                               int masklen, int masksize, int vec_enc) {
5151   assert(VM_Version::supports_popcnt(), "");
5152 
5153   if(VM_Version::supports_avx512bw()) {
5154     kmovql(tmp, mask);
5155   } else {
5156     assert(masklen <= 16, "");
5157     kmovwl(tmp, mask);
5158   }
5159 
5160   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5161   // operations needs to be clipped.
5162   if (masksize < 16 && opc != Op_VectorMaskFirstTrue) {
5163     andq(tmp, (1 << masklen) - 1);
5164   }
5165 
5166   vector_mask_operation_helper(opc, dst, tmp, masklen);
5167 }
5168 
5169 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
5170                                               Register tmp, int masklen, BasicType bt, int vec_enc) {
5171   assert(vec_enc == AVX_128bit && VM_Version::supports_avx() ||
5172          vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4), "");
5173   assert(VM_Version::supports_popcnt(), "");
5174 
5175   bool need_clip = false;
5176   switch(bt) {
5177     case T_BOOLEAN:
5178       // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1
5179       vpxor(xtmp, xtmp, xtmp, vec_enc);
5180       vpsubb(xtmp, xtmp, mask, vec_enc);
5181       vpmovmskb(tmp, xtmp, vec_enc);
5182       need_clip = masklen < 16;
5183       break;
5184     case T_BYTE:
5185       vpmovmskb(tmp, mask, vec_enc);
5186       need_clip = masklen < 16;
5187       break;
5188     case T_SHORT:
5189       vpacksswb(xtmp, mask, mask, vec_enc);
5190       if (masklen >= 16) {
5191         vpermpd(xtmp, xtmp, 8, vec_enc);
5192       }
5193       vpmovmskb(tmp, xtmp, Assembler::AVX_128bit);
5194       need_clip = masklen < 16;
5195       break;
5196     case T_INT:
5197     case T_FLOAT:
5198       vmovmskps(tmp, mask, vec_enc);
5199       need_clip = masklen < 4;
5200       break;
5201     case T_LONG:
5202     case T_DOUBLE:
5203       vmovmskpd(tmp, mask, vec_enc);
5204       need_clip = masklen < 2;
5205       break;
5206     default: assert(false, "Unhandled type, %s", type2name(bt));
5207   }
5208 
5209   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5210   // operations needs to be clipped.
5211   if (need_clip && opc != Op_VectorMaskFirstTrue) {
5212     // need_clip implies masklen < 32
5213     andq(tmp, (1 << masklen) - 1);
5214   }
5215 
5216   vector_mask_operation_helper(opc, dst, tmp, masklen);
5217 }
5218 
5219 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1,
5220                                              Register rtmp2, int mask_len) {
5221   kmov(rtmp1, src);
5222   andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len)));
5223   mov64(rtmp2, -1L);
5224   pextq(rtmp2, rtmp2, rtmp1);
5225   kmov(dst, rtmp2);
5226 }
5227 
5228 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask,
5229                                                bool merge, BasicType bt, int vec_enc) {
5230   if (opcode == Op_CompressV) {
5231     switch(bt) {
5232     case T_BYTE:
5233       evpcompressb(dst, mask, src, merge, vec_enc);
5234       break;
5235     case T_CHAR:
5236     case T_SHORT:
5237       evpcompressw(dst, mask, src, merge, vec_enc);
5238       break;
5239     case T_INT:
5240       evpcompressd(dst, mask, src, merge, vec_enc);
5241       break;
5242     case T_FLOAT:
5243       evcompressps(dst, mask, src, merge, vec_enc);
5244       break;
5245     case T_LONG:
5246       evpcompressq(dst, mask, src, merge, vec_enc);
5247       break;
5248     case T_DOUBLE:
5249       evcompresspd(dst, mask, src, merge, vec_enc);
5250       break;
5251     default:
5252       fatal("Unsupported type %s", type2name(bt));
5253       break;
5254     }
5255   } else {
5256     assert(opcode == Op_ExpandV, "");
5257     switch(bt) {
5258     case T_BYTE:
5259       evpexpandb(dst, mask, src, merge, vec_enc);
5260       break;
5261     case T_CHAR:
5262     case T_SHORT:
5263       evpexpandw(dst, mask, src, merge, vec_enc);
5264       break;
5265     case T_INT:
5266       evpexpandd(dst, mask, src, merge, vec_enc);
5267       break;
5268     case T_FLOAT:
5269       evexpandps(dst, mask, src, merge, vec_enc);
5270       break;
5271     case T_LONG:
5272       evpexpandq(dst, mask, src, merge, vec_enc);
5273       break;
5274     case T_DOUBLE:
5275       evexpandpd(dst, mask, src, merge, vec_enc);
5276       break;
5277     default:
5278       fatal("Unsupported type %s", type2name(bt));
5279       break;
5280     }
5281   }
5282 }
5283 #endif
5284 
5285 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5286                                            KRegister ktmp1, int vec_enc) {
5287   if (opcode == Op_SignumVD) {
5288     vsubpd(dst, zero, one, vec_enc);
5289     // if src < 0 ? -1 : 1
5290     evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5291     evblendmpd(dst, ktmp1, one, dst, true, vec_enc);
5292     // if src == NaN, -0.0 or 0.0 return src.
5293     evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5294     evblendmpd(dst, ktmp1, dst, src, true, vec_enc);
5295   } else {
5296     assert(opcode == Op_SignumVF, "");
5297     vsubps(dst, zero, one, vec_enc);
5298     // if src < 0 ? -1 : 1
5299     evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5300     evblendmps(dst, ktmp1, one, dst, true, vec_enc);
5301     // if src == NaN, -0.0 or 0.0 return src.
5302     evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5303     evblendmps(dst, ktmp1, dst, src, true, vec_enc);
5304   }
5305 }
5306 
5307 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5308                                           XMMRegister xtmp1, int vec_enc) {
5309   if (opcode == Op_SignumVD) {
5310     vsubpd(dst, zero, one, vec_enc);
5311     // if src < 0 ? -1 : 1
5312     vblendvpd(dst, one, dst, src, vec_enc);
5313     // if src == NaN, -0.0 or 0.0 return src.
5314     vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5315     vblendvpd(dst, dst, src, xtmp1, vec_enc);
5316   } else {
5317     assert(opcode == Op_SignumVF, "");
5318     vsubps(dst, zero, one, vec_enc);
5319     // if src < 0 ? -1 : 1
5320     vblendvps(dst, one, dst, src, vec_enc);
5321     // if src == NaN, -0.0 or 0.0 return src.
5322     vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5323     vblendvps(dst, dst, src, xtmp1, vec_enc);
5324   }
5325 }
5326 
5327 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) {
5328   if (VM_Version::supports_avx512bw()) {
5329     if (mask_len > 32) {
5330       kmovql(dst, src);
5331     } else {
5332       kmovdl(dst, src);
5333       if (mask_len != 32) {
5334         kshiftrdl(dst, dst, 32 - mask_len);
5335       }
5336     }
5337   } else {
5338     assert(mask_len <= 16, "");
5339     kmovwl(dst, src);
5340     if (mask_len != 16) {
5341       kshiftrwl(dst, dst, 16 - mask_len);
5342     }
5343   }
5344 }
5345 
5346 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) {
5347   int lane_size = type2aelembytes(bt);
5348   bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
5349   if ((is_LP64 || lane_size < 8) &&
5350       ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) ||
5351        (is_subword_type(bt) && VM_Version::supports_avx512vlbw()))) {
5352     movptr(rtmp, imm32);
5353     switch(lane_size) {
5354       case 1 : evpbroadcastb(dst, rtmp, vec_enc); break;
5355       case 2 : evpbroadcastw(dst, rtmp, vec_enc); break;
5356       case 4 : evpbroadcastd(dst, rtmp, vec_enc); break;
5357       case 8 : evpbroadcastq(dst, rtmp, vec_enc); break;
5358       fatal("Unsupported lane size %d", lane_size);
5359       break;
5360     }
5361   } else {
5362     movptr(rtmp, imm32);
5363     LP64_ONLY(movq(dst, rtmp)) NOT_LP64(movdl(dst, rtmp));
5364     switch(lane_size) {
5365       case 1 : vpbroadcastb(dst, dst, vec_enc); break;
5366       case 2 : vpbroadcastw(dst, dst, vec_enc); break;
5367       case 4 : vpbroadcastd(dst, dst, vec_enc); break;
5368       case 8 : vpbroadcastq(dst, dst, vec_enc); break;
5369       fatal("Unsupported lane size %d", lane_size);
5370       break;
5371     }
5372   }
5373 }
5374 
5375 //
5376 // Following is lookup table based popcount computation algorithm:-
5377 //       Index   Bit set count
5378 //     [ 0000 ->   0,
5379 //       0001 ->   1,
5380 //       0010 ->   1,
5381 //       0011 ->   2,
5382 //       0100 ->   1,
5383 //       0101 ->   2,
5384 //       0110 ->   2,
5385 //       0111 ->   3,
5386 //       1000 ->   1,
5387 //       1001 ->   2,
5388 //       1010 ->   3,
5389 //       1011 ->   3,
5390 //       1100 ->   2,
5391 //       1101 ->   3,
5392 //       1111 ->   4 ]
5393 //  a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as
5394 //     shuffle indices for lookup table access.
5395 //  b. Right shift each byte of vector lane by 4 positions.
5396 //  c. Count the number of 1s in 4 MSB bits each byte. These bits are used as
5397 //     shuffle indices for lookup table access.
5398 //  d. Add the bitset count of upper and lower 4 bits of each byte.
5399 //  e. Unpack double words to quad words and compute sum of absolute difference of bitset
5400 //     count of all the bytes of a quadword.
5401 //  f. Perform step e. for upper 128bit vector lane.
5402 //  g. Pack the bitset count of quadwords back to double word.
5403 //  h. Unpacking and packing operations are not needed for 64bit vector lane.
5404 
5405 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5406                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5407   assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), "");
5408   vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc);
5409   vpsrlw(dst, src, 4, vec_enc);
5410   vpand(dst, dst, xtmp1, vec_enc);
5411   vpand(xtmp1, src, xtmp1, vec_enc);
5412   vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg);
5413   vpshufb(xtmp1, xtmp2, xtmp1, vec_enc);
5414   vpshufb(dst, xtmp2, dst, vec_enc);
5415   vpaddb(dst, dst, xtmp1, vec_enc);
5416 }
5417 
5418 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5419                                             XMMRegister xtmp2, Register rtmp, int vec_enc) {
5420   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5421   // Following code is as per steps e,f,g and h of above algorithm.
5422   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5423   vpunpckhdq(dst, xtmp1, xtmp2, vec_enc);
5424   vpsadbw(dst, dst, xtmp2, vec_enc);
5425   vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc);
5426   vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc);
5427   vpackuswb(dst, xtmp1, dst, vec_enc);
5428 }
5429 
5430 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5431                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5432   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5433   // Add the popcount of upper and lower bytes of word.
5434   vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc);
5435   vpsrlw(dst, xtmp1, 8, vec_enc);
5436   vpand(xtmp1, xtmp1, xtmp2, vec_enc);
5437   vpaddw(dst, dst, xtmp1, vec_enc);
5438 }
5439 
5440 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5441                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5442   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5443   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5444   vpsadbw(dst, xtmp1, xtmp2, vec_enc);
5445 }
5446 
5447 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5448                                                  XMMRegister xtmp2, Register rtmp, int vec_enc) {
5449   switch(bt) {
5450     case T_LONG:
5451       vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5452       break;
5453     case T_INT:
5454       vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5455       break;
5456     case T_CHAR:
5457     case T_SHORT:
5458       vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5459       break;
5460     case T_BYTE:
5461     case T_BOOLEAN:
5462       vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5463       break;
5464     default:
5465       fatal("Unsupported type %s", type2name(bt));
5466       break;
5467   }
5468 }
5469 
5470 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5471                                                       KRegister mask, bool merge, int vec_enc) {
5472   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5473   switch(bt) {
5474     case T_LONG:
5475       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5476       evpopcntq(dst, mask, src, merge, vec_enc);
5477       break;
5478     case T_INT:
5479       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5480       evpopcntd(dst, mask, src, merge, vec_enc);
5481       break;
5482     case T_CHAR:
5483     case T_SHORT:
5484       assert(VM_Version::supports_avx512_bitalg(), "");
5485       evpopcntw(dst, mask, src, merge, vec_enc);
5486       break;
5487     case T_BYTE:
5488     case T_BOOLEAN:
5489       assert(VM_Version::supports_avx512_bitalg(), "");
5490       evpopcntb(dst, mask, src, merge, vec_enc);
5491       break;
5492     default:
5493       fatal("Unsupported type %s", type2name(bt));
5494       break;
5495   }
5496 }
5497 
5498 #ifndef _LP64
5499 void C2_MacroAssembler::vector_maskall_operation32(KRegister dst, Register src, KRegister tmp, int mask_len) {
5500   assert(VM_Version::supports_avx512bw(), "");
5501   kmovdl(tmp, src);
5502   kunpckdql(dst, tmp, tmp);
5503 }
5504 #endif
5505 
5506 // Bit reversal algorithm first reverses the bits of each byte followed by
5507 // a byte level reversal for multi-byte primitive types (short/int/long).
5508 // Algorithm performs a lookup table access to get reverse bit sequence
5509 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte
5510 // is obtained by swapping the reverse bit sequences of upper and lower
5511 // nibble of a byte.
5512 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5513                                            XMMRegister xtmp2, Register rtmp, int vec_enc) {
5514   if (VM_Version::supports_avx512vlbw()) {
5515 
5516     // Get the reverse bit sequence of lower nibble of each byte.
5517     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg);
5518     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5519     evpandq(dst, xtmp2, src, vec_enc);
5520     vpshufb(dst, xtmp1, dst, vec_enc);
5521     vpsllq(dst, dst, 4, vec_enc);
5522 
5523     // Get the reverse bit sequence of upper nibble of each byte.
5524     vpandn(xtmp2, xtmp2, src, vec_enc);
5525     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5526     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5527 
5528     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5529     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5530     evporq(xtmp2, dst, xtmp2, vec_enc);
5531     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
5532 
5533   } else if(vec_enc == Assembler::AVX_512bit) {
5534     // Shift based bit reversal.
5535     assert(bt == T_LONG || bt == T_INT, "");
5536 
5537     // Swap lower and upper nibble of each byte.
5538     vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc);
5539 
5540     // Swap two least and most significant bits of each nibble.
5541     vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc);
5542 
5543     // Swap adjacent pair of bits.
5544     evmovdqul(xtmp1, k0, dst, true, vec_enc);
5545     vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc);
5546 
5547     evmovdqul(xtmp1, k0, dst, true, vec_enc);
5548     vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc);
5549   } else {
5550     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp);
5551     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5552 
5553     // Get the reverse bit sequence of lower nibble of each byte.
5554     vpand(dst, xtmp2, src, vec_enc);
5555     vpshufb(dst, xtmp1, dst, vec_enc);
5556     vpsllq(dst, dst, 4, vec_enc);
5557 
5558     // Get the reverse bit sequence of upper nibble of each byte.
5559     vpandn(xtmp2, xtmp2, src, vec_enc);
5560     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5561     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5562 
5563     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5564     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5565     vpor(xtmp2, dst, xtmp2, vec_enc);
5566     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
5567   }
5568 }
5569 
5570 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc,
5571                                                 XMMRegister xtmp, Register rscratch) {
5572   assert(VM_Version::supports_gfni(), "");
5573   assert(rscratch != noreg || always_reachable(mask), "missing");
5574 
5575   // Galois field instruction based bit reversal based on following algorithm.
5576   // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
5577   vpbroadcastq(xtmp, mask, vec_enc, rscratch);
5578   vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc);
5579   vector_reverse_byte(bt, dst, xtmp, vec_enc);
5580 }
5581 
5582 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src,
5583                                           XMMRegister xtmp1, Register rtmp, int vec_enc) {
5584   vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc);
5585   evpandq(dst, xtmp1, src, vec_enc);
5586   vpsllq(dst, dst, nbits, vec_enc);
5587   vpandn(xtmp1, xtmp1, src, vec_enc);
5588   vpsrlq(xtmp1, xtmp1, nbits, vec_enc);
5589   evporq(dst, dst, xtmp1, vec_enc);
5590 }
5591 
5592 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5593                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5594   // Shift based bit reversal.
5595   assert(VM_Version::supports_evex(), "");
5596   switch(bt) {
5597     case T_LONG:
5598       // Swap upper and lower double word of each quad word.
5599       evprorq(xtmp1, k0, src, 32, true, vec_enc);
5600       evprord(xtmp1, k0, xtmp1, 16, true, vec_enc);
5601       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
5602       break;
5603     case T_INT:
5604       // Swap upper and lower word of each double word.
5605       evprord(xtmp1, k0, src, 16, true, vec_enc);
5606       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
5607       break;
5608     case T_CHAR:
5609     case T_SHORT:
5610       // Swap upper and lower byte of each word.
5611       vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc);
5612       break;
5613     case T_BYTE:
5614       evmovdquq(dst, k0, src, true, vec_enc);
5615       break;
5616     default:
5617       fatal("Unsupported type %s", type2name(bt));
5618       break;
5619   }
5620 }
5621 
5622 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) {
5623   if (bt == T_BYTE) {
5624     if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) {
5625       evmovdquq(dst, k0, src, true, vec_enc);
5626     } else {
5627       vmovdqu(dst, src);
5628     }
5629     return;
5630   }
5631   // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using
5632   // pre-computed shuffle indices.
5633   switch(bt) {
5634     case T_LONG:
5635       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg);
5636       break;
5637     case T_INT:
5638       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg);
5639       break;
5640     case T_CHAR:
5641     case T_SHORT:
5642       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg);
5643       break;
5644     default:
5645       fatal("Unsupported type %s", type2name(bt));
5646       break;
5647   }
5648   vpshufb(dst, src, dst, vec_enc);
5649 }
5650 
5651 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5652                                                         XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
5653                                                         KRegister ktmp, Register rtmp, bool merge, int vec_enc) {
5654   assert(is_integral_type(bt), "");
5655   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5656   assert(VM_Version::supports_avx512cd(), "");
5657   switch(bt) {
5658     case T_LONG:
5659       evplzcntq(dst, ktmp, src, merge, vec_enc);
5660       break;
5661     case T_INT:
5662       evplzcntd(dst, ktmp, src, merge, vec_enc);
5663       break;
5664     case T_SHORT:
5665       vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc);
5666       vpunpcklwd(xtmp2, xtmp1, src, vec_enc);
5667       evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc);
5668       vpunpckhwd(dst, xtmp1, src, vec_enc);
5669       evplzcntd(dst, ktmp, dst, merge, vec_enc);
5670       vpackusdw(dst, xtmp2, dst, vec_enc);
5671       break;
5672     case T_BYTE:
5673       // T1 = Compute leading zero counts of 4 LSB bits of each byte by
5674       // accessing the lookup table.
5675       // T2 = Compute leading zero counts of 4 MSB bits of each byte by
5676       // accessing the lookup table.
5677       // Add T1 to T2 if 4 MSB bits of byte are all zeros.
5678       assert(VM_Version::supports_avx512bw(), "");
5679       evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp);
5680       vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc);
5681       vpand(xtmp2, dst, src, vec_enc);
5682       vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5683       vpsrlw(xtmp3, src, 4, vec_enc);
5684       vpand(xtmp3, dst, xtmp3, vec_enc);
5685       vpshufb(dst, xtmp1, xtmp3, vec_enc);
5686       vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
5687       evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc);
5688       evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc);
5689       break;
5690     default:
5691       fatal("Unsupported type %s", type2name(bt));
5692       break;
5693   }
5694 }
5695 
5696 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5697                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
5698   vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp);
5699   vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5700   // T1 = Compute leading zero counts of 4 LSB bits of each byte by
5701   // accessing the lookup table.
5702   vpand(dst, xtmp2, src, vec_enc);
5703   vpshufb(dst, xtmp1, dst, vec_enc);
5704   // T2 = Compute leading zero counts of 4 MSB bits of each byte by
5705   // accessing the lookup table.
5706   vpsrlw(xtmp3, src, 4, vec_enc);
5707   vpand(xtmp3, xtmp2, xtmp3, vec_enc);
5708   vpshufb(xtmp2, xtmp1, xtmp3, vec_enc);
5709   // Add T1 to T2 if 4 MSB bits of byte are all zeros.
5710   vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
5711   vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc);
5712   vpaddb(dst, dst, xtmp2, vec_enc);
5713   vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc);
5714 }
5715 
5716 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5717                                                              XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
5718   vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
5719   // Add zero counts of lower byte and upper byte of a word if
5720   // upper byte holds a zero value.
5721   vpsrlw(xtmp3, src, 8, vec_enc);
5722   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
5723   vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc);
5724   vpsllw(xtmp2, dst, 8, vec_enc);
5725   vpaddw(xtmp2, xtmp2, dst, vec_enc);
5726   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
5727   vpsrlw(dst, dst, 8, vec_enc);
5728 }
5729 
5730 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5731                                                            XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) {
5732   // Since IEEE 754 floating point format represents mantissa in 1.0 format
5733   // hence biased exponent can be used to compute leading zero count as per
5734   // following formula:-
5735   // LZCNT = 32 - (biased_exp - 127)
5736   // Special handling has been introduced for Zero, Max_Int and -ve source values.
5737 
5738   // Broadcast 0xFF
5739   vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc);
5740   vpsrld(xtmp1, xtmp1, 24, vec_enc);
5741 
5742   // Extract biased exponent.
5743   vcvtdq2ps(dst, src, vec_enc);
5744   vpsrld(dst, dst, 23, vec_enc);
5745   vpand(dst, dst, xtmp1, vec_enc);
5746 
5747   // Broadcast 127.
5748   vpsrld(xtmp1, xtmp1, 1, vec_enc);
5749   // Exponent = biased_exp - 127
5750   vpsubd(dst, dst, xtmp1, vec_enc);
5751 
5752   // Exponent = Exponent  + 1
5753   vpsrld(xtmp3, xtmp1, 6, vec_enc);
5754   vpaddd(dst, dst, xtmp3, vec_enc);
5755 
5756   // Replace -ve exponent with zero, exponent is -ve when src
5757   // lane contains a zero value.
5758   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5759   vblendvps(dst, dst, xtmp2, dst, vec_enc);
5760 
5761   // Rematerialize broadcast 32.
5762   vpslld(xtmp1, xtmp3, 5, vec_enc);
5763   // Exponent is 32 if corresponding source lane contains max_int value.
5764   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
5765   // LZCNT = 32 - exponent
5766   vpsubd(dst, xtmp1, dst, vec_enc);
5767 
5768   // Replace LZCNT with a value 1 if corresponding source lane
5769   // contains max_int value.
5770   vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc);
5771 
5772   // Replace biased_exp with 0 if source lane value is less than zero.
5773   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5774   vblendvps(dst, dst, xtmp2, src, vec_enc);
5775 }
5776 
5777 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5778                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
5779   vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
5780   // Add zero counts of lower word and upper word of a double word if
5781   // upper word holds a zero value.
5782   vpsrld(xtmp3, src, 16, vec_enc);
5783   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
5784   vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc);
5785   vpslld(xtmp2, dst, 16, vec_enc);
5786   vpaddd(xtmp2, xtmp2, dst, vec_enc);
5787   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
5788   vpsrld(dst, dst, 16, vec_enc);
5789   // Add zero counts of lower doubleword and upper doubleword of a
5790   // quadword if upper doubleword holds a zero value.
5791   vpsrlq(xtmp3, src, 32, vec_enc);
5792   vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc);
5793   vpsllq(xtmp2, dst, 32, vec_enc);
5794   vpaddq(xtmp2, xtmp2, dst, vec_enc);
5795   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
5796   vpsrlq(dst, dst, 32, vec_enc);
5797 }
5798 
5799 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src,
5800                                                        XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
5801                                                        Register rtmp, int vec_enc) {
5802   assert(is_integral_type(bt), "unexpected type");
5803   assert(vec_enc < Assembler::AVX_512bit, "");
5804   switch(bt) {
5805     case T_LONG:
5806       vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
5807       break;
5808     case T_INT:
5809       vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
5810       break;
5811     case T_SHORT:
5812       vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
5813       break;
5814     case T_BYTE:
5815       vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
5816       break;
5817     default:
5818       fatal("Unsupported type %s", type2name(bt));
5819       break;
5820   }
5821 }
5822 
5823 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) {
5824   switch(bt) {
5825     case T_BYTE:
5826       vpsubb(dst, src1, src2, vec_enc);
5827       break;
5828     case T_SHORT:
5829       vpsubw(dst, src1, src2, vec_enc);
5830       break;
5831     case T_INT:
5832       vpsubd(dst, src1, src2, vec_enc);
5833       break;
5834     case T_LONG:
5835       vpsubq(dst, src1, src2, vec_enc);
5836       break;
5837     default:
5838       fatal("Unsupported type %s", type2name(bt));
5839       break;
5840   }
5841 }
5842 
5843 // Trailing zero count computation is based on leading zero count operation as per
5844 // following equation. All AVX3 targets support AVX512CD feature which offers
5845 // direct vector instruction to compute leading zero count.
5846 //      CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x)
5847 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5848                                                          XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
5849                                                          XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) {
5850   assert(is_integral_type(bt), "");
5851   // xtmp = -1
5852   vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc);
5853   // xtmp = xtmp + src
5854   vpadd(bt, xtmp4, xtmp4, src, vec_enc);
5855   // xtmp = xtmp & ~src
5856   vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc);
5857   vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc);
5858   vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc);
5859   vpsub(bt, dst, xtmp4, dst, vec_enc);
5860 }
5861 
5862 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation
5863 //      CTZ = PRIM_TYPE_WIDHT - POPC(x | -x)
5864 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5865                                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
5866   assert(is_integral_type(bt), "");
5867   // xtmp = 0
5868   vpxor(xtmp3 , xtmp3, xtmp3, vec_enc);
5869   // xtmp = 0 - src
5870   vpsub(bt, xtmp3, xtmp3, src, vec_enc);
5871   // xtmp = xtmp | src
5872   vpor(xtmp3, xtmp3, src, vec_enc);
5873   vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc);
5874   vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc);
5875   vpsub(bt, dst, xtmp1, dst, vec_enc);
5876 }
5877 
5878 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) {
5879   Label done;
5880   Label neg_divisor_fastpath;
5881   cmpl(divisor, 0);
5882   jccb(Assembler::less, neg_divisor_fastpath);
5883   xorl(rdx, rdx);
5884   divl(divisor);
5885   jmpb(done);
5886   bind(neg_divisor_fastpath);
5887   // Fastpath for divisor < 0:
5888   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
5889   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
5890   movl(rdx, rax);
5891   subl(rdx, divisor);
5892   if (VM_Version::supports_bmi1()) {
5893     andnl(rax, rdx, rax);
5894   } else {
5895     notl(rdx);
5896     andl(rax, rdx);
5897   }
5898   shrl(rax, 31);
5899   bind(done);
5900 }
5901 
5902 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) {
5903   Label done;
5904   Label neg_divisor_fastpath;
5905   cmpl(divisor, 0);
5906   jccb(Assembler::less, neg_divisor_fastpath);
5907   xorl(rdx, rdx);
5908   divl(divisor);
5909   jmpb(done);
5910   bind(neg_divisor_fastpath);
5911   // Fastpath when divisor < 0:
5912   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
5913   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
5914   movl(rdx, rax);
5915   subl(rax, divisor);
5916   if (VM_Version::supports_bmi1()) {
5917     andnl(rax, rax, rdx);
5918   } else {
5919     notl(rax);
5920     andl(rax, rdx);
5921   }
5922   sarl(rax, 31);
5923   andl(rax, divisor);
5924   subl(rdx, rax);
5925   bind(done);
5926 }
5927 
5928 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) {
5929   Label done;
5930   Label neg_divisor_fastpath;
5931 
5932   cmpl(divisor, 0);
5933   jccb(Assembler::less, neg_divisor_fastpath);
5934   xorl(rdx, rdx);
5935   divl(divisor);
5936   jmpb(done);
5937   bind(neg_divisor_fastpath);
5938   // Fastpath for divisor < 0:
5939   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
5940   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
5941   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
5942   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
5943   movl(rdx, rax);
5944   subl(rax, divisor);
5945   if (VM_Version::supports_bmi1()) {
5946     andnl(rax, rax, rdx);
5947   } else {
5948     notl(rax);
5949     andl(rax, rdx);
5950   }
5951   movl(tmp, rax);
5952   shrl(rax, 31); // quotient
5953   sarl(tmp, 31);
5954   andl(tmp, divisor);
5955   subl(rdx, tmp); // remainder
5956   bind(done);
5957 }
5958 
5959 #ifdef _LP64
5960 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1,
5961                                  XMMRegister xtmp2, Register rtmp) {
5962   if(VM_Version::supports_gfni()) {
5963     // Galois field instruction based bit reversal based on following algorithm.
5964     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
5965     mov64(rtmp, 0x8040201008040201L);
5966     movq(xtmp1, src);
5967     movq(xtmp2, rtmp);
5968     gf2p8affineqb(xtmp1, xtmp2, 0);
5969     movq(dst, xtmp1);
5970   } else {
5971     // Swap even and odd numbered bits.
5972     movl(rtmp, src);
5973     andl(rtmp, 0x55555555);
5974     shll(rtmp, 1);
5975     movl(dst, src);
5976     andl(dst, 0xAAAAAAAA);
5977     shrl(dst, 1);
5978     orl(dst, rtmp);
5979 
5980     // Swap LSB and MSB 2 bits of each nibble.
5981     movl(rtmp, dst);
5982     andl(rtmp, 0x33333333);
5983     shll(rtmp, 2);
5984     andl(dst, 0xCCCCCCCC);
5985     shrl(dst, 2);
5986     orl(dst, rtmp);
5987 
5988     // Swap LSB and MSB 4 bits of each byte.
5989     movl(rtmp, dst);
5990     andl(rtmp, 0x0F0F0F0F);
5991     shll(rtmp, 4);
5992     andl(dst, 0xF0F0F0F0);
5993     shrl(dst, 4);
5994     orl(dst, rtmp);
5995   }
5996   bswapl(dst);
5997 }
5998 
5999 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1,
6000                                  XMMRegister xtmp2, Register rtmp1, Register rtmp2) {
6001   if(VM_Version::supports_gfni()) {
6002     // Galois field instruction based bit reversal based on following algorithm.
6003     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6004     mov64(rtmp1, 0x8040201008040201L);
6005     movq(xtmp1, src);
6006     movq(xtmp2, rtmp1);
6007     gf2p8affineqb(xtmp1, xtmp2, 0);
6008     movq(dst, xtmp1);
6009   } else {
6010     // Swap even and odd numbered bits.
6011     movq(rtmp1, src);
6012     mov64(rtmp2, 0x5555555555555555L);
6013     andq(rtmp1, rtmp2);
6014     shlq(rtmp1, 1);
6015     movq(dst, src);
6016     notq(rtmp2);
6017     andq(dst, rtmp2);
6018     shrq(dst, 1);
6019     orq(dst, rtmp1);
6020 
6021     // Swap LSB and MSB 2 bits of each nibble.
6022     movq(rtmp1, dst);
6023     mov64(rtmp2, 0x3333333333333333L);
6024     andq(rtmp1, rtmp2);
6025     shlq(rtmp1, 2);
6026     notq(rtmp2);
6027     andq(dst, rtmp2);
6028     shrq(dst, 2);
6029     orq(dst, rtmp1);
6030 
6031     // Swap LSB and MSB 4 bits of each byte.
6032     movq(rtmp1, dst);
6033     mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL);
6034     andq(rtmp1, rtmp2);
6035     shlq(rtmp1, 4);
6036     notq(rtmp2);
6037     andq(dst, rtmp2);
6038     shrq(dst, 4);
6039     orq(dst, rtmp1);
6040   }
6041   bswapq(dst);
6042 }
6043 
6044 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) {
6045   Label done;
6046   Label neg_divisor_fastpath;
6047   cmpq(divisor, 0);
6048   jccb(Assembler::less, neg_divisor_fastpath);
6049   xorl(rdx, rdx);
6050   divq(divisor);
6051   jmpb(done);
6052   bind(neg_divisor_fastpath);
6053   // Fastpath for divisor < 0:
6054   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6055   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6056   movq(rdx, rax);
6057   subq(rdx, divisor);
6058   if (VM_Version::supports_bmi1()) {
6059     andnq(rax, rdx, rax);
6060   } else {
6061     notq(rdx);
6062     andq(rax, rdx);
6063   }
6064   shrq(rax, 63);
6065   bind(done);
6066 }
6067 
6068 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) {
6069   Label done;
6070   Label neg_divisor_fastpath;
6071   cmpq(divisor, 0);
6072   jccb(Assembler::less, neg_divisor_fastpath);
6073   xorq(rdx, rdx);
6074   divq(divisor);
6075   jmp(done);
6076   bind(neg_divisor_fastpath);
6077   // Fastpath when divisor < 0:
6078   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6079   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6080   movq(rdx, rax);
6081   subq(rax, divisor);
6082   if (VM_Version::supports_bmi1()) {
6083     andnq(rax, rax, rdx);
6084   } else {
6085     notq(rax);
6086     andq(rax, rdx);
6087   }
6088   sarq(rax, 63);
6089   andq(rax, divisor);
6090   subq(rdx, rax);
6091   bind(done);
6092 }
6093 
6094 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) {
6095   Label done;
6096   Label neg_divisor_fastpath;
6097   cmpq(divisor, 0);
6098   jccb(Assembler::less, neg_divisor_fastpath);
6099   xorq(rdx, rdx);
6100   divq(divisor);
6101   jmp(done);
6102   bind(neg_divisor_fastpath);
6103   // Fastpath for divisor < 0:
6104   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6105   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6106   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6107   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6108   movq(rdx, rax);
6109   subq(rax, divisor);
6110   if (VM_Version::supports_bmi1()) {
6111     andnq(rax, rax, rdx);
6112   } else {
6113     notq(rax);
6114     andq(rax, rdx);
6115   }
6116   movq(tmp, rax);
6117   shrq(rax, 63); // quotient
6118   sarq(tmp, 63);
6119   andq(tmp, divisor);
6120   subq(rdx, tmp); // remainder
6121   bind(done);
6122 }
6123 #endif
6124 
6125 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1,
6126                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp,
6127                                         int vlen_enc) {
6128   assert(VM_Version::supports_avx512bw(), "");
6129   // Byte shuffles are inlane operations and indices are determined using
6130   // lower 4 bit of each shuffle lane, thus all shuffle indices are
6131   // normalized to index range 0-15. This makes sure that all the multiples
6132   // of an index value are placed at same relative position in 128 bit
6133   // lane i.e. elements corresponding to shuffle indices 16, 32 and 64
6134   // will be 16th element in their respective 128 bit lanes.
6135   movl(rtmp, 16);
6136   evpbroadcastb(xtmp1, rtmp, vlen_enc);
6137 
6138   // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16,
6139   // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using
6140   // original shuffle indices and move the shuffled lanes corresponding to true
6141   // mask to destination vector.
6142   evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
6143   evshufi64x2(xtmp2, src, src, 0x0, vlen_enc);
6144   evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc);
6145 
6146   // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32
6147   // and broadcasting second 128 bit lane.
6148   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6149   vpsllq(xtmp2, xtmp1, 0x1, vlen_enc);
6150   evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
6151   evshufi64x2(xtmp3, src, src, 0x55, vlen_enc);
6152   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6153 
6154   // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48
6155   // and broadcasting third 128 bit lane.
6156   evpcmpb(ktmp, k0, shuffle,  xtmp2, Assembler::nlt, true, vlen_enc);
6157   vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc);
6158   evpcmpb(ktmp, ktmp, shuffle,  xtmp1, Assembler::lt, true, vlen_enc);
6159   evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc);
6160   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6161 
6162   // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64
6163   // and broadcasting third 128 bit lane.
6164   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6165   vpsllq(xtmp2, xtmp2, 0x1, vlen_enc);
6166   evpcmpb(ktmp, ktmp, shuffle,  xtmp2, Assembler::lt, true, vlen_enc);
6167   evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc);
6168   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6169 }
6170 
6171 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst,
6172                                                    XMMRegister shuffle, XMMRegister src, int vlen_enc) {
6173   if (vlen_enc == AVX_128bit) {
6174     vpermilps(dst, src, shuffle, vlen_enc);
6175   } else if (bt == T_INT) {
6176     vpermd(dst, shuffle, src, vlen_enc);
6177   } else {
6178     assert(bt == T_FLOAT, "");
6179     vpermps(dst, shuffle, src, vlen_enc);
6180   }
6181 }