1 /*
   2  * Copyright (c) 2020, 2023, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "gc/shared/barrierSet.hpp"
  29 #include "gc/shared/barrierSetAssembler.hpp"
  30 #include "oops/methodData.hpp"
  31 #include "opto/c2_MacroAssembler.hpp"
  32 #include "opto/intrinsicnode.hpp"
  33 #include "opto/output.hpp"
  34 #include "opto/opcodes.hpp"
  35 #include "opto/subnode.hpp"
  36 #include "runtime/objectMonitor.hpp"
  37 #include "runtime/stubRoutines.hpp"
  38 
  39 #ifdef PRODUCT
  40 #define BLOCK_COMMENT(str) /* nothing */
  41 #define STOP(error) stop(error)
  42 #else
  43 #define BLOCK_COMMENT(str) block_comment(str)
  44 #define STOP(error) block_comment(error); stop(error)
  45 #endif
  46 
  47 // C2 compiled method's prolog code.
  48 void C2_MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) {
  49 
  50   // WARNING: Initial instruction MUST be 5 bytes or longer so that
  51   // NativeJump::patch_verified_entry will be able to patch out the entry
  52   // code safely. The push to verify stack depth is ok at 5 bytes,
  53   // the frame allocation can be either 3 or 6 bytes. So if we don't do
  54   // stack bang then we must use the 6 byte frame allocation even if
  55   // we have no frame. :-(
  56   assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
  57 
  58   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
  59   // Remove word for return addr
  60   framesize -= wordSize;
  61   stack_bang_size -= wordSize;
  62 
  63   // Calls to C2R adapters often do not accept exceptional returns.
  64   // We require that their callers must bang for them.  But be careful, because
  65   // some VM calls (such as call site linkage) can use several kilobytes of
  66   // stack.  But the stack safety zone should account for that.
  67   // See bugs 4446381, 4468289, 4497237.
  68   if (stack_bang_size > 0) {
  69     generate_stack_overflow_check(stack_bang_size);
  70 
  71     // We always push rbp, so that on return to interpreter rbp, will be
  72     // restored correctly and we can correct the stack.
  73     push(rbp);
  74     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  75     if (PreserveFramePointer) {
  76       mov(rbp, rsp);
  77     }
  78     // Remove word for ebp
  79     framesize -= wordSize;
  80 
  81     // Create frame
  82     if (framesize) {
  83       subptr(rsp, framesize);
  84     }
  85   } else {
  86     // Create frame (force generation of a 4 byte immediate value)
  87     subptr_imm32(rsp, framesize);
  88 
  89     // Save RBP register now.
  90     framesize -= wordSize;
  91     movptr(Address(rsp, framesize), rbp);
  92     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  93     if (PreserveFramePointer) {
  94       movptr(rbp, rsp);
  95       if (framesize > 0) {
  96         addptr(rbp, framesize);
  97       }
  98     }
  99   }
 100 
 101   if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
 102     framesize -= wordSize;
 103     movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 104   }
 105 
 106 #ifndef _LP64
 107   // If method sets FPU control word do it now
 108   if (fp_mode_24b) {
 109     fldcw(ExternalAddress(StubRoutines::x86::addr_fpu_cntrl_wrd_24()));
 110   }
 111   if (UseSSE >= 2 && VerifyFPU) {
 112     verify_FPU(0, "FPU stack must be clean on entry");
 113   }
 114 #endif
 115 
 116 #ifdef ASSERT
 117   if (VerifyStackAtCalls) {
 118     Label L;
 119     push(rax);
 120     mov(rax, rsp);
 121     andptr(rax, StackAlignmentInBytes-1);
 122     cmpptr(rax, StackAlignmentInBytes-wordSize);
 123     pop(rax);
 124     jcc(Assembler::equal, L);
 125     STOP("Stack is not properly aligned!");
 126     bind(L);
 127   }
 128 #endif
 129 
 130   if (!is_stub) {
 131     BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 132  #ifdef _LP64
 133     if (BarrierSet::barrier_set()->barrier_set_nmethod() != nullptr) {
 134       // We put the non-hot code of the nmethod entry barrier out-of-line in a stub.
 135       Label dummy_slow_path;
 136       Label dummy_continuation;
 137       Label* slow_path = &dummy_slow_path;
 138       Label* continuation = &dummy_continuation;
 139       if (!Compile::current()->output()->in_scratch_emit_size()) {
 140         // Use real labels from actual stub when not emitting code for the purpose of measuring its size
 141         C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
 142         Compile::current()->output()->add_stub(stub);
 143         slow_path = &stub->entry();
 144         continuation = &stub->continuation();
 145       }
 146       bs->nmethod_entry_barrier(this, slow_path, continuation);
 147     }
 148 #else
 149     // Don't bother with out-of-line nmethod entry barrier stub for x86_32.
 150     bs->nmethod_entry_barrier(this, nullptr /* slow_path */, nullptr /* continuation */);
 151 #endif
 152   }
 153 }
 154 
 155 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
 156   switch (vlen_in_bytes) {
 157     case  4: // fall-through
 158     case  8: // fall-through
 159     case 16: return Assembler::AVX_128bit;
 160     case 32: return Assembler::AVX_256bit;
 161     case 64: return Assembler::AVX_512bit;
 162 
 163     default: {
 164       ShouldNotReachHere();
 165       return Assembler::AVX_NoVec;
 166     }
 167   }
 168 }
 169 
 170 #if INCLUDE_RTM_OPT
 171 
 172 // Update rtm_counters based on abort status
 173 // input: abort_status
 174 //        rtm_counters (RTMLockingCounters*)
 175 // flags are killed
 176 void C2_MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) {
 177 
 178   atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset()));
 179   if (PrintPreciseRTMLockingStatistics) {
 180     for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {
 181       Label check_abort;
 182       testl(abort_status, (1<<i));
 183       jccb(Assembler::equal, check_abort);
 184       atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx))));
 185       bind(check_abort);
 186     }
 187   }
 188 }
 189 
 190 // Branch if (random & (count-1) != 0), count is 2^n
 191 // tmp, scr and flags are killed
 192 void C2_MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) {
 193   assert(tmp == rax, "");
 194   assert(scr == rdx, "");
 195   rdtsc(); // modifies EDX:EAX
 196   andptr(tmp, count-1);
 197   jccb(Assembler::notZero, brLabel);
 198 }
 199 
 200 // Perform abort ratio calculation, set no_rtm bit if high ratio
 201 // input:  rtm_counters_Reg (RTMLockingCounters* address)
 202 // tmpReg, rtm_counters_Reg and flags are killed
 203 void C2_MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg,
 204                                                     Register rtm_counters_Reg,
 205                                                     RTMLockingCounters* rtm_counters,
 206                                                     Metadata* method_data) {
 207   Label L_done, L_check_always_rtm1, L_check_always_rtm2;
 208 
 209   if (RTMLockingCalculationDelay > 0) {
 210     // Delay calculation
 211     movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr()));
 212     testptr(tmpReg, tmpReg);
 213     jccb(Assembler::equal, L_done);
 214   }
 215   // Abort ratio calculation only if abort_count > RTMAbortThreshold
 216   //   Aborted transactions = abort_count * 100
 217   //   All transactions = total_count *  RTMTotalCountIncrRate
 218   //   Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
 219 
 220   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset()));
 221   cmpptr(tmpReg, RTMAbortThreshold);
 222   jccb(Assembler::below, L_check_always_rtm2);
 223   imulptr(tmpReg, tmpReg, 100);
 224 
 225   Register scrReg = rtm_counters_Reg;
 226   movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
 227   imulptr(scrReg, scrReg, RTMTotalCountIncrRate);
 228   imulptr(scrReg, scrReg, RTMAbortRatio);
 229   cmpptr(tmpReg, scrReg);
 230   jccb(Assembler::below, L_check_always_rtm1);
 231   if (method_data != nullptr) {
 232     // set rtm_state to "no rtm" in MDO
 233     mov_metadata(tmpReg, method_data);
 234     lock();
 235     orl(Address(tmpReg, MethodData::rtm_state_offset()), NoRTM);
 236   }
 237   jmpb(L_done);
 238   bind(L_check_always_rtm1);
 239   // Reload RTMLockingCounters* address
 240   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
 241   bind(L_check_always_rtm2);
 242   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
 243   cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate);
 244   jccb(Assembler::below, L_done);
 245   if (method_data != nullptr) {
 246     // set rtm_state to "always rtm" in MDO
 247     mov_metadata(tmpReg, method_data);
 248     lock();
 249     orl(Address(tmpReg, MethodData::rtm_state_offset()), UseRTM);
 250   }
 251   bind(L_done);
 252 }
 253 
 254 // Update counters and perform abort ratio calculation
 255 // input:  abort_status_Reg
 256 // rtm_counters_Reg, flags are killed
 257 void C2_MacroAssembler::rtm_profiling(Register abort_status_Reg,
 258                                       Register rtm_counters_Reg,
 259                                       RTMLockingCounters* rtm_counters,
 260                                       Metadata* method_data,
 261                                       bool profile_rtm) {
 262 
 263   assert(rtm_counters != nullptr, "should not be null when profiling RTM");
 264   // update rtm counters based on rax value at abort
 265   // reads abort_status_Reg, updates flags
 266   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
 267   rtm_counters_update(abort_status_Reg, rtm_counters_Reg);
 268   if (profile_rtm) {
 269     // Save abort status because abort_status_Reg is used by following code.
 270     if (RTMRetryCount > 0) {
 271       push(abort_status_Reg);
 272     }
 273     assert(rtm_counters != nullptr, "should not be null when profiling RTM");
 274     rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data);
 275     // restore abort status
 276     if (RTMRetryCount > 0) {
 277       pop(abort_status_Reg);
 278     }
 279   }
 280 }
 281 
 282 // Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4)
 283 // inputs: retry_count_Reg
 284 //       : abort_status_Reg
 285 // output: retry_count_Reg decremented by 1
 286 // flags are killed
 287 void C2_MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) {
 288   Label doneRetry;
 289   assert(abort_status_Reg == rax, "");
 290   // The abort reason bits are in eax (see all states in rtmLocking.hpp)
 291   // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4)
 292   // if reason is in 0x6 and retry count != 0 then retry
 293   andptr(abort_status_Reg, 0x6);
 294   jccb(Assembler::zero, doneRetry);
 295   testl(retry_count_Reg, retry_count_Reg);
 296   jccb(Assembler::zero, doneRetry);
 297   pause();
 298   decrementl(retry_count_Reg);
 299   jmp(retryLabel);
 300   bind(doneRetry);
 301 }
 302 
 303 // Spin and retry if lock is busy,
 304 // inputs: box_Reg (monitor address)
 305 //       : retry_count_Reg
 306 // output: retry_count_Reg decremented by 1
 307 //       : clear z flag if retry count exceeded
 308 // tmp_Reg, scr_Reg, flags are killed
 309 void C2_MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg,
 310                                                Register tmp_Reg, Register scr_Reg, Label& retryLabel) {
 311   Label SpinLoop, SpinExit, doneRetry;
 312   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 313 
 314   testl(retry_count_Reg, retry_count_Reg);
 315   jccb(Assembler::zero, doneRetry);
 316   decrementl(retry_count_Reg);
 317   movptr(scr_Reg, RTMSpinLoopCount);
 318 
 319   bind(SpinLoop);
 320   pause();
 321   decrementl(scr_Reg);
 322   jccb(Assembler::lessEqual, SpinExit);
 323   movptr(tmp_Reg, Address(box_Reg, owner_offset));
 324   testptr(tmp_Reg, tmp_Reg);
 325   jccb(Assembler::notZero, SpinLoop);
 326 
 327   bind(SpinExit);
 328   jmp(retryLabel);
 329   bind(doneRetry);
 330   incrementl(retry_count_Reg); // clear z flag
 331 }
 332 
 333 // Use RTM for normal stack locks
 334 // Input: objReg (object to lock)
 335 void C2_MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg,
 336                                          Register retry_on_abort_count_Reg,
 337                                          RTMLockingCounters* stack_rtm_counters,
 338                                          Metadata* method_data, bool profile_rtm,
 339                                          Label& DONE_LABEL, Label& IsInflated) {
 340   assert(UseRTMForStackLocks, "why call this otherwise?");
 341   assert(tmpReg == rax, "");
 342   assert(scrReg == rdx, "");
 343   Label L_rtm_retry, L_decrement_retry, L_on_abort;
 344 
 345   if (RTMRetryCount > 0) {
 346     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
 347     bind(L_rtm_retry);
 348   }
 349   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
 350   testptr(tmpReg, markWord::monitor_value);  // inflated vs stack-locked|neutral
 351   jcc(Assembler::notZero, IsInflated);
 352 
 353   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 354     Label L_noincrement;
 355     if (RTMTotalCountIncrRate > 1) {
 356       // tmpReg, scrReg and flags are killed
 357       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
 358     }
 359     assert(stack_rtm_counters != nullptr, "should not be null when profiling RTM");
 360     atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg);
 361     bind(L_noincrement);
 362   }
 363   xbegin(L_on_abort);
 364   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));       // fetch markword
 365   andptr(tmpReg, markWord::lock_mask_in_place);     // look at 2 lock bits
 366   cmpptr(tmpReg, markWord::unlocked_value);         // bits = 01 unlocked
 367   jcc(Assembler::equal, DONE_LABEL);        // all done if unlocked
 368 
 369   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
 370   if (UseRTMXendForLockBusy) {
 371     xend();
 372     movptr(abort_status_Reg, 0x2);   // Set the abort status to 2 (so we can retry)
 373     jmp(L_decrement_retry);
 374   }
 375   else {
 376     xabort(0);
 377   }
 378   bind(L_on_abort);
 379   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 380     rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm);
 381   }
 382   bind(L_decrement_retry);
 383   if (RTMRetryCount > 0) {
 384     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
 385     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
 386   }
 387 }
 388 
 389 // Use RTM for inflating locks
 390 // inputs: objReg (object to lock)
 391 //         boxReg (on-stack box address (displaced header location) - KILLED)
 392 //         tmpReg (ObjectMonitor address + markWord::monitor_value)
 393 void C2_MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg,
 394                                             Register scrReg, Register retry_on_busy_count_Reg,
 395                                             Register retry_on_abort_count_Reg,
 396                                             RTMLockingCounters* rtm_counters,
 397                                             Metadata* method_data, bool profile_rtm,
 398                                             Label& DONE_LABEL) {
 399   assert(UseRTMLocking, "why call this otherwise?");
 400   assert(tmpReg == rax, "");
 401   assert(scrReg == rdx, "");
 402   Label L_rtm_retry, L_decrement_retry, L_on_abort;
 403   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 404 
 405   movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value()));
 406   movptr(boxReg, tmpReg); // Save ObjectMonitor address
 407 
 408   if (RTMRetryCount > 0) {
 409     movl(retry_on_busy_count_Reg, RTMRetryCount);  // Retry on lock busy
 410     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
 411     bind(L_rtm_retry);
 412   }
 413   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 414     Label L_noincrement;
 415     if (RTMTotalCountIncrRate > 1) {
 416       // tmpReg, scrReg and flags are killed
 417       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
 418     }
 419     assert(rtm_counters != nullptr, "should not be null when profiling RTM");
 420     atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg);
 421     bind(L_noincrement);
 422   }
 423   xbegin(L_on_abort);
 424   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
 425   movptr(tmpReg, Address(tmpReg, owner_offset));
 426   testptr(tmpReg, tmpReg);
 427   jcc(Assembler::zero, DONE_LABEL);
 428   if (UseRTMXendForLockBusy) {
 429     xend();
 430     jmp(L_decrement_retry);
 431   }
 432   else {
 433     xabort(0);
 434   }
 435   bind(L_on_abort);
 436   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
 437   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 438     rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm);
 439   }
 440   if (RTMRetryCount > 0) {
 441     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
 442     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
 443   }
 444 
 445   movptr(tmpReg, Address(boxReg, owner_offset)) ;
 446   testptr(tmpReg, tmpReg) ;
 447   jccb(Assembler::notZero, L_decrement_retry) ;
 448 
 449   // Appears unlocked - try to swing _owner from null to non-null.
 450   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 451 #ifdef _LP64
 452   Register threadReg = r15_thread;
 453 #else
 454   get_thread(scrReg);
 455   Register threadReg = scrReg;
 456 #endif
 457   lock();
 458   cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg
 459 
 460   if (RTMRetryCount > 0) {
 461     // success done else retry
 462     jccb(Assembler::equal, DONE_LABEL) ;
 463     bind(L_decrement_retry);
 464     // Spin and retry if lock is busy.
 465     rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry);
 466   }
 467   else {
 468     bind(L_decrement_retry);
 469   }
 470 }
 471 
 472 #endif //  INCLUDE_RTM_OPT
 473 
 474 // fast_lock and fast_unlock used by C2
 475 
 476 // Because the transitions from emitted code to the runtime
 477 // monitorenter/exit helper stubs are so slow it's critical that
 478 // we inline both the stack-locking fast path and the inflated fast path.
 479 //
 480 // See also: cmpFastLock and cmpFastUnlock.
 481 //
 482 // What follows is a specialized inline transliteration of the code
 483 // in enter() and exit(). If we're concerned about I$ bloat another
 484 // option would be to emit TrySlowEnter and TrySlowExit methods
 485 // at startup-time.  These methods would accept arguments as
 486 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
 487 // indications in the icc.ZFlag.  fast_lock and fast_unlock would simply
 488 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
 489 // In practice, however, the # of lock sites is bounded and is usually small.
 490 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
 491 // if the processor uses simple bimodal branch predictors keyed by EIP
 492 // Since the helper routines would be called from multiple synchronization
 493 // sites.
 494 //
 495 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
 496 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
 497 // to those specialized methods.  That'd give us a mostly platform-independent
 498 // implementation that the JITs could optimize and inline at their pleasure.
 499 // Done correctly, the only time we'd need to cross to native could would be
 500 // to park() or unpark() threads.  We'd also need a few more unsafe operators
 501 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
 502 // (b) explicit barriers or fence operations.
 503 //
 504 // TODO:
 505 //
 506 // *  Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
 507 //    This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
 508 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
 509 //    the lock operators would typically be faster than reifying Self.
 510 //
 511 // *  Ideally I'd define the primitives as:
 512 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
 513 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
 514 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
 515 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
 516 //    Furthermore the register assignments are overconstrained, possibly resulting in
 517 //    sub-optimal code near the synchronization site.
 518 //
 519 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
 520 //    Alternately, use a better sp-proximity test.
 521 //
 522 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
 523 //    Either one is sufficient to uniquely identify a thread.
 524 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
 525 //
 526 // *  Intrinsify notify() and notifyAll() for the common cases where the
 527 //    object is locked by the calling thread but the waitlist is empty.
 528 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
 529 //
 530 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
 531 //    But beware of excessive branch density on AMD Opterons.
 532 //
 533 // *  Both fast_lock and fast_unlock set the ICC.ZF to indicate success
 534 //    or failure of the fast path.  If the fast path fails then we pass
 535 //    control to the slow path, typically in C.  In fast_lock and
 536 //    fast_unlock we often branch to DONE_LABEL, just to find that C2
 537 //    will emit a conditional branch immediately after the node.
 538 //    So we have branches to branches and lots of ICC.ZF games.
 539 //    Instead, it might be better to have C2 pass a "FailureLabel"
 540 //    into fast_lock and fast_unlock.  In the case of success, control
 541 //    will drop through the node.  ICC.ZF is undefined at exit.
 542 //    In the case of failure, the node will branch directly to the
 543 //    FailureLabel
 544 
 545 
 546 // obj: object to lock
 547 // box: on-stack box address (displaced header location) - KILLED
 548 // rax,: tmp -- KILLED
 549 // scr: tmp -- KILLED
 550 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
 551                                  Register scrReg, Register cx1Reg, Register cx2Reg, Register thread,
 552                                  RTMLockingCounters* rtm_counters,
 553                                  RTMLockingCounters* stack_rtm_counters,
 554                                  Metadata* method_data,
 555                                  bool use_rtm, bool profile_rtm) {
 556   // Ensure the register assignments are disjoint
 557   assert(tmpReg == rax, "");
 558 
 559   if (use_rtm) {
 560     assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg);
 561   } else {
 562     assert(cx1Reg == noreg, "");
 563     assert(cx2Reg == noreg, "");
 564     assert_different_registers(objReg, boxReg, tmpReg, scrReg);
 565   }
 566 
 567   // Possible cases that we'll encounter in fast_lock
 568   // ------------------------------------------------
 569   // * Inflated
 570   //    -- unlocked
 571   //    -- Locked
 572   //       = by self
 573   //       = by other
 574   // * neutral
 575   // * stack-locked
 576   //    -- by self
 577   //       = sp-proximity test hits
 578   //       = sp-proximity test generates false-negative
 579   //    -- by other
 580   //
 581 
 582   Label IsInflated, DONE_LABEL, NO_COUNT, COUNT;
 583 
 584   if (DiagnoseSyncOnValueBasedClasses != 0) {
 585     load_klass(tmpReg, objReg, scrReg);
 586     movl(tmpReg, Address(tmpReg, Klass::access_flags_offset()));
 587     testl(tmpReg, JVM_ACC_IS_VALUE_BASED_CLASS);
 588     jcc(Assembler::notZero, DONE_LABEL);
 589   }
 590 
 591 #if INCLUDE_RTM_OPT
 592   if (UseRTMForStackLocks && use_rtm) {
 593     assert(LockingMode != LM_MONITOR, "LockingMode == 0 (LM_MONITOR) and +UseRTMForStackLocks are mutually exclusive");
 594     rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg,
 595                       stack_rtm_counters, method_data, profile_rtm,
 596                       DONE_LABEL, IsInflated);
 597   }
 598 #endif // INCLUDE_RTM_OPT
 599 
 600   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));          // [FETCH]
 601   testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral
 602   jcc(Assembler::notZero, IsInflated);
 603 
 604   if (LockingMode == LM_MONITOR) {
 605     // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0.
 606     testptr(objReg, objReg);
 607   } else if (LockingMode == LM_LEGACY) {
 608     // Attempt stack-locking ...
 609     orptr (tmpReg, markWord::unlocked_value);
 610     movptr(Address(boxReg, 0), tmpReg);          // Anticipate successful CAS
 611     lock();
 612     cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes()));      // Updates tmpReg
 613     jcc(Assembler::equal, COUNT);           // Success
 614 
 615     // Recursive locking.
 616     // The object is stack-locked: markword contains stack pointer to BasicLock.
 617     // Locked by current thread if difference with current SP is less than one page.
 618     subptr(tmpReg, rsp);
 619     // Next instruction set ZFlag == 1 (Success) if difference is less then one page.
 620     andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - (int)os::vm_page_size())) );
 621     movptr(Address(boxReg, 0), tmpReg);
 622   } else {
 623     assert(LockingMode == LM_LIGHTWEIGHT, "");
 624     fast_lock_impl(objReg, tmpReg, thread, scrReg, NO_COUNT);
 625     jmp(COUNT);
 626   }
 627   jmp(DONE_LABEL);
 628 
 629   bind(IsInflated);
 630   // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value
 631 
 632 #if INCLUDE_RTM_OPT
 633   // Use the same RTM locking code in 32- and 64-bit VM.
 634   if (use_rtm) {
 635     rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg,
 636                          rtm_counters, method_data, profile_rtm, DONE_LABEL);
 637   } else {
 638 #endif // INCLUDE_RTM_OPT
 639 
 640 #ifndef _LP64
 641   // The object is inflated.
 642 
 643   // boxReg refers to the on-stack BasicLock in the current frame.
 644   // We'd like to write:
 645   //   set box->_displaced_header = markWord::unused_mark().  Any non-0 value suffices.
 646   // This is convenient but results a ST-before-CAS penalty.  The following CAS suffers
 647   // additional latency as we have another ST in the store buffer that must drain.
 648 
 649   // avoid ST-before-CAS
 650   // register juggle because we need tmpReg for cmpxchgptr below
 651   movptr(scrReg, boxReg);
 652   movptr(boxReg, tmpReg);                   // consider: LEA box, [tmp-2]
 653 
 654   // Optimistic form: consider XORL tmpReg,tmpReg
 655   movptr(tmpReg, NULL_WORD);
 656 
 657   // Appears unlocked - try to swing _owner from null to non-null.
 658   // Ideally, I'd manifest "Self" with get_thread and then attempt
 659   // to CAS the register containing Self into m->Owner.
 660   // But we don't have enough registers, so instead we can either try to CAS
 661   // rsp or the address of the box (in scr) into &m->owner.  If the CAS succeeds
 662   // we later store "Self" into m->Owner.  Transiently storing a stack address
 663   // (rsp or the address of the box) into  m->owner is harmless.
 664   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 665   lock();
 666   cmpxchgptr(thread, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 667   movptr(Address(scrReg, 0), 3);          // box->_displaced_header = 3
 668 
 669   // If the CAS fails we can either retry or pass control to the slow path.
 670   // We use the latter tactic.
 671   // Pass the CAS result in the icc.ZFlag into DONE_LABEL
 672   // If the CAS was successful ...
 673   //   Self has acquired the lock
 674   //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
 675   // Intentional fall-through into DONE_LABEL ...
 676 #else // _LP64
 677   // It's inflated and we use scrReg for ObjectMonitor* in this section.
 678   movq(scrReg, tmpReg);
 679   xorq(tmpReg, tmpReg);
 680   lock();
 681   cmpxchgptr(thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 682   // Unconditionally set box->_displaced_header = markWord::unused_mark().
 683   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 684   movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value()));
 685   // Propagate ICC.ZF from CAS above into DONE_LABEL.
 686   jccb(Assembler::equal, COUNT);          // CAS above succeeded; propagate ZF = 1 (success)
 687 
 688   cmpptr(thread, rax);                // Check if we are already the owner (recursive lock)
 689   jccb(Assembler::notEqual, NO_COUNT);    // If not recursive, ZF = 0 at this point (fail)
 690   incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 691   xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success
 692 #endif // _LP64
 693 #if INCLUDE_RTM_OPT
 694   } // use_rtm()
 695 #endif
 696   bind(DONE_LABEL);
 697 
 698   // ZFlag == 1 count in fast path
 699   // ZFlag == 0 count in slow path
 700   jccb(Assembler::notZero, NO_COUNT); // jump if ZFlag == 0
 701 
 702   bind(COUNT);
 703   // Count monitors in fast path
 704   increment(Address(thread, JavaThread::held_monitor_count_offset()));
 705 
 706   xorl(tmpReg, tmpReg); // Set ZF == 1
 707 
 708   bind(NO_COUNT);
 709 
 710   // At NO_COUNT the icc ZFlag is set as follows ...
 711   // fast_unlock uses the same protocol.
 712   // ZFlag == 1 -> Success
 713   // ZFlag == 0 -> Failure - force control through the slow path
 714 }
 715 
 716 // obj: object to unlock
 717 // box: box address (displaced header location), killed.  Must be EAX.
 718 // tmp: killed, cannot be obj nor box.
 719 //
 720 // Some commentary on balanced locking:
 721 //
 722 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
 723 // Methods that don't have provably balanced locking are forced to run in the
 724 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
 725 // The interpreter provides two properties:
 726 // I1:  At return-time the interpreter automatically and quietly unlocks any
 727 //      objects acquired the current activation (frame).  Recall that the
 728 //      interpreter maintains an on-stack list of locks currently held by
 729 //      a frame.
 730 // I2:  If a method attempts to unlock an object that is not held by the
 731 //      the frame the interpreter throws IMSX.
 732 //
 733 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
 734 // B() doesn't have provably balanced locking so it runs in the interpreter.
 735 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
 736 // is still locked by A().
 737 //
 738 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
 739 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
 740 // should not be unlocked by "normal" java-level locking and vice-versa.  The specification
 741 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
 742 // Arguably given that the spec legislates the JNI case as undefined our implementation
 743 // could reasonably *avoid* checking owner in fast_unlock().
 744 // In the interest of performance we elide m->Owner==Self check in unlock.
 745 // A perfectly viable alternative is to elide the owner check except when
 746 // Xcheck:jni is enabled.
 747 
 748 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) {
 749   assert(boxReg == rax, "");
 750   assert_different_registers(objReg, boxReg, tmpReg);
 751 
 752   Label DONE_LABEL, Stacked, COUNT, NO_COUNT;
 753 
 754 #if INCLUDE_RTM_OPT
 755   if (UseRTMForStackLocks && use_rtm) {
 756     assert(LockingMode != LM_MONITOR, "LockingMode == 0 (LM_MONITOR) and +UseRTMForStackLocks are mutually exclusive");
 757     Label L_regular_unlock;
 758     movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword
 759     andptr(tmpReg, markWord::lock_mask_in_place);                     // look at 2 lock bits
 760     cmpptr(tmpReg, markWord::unlocked_value);                         // bits = 01 unlocked
 761     jccb(Assembler::notEqual, L_regular_unlock);                      // if !HLE RegularLock
 762     xend();                                                           // otherwise end...
 763     jmp(DONE_LABEL);                                                  // ... and we're done
 764     bind(L_regular_unlock);
 765   }
 766 #endif
 767 
 768   if (LockingMode == LM_LEGACY) {
 769     cmpptr(Address(boxReg, 0), NULL_WORD);                            // Examine the displaced header
 770     jcc   (Assembler::zero, COUNT);                                   // 0 indicates recursive stack-lock
 771   }
 772   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));   // Examine the object's markword
 773   if (LockingMode != LM_MONITOR) {
 774     testptr(tmpReg, markWord::monitor_value);                         // Inflated?
 775     jcc(Assembler::zero, Stacked);
 776   }
 777 
 778   // It's inflated.
 779   if (LockingMode == LM_LIGHTWEIGHT) {
 780     // If the owner is ANONYMOUS, we need to fix it -  in an outline stub.
 781     testb(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t) ObjectMonitor::ANONYMOUS_OWNER);
 782 #ifdef _LP64
 783     if (!Compile::current()->output()->in_scratch_emit_size()) {
 784       C2HandleAnonOMOwnerStub* stub = new (Compile::current()->comp_arena()) C2HandleAnonOMOwnerStub(tmpReg, boxReg);
 785       Compile::current()->output()->add_stub(stub);
 786       jcc(Assembler::notEqual, stub->entry());
 787       bind(stub->continuation());
 788     } else
 789 #endif
 790     {
 791       // We can't easily implement this optimization on 32 bit because we don't have a thread register.
 792       // Call the slow-path instead.
 793       jcc(Assembler::notEqual, NO_COUNT);
 794     }
 795   }
 796 
 797 #if INCLUDE_RTM_OPT
 798   if (use_rtm) {
 799     Label L_regular_inflated_unlock;
 800     int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 801     movptr(boxReg, Address(tmpReg, owner_offset));
 802     testptr(boxReg, boxReg);
 803     jccb(Assembler::notZero, L_regular_inflated_unlock);
 804     xend();
 805     jmp(DONE_LABEL);
 806     bind(L_regular_inflated_unlock);
 807   }
 808 #endif
 809 
 810   // Despite our balanced locking property we still check that m->_owner == Self
 811   // as java routines or native JNI code called by this thread might
 812   // have released the lock.
 813   // Refer to the comments in synchronizer.cpp for how we might encode extra
 814   // state in _succ so we can avoid fetching EntryList|cxq.
 815   //
 816   // If there's no contention try a 1-0 exit.  That is, exit without
 817   // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
 818   // we detect and recover from the race that the 1-0 exit admits.
 819   //
 820   // Conceptually fast_unlock() must execute a STST|LDST "release" barrier
 821   // before it STs null into _owner, releasing the lock.  Updates
 822   // to data protected by the critical section must be visible before
 823   // we drop the lock (and thus before any other thread could acquire
 824   // the lock and observe the fields protected by the lock).
 825   // IA32's memory-model is SPO, so STs are ordered with respect to
 826   // each other and there's no need for an explicit barrier (fence).
 827   // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
 828 #ifndef _LP64
 829   // Note that we could employ various encoding schemes to reduce
 830   // the number of loads below (currently 4) to just 2 or 3.
 831   // Refer to the comments in synchronizer.cpp.
 832   // In practice the chain of fetches doesn't seem to impact performance, however.
 833   xorptr(boxReg, boxReg);
 834   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 835   jccb  (Assembler::notZero, DONE_LABEL);
 836   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 837   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 838   jccb  (Assembler::notZero, DONE_LABEL);
 839   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 840   jmpb  (DONE_LABEL);
 841 #else // _LP64
 842   // It's inflated
 843   Label CheckSucc, LNotRecursive, LSuccess, LGoSlowPath;
 844 
 845   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0);
 846   jccb(Assembler::equal, LNotRecursive);
 847 
 848   // Recursive inflated unlock
 849   decq(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 850   jmpb(LSuccess);
 851 
 852   bind(LNotRecursive);
 853   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 854   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 855   jccb  (Assembler::notZero, CheckSucc);
 856   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 857   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 858   jmpb  (DONE_LABEL);
 859 
 860   // Try to avoid passing control into the slow_path ...
 861   bind  (CheckSucc);
 862 
 863   // The following optional optimization can be elided if necessary
 864   // Effectively: if (succ == null) goto slow path
 865   // The code reduces the window for a race, however,
 866   // and thus benefits performance.
 867   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD);
 868   jccb  (Assembler::zero, LGoSlowPath);
 869 
 870   xorptr(boxReg, boxReg);
 871   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 872   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 873 
 874   // Memory barrier/fence
 875   // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ
 876   // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack.
 877   // This is faster on Nehalem and AMD Shanghai/Barcelona.
 878   // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences
 879   // We might also restructure (ST Owner=0;barrier;LD _Succ) to
 880   // (mov box,0; xchgq box, &m->Owner; LD _succ) .
 881   lock(); addl(Address(rsp, 0), 0);
 882 
 883   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD);
 884   jccb  (Assembler::notZero, LSuccess);
 885 
 886   // Rare inopportune interleaving - race.
 887   // The successor vanished in the small window above.
 888   // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor.
 889   // We need to ensure progress and succession.
 890   // Try to reacquire the lock.
 891   // If that fails then the new owner is responsible for succession and this
 892   // thread needs to take no further action and can exit via the fast path (success).
 893   // If the re-acquire succeeds then pass control into the slow path.
 894   // As implemented, this latter mode is horrible because we generated more
 895   // coherence traffic on the lock *and* artificially extended the critical section
 896   // length while by virtue of passing control into the slow path.
 897 
 898   // box is really RAX -- the following CMPXCHG depends on that binding
 899   // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R)
 900   lock();
 901   cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 902   // There's no successor so we tried to regrab the lock.
 903   // If that didn't work, then another thread grabbed the
 904   // lock so we're done (and exit was a success).
 905   jccb  (Assembler::notEqual, LSuccess);
 906   // Intentional fall-through into slow path
 907 
 908   bind  (LGoSlowPath);
 909   orl   (boxReg, 1);                      // set ICC.ZF=0 to indicate failure
 910   jmpb  (DONE_LABEL);
 911 
 912   bind  (LSuccess);
 913   testl (boxReg, 0);                      // set ICC.ZF=1 to indicate success
 914   jmpb  (DONE_LABEL);
 915 
 916 #endif
 917   if (LockingMode != LM_MONITOR) {
 918     bind  (Stacked);
 919     if (LockingMode == LM_LIGHTWEIGHT) {
 920       mov(boxReg, tmpReg);
 921       fast_unlock_impl(objReg, boxReg, tmpReg, NO_COUNT);
 922       jmp(COUNT);
 923     } else if (LockingMode == LM_LEGACY) {
 924       movptr(tmpReg, Address (boxReg, 0));      // re-fetch
 925       lock();
 926       cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
 927     }
 928     // Intentional fall-thru into DONE_LABEL
 929   }
 930   bind(DONE_LABEL);
 931 
 932   // ZFlag == 1 count in fast path
 933   // ZFlag == 0 count in slow path
 934   jccb(Assembler::notZero, NO_COUNT);
 935 
 936   bind(COUNT);
 937   // Count monitors in fast path
 938 #ifndef _LP64
 939   get_thread(tmpReg);
 940   decrementl(Address(tmpReg, JavaThread::held_monitor_count_offset()));
 941 #else // _LP64
 942   decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset()));
 943 #endif
 944 
 945   xorl(tmpReg, tmpReg); // Set ZF == 1
 946 
 947   bind(NO_COUNT);
 948 }
 949 
 950 //-------------------------------------------------------------------------------------------
 951 // Generic instructions support for use in .ad files C2 code generation
 952 
 953 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) {
 954   if (dst != src) {
 955     movdqu(dst, src);
 956   }
 957   if (opcode == Op_AbsVD) {
 958     andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg);
 959   } else {
 960     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 961     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
 962   }
 963 }
 964 
 965 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 966   if (opcode == Op_AbsVD) {
 967     vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg);
 968   } else {
 969     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 970     vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg);
 971   }
 972 }
 973 
 974 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) {
 975   if (dst != src) {
 976     movdqu(dst, src);
 977   }
 978   if (opcode == Op_AbsVF) {
 979     andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg);
 980   } else {
 981     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 982     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
 983   }
 984 }
 985 
 986 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 987   if (opcode == Op_AbsVF) {
 988     vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg);
 989   } else {
 990     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 991     vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg);
 992   }
 993 }
 994 
 995 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
 996   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 997   assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
 998 
 999   if (opcode == Op_MinV) {
1000     if (elem_bt == T_BYTE) {
1001       pminsb(dst, src);
1002     } else if (elem_bt == T_SHORT) {
1003       pminsw(dst, src);
1004     } else if (elem_bt == T_INT) {
1005       pminsd(dst, src);
1006     } else {
1007       assert(elem_bt == T_LONG, "required");
1008       assert(tmp == xmm0, "required");
1009       assert_different_registers(dst, src, tmp);
1010       movdqu(xmm0, dst);
1011       pcmpgtq(xmm0, src);
1012       blendvpd(dst, src);  // xmm0 as mask
1013     }
1014   } else { // opcode == Op_MaxV
1015     if (elem_bt == T_BYTE) {
1016       pmaxsb(dst, src);
1017     } else if (elem_bt == T_SHORT) {
1018       pmaxsw(dst, src);
1019     } else if (elem_bt == T_INT) {
1020       pmaxsd(dst, src);
1021     } else {
1022       assert(elem_bt == T_LONG, "required");
1023       assert(tmp == xmm0, "required");
1024       assert_different_registers(dst, src, tmp);
1025       movdqu(xmm0, src);
1026       pcmpgtq(xmm0, dst);
1027       blendvpd(dst, src);  // xmm0 as mask
1028     }
1029   }
1030 }
1031 
1032 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
1033                                  XMMRegister dst, XMMRegister src1, XMMRegister src2,
1034                                  int vlen_enc) {
1035   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
1036 
1037   if (opcode == Op_MinV) {
1038     if (elem_bt == T_BYTE) {
1039       vpminsb(dst, src1, src2, vlen_enc);
1040     } else if (elem_bt == T_SHORT) {
1041       vpminsw(dst, src1, src2, vlen_enc);
1042     } else if (elem_bt == T_INT) {
1043       vpminsd(dst, src1, src2, vlen_enc);
1044     } else {
1045       assert(elem_bt == T_LONG, "required");
1046       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
1047         vpminsq(dst, src1, src2, vlen_enc);
1048       } else {
1049         assert_different_registers(dst, src1, src2);
1050         vpcmpgtq(dst, src1, src2, vlen_enc);
1051         vblendvpd(dst, src1, src2, dst, vlen_enc);
1052       }
1053     }
1054   } else { // opcode == Op_MaxV
1055     if (elem_bt == T_BYTE) {
1056       vpmaxsb(dst, src1, src2, vlen_enc);
1057     } else if (elem_bt == T_SHORT) {
1058       vpmaxsw(dst, src1, src2, vlen_enc);
1059     } else if (elem_bt == T_INT) {
1060       vpmaxsd(dst, src1, src2, vlen_enc);
1061     } else {
1062       assert(elem_bt == T_LONG, "required");
1063       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
1064         vpmaxsq(dst, src1, src2, vlen_enc);
1065       } else {
1066         assert_different_registers(dst, src1, src2);
1067         vpcmpgtq(dst, src1, src2, vlen_enc);
1068         vblendvpd(dst, src2, src1, dst, vlen_enc);
1069       }
1070     }
1071   }
1072 }
1073 
1074 // Float/Double min max
1075 
1076 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
1077                                    XMMRegister dst, XMMRegister a, XMMRegister b,
1078                                    XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
1079                                    int vlen_enc) {
1080   assert(UseAVX > 0, "required");
1081   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1082          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1083   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1084   assert_different_registers(a, b, tmp, atmp, btmp);
1085 
1086   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1087   bool is_double_word = is_double_word_type(elem_bt);
1088 
1089   if (!is_double_word && is_min) {
1090     vblendvps(atmp, a, b, a, vlen_enc);
1091     vblendvps(btmp, b, a, a, vlen_enc);
1092     vminps(tmp, atmp, btmp, vlen_enc);
1093     vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1094     vblendvps(dst, tmp, atmp, btmp, vlen_enc);
1095   } else if (!is_double_word && !is_min) {
1096     vblendvps(btmp, b, a, b, vlen_enc);
1097     vblendvps(atmp, a, b, b, vlen_enc);
1098     vmaxps(tmp, atmp, btmp, vlen_enc);
1099     vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1100     vblendvps(dst, tmp, atmp, btmp, vlen_enc);
1101   } else if (is_double_word && is_min) {
1102     vblendvpd(atmp, a, b, a, vlen_enc);
1103     vblendvpd(btmp, b, a, a, vlen_enc);
1104     vminpd(tmp, atmp, btmp, vlen_enc);
1105     vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1106     vblendvpd(dst, tmp, atmp, btmp, vlen_enc);
1107   } else {
1108     assert(is_double_word && !is_min, "sanity");
1109     vblendvpd(btmp, b, a, b, vlen_enc);
1110     vblendvpd(atmp, a, b, b, vlen_enc);
1111     vmaxpd(tmp, atmp, btmp, vlen_enc);
1112     vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1113     vblendvpd(dst, tmp, atmp, btmp, vlen_enc);
1114   }
1115 }
1116 
1117 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
1118                                     XMMRegister dst, XMMRegister a, XMMRegister b,
1119                                     KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
1120                                     int vlen_enc) {
1121   assert(UseAVX > 2, "required");
1122   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1123          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1124   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1125   assert_different_registers(dst, a, b, atmp, btmp);
1126 
1127   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1128   bool is_double_word = is_double_word_type(elem_bt);
1129   bool merge = true;
1130 
1131   if (!is_double_word && is_min) {
1132     evpmovd2m(ktmp, a, vlen_enc);
1133     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1134     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1135     vminps(dst, atmp, btmp, vlen_enc);
1136     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1137     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1138   } else if (!is_double_word && !is_min) {
1139     evpmovd2m(ktmp, b, vlen_enc);
1140     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1141     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1142     vmaxps(dst, atmp, btmp, vlen_enc);
1143     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1144     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1145   } else if (is_double_word && is_min) {
1146     evpmovq2m(ktmp, a, vlen_enc);
1147     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1148     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1149     vminpd(dst, atmp, btmp, vlen_enc);
1150     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1151     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1152   } else {
1153     assert(is_double_word && !is_min, "sanity");
1154     evpmovq2m(ktmp, b, vlen_enc);
1155     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1156     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1157     vmaxpd(dst, atmp, btmp, vlen_enc);
1158     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1159     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1160   }
1161 }
1162 
1163 // Float/Double signum
1164 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) {
1165   assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");
1166 
1167   Label DONE_LABEL;
1168 
1169   if (opcode == Op_SignumF) {
1170     assert(UseSSE > 0, "required");
1171     ucomiss(dst, zero);
1172     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1173     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1174     movflt(dst, one);
1175     jcc(Assembler::above, DONE_LABEL);
1176     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
1177   } else if (opcode == Op_SignumD) {
1178     assert(UseSSE > 1, "required");
1179     ucomisd(dst, zero);
1180     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1181     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1182     movdbl(dst, one);
1183     jcc(Assembler::above, DONE_LABEL);
1184     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
1185   }
1186 
1187   bind(DONE_LABEL);
1188 }
1189 
1190 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1191   if (sign) {
1192     pmovsxbw(dst, src);
1193   } else {
1194     pmovzxbw(dst, src);
1195   }
1196 }
1197 
1198 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1199   if (sign) {
1200     vpmovsxbw(dst, src, vector_len);
1201   } else {
1202     vpmovzxbw(dst, src, vector_len);
1203   }
1204 }
1205 
1206 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1207   if (sign) {
1208     vpmovsxbd(dst, src, vector_len);
1209   } else {
1210     vpmovzxbd(dst, src, vector_len);
1211   }
1212 }
1213 
1214 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1215   if (sign) {
1216     vpmovsxwd(dst, src, vector_len);
1217   } else {
1218     vpmovzxwd(dst, src, vector_len);
1219   }
1220 }
1221 
1222 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1223                                      int shift, int vector_len) {
1224   if (opcode == Op_RotateLeftV) {
1225     if (etype == T_INT) {
1226       evprold(dst, src, shift, vector_len);
1227     } else {
1228       assert(etype == T_LONG, "expected type T_LONG");
1229       evprolq(dst, src, shift, vector_len);
1230     }
1231   } else {
1232     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1233     if (etype == T_INT) {
1234       evprord(dst, src, shift, vector_len);
1235     } else {
1236       assert(etype == T_LONG, "expected type T_LONG");
1237       evprorq(dst, src, shift, vector_len);
1238     }
1239   }
1240 }
1241 
1242 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1243                                      XMMRegister shift, int vector_len) {
1244   if (opcode == Op_RotateLeftV) {
1245     if (etype == T_INT) {
1246       evprolvd(dst, src, shift, vector_len);
1247     } else {
1248       assert(etype == T_LONG, "expected type T_LONG");
1249       evprolvq(dst, src, shift, vector_len);
1250     }
1251   } else {
1252     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1253     if (etype == T_INT) {
1254       evprorvd(dst, src, shift, vector_len);
1255     } else {
1256       assert(etype == T_LONG, "expected type T_LONG");
1257       evprorvq(dst, src, shift, vector_len);
1258     }
1259   }
1260 }
1261 
1262 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1263   if (opcode == Op_RShiftVI) {
1264     psrad(dst, shift);
1265   } else if (opcode == Op_LShiftVI) {
1266     pslld(dst, shift);
1267   } else {
1268     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1269     psrld(dst, shift);
1270   }
1271 }
1272 
1273 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1274   switch (opcode) {
1275     case Op_RShiftVI:  psrad(dst, shift); break;
1276     case Op_LShiftVI:  pslld(dst, shift); break;
1277     case Op_URShiftVI: psrld(dst, shift); break;
1278 
1279     default: assert(false, "%s", NodeClassNames[opcode]);
1280   }
1281 }
1282 
1283 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1284   if (opcode == Op_RShiftVI) {
1285     vpsrad(dst, nds, shift, vector_len);
1286   } else if (opcode == Op_LShiftVI) {
1287     vpslld(dst, nds, shift, vector_len);
1288   } else {
1289     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1290     vpsrld(dst, nds, shift, vector_len);
1291   }
1292 }
1293 
1294 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1295   switch (opcode) {
1296     case Op_RShiftVI:  vpsrad(dst, src, shift, vlen_enc); break;
1297     case Op_LShiftVI:  vpslld(dst, src, shift, vlen_enc); break;
1298     case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1299 
1300     default: assert(false, "%s", NodeClassNames[opcode]);
1301   }
1302 }
1303 
1304 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1305   switch (opcode) {
1306     case Op_RShiftVB:  // fall-through
1307     case Op_RShiftVS:  psraw(dst, shift); break;
1308 
1309     case Op_LShiftVB:  // fall-through
1310     case Op_LShiftVS:  psllw(dst, shift);   break;
1311 
1312     case Op_URShiftVS: // fall-through
1313     case Op_URShiftVB: psrlw(dst, shift);  break;
1314 
1315     default: assert(false, "%s", NodeClassNames[opcode]);
1316   }
1317 }
1318 
1319 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1320   switch (opcode) {
1321     case Op_RShiftVB:  // fall-through
1322     case Op_RShiftVS:  vpsraw(dst, src, shift, vlen_enc); break;
1323 
1324     case Op_LShiftVB:  // fall-through
1325     case Op_LShiftVS:  vpsllw(dst, src, shift, vlen_enc); break;
1326 
1327     case Op_URShiftVS: // fall-through
1328     case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1329 
1330     default: assert(false, "%s", NodeClassNames[opcode]);
1331   }
1332 }
1333 
1334 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1335   switch (opcode) {
1336     case Op_RShiftVL:  psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1337     case Op_LShiftVL:  psllq(dst, shift); break;
1338     case Op_URShiftVL: psrlq(dst, shift); break;
1339 
1340     default: assert(false, "%s", NodeClassNames[opcode]);
1341   }
1342 }
1343 
1344 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1345   if (opcode == Op_RShiftVL) {
1346     psrlq(dst, shift);  // using srl to implement sra on pre-avs512 systems
1347   } else if (opcode == Op_LShiftVL) {
1348     psllq(dst, shift);
1349   } else {
1350     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1351     psrlq(dst, shift);
1352   }
1353 }
1354 
1355 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1356   switch (opcode) {
1357     case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1358     case Op_LShiftVL:  vpsllq(dst, src, shift, vlen_enc); break;
1359     case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1360 
1361     default: assert(false, "%s", NodeClassNames[opcode]);
1362   }
1363 }
1364 
1365 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1366   if (opcode == Op_RShiftVL) {
1367     evpsraq(dst, nds, shift, vector_len);
1368   } else if (opcode == Op_LShiftVL) {
1369     vpsllq(dst, nds, shift, vector_len);
1370   } else {
1371     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1372     vpsrlq(dst, nds, shift, vector_len);
1373   }
1374 }
1375 
1376 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1377   switch (opcode) {
1378     case Op_RShiftVB:  // fall-through
1379     case Op_RShiftVS:  // fall-through
1380     case Op_RShiftVI:  vpsravd(dst, src, shift, vlen_enc); break;
1381 
1382     case Op_LShiftVB:  // fall-through
1383     case Op_LShiftVS:  // fall-through
1384     case Op_LShiftVI:  vpsllvd(dst, src, shift, vlen_enc); break;
1385 
1386     case Op_URShiftVB: // fall-through
1387     case Op_URShiftVS: // fall-through
1388     case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1389 
1390     default: assert(false, "%s", NodeClassNames[opcode]);
1391   }
1392 }
1393 
1394 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1395   switch (opcode) {
1396     case Op_RShiftVB:  // fall-through
1397     case Op_RShiftVS:  evpsravw(dst, src, shift, vlen_enc); break;
1398 
1399     case Op_LShiftVB:  // fall-through
1400     case Op_LShiftVS:  evpsllvw(dst, src, shift, vlen_enc); break;
1401 
1402     case Op_URShiftVB: // fall-through
1403     case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1404 
1405     default: assert(false, "%s", NodeClassNames[opcode]);
1406   }
1407 }
1408 
1409 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1410   assert(UseAVX >= 2, "required");
1411   switch (opcode) {
1412     case Op_RShiftVL: {
1413       if (UseAVX > 2) {
1414         assert(tmp == xnoreg, "not used");
1415         if (!VM_Version::supports_avx512vl()) {
1416           vlen_enc = Assembler::AVX_512bit;
1417         }
1418         evpsravq(dst, src, shift, vlen_enc);
1419       } else {
1420         vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1421         vpsrlvq(dst, src, shift, vlen_enc);
1422         vpsrlvq(tmp, tmp, shift, vlen_enc);
1423         vpxor(dst, dst, tmp, vlen_enc);
1424         vpsubq(dst, dst, tmp, vlen_enc);
1425       }
1426       break;
1427     }
1428     case Op_LShiftVL: {
1429       assert(tmp == xnoreg, "not used");
1430       vpsllvq(dst, src, shift, vlen_enc);
1431       break;
1432     }
1433     case Op_URShiftVL: {
1434       assert(tmp == xnoreg, "not used");
1435       vpsrlvq(dst, src, shift, vlen_enc);
1436       break;
1437     }
1438     default: assert(false, "%s", NodeClassNames[opcode]);
1439   }
1440 }
1441 
1442 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1443 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1444   assert(opcode == Op_LShiftVB ||
1445          opcode == Op_RShiftVB ||
1446          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1447   bool sign = (opcode != Op_URShiftVB);
1448   assert(vector_len == 0, "required");
1449   vextendbd(sign, dst, src, 1);
1450   vpmovzxbd(vtmp, shift, 1);
1451   varshiftd(opcode, dst, dst, vtmp, 1);
1452   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg);
1453   vextracti128_high(vtmp, dst);
1454   vpackusdw(dst, dst, vtmp, 0);
1455 }
1456 
1457 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1458 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1459   assert(opcode == Op_LShiftVB ||
1460          opcode == Op_RShiftVB ||
1461          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1462   bool sign = (opcode != Op_URShiftVB);
1463   int ext_vector_len = vector_len + 1;
1464   vextendbw(sign, dst, src, ext_vector_len);
1465   vpmovzxbw(vtmp, shift, ext_vector_len);
1466   varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1467   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg);
1468   if (vector_len == 0) {
1469     vextracti128_high(vtmp, dst);
1470     vpackuswb(dst, dst, vtmp, vector_len);
1471   } else {
1472     vextracti64x4_high(vtmp, dst);
1473     vpackuswb(dst, dst, vtmp, vector_len);
1474     vpermq(dst, dst, 0xD8, vector_len);
1475   }
1476 }
1477 
1478 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1479   switch(typ) {
1480     case T_BYTE:
1481       pinsrb(dst, val, idx);
1482       break;
1483     case T_SHORT:
1484       pinsrw(dst, val, idx);
1485       break;
1486     case T_INT:
1487       pinsrd(dst, val, idx);
1488       break;
1489     case T_LONG:
1490       pinsrq(dst, val, idx);
1491       break;
1492     default:
1493       assert(false,"Should not reach here.");
1494       break;
1495   }
1496 }
1497 
1498 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1499   switch(typ) {
1500     case T_BYTE:
1501       vpinsrb(dst, src, val, idx);
1502       break;
1503     case T_SHORT:
1504       vpinsrw(dst, src, val, idx);
1505       break;
1506     case T_INT:
1507       vpinsrd(dst, src, val, idx);
1508       break;
1509     case T_LONG:
1510       vpinsrq(dst, src, val, idx);
1511       break;
1512     default:
1513       assert(false,"Should not reach here.");
1514       break;
1515   }
1516 }
1517 
1518 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1519   switch(typ) {
1520     case T_INT:
1521       vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1522       break;
1523     case T_FLOAT:
1524       vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1525       break;
1526     case T_LONG:
1527       vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1528       break;
1529     case T_DOUBLE:
1530       vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1531       break;
1532     default:
1533       assert(false,"Should not reach here.");
1534       break;
1535   }
1536 }
1537 
1538 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1539   switch(typ) {
1540     case T_INT:
1541       evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1542       break;
1543     case T_FLOAT:
1544       evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1545       break;
1546     case T_LONG:
1547       evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1548       break;
1549     case T_DOUBLE:
1550       evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1551       break;
1552     default:
1553       assert(false,"Should not reach here.");
1554       break;
1555   }
1556 }
1557 
1558 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1559   switch(typ) {
1560     case T_INT:
1561       evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1562       break;
1563     case T_FLOAT:
1564       evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1565       break;
1566     case T_LONG:
1567       evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1568       break;
1569     case T_DOUBLE:
1570       evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1571       break;
1572     default:
1573       assert(false,"Should not reach here.");
1574       break;
1575   }
1576 }
1577 
1578 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) {
1579   if (vlen_in_bytes <= 16) {
1580     pxor (dst, dst);
1581     psubb(dst, src);
1582     switch (elem_bt) {
1583       case T_BYTE:   /* nothing to do */ break;
1584       case T_SHORT:  pmovsxbw(dst, dst); break;
1585       case T_INT:    pmovsxbd(dst, dst); break;
1586       case T_FLOAT:  pmovsxbd(dst, dst); break;
1587       case T_LONG:   pmovsxbq(dst, dst); break;
1588       case T_DOUBLE: pmovsxbq(dst, dst); break;
1589 
1590       default: assert(false, "%s", type2name(elem_bt));
1591     }
1592   } else {
1593     assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, "");
1594     int vlen_enc = vector_length_encoding(vlen_in_bytes);
1595 
1596     vpxor (dst, dst, dst, vlen_enc);
1597     vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc);
1598 
1599     switch (elem_bt) {
1600       case T_BYTE:   /* nothing to do */            break;
1601       case T_SHORT:  vpmovsxbw(dst, dst, vlen_enc); break;
1602       case T_INT:    vpmovsxbd(dst, dst, vlen_enc); break;
1603       case T_FLOAT:  vpmovsxbd(dst, dst, vlen_enc); break;
1604       case T_LONG:   vpmovsxbq(dst, dst, vlen_enc); break;
1605       case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1606 
1607       default: assert(false, "%s", type2name(elem_bt));
1608     }
1609   }
1610 }
1611 
1612 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) {
1613   if (novlbwdq) {
1614     vpmovsxbd(xtmp, src, vlen_enc);
1615     evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()),
1616             Assembler::eq, true, vlen_enc, noreg);
1617   } else {
1618     vpxor(xtmp, xtmp, xtmp, vlen_enc);
1619     vpsubb(xtmp, xtmp, src, vlen_enc);
1620     evpmovb2m(dst, xtmp, vlen_enc);
1621   }
1622 }
1623 
1624 void C2_MacroAssembler::load_vector(XMMRegister dst, Address src, int vlen_in_bytes) {
1625   switch (vlen_in_bytes) {
1626     case 4:  movdl(dst, src);   break;
1627     case 8:  movq(dst, src);    break;
1628     case 16: movdqu(dst, src);  break;
1629     case 32: vmovdqu(dst, src); break;
1630     case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break;
1631     default: ShouldNotReachHere();
1632   }
1633 }
1634 
1635 void C2_MacroAssembler::load_vector(XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) {
1636   assert(rscratch != noreg || always_reachable(src), "missing");
1637 
1638   if (reachable(src)) {
1639     load_vector(dst, as_Address(src), vlen_in_bytes);
1640   } else {
1641     lea(rscratch, src);
1642     load_vector(dst, Address(rscratch, 0), vlen_in_bytes);
1643   }
1644 }
1645 
1646 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) {
1647   int vlen_enc = vector_length_encoding(vlen);
1648   if (VM_Version::supports_avx()) {
1649     if (bt == T_LONG) {
1650       if (VM_Version::supports_avx2()) {
1651         vpbroadcastq(dst, src, vlen_enc);
1652       } else {
1653         vmovddup(dst, src, vlen_enc);
1654       }
1655     } else if (bt == T_DOUBLE) {
1656       if (vlen_enc != Assembler::AVX_128bit) {
1657         vbroadcastsd(dst, src, vlen_enc, noreg);
1658       } else {
1659         vmovddup(dst, src, vlen_enc);
1660       }
1661     } else {
1662       if (VM_Version::supports_avx2() && is_integral_type(bt)) {
1663         vpbroadcastd(dst, src, vlen_enc);
1664       } else {
1665         vbroadcastss(dst, src, vlen_enc);
1666       }
1667     }
1668   } else if (VM_Version::supports_sse3()) {
1669     movddup(dst, src);
1670   } else {
1671     movq(dst, src);
1672     if (vlen == 16) {
1673       punpcklqdq(dst, dst);
1674     }
1675   }
1676 }
1677 
1678 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) {
1679   // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64.
1680   int offset = exact_log2(type2aelembytes(bt)) << 6;
1681   if (is_floating_point_type(bt)) {
1682     offset += 128;
1683   }
1684   ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset);
1685   load_vector(dst, addr, vlen_in_bytes);
1686 }
1687 
1688 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1689 
1690 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1691   int vector_len = Assembler::AVX_128bit;
1692 
1693   switch (opcode) {
1694     case Op_AndReductionV:  pand(dst, src); break;
1695     case Op_OrReductionV:   por (dst, src); break;
1696     case Op_XorReductionV:  pxor(dst, src); break;
1697     case Op_MinReductionV:
1698       switch (typ) {
1699         case T_BYTE:        pminsb(dst, src); break;
1700         case T_SHORT:       pminsw(dst, src); break;
1701         case T_INT:         pminsd(dst, src); break;
1702         case T_LONG:        assert(UseAVX > 2, "required");
1703                             vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
1704         default:            assert(false, "wrong type");
1705       }
1706       break;
1707     case Op_MaxReductionV:
1708       switch (typ) {
1709         case T_BYTE:        pmaxsb(dst, src); break;
1710         case T_SHORT:       pmaxsw(dst, src); break;
1711         case T_INT:         pmaxsd(dst, src); break;
1712         case T_LONG:        assert(UseAVX > 2, "required");
1713                             vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
1714         default:            assert(false, "wrong type");
1715       }
1716       break;
1717     case Op_AddReductionVF: addss(dst, src); break;
1718     case Op_AddReductionVD: addsd(dst, src); break;
1719     case Op_AddReductionVI:
1720       switch (typ) {
1721         case T_BYTE:        paddb(dst, src); break;
1722         case T_SHORT:       paddw(dst, src); break;
1723         case T_INT:         paddd(dst, src); break;
1724         default:            assert(false, "wrong type");
1725       }
1726       break;
1727     case Op_AddReductionVL: paddq(dst, src); break;
1728     case Op_MulReductionVF: mulss(dst, src); break;
1729     case Op_MulReductionVD: mulsd(dst, src); break;
1730     case Op_MulReductionVI:
1731       switch (typ) {
1732         case T_SHORT:       pmullw(dst, src); break;
1733         case T_INT:         pmulld(dst, src); break;
1734         default:            assert(false, "wrong type");
1735       }
1736       break;
1737     case Op_MulReductionVL: assert(UseAVX > 2, "required");
1738                             evpmullq(dst, dst, src, vector_len); break;
1739     default:                assert(false, "wrong opcode");
1740   }
1741 }
1742 
1743 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1744   int vector_len = Assembler::AVX_256bit;
1745 
1746   switch (opcode) {
1747     case Op_AndReductionV:  vpand(dst, src1, src2, vector_len); break;
1748     case Op_OrReductionV:   vpor (dst, src1, src2, vector_len); break;
1749     case Op_XorReductionV:  vpxor(dst, src1, src2, vector_len); break;
1750     case Op_MinReductionV:
1751       switch (typ) {
1752         case T_BYTE:        vpminsb(dst, src1, src2, vector_len); break;
1753         case T_SHORT:       vpminsw(dst, src1, src2, vector_len); break;
1754         case T_INT:         vpminsd(dst, src1, src2, vector_len); break;
1755         case T_LONG:        assert(UseAVX > 2, "required");
1756                             vpminsq(dst, src1, src2, vector_len); break;
1757         default:            assert(false, "wrong type");
1758       }
1759       break;
1760     case Op_MaxReductionV:
1761       switch (typ) {
1762         case T_BYTE:        vpmaxsb(dst, src1, src2, vector_len); break;
1763         case T_SHORT:       vpmaxsw(dst, src1, src2, vector_len); break;
1764         case T_INT:         vpmaxsd(dst, src1, src2, vector_len); break;
1765         case T_LONG:        assert(UseAVX > 2, "required");
1766                             vpmaxsq(dst, src1, src2, vector_len); break;
1767         default:            assert(false, "wrong type");
1768       }
1769       break;
1770     case Op_AddReductionVI:
1771       switch (typ) {
1772         case T_BYTE:        vpaddb(dst, src1, src2, vector_len); break;
1773         case T_SHORT:       vpaddw(dst, src1, src2, vector_len); break;
1774         case T_INT:         vpaddd(dst, src1, src2, vector_len); break;
1775         default:            assert(false, "wrong type");
1776       }
1777       break;
1778     case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
1779     case Op_MulReductionVI:
1780       switch (typ) {
1781         case T_SHORT:       vpmullw(dst, src1, src2, vector_len); break;
1782         case T_INT:         vpmulld(dst, src1, src2, vector_len); break;
1783         default:            assert(false, "wrong type");
1784       }
1785       break;
1786     case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break;
1787     default:                assert(false, "wrong opcode");
1788   }
1789 }
1790 
1791 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
1792                                   XMMRegister dst, XMMRegister src,
1793                                   XMMRegister vtmp1, XMMRegister vtmp2) {
1794   switch (opcode) {
1795     case Op_AddReductionVF:
1796     case Op_MulReductionVF:
1797       reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1798       break;
1799 
1800     case Op_AddReductionVD:
1801     case Op_MulReductionVD:
1802       reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1803       break;
1804 
1805     default: assert(false, "wrong opcode");
1806   }
1807 }
1808 
1809 void C2_MacroAssembler::reduceB(int opcode, int vlen,
1810                              Register dst, Register src1, XMMRegister src2,
1811                              XMMRegister vtmp1, XMMRegister vtmp2) {
1812   switch (vlen) {
1813     case  8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1814     case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1815     case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1816     case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1817 
1818     default: assert(false, "wrong vector length");
1819   }
1820 }
1821 
1822 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
1823                              Register dst, Register src1, XMMRegister src2,
1824                              XMMRegister vtmp1, XMMRegister vtmp2) {
1825   switch (vlen) {
1826     case  8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1827     case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1828     case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1829     case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1830 
1831     default: assert(false, "wrong vector length");
1832   }
1833 }
1834 
1835 void C2_MacroAssembler::reduceS(int opcode, int vlen,
1836                              Register dst, Register src1, XMMRegister src2,
1837                              XMMRegister vtmp1, XMMRegister vtmp2) {
1838   switch (vlen) {
1839     case  4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1840     case  8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1841     case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1842     case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1843 
1844     default: assert(false, "wrong vector length");
1845   }
1846 }
1847 
1848 void C2_MacroAssembler::reduceI(int opcode, int vlen,
1849                              Register dst, Register src1, XMMRegister src2,
1850                              XMMRegister vtmp1, XMMRegister vtmp2) {
1851   switch (vlen) {
1852     case  2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1853     case  4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1854     case  8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1855     case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1856 
1857     default: assert(false, "wrong vector length");
1858   }
1859 }
1860 
1861 #ifdef _LP64
1862 void C2_MacroAssembler::reduceL(int opcode, int vlen,
1863                              Register dst, Register src1, XMMRegister src2,
1864                              XMMRegister vtmp1, XMMRegister vtmp2) {
1865   switch (vlen) {
1866     case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1867     case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1868     case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1869 
1870     default: assert(false, "wrong vector length");
1871   }
1872 }
1873 #endif // _LP64
1874 
1875 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1876   switch (vlen) {
1877     case 2:
1878       assert(vtmp2 == xnoreg, "");
1879       reduce2F(opcode, dst, src, vtmp1);
1880       break;
1881     case 4:
1882       assert(vtmp2 == xnoreg, "");
1883       reduce4F(opcode, dst, src, vtmp1);
1884       break;
1885     case 8:
1886       reduce8F(opcode, dst, src, vtmp1, vtmp2);
1887       break;
1888     case 16:
1889       reduce16F(opcode, dst, src, vtmp1, vtmp2);
1890       break;
1891     default: assert(false, "wrong vector length");
1892   }
1893 }
1894 
1895 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1896   switch (vlen) {
1897     case 2:
1898       assert(vtmp2 == xnoreg, "");
1899       reduce2D(opcode, dst, src, vtmp1);
1900       break;
1901     case 4:
1902       reduce4D(opcode, dst, src, vtmp1, vtmp2);
1903       break;
1904     case 8:
1905       reduce8D(opcode, dst, src, vtmp1, vtmp2);
1906       break;
1907     default: assert(false, "wrong vector length");
1908   }
1909 }
1910 
1911 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1912   if (opcode == Op_AddReductionVI) {
1913     if (vtmp1 != src2) {
1914       movdqu(vtmp1, src2);
1915     }
1916     phaddd(vtmp1, vtmp1);
1917   } else {
1918     pshufd(vtmp1, src2, 0x1);
1919     reduce_operation_128(T_INT, opcode, vtmp1, src2);
1920   }
1921   movdl(vtmp2, src1);
1922   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
1923   movdl(dst, vtmp1);
1924 }
1925 
1926 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1927   if (opcode == Op_AddReductionVI) {
1928     if (vtmp1 != src2) {
1929       movdqu(vtmp1, src2);
1930     }
1931     phaddd(vtmp1, src2);
1932     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1933   } else {
1934     pshufd(vtmp2, src2, 0xE);
1935     reduce_operation_128(T_INT, opcode, vtmp2, src2);
1936     reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1937   }
1938 }
1939 
1940 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1941   if (opcode == Op_AddReductionVI) {
1942     vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
1943     vextracti128_high(vtmp2, vtmp1);
1944     vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
1945     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1946   } else {
1947     vextracti128_high(vtmp1, src2);
1948     reduce_operation_128(T_INT, opcode, vtmp1, src2);
1949     reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1950   }
1951 }
1952 
1953 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1954   vextracti64x4_high(vtmp2, src2);
1955   reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
1956   reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1957 }
1958 
1959 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1960   pshufd(vtmp2, src2, 0x1);
1961   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
1962   movdqu(vtmp1, vtmp2);
1963   psrldq(vtmp1, 2);
1964   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
1965   movdqu(vtmp2, vtmp1);
1966   psrldq(vtmp2, 1);
1967   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
1968   movdl(vtmp2, src1);
1969   pmovsxbd(vtmp1, vtmp1);
1970   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
1971   pextrb(dst, vtmp1, 0x0);
1972   movsbl(dst, dst);
1973 }
1974 
1975 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1976   pshufd(vtmp1, src2, 0xE);
1977   reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
1978   reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1979 }
1980 
1981 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1982   vextracti128_high(vtmp2, src2);
1983   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
1984   reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1985 }
1986 
1987 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1988   vextracti64x4_high(vtmp1, src2);
1989   reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
1990   reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1991 }
1992 
1993 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1994   pmovsxbw(vtmp2, src2);
1995   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1996 }
1997 
1998 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1999   if (UseAVX > 1) {
2000     int vector_len = Assembler::AVX_256bit;
2001     vpmovsxbw(vtmp1, src2, vector_len);
2002     reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2003   } else {
2004     pmovsxbw(vtmp2, src2);
2005     reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2006     pshufd(vtmp2, src2, 0x1);
2007     pmovsxbw(vtmp2, src2);
2008     reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2009   }
2010 }
2011 
2012 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2013   if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
2014     int vector_len = Assembler::AVX_512bit;
2015     vpmovsxbw(vtmp1, src2, vector_len);
2016     reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2017   } else {
2018     assert(UseAVX >= 2,"Should not reach here.");
2019     mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
2020     vextracti128_high(vtmp2, src2);
2021     mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2022   }
2023 }
2024 
2025 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2026   mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
2027   vextracti64x4_high(vtmp2, src2);
2028   mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2029 }
2030 
2031 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2032   if (opcode == Op_AddReductionVI) {
2033     if (vtmp1 != src2) {
2034       movdqu(vtmp1, src2);
2035     }
2036     phaddw(vtmp1, vtmp1);
2037     phaddw(vtmp1, vtmp1);
2038   } else {
2039     pshufd(vtmp2, src2, 0x1);
2040     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2041     movdqu(vtmp1, vtmp2);
2042     psrldq(vtmp1, 2);
2043     reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
2044   }
2045   movdl(vtmp2, src1);
2046   pmovsxwd(vtmp1, vtmp1);
2047   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2048   pextrw(dst, vtmp1, 0x0);
2049   movswl(dst, dst);
2050 }
2051 
2052 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2053   if (opcode == Op_AddReductionVI) {
2054     if (vtmp1 != src2) {
2055       movdqu(vtmp1, src2);
2056     }
2057     phaddw(vtmp1, src2);
2058   } else {
2059     pshufd(vtmp1, src2, 0xE);
2060     reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
2061   }
2062   reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2063 }
2064 
2065 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2066   if (opcode == Op_AddReductionVI) {
2067     int vector_len = Assembler::AVX_256bit;
2068     vphaddw(vtmp2, src2, src2, vector_len);
2069     vpermq(vtmp2, vtmp2, 0xD8, vector_len);
2070   } else {
2071     vextracti128_high(vtmp2, src2);
2072     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2073   }
2074   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2075 }
2076 
2077 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2078   int vector_len = Assembler::AVX_256bit;
2079   vextracti64x4_high(vtmp1, src2);
2080   reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
2081   reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2082 }
2083 
2084 #ifdef _LP64
2085 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2086   pshufd(vtmp2, src2, 0xE);
2087   reduce_operation_128(T_LONG, opcode, vtmp2, src2);
2088   movdq(vtmp1, src1);
2089   reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
2090   movdq(dst, vtmp1);
2091 }
2092 
2093 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2094   vextracti128_high(vtmp1, src2);
2095   reduce_operation_128(T_LONG, opcode, vtmp1, src2);
2096   reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2097 }
2098 
2099 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2100   vextracti64x4_high(vtmp2, src2);
2101   reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
2102   reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2103 }
2104 
2105 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
2106   mov64(temp, -1L);
2107   bzhiq(temp, temp, len);
2108   kmovql(dst, temp);
2109 }
2110 #endif // _LP64
2111 
2112 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2113   reduce_operation_128(T_FLOAT, opcode, dst, src);
2114   pshufd(vtmp, src, 0x1);
2115   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2116 }
2117 
2118 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2119   reduce2F(opcode, dst, src, vtmp);
2120   pshufd(vtmp, src, 0x2);
2121   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2122   pshufd(vtmp, src, 0x3);
2123   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2124 }
2125 
2126 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2127   reduce4F(opcode, dst, src, vtmp2);
2128   vextractf128_high(vtmp2, src);
2129   reduce4F(opcode, dst, vtmp2, vtmp1);
2130 }
2131 
2132 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2133   reduce8F(opcode, dst, src, vtmp1, vtmp2);
2134   vextracti64x4_high(vtmp1, src);
2135   reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
2136 }
2137 
2138 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2139   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2140   pshufd(vtmp, src, 0xE);
2141   reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
2142 }
2143 
2144 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2145   reduce2D(opcode, dst, src, vtmp2);
2146   vextractf128_high(vtmp2, src);
2147   reduce2D(opcode, dst, vtmp2, vtmp1);
2148 }
2149 
2150 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2151   reduce4D(opcode, dst, src, vtmp1, vtmp2);
2152   vextracti64x4_high(vtmp1, src);
2153   reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
2154 }
2155 
2156 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) {
2157   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2158 }
2159 
2160 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) {
2161   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2162 }
2163 
2164 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask,
2165                                  int vec_enc) {
2166   switch(elem_bt) {
2167     case T_INT:
2168     case T_FLOAT:
2169       vmaskmovps(dst, src, mask, vec_enc);
2170       break;
2171     case T_LONG:
2172     case T_DOUBLE:
2173       vmaskmovpd(dst, src, mask, vec_enc);
2174       break;
2175     default:
2176       fatal("Unsupported type %s", type2name(elem_bt));
2177       break;
2178   }
2179 }
2180 
2181 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask,
2182                                  int vec_enc) {
2183   switch(elem_bt) {
2184     case T_INT:
2185     case T_FLOAT:
2186       vmaskmovps(dst, src, mask, vec_enc);
2187       break;
2188     case T_LONG:
2189     case T_DOUBLE:
2190       vmaskmovpd(dst, src, mask, vec_enc);
2191       break;
2192     default:
2193       fatal("Unsupported type %s", type2name(elem_bt));
2194       break;
2195   }
2196 }
2197 
2198 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
2199                                           XMMRegister dst, XMMRegister src,
2200                                           XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2201                                           XMMRegister xmm_0, XMMRegister xmm_1) {
2202   const int permconst[] = {1, 14};
2203   XMMRegister wsrc = src;
2204   XMMRegister wdst = xmm_0;
2205   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2206 
2207   int vlen_enc = Assembler::AVX_128bit;
2208   if (vlen == 16) {
2209     vlen_enc = Assembler::AVX_256bit;
2210   }
2211 
2212   for (int i = log2(vlen) - 1; i >=0; i--) {
2213     if (i == 0 && !is_dst_valid) {
2214       wdst = dst;
2215     }
2216     if (i == 3) {
2217       vextracti64x4_high(wtmp, wsrc);
2218     } else if (i == 2) {
2219       vextracti128_high(wtmp, wsrc);
2220     } else { // i = [0,1]
2221       vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
2222     }
2223     vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2224     wsrc = wdst;
2225     vlen_enc = Assembler::AVX_128bit;
2226   }
2227   if (is_dst_valid) {
2228     vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2229   }
2230 }
2231 
2232 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
2233                                         XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2234                                         XMMRegister xmm_0, XMMRegister xmm_1) {
2235   XMMRegister wsrc = src;
2236   XMMRegister wdst = xmm_0;
2237   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2238   int vlen_enc = Assembler::AVX_128bit;
2239   if (vlen == 8) {
2240     vlen_enc = Assembler::AVX_256bit;
2241   }
2242   for (int i = log2(vlen) - 1; i >=0; i--) {
2243     if (i == 0 && !is_dst_valid) {
2244       wdst = dst;
2245     }
2246     if (i == 1) {
2247       vextracti128_high(wtmp, wsrc);
2248     } else if (i == 2) {
2249       vextracti64x4_high(wtmp, wsrc);
2250     } else {
2251       assert(i == 0, "%d", i);
2252       vpermilpd(wtmp, wsrc, 1, vlen_enc);
2253     }
2254     vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2255     wsrc = wdst;
2256     vlen_enc = Assembler::AVX_128bit;
2257   }
2258   if (is_dst_valid) {
2259     vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2260   }
2261 }
2262 
2263 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2264   switch (bt) {
2265     case T_BYTE:  pextrb(dst, src, idx); break;
2266     case T_SHORT: pextrw(dst, src, idx); break;
2267     case T_INT:   pextrd(dst, src, idx); break;
2268     case T_LONG:  pextrq(dst, src, idx); break;
2269 
2270     default:
2271       assert(false,"Should not reach here.");
2272       break;
2273   }
2274 }
2275 
2276 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2277   int esize =  type2aelembytes(typ);
2278   int elem_per_lane = 16/esize;
2279   int lane = elemindex / elem_per_lane;
2280   int eindex = elemindex % elem_per_lane;
2281 
2282   if (lane >= 2) {
2283     assert(UseAVX > 2, "required");
2284     vextractf32x4(dst, src, lane & 3);
2285     return dst;
2286   } else if (lane > 0) {
2287     assert(UseAVX > 0, "required");
2288     vextractf128(dst, src, lane);
2289     return dst;
2290   } else {
2291     return src;
2292   }
2293 }
2294 
2295 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) {
2296   if (typ == T_BYTE) {
2297     movsbl(dst, dst);
2298   } else if (typ == T_SHORT) {
2299     movswl(dst, dst);
2300   }
2301 }
2302 
2303 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2304   int esize =  type2aelembytes(typ);
2305   int elem_per_lane = 16/esize;
2306   int eindex = elemindex % elem_per_lane;
2307   assert(is_integral_type(typ),"required");
2308 
2309   if (eindex == 0) {
2310     if (typ == T_LONG) {
2311       movq(dst, src);
2312     } else {
2313       movdl(dst, src);
2314       movsxl(typ, dst);
2315     }
2316   } else {
2317     extract(typ, dst, src, eindex);
2318     movsxl(typ, dst);
2319   }
2320 }
2321 
2322 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) {
2323   int esize =  type2aelembytes(typ);
2324   int elem_per_lane = 16/esize;
2325   int eindex = elemindex % elem_per_lane;
2326   assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2327 
2328   if (eindex == 0) {
2329     movq(dst, src);
2330   } else {
2331     if (typ == T_FLOAT) {
2332       if (UseAVX == 0) {
2333         movdqu(dst, src);
2334         shufps(dst, dst, eindex);
2335       } else {
2336         vshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2337       }
2338     } else {
2339       if (UseAVX == 0) {
2340         movdqu(dst, src);
2341         psrldq(dst, eindex*esize);
2342       } else {
2343         vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2344       }
2345       movq(dst, dst);
2346     }
2347   }
2348   // Zero upper bits
2349   if (typ == T_FLOAT) {
2350     if (UseAVX == 0) {
2351       assert(vtmp != xnoreg, "required.");
2352       movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg);
2353       pand(dst, vtmp);
2354     } else {
2355       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg);
2356     }
2357   }
2358 }
2359 
2360 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) {
2361   switch(typ) {
2362     case T_BYTE:
2363     case T_BOOLEAN:
2364       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2365       break;
2366     case T_SHORT:
2367     case T_CHAR:
2368       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2369       break;
2370     case T_INT:
2371     case T_FLOAT:
2372       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2373       break;
2374     case T_LONG:
2375     case T_DOUBLE:
2376       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2377       break;
2378     default:
2379       assert(false,"Should not reach here.");
2380       break;
2381   }
2382 }
2383 
2384 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) {
2385   assert(rscratch != noreg || always_reachable(src2), "missing");
2386 
2387   switch(typ) {
2388     case T_BOOLEAN:
2389     case T_BYTE:
2390       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2391       break;
2392     case T_CHAR:
2393     case T_SHORT:
2394       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2395       break;
2396     case T_INT:
2397     case T_FLOAT:
2398       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2399       break;
2400     case T_LONG:
2401     case T_DOUBLE:
2402       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2403       break;
2404     default:
2405       assert(false,"Should not reach here.");
2406       break;
2407   }
2408 }
2409 
2410 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2411   switch(typ) {
2412     case T_BYTE:
2413       evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2414       break;
2415     case T_SHORT:
2416       evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2417       break;
2418     case T_INT:
2419     case T_FLOAT:
2420       evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2421       break;
2422     case T_LONG:
2423     case T_DOUBLE:
2424       evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2425       break;
2426     default:
2427       assert(false,"Should not reach here.");
2428       break;
2429   }
2430 }
2431 
2432 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) {
2433   assert(vlen_in_bytes <= 32, "");
2434   int esize = type2aelembytes(bt);
2435   if (vlen_in_bytes == 32) {
2436     assert(vtmp == xnoreg, "required.");
2437     if (esize >= 4) {
2438       vtestps(src1, src2, AVX_256bit);
2439     } else {
2440       vptest(src1, src2, AVX_256bit);
2441     }
2442     return;
2443   }
2444   if (vlen_in_bytes < 16) {
2445     // Duplicate the lower part to fill the whole register,
2446     // Don't need to do so for src2
2447     assert(vtmp != xnoreg, "required");
2448     int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04;
2449     pshufd(vtmp, src1, shuffle_imm);
2450   } else {
2451     assert(vtmp == xnoreg, "required");
2452     vtmp = src1;
2453   }
2454   if (esize >= 4 && VM_Version::supports_avx()) {
2455     vtestps(vtmp, src2, AVX_128bit);
2456   } else {
2457     ptest(vtmp, src2);
2458   }
2459 }
2460 
2461 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
2462   assert(UseAVX >= 2, "required");
2463 #ifdef ASSERT
2464   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2465   bool is_bw_supported = VM_Version::supports_avx512bw();
2466   if (is_bw && !is_bw_supported) {
2467     assert(vlen_enc != Assembler::AVX_512bit, "required");
2468     assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16),
2469            "XMM register should be 0-15");
2470   }
2471 #endif // ASSERT
2472   switch (elem_bt) {
2473     case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return;
2474     case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return;
2475     case T_INT: vpaddd(dst, src1, src2, vlen_enc); return;
2476     case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return;
2477     case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return;
2478     case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return;
2479     default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2480   }
2481 }
2482 
2483 #ifdef _LP64
2484 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) {
2485   assert(UseAVX >= 2, "required");
2486   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2487   bool is_vl = vlen_enc != Assembler::AVX_512bit;
2488   if ((UseAVX > 2) &&
2489       (!is_bw || VM_Version::supports_avx512bw()) &&
2490       (!is_vl || VM_Version::supports_avx512vl())) {
2491     switch (elem_bt) {
2492       case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return;
2493       case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return;
2494       case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return;
2495       case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return;
2496       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2497     }
2498   } else {
2499     assert(vlen_enc != Assembler::AVX_512bit, "required");
2500     assert((dst->encoding() < 16),"XMM register should be 0-15");
2501     switch (elem_bt) {
2502       case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return;
2503       case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return;
2504       case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return;
2505       case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return;
2506       case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return;
2507       case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return;
2508       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2509     }
2510   }
2511 }
2512 #endif
2513 
2514 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
2515   switch (to_elem_bt) {
2516     case T_SHORT:
2517       vpmovsxbw(dst, src, vlen_enc);
2518       break;
2519     case T_INT:
2520       vpmovsxbd(dst, src, vlen_enc);
2521       break;
2522     case T_FLOAT:
2523       vpmovsxbd(dst, src, vlen_enc);
2524       vcvtdq2ps(dst, dst, vlen_enc);
2525       break;
2526     case T_LONG:
2527       vpmovsxbq(dst, src, vlen_enc);
2528       break;
2529     case T_DOUBLE: {
2530       int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
2531       vpmovsxbd(dst, src, mid_vlen_enc);
2532       vcvtdq2pd(dst, dst, vlen_enc);
2533       break;
2534     }
2535     default:
2536       fatal("Unsupported type %s", type2name(to_elem_bt));
2537       break;
2538   }
2539 }
2540 
2541 //-------------------------------------------------------------------------------------------
2542 
2543 // IndexOf for constant substrings with size >= 8 chars
2544 // which don't need to be loaded through stack.
2545 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2546                                          Register cnt1, Register cnt2,
2547                                          int int_cnt2,  Register result,
2548                                          XMMRegister vec, Register tmp,
2549                                          int ae) {
2550   ShortBranchVerifier sbv(this);
2551   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2552   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2553 
2554   // This method uses the pcmpestri instruction with bound registers
2555   //   inputs:
2556   //     xmm - substring
2557   //     rax - substring length (elements count)
2558   //     mem - scanned string
2559   //     rdx - string length (elements count)
2560   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2561   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2562   //   outputs:
2563   //     rcx - matched index in string
2564   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2565   int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2566   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2567   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2568   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2569 
2570   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2571         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2572         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2573 
2574   // Note, inline_string_indexOf() generates checks:
2575   // if (substr.count > string.count) return -1;
2576   // if (substr.count == 0) return 0;
2577   assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2578 
2579   // Load substring.
2580   if (ae == StrIntrinsicNode::UL) {
2581     pmovzxbw(vec, Address(str2, 0));
2582   } else {
2583     movdqu(vec, Address(str2, 0));
2584   }
2585   movl(cnt2, int_cnt2);
2586   movptr(result, str1); // string addr
2587 
2588   if (int_cnt2 > stride) {
2589     jmpb(SCAN_TO_SUBSTR);
2590 
2591     // Reload substr for rescan, this code
2592     // is executed only for large substrings (> 8 chars)
2593     bind(RELOAD_SUBSTR);
2594     if (ae == StrIntrinsicNode::UL) {
2595       pmovzxbw(vec, Address(str2, 0));
2596     } else {
2597       movdqu(vec, Address(str2, 0));
2598     }
2599     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2600 
2601     bind(RELOAD_STR);
2602     // We came here after the beginning of the substring was
2603     // matched but the rest of it was not so we need to search
2604     // again. Start from the next element after the previous match.
2605 
2606     // cnt2 is number of substring reminding elements and
2607     // cnt1 is number of string reminding elements when cmp failed.
2608     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
2609     subl(cnt1, cnt2);
2610     addl(cnt1, int_cnt2);
2611     movl(cnt2, int_cnt2); // Now restore cnt2
2612 
2613     decrementl(cnt1);     // Shift to next element
2614     cmpl(cnt1, cnt2);
2615     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2616 
2617     addptr(result, (1<<scale1));
2618 
2619   } // (int_cnt2 > 8)
2620 
2621   // Scan string for start of substr in 16-byte vectors
2622   bind(SCAN_TO_SUBSTR);
2623   pcmpestri(vec, Address(result, 0), mode);
2624   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2625   subl(cnt1, stride);
2626   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2627   cmpl(cnt1, cnt2);
2628   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2629   addptr(result, 16);
2630   jmpb(SCAN_TO_SUBSTR);
2631 
2632   // Found a potential substr
2633   bind(FOUND_CANDIDATE);
2634   // Matched whole vector if first element matched (tmp(rcx) == 0).
2635   if (int_cnt2 == stride) {
2636     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
2637   } else { // int_cnt2 > 8
2638     jccb(Assembler::overflow, FOUND_SUBSTR);
2639   }
2640   // After pcmpestri tmp(rcx) contains matched element index
2641   // Compute start addr of substr
2642   lea(result, Address(result, tmp, scale1));
2643 
2644   // Make sure string is still long enough
2645   subl(cnt1, tmp);
2646   cmpl(cnt1, cnt2);
2647   if (int_cnt2 == stride) {
2648     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2649   } else { // int_cnt2 > 8
2650     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
2651   }
2652   // Left less then substring.
2653 
2654   bind(RET_NOT_FOUND);
2655   movl(result, -1);
2656   jmp(EXIT);
2657 
2658   if (int_cnt2 > stride) {
2659     // This code is optimized for the case when whole substring
2660     // is matched if its head is matched.
2661     bind(MATCH_SUBSTR_HEAD);
2662     pcmpestri(vec, Address(result, 0), mode);
2663     // Reload only string if does not match
2664     jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
2665 
2666     Label CONT_SCAN_SUBSTR;
2667     // Compare the rest of substring (> 8 chars).
2668     bind(FOUND_SUBSTR);
2669     // First 8 chars are already matched.
2670     negptr(cnt2);
2671     addptr(cnt2, stride);
2672 
2673     bind(SCAN_SUBSTR);
2674     subl(cnt1, stride);
2675     cmpl(cnt2, -stride); // Do not read beyond substring
2676     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
2677     // Back-up strings to avoid reading beyond substring:
2678     // cnt1 = cnt1 - cnt2 + 8
2679     addl(cnt1, cnt2); // cnt2 is negative
2680     addl(cnt1, stride);
2681     movl(cnt2, stride); negptr(cnt2);
2682     bind(CONT_SCAN_SUBSTR);
2683     if (int_cnt2 < (int)G) {
2684       int tail_off1 = int_cnt2<<scale1;
2685       int tail_off2 = int_cnt2<<scale2;
2686       if (ae == StrIntrinsicNode::UL) {
2687         pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
2688       } else {
2689         movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
2690       }
2691       pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
2692     } else {
2693       // calculate index in register to avoid integer overflow (int_cnt2*2)
2694       movl(tmp, int_cnt2);
2695       addptr(tmp, cnt2);
2696       if (ae == StrIntrinsicNode::UL) {
2697         pmovzxbw(vec, Address(str2, tmp, scale2, 0));
2698       } else {
2699         movdqu(vec, Address(str2, tmp, scale2, 0));
2700       }
2701       pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
2702     }
2703     // Need to reload strings pointers if not matched whole vector
2704     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2705     addptr(cnt2, stride);
2706     jcc(Assembler::negative, SCAN_SUBSTR);
2707     // Fall through if found full substring
2708 
2709   } // (int_cnt2 > 8)
2710 
2711   bind(RET_FOUND);
2712   // Found result if we matched full small substring.
2713   // Compute substr offset
2714   subptr(result, str1);
2715   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2716     shrl(result, 1); // index
2717   }
2718   bind(EXIT);
2719 
2720 } // string_indexofC8
2721 
2722 // Small strings are loaded through stack if they cross page boundary.
2723 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
2724                                        Register cnt1, Register cnt2,
2725                                        int int_cnt2,  Register result,
2726                                        XMMRegister vec, Register tmp,
2727                                        int ae) {
2728   ShortBranchVerifier sbv(this);
2729   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2730   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2731 
2732   //
2733   // int_cnt2 is length of small (< 8 chars) constant substring
2734   // or (-1) for non constant substring in which case its length
2735   // is in cnt2 register.
2736   //
2737   // Note, inline_string_indexOf() generates checks:
2738   // if (substr.count > string.count) return -1;
2739   // if (substr.count == 0) return 0;
2740   //
2741   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2742   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
2743   // This method uses the pcmpestri instruction with bound registers
2744   //   inputs:
2745   //     xmm - substring
2746   //     rax - substring length (elements count)
2747   //     mem - scanned string
2748   //     rdx - string length (elements count)
2749   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2750   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2751   //   outputs:
2752   //     rcx - matched index in string
2753   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2754   int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2755   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2756   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2757 
2758   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
2759         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
2760         FOUND_CANDIDATE;
2761 
2762   { //========================================================
2763     // We don't know where these strings are located
2764     // and we can't read beyond them. Load them through stack.
2765     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
2766 
2767     movptr(tmp, rsp); // save old SP
2768 
2769     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
2770       if (int_cnt2 == (1>>scale2)) { // One byte
2771         assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
2772         load_unsigned_byte(result, Address(str2, 0));
2773         movdl(vec, result); // move 32 bits
2774       } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
2775         // Not enough header space in 32-bit VM: 12+3 = 15.
2776         movl(result, Address(str2, -1));
2777         shrl(result, 8);
2778         movdl(vec, result); // move 32 bits
2779       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
2780         load_unsigned_short(result, Address(str2, 0));
2781         movdl(vec, result); // move 32 bits
2782       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
2783         movdl(vec, Address(str2, 0)); // move 32 bits
2784       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
2785         movq(vec, Address(str2, 0));  // move 64 bits
2786       } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
2787         // Array header size is 12 bytes in 32-bit VM
2788         // + 6 bytes for 3 chars == 18 bytes,
2789         // enough space to load vec and shift.
2790         assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
2791         if (ae == StrIntrinsicNode::UL) {
2792           int tail_off = int_cnt2-8;
2793           pmovzxbw(vec, Address(str2, tail_off));
2794           psrldq(vec, -2*tail_off);
2795         }
2796         else {
2797           int tail_off = int_cnt2*(1<<scale2);
2798           movdqu(vec, Address(str2, tail_off-16));
2799           psrldq(vec, 16-tail_off);
2800         }
2801       }
2802     } else { // not constant substring
2803       cmpl(cnt2, stride);
2804       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
2805 
2806       // We can read beyond string if srt+16 does not cross page boundary
2807       // since heaps are aligned and mapped by pages.
2808       assert(os::vm_page_size() < (int)G, "default page should be small");
2809       movl(result, str2); // We need only low 32 bits
2810       andl(result, ((int)os::vm_page_size()-1));
2811       cmpl(result, ((int)os::vm_page_size()-16));
2812       jccb(Assembler::belowEqual, CHECK_STR);
2813 
2814       // Move small strings to stack to allow load 16 bytes into vec.
2815       subptr(rsp, 16);
2816       int stk_offset = wordSize-(1<<scale2);
2817       push(cnt2);
2818 
2819       bind(COPY_SUBSTR);
2820       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
2821         load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
2822         movb(Address(rsp, cnt2, scale2, stk_offset), result);
2823       } else if (ae == StrIntrinsicNode::UU) {
2824         load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
2825         movw(Address(rsp, cnt2, scale2, stk_offset), result);
2826       }
2827       decrement(cnt2);
2828       jccb(Assembler::notZero, COPY_SUBSTR);
2829 
2830       pop(cnt2);
2831       movptr(str2, rsp);  // New substring address
2832     } // non constant
2833 
2834     bind(CHECK_STR);
2835     cmpl(cnt1, stride);
2836     jccb(Assembler::aboveEqual, BIG_STRINGS);
2837 
2838     // Check cross page boundary.
2839     movl(result, str1); // We need only low 32 bits
2840     andl(result, ((int)os::vm_page_size()-1));
2841     cmpl(result, ((int)os::vm_page_size()-16));
2842     jccb(Assembler::belowEqual, BIG_STRINGS);
2843 
2844     subptr(rsp, 16);
2845     int stk_offset = -(1<<scale1);
2846     if (int_cnt2 < 0) { // not constant
2847       push(cnt2);
2848       stk_offset += wordSize;
2849     }
2850     movl(cnt2, cnt1);
2851 
2852     bind(COPY_STR);
2853     if (ae == StrIntrinsicNode::LL) {
2854       load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
2855       movb(Address(rsp, cnt2, scale1, stk_offset), result);
2856     } else {
2857       load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
2858       movw(Address(rsp, cnt2, scale1, stk_offset), result);
2859     }
2860     decrement(cnt2);
2861     jccb(Assembler::notZero, COPY_STR);
2862 
2863     if (int_cnt2 < 0) { // not constant
2864       pop(cnt2);
2865     }
2866     movptr(str1, rsp);  // New string address
2867 
2868     bind(BIG_STRINGS);
2869     // Load substring.
2870     if (int_cnt2 < 0) { // -1
2871       if (ae == StrIntrinsicNode::UL) {
2872         pmovzxbw(vec, Address(str2, 0));
2873       } else {
2874         movdqu(vec, Address(str2, 0));
2875       }
2876       push(cnt2);       // substr count
2877       push(str2);       // substr addr
2878       push(str1);       // string addr
2879     } else {
2880       // Small (< 8 chars) constant substrings are loaded already.
2881       movl(cnt2, int_cnt2);
2882     }
2883     push(tmp);  // original SP
2884 
2885   } // Finished loading
2886 
2887   //========================================================
2888   // Start search
2889   //
2890 
2891   movptr(result, str1); // string addr
2892 
2893   if (int_cnt2  < 0) {  // Only for non constant substring
2894     jmpb(SCAN_TO_SUBSTR);
2895 
2896     // SP saved at sp+0
2897     // String saved at sp+1*wordSize
2898     // Substr saved at sp+2*wordSize
2899     // Substr count saved at sp+3*wordSize
2900 
2901     // Reload substr for rescan, this code
2902     // is executed only for large substrings (> 8 chars)
2903     bind(RELOAD_SUBSTR);
2904     movptr(str2, Address(rsp, 2*wordSize));
2905     movl(cnt2, Address(rsp, 3*wordSize));
2906     if (ae == StrIntrinsicNode::UL) {
2907       pmovzxbw(vec, Address(str2, 0));
2908     } else {
2909       movdqu(vec, Address(str2, 0));
2910     }
2911     // We came here after the beginning of the substring was
2912     // matched but the rest of it was not so we need to search
2913     // again. Start from the next element after the previous match.
2914     subptr(str1, result); // Restore counter
2915     if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2916       shrl(str1, 1);
2917     }
2918     addl(cnt1, str1);
2919     decrementl(cnt1);   // Shift to next element
2920     cmpl(cnt1, cnt2);
2921     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2922 
2923     addptr(result, (1<<scale1));
2924   } // non constant
2925 
2926   // Scan string for start of substr in 16-byte vectors
2927   bind(SCAN_TO_SUBSTR);
2928   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2929   pcmpestri(vec, Address(result, 0), mode);
2930   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2931   subl(cnt1, stride);
2932   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2933   cmpl(cnt1, cnt2);
2934   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2935   addptr(result, 16);
2936 
2937   bind(ADJUST_STR);
2938   cmpl(cnt1, stride); // Do not read beyond string
2939   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2940   // Back-up string to avoid reading beyond string.
2941   lea(result, Address(result, cnt1, scale1, -16));
2942   movl(cnt1, stride);
2943   jmpb(SCAN_TO_SUBSTR);
2944 
2945   // Found a potential substr
2946   bind(FOUND_CANDIDATE);
2947   // After pcmpestri tmp(rcx) contains matched element index
2948 
2949   // Make sure string is still long enough
2950   subl(cnt1, tmp);
2951   cmpl(cnt1, cnt2);
2952   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
2953   // Left less then substring.
2954 
2955   bind(RET_NOT_FOUND);
2956   movl(result, -1);
2957   jmp(CLEANUP);
2958 
2959   bind(FOUND_SUBSTR);
2960   // Compute start addr of substr
2961   lea(result, Address(result, tmp, scale1));
2962   if (int_cnt2 > 0) { // Constant substring
2963     // Repeat search for small substring (< 8 chars)
2964     // from new point without reloading substring.
2965     // Have to check that we don't read beyond string.
2966     cmpl(tmp, stride-int_cnt2);
2967     jccb(Assembler::greater, ADJUST_STR);
2968     // Fall through if matched whole substring.
2969   } else { // non constant
2970     assert(int_cnt2 == -1, "should be != 0");
2971 
2972     addl(tmp, cnt2);
2973     // Found result if we matched whole substring.
2974     cmpl(tmp, stride);
2975     jcc(Assembler::lessEqual, RET_FOUND);
2976 
2977     // Repeat search for small substring (<= 8 chars)
2978     // from new point 'str1' without reloading substring.
2979     cmpl(cnt2, stride);
2980     // Have to check that we don't read beyond string.
2981     jccb(Assembler::lessEqual, ADJUST_STR);
2982 
2983     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
2984     // Compare the rest of substring (> 8 chars).
2985     movptr(str1, result);
2986 
2987     cmpl(tmp, cnt2);
2988     // First 8 chars are already matched.
2989     jccb(Assembler::equal, CHECK_NEXT);
2990 
2991     bind(SCAN_SUBSTR);
2992     pcmpestri(vec, Address(str1, 0), mode);
2993     // Need to reload strings pointers if not matched whole vector
2994     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2995 
2996     bind(CHECK_NEXT);
2997     subl(cnt2, stride);
2998     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
2999     addptr(str1, 16);
3000     if (ae == StrIntrinsicNode::UL) {
3001       addptr(str2, 8);
3002     } else {
3003       addptr(str2, 16);
3004     }
3005     subl(cnt1, stride);
3006     cmpl(cnt2, stride); // Do not read beyond substring
3007     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
3008     // Back-up strings to avoid reading beyond substring.
3009 
3010     if (ae == StrIntrinsicNode::UL) {
3011       lea(str2, Address(str2, cnt2, scale2, -8));
3012       lea(str1, Address(str1, cnt2, scale1, -16));
3013     } else {
3014       lea(str2, Address(str2, cnt2, scale2, -16));
3015       lea(str1, Address(str1, cnt2, scale1, -16));
3016     }
3017     subl(cnt1, cnt2);
3018     movl(cnt2, stride);
3019     addl(cnt1, stride);
3020     bind(CONT_SCAN_SUBSTR);
3021     if (ae == StrIntrinsicNode::UL) {
3022       pmovzxbw(vec, Address(str2, 0));
3023     } else {
3024       movdqu(vec, Address(str2, 0));
3025     }
3026     jmp(SCAN_SUBSTR);
3027 
3028     bind(RET_FOUND_LONG);
3029     movptr(str1, Address(rsp, wordSize));
3030   } // non constant
3031 
3032   bind(RET_FOUND);
3033   // Compute substr offset
3034   subptr(result, str1);
3035   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3036     shrl(result, 1); // index
3037   }
3038   bind(CLEANUP);
3039   pop(rsp); // restore SP
3040 
3041 } // string_indexof
3042 
3043 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3044                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3045   ShortBranchVerifier sbv(this);
3046   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3047 
3048   int stride = 8;
3049 
3050   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
3051         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
3052         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
3053         FOUND_SEQ_CHAR, DONE_LABEL;
3054 
3055   movptr(result, str1);
3056   if (UseAVX >= 2) {
3057     cmpl(cnt1, stride);
3058     jcc(Assembler::less, SCAN_TO_CHAR);
3059     cmpl(cnt1, 2*stride);
3060     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
3061     movdl(vec1, ch);
3062     vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
3063     vpxor(vec2, vec2);
3064     movl(tmp, cnt1);
3065     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
3066     andl(cnt1,0x0000000F);  //tail count (in chars)
3067 
3068     bind(SCAN_TO_16_CHAR_LOOP);
3069     vmovdqu(vec3, Address(result, 0));
3070     vpcmpeqw(vec3, vec3, vec1, 1);
3071     vptest(vec2, vec3);
3072     jcc(Assembler::carryClear, FOUND_CHAR);
3073     addptr(result, 32);
3074     subl(tmp, 2*stride);
3075     jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
3076     jmp(SCAN_TO_8_CHAR);
3077     bind(SCAN_TO_8_CHAR_INIT);
3078     movdl(vec1, ch);
3079     pshuflw(vec1, vec1, 0x00);
3080     pshufd(vec1, vec1, 0);
3081     pxor(vec2, vec2);
3082   }
3083   bind(SCAN_TO_8_CHAR);
3084   cmpl(cnt1, stride);
3085   jcc(Assembler::less, SCAN_TO_CHAR);
3086   if (UseAVX < 2) {
3087     movdl(vec1, ch);
3088     pshuflw(vec1, vec1, 0x00);
3089     pshufd(vec1, vec1, 0);
3090     pxor(vec2, vec2);
3091   }
3092   movl(tmp, cnt1);
3093   andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
3094   andl(cnt1,0x00000007);  //tail count (in chars)
3095 
3096   bind(SCAN_TO_8_CHAR_LOOP);
3097   movdqu(vec3, Address(result, 0));
3098   pcmpeqw(vec3, vec1);
3099   ptest(vec2, vec3);
3100   jcc(Assembler::carryClear, FOUND_CHAR);
3101   addptr(result, 16);
3102   subl(tmp, stride);
3103   jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
3104   bind(SCAN_TO_CHAR);
3105   testl(cnt1, cnt1);
3106   jcc(Assembler::zero, RET_NOT_FOUND);
3107   bind(SCAN_TO_CHAR_LOOP);
3108   load_unsigned_short(tmp, Address(result, 0));
3109   cmpl(ch, tmp);
3110   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3111   addptr(result, 2);
3112   subl(cnt1, 1);
3113   jccb(Assembler::zero, RET_NOT_FOUND);
3114   jmp(SCAN_TO_CHAR_LOOP);
3115 
3116   bind(RET_NOT_FOUND);
3117   movl(result, -1);
3118   jmpb(DONE_LABEL);
3119 
3120   bind(FOUND_CHAR);
3121   if (UseAVX >= 2) {
3122     vpmovmskb(tmp, vec3);
3123   } else {
3124     pmovmskb(tmp, vec3);
3125   }
3126   bsfl(ch, tmp);
3127   addptr(result, ch);
3128 
3129   bind(FOUND_SEQ_CHAR);
3130   subptr(result, str1);
3131   shrl(result, 1);
3132 
3133   bind(DONE_LABEL);
3134 } // string_indexof_char
3135 
3136 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3137                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3138   ShortBranchVerifier sbv(this);
3139   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3140 
3141   int stride = 16;
3142 
3143   Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
3144         SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
3145         RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
3146         FOUND_SEQ_CHAR, DONE_LABEL;
3147 
3148   movptr(result, str1);
3149   if (UseAVX >= 2) {
3150     cmpl(cnt1, stride);
3151     jcc(Assembler::less, SCAN_TO_CHAR_INIT);
3152     cmpl(cnt1, stride*2);
3153     jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
3154     movdl(vec1, ch);
3155     vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
3156     vpxor(vec2, vec2);
3157     movl(tmp, cnt1);
3158     andl(tmp, 0xFFFFFFE0);  //vector count (in chars)
3159     andl(cnt1,0x0000001F);  //tail count (in chars)
3160 
3161     bind(SCAN_TO_32_CHAR_LOOP);
3162     vmovdqu(vec3, Address(result, 0));
3163     vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
3164     vptest(vec2, vec3);
3165     jcc(Assembler::carryClear, FOUND_CHAR);
3166     addptr(result, 32);
3167     subl(tmp, stride*2);
3168     jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
3169     jmp(SCAN_TO_16_CHAR);
3170 
3171     bind(SCAN_TO_16_CHAR_INIT);
3172     movdl(vec1, ch);
3173     pxor(vec2, vec2);
3174     pshufb(vec1, vec2);
3175   }
3176 
3177   bind(SCAN_TO_16_CHAR);
3178   cmpl(cnt1, stride);
3179   jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left
3180   if (UseAVX < 2) {
3181     movdl(vec1, ch);
3182     pxor(vec2, vec2);
3183     pshufb(vec1, vec2);
3184   }
3185   movl(tmp, cnt1);
3186   andl(tmp, 0xFFFFFFF0);  //vector count (in bytes)
3187   andl(cnt1,0x0000000F);  //tail count (in bytes)
3188 
3189   bind(SCAN_TO_16_CHAR_LOOP);
3190   movdqu(vec3, Address(result, 0));
3191   pcmpeqb(vec3, vec1);
3192   ptest(vec2, vec3);
3193   jcc(Assembler::carryClear, FOUND_CHAR);
3194   addptr(result, 16);
3195   subl(tmp, stride);
3196   jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
3197 
3198   bind(SCAN_TO_CHAR_INIT);
3199   testl(cnt1, cnt1);
3200   jcc(Assembler::zero, RET_NOT_FOUND);
3201   bind(SCAN_TO_CHAR_LOOP);
3202   load_unsigned_byte(tmp, Address(result, 0));
3203   cmpl(ch, tmp);
3204   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3205   addptr(result, 1);
3206   subl(cnt1, 1);
3207   jccb(Assembler::zero, RET_NOT_FOUND);
3208   jmp(SCAN_TO_CHAR_LOOP);
3209 
3210   bind(RET_NOT_FOUND);
3211   movl(result, -1);
3212   jmpb(DONE_LABEL);
3213 
3214   bind(FOUND_CHAR);
3215   if (UseAVX >= 2) {
3216     vpmovmskb(tmp, vec3);
3217   } else {
3218     pmovmskb(tmp, vec3);
3219   }
3220   bsfl(ch, tmp);
3221   addptr(result, ch);
3222 
3223   bind(FOUND_SEQ_CHAR);
3224   subptr(result, str1);
3225 
3226   bind(DONE_LABEL);
3227 } // stringL_indexof_char
3228 
3229 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) {
3230   switch (eltype) {
3231   case T_BOOLEAN: return sizeof(jboolean);
3232   case T_BYTE:  return sizeof(jbyte);
3233   case T_SHORT: return sizeof(jshort);
3234   case T_CHAR:  return sizeof(jchar);
3235   case T_INT:   return sizeof(jint);
3236   default:
3237     ShouldNotReachHere();
3238     return -1;
3239   }
3240 }
3241 
3242 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) {
3243   switch (eltype) {
3244   // T_BOOLEAN used as surrogate for unsigned byte
3245   case T_BOOLEAN: movzbl(dst, src);   break;
3246   case T_BYTE:    movsbl(dst, src);   break;
3247   case T_SHORT:   movswl(dst, src);   break;
3248   case T_CHAR:    movzwl(dst, src);   break;
3249   case T_INT:     movl(dst, src);     break;
3250   default:
3251     ShouldNotReachHere();
3252   }
3253 }
3254 
3255 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) {
3256   load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8);
3257 }
3258 
3259 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) {
3260   load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8);
3261 }
3262 
3263 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) {
3264   const int vlen = Assembler::AVX_256bit;
3265   switch (eltype) {
3266   case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3267   case T_BYTE:      vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT);  break;
3268   case T_SHORT:     vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3269   case T_CHAR:    vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3270   case T_INT:
3271     // do nothing
3272     break;
3273   default:
3274     ShouldNotReachHere();
3275   }
3276 }
3277 
3278 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result,
3279                                         Register index, Register tmp2, Register tmp3, XMMRegister vnext,
3280                                         XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3,
3281                                         XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3,
3282                                         XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3,
3283                                         BasicType eltype) {
3284   ShortBranchVerifier sbv(this);
3285   assert(UseAVX >= 2, "AVX2 intrinsics are required");
3286   assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3);
3287   assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3);
3288 
3289   Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN,
3290         SHORT_UNROLLED_LOOP_EXIT,
3291         UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME,
3292         UNROLLED_VECTOR_LOOP_BEGIN,
3293         END;
3294   switch (eltype) {
3295   case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break;
3296   case T_CHAR:    BLOCK_COMMENT("arrays_hashcode(char) {");          break;
3297   case T_BYTE:    BLOCK_COMMENT("arrays_hashcode(byte) {");          break;
3298   case T_SHORT:   BLOCK_COMMENT("arrays_hashcode(short) {");         break;
3299   case T_INT:     BLOCK_COMMENT("arrays_hashcode(int) {");           break;
3300   default:        BLOCK_COMMENT("arrays_hashcode {");                break;
3301   }
3302 
3303   // For "renaming" for readibility of the code
3304   const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 },
3305                     vresult[] = { vresult0, vresult1, vresult2, vresult3 },
3306                     vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 };
3307 
3308   const int elsize = arrays_hashcode_elsize(eltype);
3309 
3310   /*
3311     if (cnt1 >= 2) {
3312       if (cnt1 >= 32) {
3313         UNROLLED VECTOR LOOP
3314       }
3315       UNROLLED SCALAR LOOP
3316     }
3317     SINGLE SCALAR
3318    */
3319 
3320   cmpl(cnt1, 32);
3321   jcc(Assembler::less, SHORT_UNROLLED_BEGIN);
3322 
3323   // cnt1 >= 32 && generate_vectorized_loop
3324   xorl(index, index);
3325 
3326   // vresult = IntVector.zero(I256);
3327   for (int idx = 0; idx < 4; idx++) {
3328     vpxor(vresult[idx], vresult[idx]);
3329   }
3330   // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]);
3331   Register bound = tmp2;
3332   Register next = tmp3;
3333   lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint))));
3334   movl(next, Address(tmp2, 0));
3335   movdl(vnext, next);
3336   vpbroadcastd(vnext, vnext, Assembler::AVX_256bit);
3337 
3338   // index = 0;
3339   // bound = cnt1 & ~(32 - 1);
3340   movl(bound, cnt1);
3341   andl(bound, ~(32 - 1));
3342   // for (; index < bound; index += 32) {
3343   bind(UNROLLED_VECTOR_LOOP_BEGIN);
3344   // result *= next;
3345   imull(result, next);
3346   // loop fission to upfront the cost of fetching from memory, OOO execution
3347   // can then hopefully do a better job of prefetching
3348   for (int idx = 0; idx < 4; idx++) {
3349     arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype);
3350   }
3351   // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7];
3352   for (int idx = 0; idx < 4; idx++) {
3353     vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit);
3354     arrays_hashcode_elvcast(vtmp[idx], eltype);
3355     vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit);
3356   }
3357   // index += 32;
3358   addl(index, 32);
3359   // index < bound;
3360   cmpl(index, bound);
3361   jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN);
3362   // }
3363 
3364   lea(ary1, Address(ary1, bound, Address::times(elsize)));
3365   subl(cnt1, bound);
3366   // release bound
3367 
3368   // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1);
3369   for (int idx = 0; idx < 4; idx++) {
3370     lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint))));
3371     arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT);
3372     vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit);
3373   }
3374   // result += vresult.reduceLanes(ADD);
3375   for (int idx = 0; idx < 4; idx++) {
3376     reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]);
3377   }
3378 
3379   // } else if (cnt1 < 32) {
3380 
3381   bind(SHORT_UNROLLED_BEGIN);
3382   // int i = 1;
3383   movl(index, 1);
3384   cmpl(index, cnt1);
3385   jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT);
3386 
3387   // for (; i < cnt1 ; i += 2) {
3388   bind(SHORT_UNROLLED_LOOP_BEGIN);
3389   movl(tmp3, 961);
3390   imull(result, tmp3);
3391   arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3392   movl(tmp3, tmp2);
3393   shll(tmp3, 5);
3394   subl(tmp3, tmp2);
3395   addl(result, tmp3);
3396   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype);
3397   addl(result, tmp3);
3398   addl(index, 2);
3399   cmpl(index, cnt1);
3400   jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN);
3401 
3402   // }
3403   // if (i >= cnt1) {
3404   bind(SHORT_UNROLLED_LOOP_EXIT);
3405   jccb(Assembler::greater, END);
3406   movl(tmp2, result);
3407   shll(result, 5);
3408   subl(result, tmp2);
3409   arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3410   addl(result, tmp3);
3411   // }
3412   bind(END);
3413 
3414   BLOCK_COMMENT("} // arrays_hashcode");
3415 
3416 } // arrays_hashcode
3417 
3418 // helper function for string_compare
3419 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
3420                                            Address::ScaleFactor scale, Address::ScaleFactor scale1,
3421                                            Address::ScaleFactor scale2, Register index, int ae) {
3422   if (ae == StrIntrinsicNode::LL) {
3423     load_unsigned_byte(elem1, Address(str1, index, scale, 0));
3424     load_unsigned_byte(elem2, Address(str2, index, scale, 0));
3425   } else if (ae == StrIntrinsicNode::UU) {
3426     load_unsigned_short(elem1, Address(str1, index, scale, 0));
3427     load_unsigned_short(elem2, Address(str2, index, scale, 0));
3428   } else {
3429     load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
3430     load_unsigned_short(elem2, Address(str2, index, scale2, 0));
3431   }
3432 }
3433 
3434 // Compare strings, used for char[] and byte[].
3435 void C2_MacroAssembler::string_compare(Register str1, Register str2,
3436                                        Register cnt1, Register cnt2, Register result,
3437                                        XMMRegister vec1, int ae, KRegister mask) {
3438   ShortBranchVerifier sbv(this);
3439   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
3440   Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only _LP64 && AVX3
3441   int stride, stride2, adr_stride, adr_stride1, adr_stride2;
3442   int stride2x2 = 0x40;
3443   Address::ScaleFactor scale = Address::no_scale;
3444   Address::ScaleFactor scale1 = Address::no_scale;
3445   Address::ScaleFactor scale2 = Address::no_scale;
3446 
3447   if (ae != StrIntrinsicNode::LL) {
3448     stride2x2 = 0x20;
3449   }
3450 
3451   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
3452     shrl(cnt2, 1);
3453   }
3454   // Compute the minimum of the string lengths and the
3455   // difference of the string lengths (stack).
3456   // Do the conditional move stuff
3457   movl(result, cnt1);
3458   subl(cnt1, cnt2);
3459   push(cnt1);
3460   cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
3461 
3462   // Is the minimum length zero?
3463   testl(cnt2, cnt2);
3464   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3465   if (ae == StrIntrinsicNode::LL) {
3466     // Load first bytes
3467     load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
3468     load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
3469   } else if (ae == StrIntrinsicNode::UU) {
3470     // Load first characters
3471     load_unsigned_short(result, Address(str1, 0));
3472     load_unsigned_short(cnt1, Address(str2, 0));
3473   } else {
3474     load_unsigned_byte(result, Address(str1, 0));
3475     load_unsigned_short(cnt1, Address(str2, 0));
3476   }
3477   subl(result, cnt1);
3478   jcc(Assembler::notZero,  POP_LABEL);
3479 
3480   if (ae == StrIntrinsicNode::UU) {
3481     // Divide length by 2 to get number of chars
3482     shrl(cnt2, 1);
3483   }
3484   cmpl(cnt2, 1);
3485   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3486 
3487   // Check if the strings start at the same location and setup scale and stride
3488   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3489     cmpptr(str1, str2);
3490     jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3491     if (ae == StrIntrinsicNode::LL) {
3492       scale = Address::times_1;
3493       stride = 16;
3494     } else {
3495       scale = Address::times_2;
3496       stride = 8;
3497     }
3498   } else {
3499     scale1 = Address::times_1;
3500     scale2 = Address::times_2;
3501     // scale not used
3502     stride = 8;
3503   }
3504 
3505   if (UseAVX >= 2 && UseSSE42Intrinsics) {
3506     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
3507     Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
3508     Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
3509     Label COMPARE_TAIL_LONG;
3510     Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only _LP64 && AVX3
3511 
3512     int pcmpmask = 0x19;
3513     if (ae == StrIntrinsicNode::LL) {
3514       pcmpmask &= ~0x01;
3515     }
3516 
3517     // Setup to compare 16-chars (32-bytes) vectors,
3518     // start from first character again because it has aligned address.
3519     if (ae == StrIntrinsicNode::LL) {
3520       stride2 = 32;
3521     } else {
3522       stride2 = 16;
3523     }
3524     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3525       adr_stride = stride << scale;
3526     } else {
3527       adr_stride1 = 8;  //stride << scale1;
3528       adr_stride2 = 16; //stride << scale2;
3529     }
3530 
3531     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3532     // rax and rdx are used by pcmpestri as elements counters
3533     movl(result, cnt2);
3534     andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
3535     jcc(Assembler::zero, COMPARE_TAIL_LONG);
3536 
3537     // fast path : compare first 2 8-char vectors.
3538     bind(COMPARE_16_CHARS);
3539     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3540       movdqu(vec1, Address(str1, 0));
3541     } else {
3542       pmovzxbw(vec1, Address(str1, 0));
3543     }
3544     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3545     jccb(Assembler::below, COMPARE_INDEX_CHAR);
3546 
3547     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3548       movdqu(vec1, Address(str1, adr_stride));
3549       pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
3550     } else {
3551       pmovzxbw(vec1, Address(str1, adr_stride1));
3552       pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
3553     }
3554     jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
3555     addl(cnt1, stride);
3556 
3557     // Compare the characters at index in cnt1
3558     bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
3559     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3560     subl(result, cnt2);
3561     jmp(POP_LABEL);
3562 
3563     // Setup the registers to start vector comparison loop
3564     bind(COMPARE_WIDE_VECTORS);
3565     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3566       lea(str1, Address(str1, result, scale));
3567       lea(str2, Address(str2, result, scale));
3568     } else {
3569       lea(str1, Address(str1, result, scale1));
3570       lea(str2, Address(str2, result, scale2));
3571     }
3572     subl(result, stride2);
3573     subl(cnt2, stride2);
3574     jcc(Assembler::zero, COMPARE_WIDE_TAIL);
3575     negptr(result);
3576 
3577     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
3578     bind(COMPARE_WIDE_VECTORS_LOOP);
3579 
3580 #ifdef _LP64
3581     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3582       cmpl(cnt2, stride2x2);
3583       jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3584       testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
3585       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
3586 
3587       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3588       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3589         evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
3590         evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3591       } else {
3592         vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
3593         evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3594       }
3595       kortestql(mask, mask);
3596       jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
3597       addptr(result, stride2x2);  // update since we already compared at this addr
3598       subl(cnt2, stride2x2);      // and sub the size too
3599       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3600 
3601       vpxor(vec1, vec1);
3602       jmpb(COMPARE_WIDE_TAIL);
3603     }//if (VM_Version::supports_avx512vlbw())
3604 #endif // _LP64
3605 
3606 
3607     bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3608     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3609       vmovdqu(vec1, Address(str1, result, scale));
3610       vpxor(vec1, Address(str2, result, scale));
3611     } else {
3612       vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
3613       vpxor(vec1, Address(str2, result, scale2));
3614     }
3615     vptest(vec1, vec1);
3616     jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
3617     addptr(result, stride2);
3618     subl(cnt2, stride2);
3619     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
3620     // clean upper bits of YMM registers
3621     vpxor(vec1, vec1);
3622 
3623     // compare wide vectors tail
3624     bind(COMPARE_WIDE_TAIL);
3625     testptr(result, result);
3626     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3627 
3628     movl(result, stride2);
3629     movl(cnt2, result);
3630     negptr(result);
3631     jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3632 
3633     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
3634     bind(VECTOR_NOT_EQUAL);
3635     // clean upper bits of YMM registers
3636     vpxor(vec1, vec1);
3637     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3638       lea(str1, Address(str1, result, scale));
3639       lea(str2, Address(str2, result, scale));
3640     } else {
3641       lea(str1, Address(str1, result, scale1));
3642       lea(str2, Address(str2, result, scale2));
3643     }
3644     jmp(COMPARE_16_CHARS);
3645 
3646     // Compare tail chars, length between 1 to 15 chars
3647     bind(COMPARE_TAIL_LONG);
3648     movl(cnt2, result);
3649     cmpl(cnt2, stride);
3650     jcc(Assembler::less, COMPARE_SMALL_STR);
3651 
3652     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3653       movdqu(vec1, Address(str1, 0));
3654     } else {
3655       pmovzxbw(vec1, Address(str1, 0));
3656     }
3657     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3658     jcc(Assembler::below, COMPARE_INDEX_CHAR);
3659     subptr(cnt2, stride);
3660     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3661     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3662       lea(str1, Address(str1, result, scale));
3663       lea(str2, Address(str2, result, scale));
3664     } else {
3665       lea(str1, Address(str1, result, scale1));
3666       lea(str2, Address(str2, result, scale2));
3667     }
3668     negptr(cnt2);
3669     jmpb(WHILE_HEAD_LABEL);
3670 
3671     bind(COMPARE_SMALL_STR);
3672   } else if (UseSSE42Intrinsics) {
3673     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
3674     int pcmpmask = 0x19;
3675     // Setup to compare 8-char (16-byte) vectors,
3676     // start from first character again because it has aligned address.
3677     movl(result, cnt2);
3678     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
3679     if (ae == StrIntrinsicNode::LL) {
3680       pcmpmask &= ~0x01;
3681     }
3682     jcc(Assembler::zero, COMPARE_TAIL);
3683     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3684       lea(str1, Address(str1, result, scale));
3685       lea(str2, Address(str2, result, scale));
3686     } else {
3687       lea(str1, Address(str1, result, scale1));
3688       lea(str2, Address(str2, result, scale2));
3689     }
3690     negptr(result);
3691 
3692     // pcmpestri
3693     //   inputs:
3694     //     vec1- substring
3695     //     rax - negative string length (elements count)
3696     //     mem - scanned string
3697     //     rdx - string length (elements count)
3698     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
3699     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
3700     //   outputs:
3701     //     rcx - first mismatched element index
3702     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3703 
3704     bind(COMPARE_WIDE_VECTORS);
3705     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3706       movdqu(vec1, Address(str1, result, scale));
3707       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3708     } else {
3709       pmovzxbw(vec1, Address(str1, result, scale1));
3710       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3711     }
3712     // After pcmpestri cnt1(rcx) contains mismatched element index
3713 
3714     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
3715     addptr(result, stride);
3716     subptr(cnt2, stride);
3717     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3718 
3719     // compare wide vectors tail
3720     testptr(result, result);
3721     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3722 
3723     movl(cnt2, stride);
3724     movl(result, stride);
3725     negptr(result);
3726     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3727       movdqu(vec1, Address(str1, result, scale));
3728       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3729     } else {
3730       pmovzxbw(vec1, Address(str1, result, scale1));
3731       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3732     }
3733     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
3734 
3735     // Mismatched characters in the vectors
3736     bind(VECTOR_NOT_EQUAL);
3737     addptr(cnt1, result);
3738     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3739     subl(result, cnt2);
3740     jmpb(POP_LABEL);
3741 
3742     bind(COMPARE_TAIL); // limit is zero
3743     movl(cnt2, result);
3744     // Fallthru to tail compare
3745   }
3746   // Shift str2 and str1 to the end of the arrays, negate min
3747   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3748     lea(str1, Address(str1, cnt2, scale));
3749     lea(str2, Address(str2, cnt2, scale));
3750   } else {
3751     lea(str1, Address(str1, cnt2, scale1));
3752     lea(str2, Address(str2, cnt2, scale2));
3753   }
3754   decrementl(cnt2);  // first character was compared already
3755   negptr(cnt2);
3756 
3757   // Compare the rest of the elements
3758   bind(WHILE_HEAD_LABEL);
3759   load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
3760   subl(result, cnt1);
3761   jccb(Assembler::notZero, POP_LABEL);
3762   increment(cnt2);
3763   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
3764 
3765   // Strings are equal up to min length.  Return the length difference.
3766   bind(LENGTH_DIFF_LABEL);
3767   pop(result);
3768   if (ae == StrIntrinsicNode::UU) {
3769     // Divide diff by 2 to get number of chars
3770     sarl(result, 1);
3771   }
3772   jmpb(DONE_LABEL);
3773 
3774 #ifdef _LP64
3775   if (VM_Version::supports_avx512vlbw()) {
3776 
3777     bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
3778 
3779     kmovql(cnt1, mask);
3780     notq(cnt1);
3781     bsfq(cnt2, cnt1);
3782     if (ae != StrIntrinsicNode::LL) {
3783       // Divide diff by 2 to get number of chars
3784       sarl(cnt2, 1);
3785     }
3786     addq(result, cnt2);
3787     if (ae == StrIntrinsicNode::LL) {
3788       load_unsigned_byte(cnt1, Address(str2, result));
3789       load_unsigned_byte(result, Address(str1, result));
3790     } else if (ae == StrIntrinsicNode::UU) {
3791       load_unsigned_short(cnt1, Address(str2, result, scale));
3792       load_unsigned_short(result, Address(str1, result, scale));
3793     } else {
3794       load_unsigned_short(cnt1, Address(str2, result, scale2));
3795       load_unsigned_byte(result, Address(str1, result, scale1));
3796     }
3797     subl(result, cnt1);
3798     jmpb(POP_LABEL);
3799   }//if (VM_Version::supports_avx512vlbw())
3800 #endif // _LP64
3801 
3802   // Discard the stored length difference
3803   bind(POP_LABEL);
3804   pop(cnt1);
3805 
3806   // That's it
3807   bind(DONE_LABEL);
3808   if(ae == StrIntrinsicNode::UL) {
3809     negl(result);
3810   }
3811 
3812 }
3813 
3814 // Search for Non-ASCII character (Negative byte value) in a byte array,
3815 // return the index of the first such character, otherwise the length
3816 // of the array segment searched.
3817 //   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
3818 //   @IntrinsicCandidate
3819 //   public static int countPositives(byte[] ba, int off, int len) {
3820 //     for (int i = off; i < off + len; i++) {
3821 //       if (ba[i] < 0) {
3822 //         return i - off;
3823 //       }
3824 //     }
3825 //     return len;
3826 //   }
3827 void C2_MacroAssembler::count_positives(Register ary1, Register len,
3828   Register result, Register tmp1,
3829   XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {
3830   // rsi: byte array
3831   // rcx: len
3832   // rax: result
3833   ShortBranchVerifier sbv(this);
3834   assert_different_registers(ary1, len, result, tmp1);
3835   assert_different_registers(vec1, vec2);
3836   Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
3837 
3838   movl(result, len); // copy
3839   // len == 0
3840   testl(len, len);
3841   jcc(Assembler::zero, DONE);
3842 
3843   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
3844     VM_Version::supports_avx512vlbw() &&
3845     VM_Version::supports_bmi2()) {
3846 
3847     Label test_64_loop, test_tail, BREAK_LOOP;
3848     Register tmp3_aliased = len;
3849 
3850     movl(tmp1, len);
3851     vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
3852 
3853     andl(tmp1, 64 - 1);   // tail count (in chars) 0x3F
3854     andl(len, ~(64 - 1));    // vector count (in chars)
3855     jccb(Assembler::zero, test_tail);
3856 
3857     lea(ary1, Address(ary1, len, Address::times_1));
3858     negptr(len);
3859 
3860     bind(test_64_loop);
3861     // Check whether our 64 elements of size byte contain negatives
3862     evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
3863     kortestql(mask1, mask1);
3864     jcc(Assembler::notZero, BREAK_LOOP);
3865 
3866     addptr(len, 64);
3867     jccb(Assembler::notZero, test_64_loop);
3868 
3869     bind(test_tail);
3870     // bail out when there is nothing to be done
3871     testl(tmp1, -1);
3872     jcc(Assembler::zero, DONE);
3873 
3874     // ~(~0 << len) applied up to two times (for 32-bit scenario)
3875 #ifdef _LP64
3876     mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
3877     shlxq(tmp3_aliased, tmp3_aliased, tmp1);
3878     notq(tmp3_aliased);
3879     kmovql(mask2, tmp3_aliased);
3880 #else
3881     Label k_init;
3882     jmp(k_init);
3883 
3884     // We could not read 64-bits from a general purpose register thus we move
3885     // data required to compose 64 1's to the instruction stream
3886     // We emit 64 byte wide series of elements from 0..63 which later on would
3887     // be used as a compare targets with tail count contained in tmp1 register.
3888     // Result would be a k register having tmp1 consecutive number or 1
3889     // counting from least significant bit.
3890     address tmp = pc();
3891     emit_int64(0x0706050403020100);
3892     emit_int64(0x0F0E0D0C0B0A0908);
3893     emit_int64(0x1716151413121110);
3894     emit_int64(0x1F1E1D1C1B1A1918);
3895     emit_int64(0x2726252423222120);
3896     emit_int64(0x2F2E2D2C2B2A2928);
3897     emit_int64(0x3736353433323130);
3898     emit_int64(0x3F3E3D3C3B3A3938);
3899 
3900     bind(k_init);
3901     lea(len, InternalAddress(tmp));
3902     // create mask to test for negative byte inside a vector
3903     evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit);
3904     evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit);
3905 
3906 #endif
3907     evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);
3908     ktestq(mask1, mask2);
3909     jcc(Assembler::zero, DONE);
3910 
3911     bind(BREAK_LOOP);
3912     // At least one byte in the last 64 bytes is negative.
3913     // Set up to look at the last 64 bytes as if they were a tail
3914     lea(ary1, Address(ary1, len, Address::times_1));
3915     addptr(result, len);
3916     // Ignore the very last byte: if all others are positive,
3917     // it must be negative, so we can skip right to the 2+1 byte
3918     // end comparison at this point
3919     orl(result, 63);
3920     movl(len, 63);
3921     // Fallthru to tail compare
3922   } else {
3923 
3924     if (UseAVX >= 2 && UseSSE >= 2) {
3925       // With AVX2, use 32-byte vector compare
3926       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
3927 
3928       // Compare 32-byte vectors
3929       testl(len, 0xffffffe0);   // vector count (in bytes)
3930       jccb(Assembler::zero, TAIL_START);
3931 
3932       andl(len, 0xffffffe0);
3933       lea(ary1, Address(ary1, len, Address::times_1));
3934       negptr(len);
3935 
3936       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
3937       movdl(vec2, tmp1);
3938       vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
3939 
3940       bind(COMPARE_WIDE_VECTORS);
3941       vmovdqu(vec1, Address(ary1, len, Address::times_1));
3942       vptest(vec1, vec2);
3943       jccb(Assembler::notZero, BREAK_LOOP);
3944       addptr(len, 32);
3945       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3946 
3947       testl(result, 0x0000001f);   // any bytes remaining?
3948       jcc(Assembler::zero, DONE);
3949 
3950       // Quick test using the already prepared vector mask
3951       movl(len, result);
3952       andl(len, 0x0000001f);
3953       vmovdqu(vec1, Address(ary1, len, Address::times_1, -32));
3954       vptest(vec1, vec2);
3955       jcc(Assembler::zero, DONE);
3956       // There are zeros, jump to the tail to determine exactly where
3957       jmpb(TAIL_START);
3958 
3959       bind(BREAK_LOOP);
3960       // At least one byte in the last 32-byte vector is negative.
3961       // Set up to look at the last 32 bytes as if they were a tail
3962       lea(ary1, Address(ary1, len, Address::times_1));
3963       addptr(result, len);
3964       // Ignore the very last byte: if all others are positive,
3965       // it must be negative, so we can skip right to the 2+1 byte
3966       // end comparison at this point
3967       orl(result, 31);
3968       movl(len, 31);
3969       // Fallthru to tail compare
3970     } else if (UseSSE42Intrinsics) {
3971       // With SSE4.2, use double quad vector compare
3972       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
3973 
3974       // Compare 16-byte vectors
3975       testl(len, 0xfffffff0);   // vector count (in bytes)
3976       jcc(Assembler::zero, TAIL_START);
3977 
3978       andl(len, 0xfffffff0);
3979       lea(ary1, Address(ary1, len, Address::times_1));
3980       negptr(len);
3981 
3982       movl(tmp1, 0x80808080);
3983       movdl(vec2, tmp1);
3984       pshufd(vec2, vec2, 0);
3985 
3986       bind(COMPARE_WIDE_VECTORS);
3987       movdqu(vec1, Address(ary1, len, Address::times_1));
3988       ptest(vec1, vec2);
3989       jccb(Assembler::notZero, BREAK_LOOP);
3990       addptr(len, 16);
3991       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3992 
3993       testl(result, 0x0000000f); // len is zero, any bytes remaining?
3994       jcc(Assembler::zero, DONE);
3995 
3996       // Quick test using the already prepared vector mask
3997       movl(len, result);
3998       andl(len, 0x0000000f);   // tail count (in bytes)
3999       movdqu(vec1, Address(ary1, len, Address::times_1, -16));
4000       ptest(vec1, vec2);
4001       jcc(Assembler::zero, DONE);
4002       jmpb(TAIL_START);
4003 
4004       bind(BREAK_LOOP);
4005       // At least one byte in the last 16-byte vector is negative.
4006       // Set up and look at the last 16 bytes as if they were a tail
4007       lea(ary1, Address(ary1, len, Address::times_1));
4008       addptr(result, len);
4009       // Ignore the very last byte: if all others are positive,
4010       // it must be negative, so we can skip right to the 2+1 byte
4011       // end comparison at this point
4012       orl(result, 15);
4013       movl(len, 15);
4014       // Fallthru to tail compare
4015     }
4016   }
4017 
4018   bind(TAIL_START);
4019   // Compare 4-byte vectors
4020   andl(len, 0xfffffffc); // vector count (in bytes)
4021   jccb(Assembler::zero, COMPARE_CHAR);
4022 
4023   lea(ary1, Address(ary1, len, Address::times_1));
4024   negptr(len);
4025 
4026   bind(COMPARE_VECTORS);
4027   movl(tmp1, Address(ary1, len, Address::times_1));
4028   andl(tmp1, 0x80808080);
4029   jccb(Assembler::notZero, TAIL_ADJUST);
4030   addptr(len, 4);
4031   jccb(Assembler::notZero, COMPARE_VECTORS);
4032 
4033   // Compare trailing char (final 2-3 bytes), if any
4034   bind(COMPARE_CHAR);
4035 
4036   testl(result, 0x2);   // tail  char
4037   jccb(Assembler::zero, COMPARE_BYTE);
4038   load_unsigned_short(tmp1, Address(ary1, 0));
4039   andl(tmp1, 0x00008080);
4040   jccb(Assembler::notZero, CHAR_ADJUST);
4041   lea(ary1, Address(ary1, 2));
4042 
4043   bind(COMPARE_BYTE);
4044   testl(result, 0x1);   // tail  byte
4045   jccb(Assembler::zero, DONE);
4046   load_unsigned_byte(tmp1, Address(ary1, 0));
4047   testl(tmp1, 0x00000080);
4048   jccb(Assembler::zero, DONE);
4049   subptr(result, 1);
4050   jmpb(DONE);
4051 
4052   bind(TAIL_ADJUST);
4053   // there are negative bits in the last 4 byte block.
4054   // Adjust result and check the next three bytes
4055   addptr(result, len);
4056   orl(result, 3);
4057   lea(ary1, Address(ary1, len, Address::times_1));
4058   jmpb(COMPARE_CHAR);
4059 
4060   bind(CHAR_ADJUST);
4061   // We are looking at a char + optional byte tail, and found that one
4062   // of the bytes in the char is negative. Adjust the result, check the
4063   // first byte and readjust if needed.
4064   andl(result, 0xfffffffc);
4065   testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first
4066   jccb(Assembler::notZero, DONE);
4067   addptr(result, 1);
4068 
4069   // That's it
4070   bind(DONE);
4071   if (UseAVX >= 2 && UseSSE >= 2) {
4072     // clean upper bits of YMM registers
4073     vpxor(vec1, vec1);
4074     vpxor(vec2, vec2);
4075   }
4076 }
4077 
4078 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
4079 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
4080                                       Register limit, Register result, Register chr,
4081                                       XMMRegister vec1, XMMRegister vec2, bool is_char, KRegister mask) {
4082   ShortBranchVerifier sbv(this);
4083   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
4084 
4085   int length_offset  = arrayOopDesc::length_offset_in_bytes();
4086   int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
4087 
4088   if (is_array_equ) {
4089     // Check the input args
4090     cmpoop(ary1, ary2);
4091     jcc(Assembler::equal, TRUE_LABEL);
4092 
4093     // Need additional checks for arrays_equals.
4094     testptr(ary1, ary1);
4095     jcc(Assembler::zero, FALSE_LABEL);
4096     testptr(ary2, ary2);
4097     jcc(Assembler::zero, FALSE_LABEL);
4098 
4099     // Check the lengths
4100     movl(limit, Address(ary1, length_offset));
4101     cmpl(limit, Address(ary2, length_offset));
4102     jcc(Assembler::notEqual, FALSE_LABEL);
4103   }
4104 
4105   // count == 0
4106   testl(limit, limit);
4107   jcc(Assembler::zero, TRUE_LABEL);
4108 
4109   if (is_array_equ) {
4110     // Load array address
4111     lea(ary1, Address(ary1, base_offset));
4112     lea(ary2, Address(ary2, base_offset));
4113   }
4114 
4115   if (is_array_equ && is_char) {
4116     // arrays_equals when used for char[].
4117     shll(limit, 1);      // byte count != 0
4118   }
4119   movl(result, limit); // copy
4120 
4121   if (UseAVX >= 2) {
4122     // With AVX2, use 32-byte vector compare
4123     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4124 
4125     // Compare 32-byte vectors
4126     andl(result, 0x0000001f);  //   tail count (in bytes)
4127     andl(limit, 0xffffffe0);   // vector count (in bytes)
4128     jcc(Assembler::zero, COMPARE_TAIL);
4129 
4130     lea(ary1, Address(ary1, limit, Address::times_1));
4131     lea(ary2, Address(ary2, limit, Address::times_1));
4132     negptr(limit);
4133 
4134 #ifdef _LP64
4135     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
4136       Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
4137 
4138       cmpl(limit, -64);
4139       jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
4140 
4141       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
4142 
4143       evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
4144       evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
4145       kortestql(mask, mask);
4146       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4147       addptr(limit, 64);  // update since we already compared at this addr
4148       cmpl(limit, -64);
4149       jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
4150 
4151       // At this point we may still need to compare -limit+result bytes.
4152       // We could execute the next two instruction and just continue via non-wide path:
4153       //  cmpl(limit, 0);
4154       //  jcc(Assembler::equal, COMPARE_TAIL);  // true
4155       // But since we stopped at the points ary{1,2}+limit which are
4156       // not farther than 64 bytes from the ends of arrays ary{1,2}+result
4157       // (|limit| <= 32 and result < 32),
4158       // we may just compare the last 64 bytes.
4159       //
4160       addptr(result, -64);   // it is safe, bc we just came from this area
4161       evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
4162       evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
4163       kortestql(mask, mask);
4164       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4165 
4166       jmp(TRUE_LABEL);
4167 
4168       bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
4169 
4170     }//if (VM_Version::supports_avx512vlbw())
4171 #endif //_LP64
4172     bind(COMPARE_WIDE_VECTORS);
4173     vmovdqu(vec1, Address(ary1, limit, Address::times_1));
4174     vmovdqu(vec2, Address(ary2, limit, Address::times_1));
4175     vpxor(vec1, vec2);
4176 
4177     vptest(vec1, vec1);
4178     jcc(Assembler::notZero, FALSE_LABEL);
4179     addptr(limit, 32);
4180     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4181 
4182     testl(result, result);
4183     jcc(Assembler::zero, TRUE_LABEL);
4184 
4185     vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
4186     vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
4187     vpxor(vec1, vec2);
4188 
4189     vptest(vec1, vec1);
4190     jccb(Assembler::notZero, FALSE_LABEL);
4191     jmpb(TRUE_LABEL);
4192 
4193     bind(COMPARE_TAIL); // limit is zero
4194     movl(limit, result);
4195     // Fallthru to tail compare
4196   } else if (UseSSE42Intrinsics) {
4197     // With SSE4.2, use double quad vector compare
4198     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4199 
4200     // Compare 16-byte vectors
4201     andl(result, 0x0000000f);  //   tail count (in bytes)
4202     andl(limit, 0xfffffff0);   // vector count (in bytes)
4203     jcc(Assembler::zero, COMPARE_TAIL);
4204 
4205     lea(ary1, Address(ary1, limit, Address::times_1));
4206     lea(ary2, Address(ary2, limit, Address::times_1));
4207     negptr(limit);
4208 
4209     bind(COMPARE_WIDE_VECTORS);
4210     movdqu(vec1, Address(ary1, limit, Address::times_1));
4211     movdqu(vec2, Address(ary2, limit, Address::times_1));
4212     pxor(vec1, vec2);
4213 
4214     ptest(vec1, vec1);
4215     jcc(Assembler::notZero, FALSE_LABEL);
4216     addptr(limit, 16);
4217     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4218 
4219     testl(result, result);
4220     jcc(Assembler::zero, TRUE_LABEL);
4221 
4222     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
4223     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
4224     pxor(vec1, vec2);
4225 
4226     ptest(vec1, vec1);
4227     jccb(Assembler::notZero, FALSE_LABEL);
4228     jmpb(TRUE_LABEL);
4229 
4230     bind(COMPARE_TAIL); // limit is zero
4231     movl(limit, result);
4232     // Fallthru to tail compare
4233   }
4234 
4235   // Compare 4-byte vectors
4236   andl(limit, 0xfffffffc); // vector count (in bytes)
4237   jccb(Assembler::zero, COMPARE_CHAR);
4238 
4239   lea(ary1, Address(ary1, limit, Address::times_1));
4240   lea(ary2, Address(ary2, limit, Address::times_1));
4241   negptr(limit);
4242 
4243   bind(COMPARE_VECTORS);
4244   movl(chr, Address(ary1, limit, Address::times_1));
4245   cmpl(chr, Address(ary2, limit, Address::times_1));
4246   jccb(Assembler::notEqual, FALSE_LABEL);
4247   addptr(limit, 4);
4248   jcc(Assembler::notZero, COMPARE_VECTORS);
4249 
4250   // Compare trailing char (final 2 bytes), if any
4251   bind(COMPARE_CHAR);
4252   testl(result, 0x2);   // tail  char
4253   jccb(Assembler::zero, COMPARE_BYTE);
4254   load_unsigned_short(chr, Address(ary1, 0));
4255   load_unsigned_short(limit, Address(ary2, 0));
4256   cmpl(chr, limit);
4257   jccb(Assembler::notEqual, FALSE_LABEL);
4258 
4259   if (is_array_equ && is_char) {
4260     bind(COMPARE_BYTE);
4261   } else {
4262     lea(ary1, Address(ary1, 2));
4263     lea(ary2, Address(ary2, 2));
4264 
4265     bind(COMPARE_BYTE);
4266     testl(result, 0x1);   // tail  byte
4267     jccb(Assembler::zero, TRUE_LABEL);
4268     load_unsigned_byte(chr, Address(ary1, 0));
4269     load_unsigned_byte(limit, Address(ary2, 0));
4270     cmpl(chr, limit);
4271     jccb(Assembler::notEqual, FALSE_LABEL);
4272   }
4273   bind(TRUE_LABEL);
4274   movl(result, 1);   // return true
4275   jmpb(DONE);
4276 
4277   bind(FALSE_LABEL);
4278   xorl(result, result); // return false
4279 
4280   // That's it
4281   bind(DONE);
4282   if (UseAVX >= 2) {
4283     // clean upper bits of YMM registers
4284     vpxor(vec1, vec1);
4285     vpxor(vec2, vec2);
4286   }
4287 }
4288 
4289 #ifdef _LP64
4290 
4291 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) {
4292 #define __ masm.
4293   Register dst = stub.data<0>();
4294   XMMRegister src = stub.data<1>();
4295   address target = stub.data<2>();
4296   __ bind(stub.entry());
4297   __ subptr(rsp, 8);
4298   __ movdbl(Address(rsp), src);
4299   __ call(RuntimeAddress(target));
4300   __ pop(dst);
4301   __ jmp(stub.continuation());
4302 #undef __
4303 }
4304 
4305 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) {
4306   assert(dst_bt == T_INT || dst_bt == T_LONG, "");
4307   assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, "");
4308 
4309   address slowpath_target;
4310   if (dst_bt == T_INT) {
4311     if (src_bt == T_FLOAT) {
4312       cvttss2sil(dst, src);
4313       cmpl(dst, 0x80000000);
4314       slowpath_target = StubRoutines::x86::f2i_fixup();
4315     } else {
4316       cvttsd2sil(dst, src);
4317       cmpl(dst, 0x80000000);
4318       slowpath_target = StubRoutines::x86::d2i_fixup();
4319     }
4320   } else {
4321     if (src_bt == T_FLOAT) {
4322       cvttss2siq(dst, src);
4323       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4324       slowpath_target = StubRoutines::x86::f2l_fixup();
4325     } else {
4326       cvttsd2siq(dst, src);
4327       cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4328       slowpath_target = StubRoutines::x86::d2l_fixup();
4329     }
4330   }
4331 
4332   auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, 23, convertF2I_slowpath);
4333   jcc(Assembler::equal, stub->entry());
4334   bind(stub->continuation());
4335 }
4336 
4337 #endif // _LP64
4338 
4339 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4340                                     XMMRegister src1, int imm8, bool merge, int vlen_enc) {
4341   switch(ideal_opc) {
4342     case Op_LShiftVS:
4343       Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break;
4344     case Op_LShiftVI:
4345       Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break;
4346     case Op_LShiftVL:
4347       Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break;
4348     case Op_RShiftVS:
4349       Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break;
4350     case Op_RShiftVI:
4351       Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break;
4352     case Op_RShiftVL:
4353       Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break;
4354     case Op_URShiftVS:
4355       Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break;
4356     case Op_URShiftVI:
4357       Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break;
4358     case Op_URShiftVL:
4359       Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break;
4360     case Op_RotateRightV:
4361       evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4362     case Op_RotateLeftV:
4363       evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4364     default:
4365       fatal("Unsupported masked operation"); break;
4366   }
4367 }
4368 
4369 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4370                                     XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc,
4371                                     bool is_varshift) {
4372   switch (ideal_opc) {
4373     case Op_AddVB:
4374       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4375     case Op_AddVS:
4376       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4377     case Op_AddVI:
4378       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4379     case Op_AddVL:
4380       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4381     case Op_AddVF:
4382       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4383     case Op_AddVD:
4384       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4385     case Op_SubVB:
4386       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4387     case Op_SubVS:
4388       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4389     case Op_SubVI:
4390       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4391     case Op_SubVL:
4392       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4393     case Op_SubVF:
4394       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4395     case Op_SubVD:
4396       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4397     case Op_MulVS:
4398       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4399     case Op_MulVI:
4400       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4401     case Op_MulVL:
4402       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4403     case Op_MulVF:
4404       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4405     case Op_MulVD:
4406       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4407     case Op_DivVF:
4408       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4409     case Op_DivVD:
4410       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4411     case Op_SqrtVF:
4412       evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break;
4413     case Op_SqrtVD:
4414       evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break;
4415     case Op_AbsVB:
4416       evpabsb(dst, mask, src2, merge, vlen_enc); break;
4417     case Op_AbsVS:
4418       evpabsw(dst, mask, src2, merge, vlen_enc); break;
4419     case Op_AbsVI:
4420       evpabsd(dst, mask, src2, merge, vlen_enc); break;
4421     case Op_AbsVL:
4422       evpabsq(dst, mask, src2, merge, vlen_enc); break;
4423     case Op_FmaVF:
4424       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4425     case Op_FmaVD:
4426       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4427     case Op_VectorRearrange:
4428       evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break;
4429     case Op_LShiftVS:
4430       evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4431     case Op_LShiftVI:
4432       evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4433     case Op_LShiftVL:
4434       evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4435     case Op_RShiftVS:
4436       evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4437     case Op_RShiftVI:
4438       evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4439     case Op_RShiftVL:
4440       evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4441     case Op_URShiftVS:
4442       evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4443     case Op_URShiftVI:
4444       evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4445     case Op_URShiftVL:
4446       evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4447     case Op_RotateLeftV:
4448       evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4449     case Op_RotateRightV:
4450       evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4451     case Op_MaxV:
4452       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4453     case Op_MinV:
4454       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4455     case Op_XorV:
4456       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4457     case Op_OrV:
4458       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4459     case Op_AndV:
4460       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4461     default:
4462       fatal("Unsupported masked operation"); break;
4463   }
4464 }
4465 
4466 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4467                                     XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4468   switch (ideal_opc) {
4469     case Op_AddVB:
4470       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4471     case Op_AddVS:
4472       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4473     case Op_AddVI:
4474       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4475     case Op_AddVL:
4476       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4477     case Op_AddVF:
4478       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4479     case Op_AddVD:
4480       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4481     case Op_SubVB:
4482       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4483     case Op_SubVS:
4484       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4485     case Op_SubVI:
4486       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4487     case Op_SubVL:
4488       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4489     case Op_SubVF:
4490       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4491     case Op_SubVD:
4492       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4493     case Op_MulVS:
4494       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4495     case Op_MulVI:
4496       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4497     case Op_MulVL:
4498       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4499     case Op_MulVF:
4500       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4501     case Op_MulVD:
4502       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4503     case Op_DivVF:
4504       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4505     case Op_DivVD:
4506       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4507     case Op_FmaVF:
4508       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4509     case Op_FmaVD:
4510       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4511     case Op_MaxV:
4512       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4513     case Op_MinV:
4514       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4515     case Op_XorV:
4516       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4517     case Op_OrV:
4518       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4519     case Op_AndV:
4520       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4521     default:
4522       fatal("Unsupported masked operation"); break;
4523   }
4524 }
4525 
4526 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst,
4527                                   KRegister src1, KRegister src2) {
4528   BasicType etype = T_ILLEGAL;
4529   switch(mask_len) {
4530     case 2:
4531     case 4:
4532     case 8:  etype = T_BYTE; break;
4533     case 16: etype = T_SHORT; break;
4534     case 32: etype = T_INT; break;
4535     case 64: etype = T_LONG; break;
4536     default: fatal("Unsupported type"); break;
4537   }
4538   assert(etype != T_ILLEGAL, "");
4539   switch(ideal_opc) {
4540     case Op_AndVMask:
4541       kand(etype, dst, src1, src2); break;
4542     case Op_OrVMask:
4543       kor(etype, dst, src1, src2); break;
4544     case Op_XorVMask:
4545       kxor(etype, dst, src1, src2); break;
4546     default:
4547       fatal("Unsupported masked operation"); break;
4548   }
4549 }
4550 
4551 /*
4552  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4553  * If src is NaN, the result is 0.
4554  * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE,
4555  * the result is equal to the value of Integer.MIN_VALUE.
4556  * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE,
4557  * the result is equal to the value of Integer.MAX_VALUE.
4558  */
4559 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4560                                                                    XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
4561                                                                    Register rscratch, AddressLiteral float_sign_flip,
4562                                                                    int vec_enc) {
4563   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4564   Label done;
4565   vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch);
4566   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
4567   vptest(xtmp2, xtmp2, vec_enc);
4568   jccb(Assembler::equal, done);
4569 
4570   vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc);
4571   vpxor(xtmp1, xtmp1, xtmp4, vec_enc);
4572 
4573   vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
4574   vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc);
4575   vblendvps(dst, dst, xtmp4, xtmp3, vec_enc);
4576 
4577   // Recompute the mask for remaining special value.
4578   vpxor(xtmp2, xtmp2, xtmp3, vec_enc);
4579   // Extract SRC values corresponding to TRUE mask lanes.
4580   vpand(xtmp4, xtmp2, src, vec_enc);
4581   // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special
4582   // values are set.
4583   vpxor(xtmp3, xtmp2, xtmp4, vec_enc);
4584 
4585   vblendvps(dst, dst, xtmp1, xtmp3, vec_enc);
4586   bind(done);
4587 }
4588 
4589 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4590                                                                     XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4591                                                                     Register rscratch, AddressLiteral float_sign_flip,
4592                                                                     int vec_enc) {
4593   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4594   Label done;
4595   evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
4596   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4597   kortestwl(ktmp1, ktmp1);
4598   jccb(Assembler::equal, done);
4599 
4600   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4601   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4602   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
4603 
4604   kxorwl(ktmp1, ktmp1, ktmp2);
4605   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4606   vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4607   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
4608   bind(done);
4609 }
4610 
4611 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4612                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4613                                                                      Register rscratch, AddressLiteral double_sign_flip,
4614                                                                      int vec_enc) {
4615   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4616 
4617   Label done;
4618   evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
4619   Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc);
4620   kortestwl(ktmp1, ktmp1);
4621   jccb(Assembler::equal, done);
4622 
4623   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4624   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4625   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
4626 
4627   kxorwl(ktmp1, ktmp1, ktmp2);
4628   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4629   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4630   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
4631   bind(done);
4632 }
4633 
4634 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4635                                                                      XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4636                                                                      Register rscratch, AddressLiteral float_sign_flip,
4637                                                                      int vec_enc) {
4638   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4639   Label done;
4640   evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
4641   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4642   kortestwl(ktmp1, ktmp1);
4643   jccb(Assembler::equal, done);
4644 
4645   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4646   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4647   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
4648 
4649   kxorwl(ktmp1, ktmp1, ktmp2);
4650   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4651   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4652   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
4653   bind(done);
4654 }
4655 
4656 /*
4657  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4658  * If src is NaN, the result is 0.
4659  * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE,
4660  * the result is equal to the value of Long.MIN_VALUE.
4661  * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE,
4662  * the result is equal to the value of Long.MAX_VALUE.
4663  */
4664 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4665                                                                       XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4666                                                                       Register rscratch, AddressLiteral double_sign_flip,
4667                                                                       int vec_enc) {
4668   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4669 
4670   Label done;
4671   evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
4672   evpcmpeqq(ktmp1, xtmp1, dst, vec_enc);
4673   kortestwl(ktmp1, ktmp1);
4674   jccb(Assembler::equal, done);
4675 
4676   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4677   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4678   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
4679 
4680   kxorwl(ktmp1, ktmp1, ktmp2);
4681   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4682   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4683   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
4684   bind(done);
4685 }
4686 
4687 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero,
4688                                                              XMMRegister xtmp, int index, int vec_enc) {
4689    assert(vec_enc < Assembler::AVX_512bit, "");
4690    if (vec_enc == Assembler::AVX_256bit) {
4691      vextractf128_high(xtmp, src);
4692      vshufps(dst, src, xtmp, index, vec_enc);
4693    } else {
4694      vshufps(dst, src, zero, index, vec_enc);
4695    }
4696 }
4697 
4698 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
4699                                                                     XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch,
4700                                                                     AddressLiteral float_sign_flip, int src_vec_enc) {
4701   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4702 
4703   Label done;
4704   // Compare the destination lanes with float_sign_flip
4705   // value to get mask for all special values.
4706   movdqu(xtmp1, float_sign_flip, rscratch);
4707   vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit);
4708   ptest(xtmp2, xtmp2);
4709   jccb(Assembler::equal, done);
4710 
4711   // Flip float_sign_flip to get max integer value.
4712   vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit);
4713   pxor(xtmp1, xtmp4);
4714 
4715   // Set detination lanes corresponding to unordered source lanes as zero.
4716   vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc);
4717   vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc);
4718 
4719   // Shuffle mask vector and pack lower doubles word from each quadword lane.
4720   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
4721   vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit);
4722 
4723   // Recompute the mask for remaining special value.
4724   pxor(xtmp2, xtmp3);
4725   // Extract mask corresponding to non-negative source lanes.
4726   vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc);
4727 
4728   // Shuffle mask vector and pack lower doubles word from each quadword lane.
4729   vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
4730   pand(xtmp3, xtmp2);
4731 
4732   // Replace destination lanes holding special value(0x80000000) with max int
4733   // if corresponding source lane holds a +ve value.
4734   vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit);
4735   bind(done);
4736 }
4737 
4738 
4739 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero,
4740                                                    XMMRegister xtmp, Register rscratch, int vec_enc) {
4741   switch(to_elem_bt) {
4742     case T_SHORT:
4743       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing");
4744       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch);
4745       vpackusdw(dst, dst, zero, vec_enc);
4746       if (vec_enc == Assembler::AVX_256bit) {
4747         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
4748       }
4749       break;
4750     case  T_BYTE:
4751       assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing");
4752       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch);
4753       vpackusdw(dst, dst, zero, vec_enc);
4754       if (vec_enc == Assembler::AVX_256bit) {
4755         vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
4756       }
4757       vpackuswb(dst, dst, zero, vec_enc);
4758       break;
4759     default: assert(false, "%s", type2name(to_elem_bt));
4760   }
4761 }
4762 
4763 /*
4764  * Algorithm for vector D2L and F2I conversions:-
4765  * a) Perform vector D2L/F2I cast.
4766  * b) Choose fast path if none of the result vector lane contains 0x80000000 value.
4767  *    It signifies that source value could be any of the special floating point
4768  *    values(NaN,-Inf,Inf,Max,-Min).
4769  * c) Set destination to zero if source is NaN value.
4770  * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value.
4771  */
4772 
4773 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4774                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
4775                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
4776   int to_elem_sz = type2aelembytes(to_elem_bt);
4777   assert(to_elem_sz <= 4, "");
4778   vcvttps2dq(dst, src, vec_enc);
4779   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc);
4780   if (to_elem_sz < 4) {
4781     vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
4782     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc);
4783   }
4784 }
4785 
4786 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4787                                             XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip,
4788                                             Register rscratch, int vec_enc) {
4789   int to_elem_sz = type2aelembytes(to_elem_bt);
4790   assert(to_elem_sz <= 4, "");
4791   vcvttps2dq(dst, src, vec_enc);
4792   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc);
4793   switch(to_elem_bt) {
4794     case T_INT:
4795       break;
4796     case T_SHORT:
4797       evpmovdw(dst, dst, vec_enc);
4798       break;
4799     case T_BYTE:
4800       evpmovdb(dst, dst, vec_enc);
4801       break;
4802     default: assert(false, "%s", type2name(to_elem_bt));
4803   }
4804 }
4805 
4806 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
4807                                             KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip,
4808                                             Register rscratch, int vec_enc) {
4809   evcvttps2qq(dst, src, vec_enc);
4810   vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc);
4811 }
4812 
4813 // Handling for downcasting from double to integer or sub-word types on AVX2.
4814 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4815                                            XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5,
4816                                            AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
4817   int to_elem_sz = type2aelembytes(to_elem_bt);
4818   assert(to_elem_sz < 8, "");
4819   vcvttpd2dq(dst, src, vec_enc);
4820   vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch,
4821                                               float_sign_flip, vec_enc);
4822   if (to_elem_sz < 4) {
4823     // xtmp4 holds all zero lanes.
4824     vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit);
4825   }
4826 }
4827 
4828 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src,
4829                                             XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1,
4830                                             KRegister ktmp2, AddressLiteral sign_flip,
4831                                             Register rscratch, int vec_enc) {
4832   if (VM_Version::supports_avx512dq()) {
4833     evcvttpd2qq(dst, src, vec_enc);
4834     vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
4835     switch(to_elem_bt) {
4836       case T_LONG:
4837         break;
4838       case T_INT:
4839         evpmovsqd(dst, dst, vec_enc);
4840         break;
4841       case T_SHORT:
4842         evpmovsqd(dst, dst, vec_enc);
4843         evpmovdw(dst, dst, vec_enc);
4844         break;
4845       case T_BYTE:
4846         evpmovsqd(dst, dst, vec_enc);
4847         evpmovdb(dst, dst, vec_enc);
4848         break;
4849       default: assert(false, "%s", type2name(to_elem_bt));
4850     }
4851   } else {
4852     assert(type2aelembytes(to_elem_bt) <= 4, "");
4853     vcvttpd2dq(dst, src, vec_enc);
4854     vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
4855     switch(to_elem_bt) {
4856       case T_INT:
4857         break;
4858       case T_SHORT:
4859         evpmovdw(dst, dst, vec_enc);
4860         break;
4861       case T_BYTE:
4862         evpmovdb(dst, dst, vec_enc);
4863         break;
4864       default: assert(false, "%s", type2name(to_elem_bt));
4865     }
4866   }
4867 }
4868 
4869 #ifdef _LP64
4870 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src,
4871                                                  AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
4872                                                  Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
4873   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
4874   // and re-instantiate original MXCSR.RC mode after that.
4875   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
4876 
4877   mov64(tmp, julong_cast(0.5L));
4878   evpbroadcastq(xtmp1, tmp, vec_enc);
4879   vaddpd(xtmp1, src , xtmp1, vec_enc);
4880   evcvtpd2qq(dst, xtmp1, vec_enc);
4881   vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
4882                                                 double_sign_flip, vec_enc);;
4883 
4884   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
4885 }
4886 
4887 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src,
4888                                                 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
4889                                                 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
4890   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
4891   // and re-instantiate original MXCSR.RC mode after that.
4892   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
4893 
4894   movl(tmp, jint_cast(0.5));
4895   movq(xtmp1, tmp);
4896   vbroadcastss(xtmp1, xtmp1, vec_enc);
4897   vaddps(xtmp1, src , xtmp1, vec_enc);
4898   vcvtps2dq(dst, xtmp1, vec_enc);
4899   vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
4900                                               float_sign_flip, vec_enc);
4901 
4902   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
4903 }
4904 
4905 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src,
4906                                                AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
4907                                                Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) {
4908   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
4909   // and re-instantiate original MXCSR.RC mode after that.
4910   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
4911 
4912   movl(tmp, jint_cast(0.5));
4913   movq(xtmp1, tmp);
4914   vbroadcastss(xtmp1, xtmp1, vec_enc);
4915   vaddps(xtmp1, src , xtmp1, vec_enc);
4916   vcvtps2dq(dst, xtmp1, vec_enc);
4917   vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc);
4918 
4919   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
4920 }
4921 #endif // _LP64
4922 
4923 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
4924                                              BasicType from_elem_bt, BasicType to_elem_bt) {
4925   switch (from_elem_bt) {
4926     case T_BYTE:
4927       switch (to_elem_bt) {
4928         case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break;
4929         case T_INT:   vpmovzxbd(dst, src, vlen_enc); break;
4930         case T_LONG:  vpmovzxbq(dst, src, vlen_enc); break;
4931         default: ShouldNotReachHere();
4932       }
4933       break;
4934     case T_SHORT:
4935       switch (to_elem_bt) {
4936         case T_INT:  vpmovzxwd(dst, src, vlen_enc); break;
4937         case T_LONG: vpmovzxwq(dst, src, vlen_enc); break;
4938         default: ShouldNotReachHere();
4939       }
4940       break;
4941     case T_INT:
4942       assert(to_elem_bt == T_LONG, "");
4943       vpmovzxdq(dst, src, vlen_enc);
4944       break;
4945     default:
4946       ShouldNotReachHere();
4947   }
4948 }
4949 
4950 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
4951                                            BasicType from_elem_bt, BasicType to_elem_bt) {
4952   switch (from_elem_bt) {
4953     case T_BYTE:
4954       switch (to_elem_bt) {
4955         case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break;
4956         case T_INT:   vpmovsxbd(dst, src, vlen_enc); break;
4957         case T_LONG:  vpmovsxbq(dst, src, vlen_enc); break;
4958         default: ShouldNotReachHere();
4959       }
4960       break;
4961     case T_SHORT:
4962       switch (to_elem_bt) {
4963         case T_INT:  vpmovsxwd(dst, src, vlen_enc); break;
4964         case T_LONG: vpmovsxwq(dst, src, vlen_enc); break;
4965         default: ShouldNotReachHere();
4966       }
4967       break;
4968     case T_INT:
4969       assert(to_elem_bt == T_LONG, "");
4970       vpmovsxdq(dst, src, vlen_enc);
4971       break;
4972     default:
4973       ShouldNotReachHere();
4974   }
4975 }
4976 
4977 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src,
4978                                          BasicType dst_bt, BasicType src_bt, int vlen) {
4979   int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen);
4980   assert(vlen_enc != AVX_512bit, "");
4981 
4982   int dst_bt_size = type2aelembytes(dst_bt);
4983   int src_bt_size = type2aelembytes(src_bt);
4984   if (dst_bt_size > src_bt_size) {
4985     switch (dst_bt_size / src_bt_size) {
4986       case 2: vpmovsxbw(dst, src, vlen_enc); break;
4987       case 4: vpmovsxbd(dst, src, vlen_enc); break;
4988       case 8: vpmovsxbq(dst, src, vlen_enc); break;
4989       default: ShouldNotReachHere();
4990     }
4991   } else {
4992     assert(dst_bt_size < src_bt_size, "");
4993     switch (src_bt_size / dst_bt_size) {
4994       case 2: {
4995         if (vlen_enc == AVX_128bit) {
4996           vpacksswb(dst, src, src, vlen_enc);
4997         } else {
4998           vpacksswb(dst, src, src, vlen_enc);
4999           vpermq(dst, dst, 0x08, vlen_enc);
5000         }
5001         break;
5002       }
5003       case 4: {
5004         if (vlen_enc == AVX_128bit) {
5005           vpackssdw(dst, src, src, vlen_enc);
5006           vpacksswb(dst, dst, dst, vlen_enc);
5007         } else {
5008           vpackssdw(dst, src, src, vlen_enc);
5009           vpermq(dst, dst, 0x08, vlen_enc);
5010           vpacksswb(dst, dst, dst, AVX_128bit);
5011         }
5012         break;
5013       }
5014       case 8: {
5015         if (vlen_enc == AVX_128bit) {
5016           vpshufd(dst, src, 0x08, vlen_enc);
5017           vpackssdw(dst, dst, dst, vlen_enc);
5018           vpacksswb(dst, dst, dst, vlen_enc);
5019         } else {
5020           vpshufd(dst, src, 0x08, vlen_enc);
5021           vpermq(dst, dst, 0x08, vlen_enc);
5022           vpackssdw(dst, dst, dst, AVX_128bit);
5023           vpacksswb(dst, dst, dst, AVX_128bit);
5024         }
5025         break;
5026       }
5027       default: ShouldNotReachHere();
5028     }
5029   }
5030 }
5031 
5032 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3,
5033                                    bool merge, BasicType bt, int vlen_enc) {
5034   if (bt == T_INT) {
5035     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5036   } else {
5037     assert(bt == T_LONG, "");
5038     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5039   }
5040 }
5041 
5042 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3,
5043                                    bool merge, BasicType bt, int vlen_enc) {
5044   if (bt == T_INT) {
5045     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5046   } else {
5047     assert(bt == T_LONG, "");
5048     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5049   }
5050 }
5051 
5052 #ifdef _LP64
5053 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1,
5054                                                Register rtmp2, XMMRegister xtmp, int mask_len,
5055                                                int vec_enc) {
5056   int index = 0;
5057   int vindex = 0;
5058   mov64(rtmp1, 0x0101010101010101L);
5059   pdepq(rtmp1, src, rtmp1);
5060   if (mask_len > 8) {
5061     movq(rtmp2, src);
5062     vpxor(xtmp, xtmp, xtmp, vec_enc);
5063     movq(xtmp, rtmp1);
5064   }
5065   movq(dst, rtmp1);
5066 
5067   mask_len -= 8;
5068   while (mask_len > 0) {
5069     assert ((mask_len & 0x7) == 0, "mask must be multiple of 8");
5070     index++;
5071     if ((index % 2) == 0) {
5072       pxor(xtmp, xtmp);
5073     }
5074     mov64(rtmp1, 0x0101010101010101L);
5075     shrq(rtmp2, 8);
5076     pdepq(rtmp1, rtmp2, rtmp1);
5077     pinsrq(xtmp, rtmp1, index % 2);
5078     vindex = index / 2;
5079     if (vindex) {
5080       // Write entire 16 byte vector when both 64 bit
5081       // lanes are update to save redundant instructions.
5082       if (index % 2) {
5083         vinsertf128(dst, dst, xtmp, vindex);
5084       }
5085     } else {
5086       vmovdqu(dst, xtmp);
5087     }
5088     mask_len -= 8;
5089   }
5090 }
5091 
5092 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) {
5093   switch(opc) {
5094     case Op_VectorMaskTrueCount:
5095       popcntq(dst, tmp);
5096       break;
5097     case Op_VectorMaskLastTrue:
5098       if (VM_Version::supports_lzcnt()) {
5099         lzcntq(tmp, tmp);
5100         movl(dst, 63);
5101         subl(dst, tmp);
5102       } else {
5103         movl(dst, -1);
5104         bsrq(tmp, tmp);
5105         cmov32(Assembler::notZero, dst, tmp);
5106       }
5107       break;
5108     case Op_VectorMaskFirstTrue:
5109       if (VM_Version::supports_bmi1()) {
5110         if (masklen < 32) {
5111           orl(tmp, 1 << masklen);
5112           tzcntl(dst, tmp);
5113         } else if (masklen == 32) {
5114           tzcntl(dst, tmp);
5115         } else {
5116           assert(masklen == 64, "");
5117           tzcntq(dst, tmp);
5118         }
5119       } else {
5120         if (masklen < 32) {
5121           orl(tmp, 1 << masklen);
5122           bsfl(dst, tmp);
5123         } else {
5124           assert(masklen == 32 || masklen == 64, "");
5125           movl(dst, masklen);
5126           if (masklen == 32)  {
5127             bsfl(tmp, tmp);
5128           } else {
5129             bsfq(tmp, tmp);
5130           }
5131           cmov32(Assembler::notZero, dst, tmp);
5132         }
5133       }
5134       break;
5135     case Op_VectorMaskToLong:
5136       assert(dst == tmp, "Dst and tmp should be the same for toLong operations");
5137       break;
5138     default: assert(false, "Unhandled mask operation");
5139   }
5140 }
5141 
5142 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp,
5143                                               int masklen, int masksize, int vec_enc) {
5144   assert(VM_Version::supports_popcnt(), "");
5145 
5146   if(VM_Version::supports_avx512bw()) {
5147     kmovql(tmp, mask);
5148   } else {
5149     assert(masklen <= 16, "");
5150     kmovwl(tmp, mask);
5151   }
5152 
5153   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5154   // operations needs to be clipped.
5155   if (masksize < 16 && opc != Op_VectorMaskFirstTrue) {
5156     andq(tmp, (1 << masklen) - 1);
5157   }
5158 
5159   vector_mask_operation_helper(opc, dst, tmp, masklen);
5160 }
5161 
5162 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
5163                                               Register tmp, int masklen, BasicType bt, int vec_enc) {
5164   assert(vec_enc == AVX_128bit && VM_Version::supports_avx() ||
5165          vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4), "");
5166   assert(VM_Version::supports_popcnt(), "");
5167 
5168   bool need_clip = false;
5169   switch(bt) {
5170     case T_BOOLEAN:
5171       // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1
5172       vpxor(xtmp, xtmp, xtmp, vec_enc);
5173       vpsubb(xtmp, xtmp, mask, vec_enc);
5174       vpmovmskb(tmp, xtmp, vec_enc);
5175       need_clip = masklen < 16;
5176       break;
5177     case T_BYTE:
5178       vpmovmskb(tmp, mask, vec_enc);
5179       need_clip = masklen < 16;
5180       break;
5181     case T_SHORT:
5182       vpacksswb(xtmp, mask, mask, vec_enc);
5183       if (masklen >= 16) {
5184         vpermpd(xtmp, xtmp, 8, vec_enc);
5185       }
5186       vpmovmskb(tmp, xtmp, Assembler::AVX_128bit);
5187       need_clip = masklen < 16;
5188       break;
5189     case T_INT:
5190     case T_FLOAT:
5191       vmovmskps(tmp, mask, vec_enc);
5192       need_clip = masklen < 4;
5193       break;
5194     case T_LONG:
5195     case T_DOUBLE:
5196       vmovmskpd(tmp, mask, vec_enc);
5197       need_clip = masklen < 2;
5198       break;
5199     default: assert(false, "Unhandled type, %s", type2name(bt));
5200   }
5201 
5202   // Mask generated out of partial vector comparisons/replicate/mask manipulation
5203   // operations needs to be clipped.
5204   if (need_clip && opc != Op_VectorMaskFirstTrue) {
5205     // need_clip implies masklen < 32
5206     andq(tmp, (1 << masklen) - 1);
5207   }
5208 
5209   vector_mask_operation_helper(opc, dst, tmp, masklen);
5210 }
5211 
5212 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1,
5213                                              Register rtmp2, int mask_len) {
5214   kmov(rtmp1, src);
5215   andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len)));
5216   mov64(rtmp2, -1L);
5217   pextq(rtmp2, rtmp2, rtmp1);
5218   kmov(dst, rtmp2);
5219 }
5220 
5221 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask,
5222                                                bool merge, BasicType bt, int vec_enc) {
5223   if (opcode == Op_CompressV) {
5224     switch(bt) {
5225     case T_BYTE:
5226       evpcompressb(dst, mask, src, merge, vec_enc);
5227       break;
5228     case T_CHAR:
5229     case T_SHORT:
5230       evpcompressw(dst, mask, src, merge, vec_enc);
5231       break;
5232     case T_INT:
5233       evpcompressd(dst, mask, src, merge, vec_enc);
5234       break;
5235     case T_FLOAT:
5236       evcompressps(dst, mask, src, merge, vec_enc);
5237       break;
5238     case T_LONG:
5239       evpcompressq(dst, mask, src, merge, vec_enc);
5240       break;
5241     case T_DOUBLE:
5242       evcompresspd(dst, mask, src, merge, vec_enc);
5243       break;
5244     default:
5245       fatal("Unsupported type %s", type2name(bt));
5246       break;
5247     }
5248   } else {
5249     assert(opcode == Op_ExpandV, "");
5250     switch(bt) {
5251     case T_BYTE:
5252       evpexpandb(dst, mask, src, merge, vec_enc);
5253       break;
5254     case T_CHAR:
5255     case T_SHORT:
5256       evpexpandw(dst, mask, src, merge, vec_enc);
5257       break;
5258     case T_INT:
5259       evpexpandd(dst, mask, src, merge, vec_enc);
5260       break;
5261     case T_FLOAT:
5262       evexpandps(dst, mask, src, merge, vec_enc);
5263       break;
5264     case T_LONG:
5265       evpexpandq(dst, mask, src, merge, vec_enc);
5266       break;
5267     case T_DOUBLE:
5268       evexpandpd(dst, mask, src, merge, vec_enc);
5269       break;
5270     default:
5271       fatal("Unsupported type %s", type2name(bt));
5272       break;
5273     }
5274   }
5275 }
5276 #endif
5277 
5278 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5279                                            KRegister ktmp1, int vec_enc) {
5280   if (opcode == Op_SignumVD) {
5281     vsubpd(dst, zero, one, vec_enc);
5282     // if src < 0 ? -1 : 1
5283     evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5284     evblendmpd(dst, ktmp1, one, dst, true, vec_enc);
5285     // if src == NaN, -0.0 or 0.0 return src.
5286     evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5287     evblendmpd(dst, ktmp1, dst, src, true, vec_enc);
5288   } else {
5289     assert(opcode == Op_SignumVF, "");
5290     vsubps(dst, zero, one, vec_enc);
5291     // if src < 0 ? -1 : 1
5292     evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5293     evblendmps(dst, ktmp1, one, dst, true, vec_enc);
5294     // if src == NaN, -0.0 or 0.0 return src.
5295     evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5296     evblendmps(dst, ktmp1, dst, src, true, vec_enc);
5297   }
5298 }
5299 
5300 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5301                                           XMMRegister xtmp1, int vec_enc) {
5302   if (opcode == Op_SignumVD) {
5303     vsubpd(dst, zero, one, vec_enc);
5304     // if src < 0 ? -1 : 1
5305     vblendvpd(dst, one, dst, src, vec_enc);
5306     // if src == NaN, -0.0 or 0.0 return src.
5307     vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5308     vblendvpd(dst, dst, src, xtmp1, vec_enc);
5309   } else {
5310     assert(opcode == Op_SignumVF, "");
5311     vsubps(dst, zero, one, vec_enc);
5312     // if src < 0 ? -1 : 1
5313     vblendvps(dst, one, dst, src, vec_enc);
5314     // if src == NaN, -0.0 or 0.0 return src.
5315     vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5316     vblendvps(dst, dst, src, xtmp1, vec_enc);
5317   }
5318 }
5319 
5320 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) {
5321   if (VM_Version::supports_avx512bw()) {
5322     if (mask_len > 32) {
5323       kmovql(dst, src);
5324     } else {
5325       kmovdl(dst, src);
5326       if (mask_len != 32) {
5327         kshiftrdl(dst, dst, 32 - mask_len);
5328       }
5329     }
5330   } else {
5331     assert(mask_len <= 16, "");
5332     kmovwl(dst, src);
5333     if (mask_len != 16) {
5334       kshiftrwl(dst, dst, 16 - mask_len);
5335     }
5336   }
5337 }
5338 
5339 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) {
5340   int lane_size = type2aelembytes(bt);
5341   bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
5342   if ((is_LP64 || lane_size < 8) &&
5343       ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) ||
5344        (is_subword_type(bt) && VM_Version::supports_avx512vlbw()))) {
5345     movptr(rtmp, imm32);
5346     switch(lane_size) {
5347       case 1 : evpbroadcastb(dst, rtmp, vec_enc); break;
5348       case 2 : evpbroadcastw(dst, rtmp, vec_enc); break;
5349       case 4 : evpbroadcastd(dst, rtmp, vec_enc); break;
5350       case 8 : evpbroadcastq(dst, rtmp, vec_enc); break;
5351       fatal("Unsupported lane size %d", lane_size);
5352       break;
5353     }
5354   } else {
5355     movptr(rtmp, imm32);
5356     LP64_ONLY(movq(dst, rtmp)) NOT_LP64(movdl(dst, rtmp));
5357     switch(lane_size) {
5358       case 1 : vpbroadcastb(dst, dst, vec_enc); break;
5359       case 2 : vpbroadcastw(dst, dst, vec_enc); break;
5360       case 4 : vpbroadcastd(dst, dst, vec_enc); break;
5361       case 8 : vpbroadcastq(dst, dst, vec_enc); break;
5362       fatal("Unsupported lane size %d", lane_size);
5363       break;
5364     }
5365   }
5366 }
5367 
5368 //
5369 // Following is lookup table based popcount computation algorithm:-
5370 //       Index   Bit set count
5371 //     [ 0000 ->   0,
5372 //       0001 ->   1,
5373 //       0010 ->   1,
5374 //       0011 ->   2,
5375 //       0100 ->   1,
5376 //       0101 ->   2,
5377 //       0110 ->   2,
5378 //       0111 ->   3,
5379 //       1000 ->   1,
5380 //       1001 ->   2,
5381 //       1010 ->   3,
5382 //       1011 ->   3,
5383 //       1100 ->   2,
5384 //       1101 ->   3,
5385 //       1111 ->   4 ]
5386 //  a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as
5387 //     shuffle indices for lookup table access.
5388 //  b. Right shift each byte of vector lane by 4 positions.
5389 //  c. Count the number of 1s in 4 MSB bits each byte. These bits are used as
5390 //     shuffle indices for lookup table access.
5391 //  d. Add the bitset count of upper and lower 4 bits of each byte.
5392 //  e. Unpack double words to quad words and compute sum of absolute difference of bitset
5393 //     count of all the bytes of a quadword.
5394 //  f. Perform step e. for upper 128bit vector lane.
5395 //  g. Pack the bitset count of quadwords back to double word.
5396 //  h. Unpacking and packing operations are not needed for 64bit vector lane.
5397 
5398 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5399                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5400   assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), "");
5401   vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc);
5402   vpsrlw(dst, src, 4, vec_enc);
5403   vpand(dst, dst, xtmp1, vec_enc);
5404   vpand(xtmp1, src, xtmp1, vec_enc);
5405   vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg);
5406   vpshufb(xtmp1, xtmp2, xtmp1, vec_enc);
5407   vpshufb(dst, xtmp2, dst, vec_enc);
5408   vpaddb(dst, dst, xtmp1, vec_enc);
5409 }
5410 
5411 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5412                                             XMMRegister xtmp2, Register rtmp, int vec_enc) {
5413   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5414   // Following code is as per steps e,f,g and h of above algorithm.
5415   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5416   vpunpckhdq(dst, xtmp1, xtmp2, vec_enc);
5417   vpsadbw(dst, dst, xtmp2, vec_enc);
5418   vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc);
5419   vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc);
5420   vpackuswb(dst, xtmp1, dst, vec_enc);
5421 }
5422 
5423 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5424                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5425   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5426   // Add the popcount of upper and lower bytes of word.
5427   vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc);
5428   vpsrlw(dst, xtmp1, 8, vec_enc);
5429   vpand(xtmp1, xtmp1, xtmp2, vec_enc);
5430   vpaddw(dst, dst, xtmp1, vec_enc);
5431 }
5432 
5433 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5434                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5435   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5436   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5437   vpsadbw(dst, xtmp1, xtmp2, vec_enc);
5438 }
5439 
5440 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5441                                                  XMMRegister xtmp2, Register rtmp, int vec_enc) {
5442   switch(bt) {
5443     case T_LONG:
5444       vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5445       break;
5446     case T_INT:
5447       vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5448       break;
5449     case T_CHAR:
5450     case T_SHORT:
5451       vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5452       break;
5453     case T_BYTE:
5454     case T_BOOLEAN:
5455       vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5456       break;
5457     default:
5458       fatal("Unsupported type %s", type2name(bt));
5459       break;
5460   }
5461 }
5462 
5463 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5464                                                       KRegister mask, bool merge, int vec_enc) {
5465   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5466   switch(bt) {
5467     case T_LONG:
5468       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5469       evpopcntq(dst, mask, src, merge, vec_enc);
5470       break;
5471     case T_INT:
5472       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5473       evpopcntd(dst, mask, src, merge, vec_enc);
5474       break;
5475     case T_CHAR:
5476     case T_SHORT:
5477       assert(VM_Version::supports_avx512_bitalg(), "");
5478       evpopcntw(dst, mask, src, merge, vec_enc);
5479       break;
5480     case T_BYTE:
5481     case T_BOOLEAN:
5482       assert(VM_Version::supports_avx512_bitalg(), "");
5483       evpopcntb(dst, mask, src, merge, vec_enc);
5484       break;
5485     default:
5486       fatal("Unsupported type %s", type2name(bt));
5487       break;
5488   }
5489 }
5490 
5491 #ifndef _LP64
5492 void C2_MacroAssembler::vector_maskall_operation32(KRegister dst, Register src, KRegister tmp, int mask_len) {
5493   assert(VM_Version::supports_avx512bw(), "");
5494   kmovdl(tmp, src);
5495   kunpckdql(dst, tmp, tmp);
5496 }
5497 #endif
5498 
5499 // Bit reversal algorithm first reverses the bits of each byte followed by
5500 // a byte level reversal for multi-byte primitive types (short/int/long).
5501 // Algorithm performs a lookup table access to get reverse bit sequence
5502 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte
5503 // is obtained by swapping the reverse bit sequences of upper and lower
5504 // nibble of a byte.
5505 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5506                                            XMMRegister xtmp2, Register rtmp, int vec_enc) {
5507   if (VM_Version::supports_avx512vlbw()) {
5508 
5509     // Get the reverse bit sequence of lower nibble of each byte.
5510     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg);
5511     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5512     evpandq(dst, xtmp2, src, vec_enc);
5513     vpshufb(dst, xtmp1, dst, vec_enc);
5514     vpsllq(dst, dst, 4, vec_enc);
5515 
5516     // Get the reverse bit sequence of upper nibble of each byte.
5517     vpandn(xtmp2, xtmp2, src, vec_enc);
5518     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5519     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5520 
5521     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5522     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5523     evporq(xtmp2, dst, xtmp2, vec_enc);
5524     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
5525 
5526   } else if(vec_enc == Assembler::AVX_512bit) {
5527     // Shift based bit reversal.
5528     assert(bt == T_LONG || bt == T_INT, "");
5529 
5530     // Swap lower and upper nibble of each byte.
5531     vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc);
5532 
5533     // Swap two least and most significant bits of each nibble.
5534     vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc);
5535 
5536     // Swap adjacent pair of bits.
5537     evmovdqul(xtmp1, k0, dst, true, vec_enc);
5538     vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc);
5539 
5540     evmovdqul(xtmp1, k0, dst, true, vec_enc);
5541     vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc);
5542   } else {
5543     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp);
5544     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5545 
5546     // Get the reverse bit sequence of lower nibble of each byte.
5547     vpand(dst, xtmp2, src, vec_enc);
5548     vpshufb(dst, xtmp1, dst, vec_enc);
5549     vpsllq(dst, dst, 4, vec_enc);
5550 
5551     // Get the reverse bit sequence of upper nibble of each byte.
5552     vpandn(xtmp2, xtmp2, src, vec_enc);
5553     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5554     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5555 
5556     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5557     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5558     vpor(xtmp2, dst, xtmp2, vec_enc);
5559     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
5560   }
5561 }
5562 
5563 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc,
5564                                                 XMMRegister xtmp, Register rscratch) {
5565   assert(VM_Version::supports_gfni(), "");
5566   assert(rscratch != noreg || always_reachable(mask), "missing");
5567 
5568   // Galois field instruction based bit reversal based on following algorithm.
5569   // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
5570   vpbroadcastq(xtmp, mask, vec_enc, rscratch);
5571   vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc);
5572   vector_reverse_byte(bt, dst, xtmp, vec_enc);
5573 }
5574 
5575 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src,
5576                                           XMMRegister xtmp1, Register rtmp, int vec_enc) {
5577   vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc);
5578   evpandq(dst, xtmp1, src, vec_enc);
5579   vpsllq(dst, dst, nbits, vec_enc);
5580   vpandn(xtmp1, xtmp1, src, vec_enc);
5581   vpsrlq(xtmp1, xtmp1, nbits, vec_enc);
5582   evporq(dst, dst, xtmp1, vec_enc);
5583 }
5584 
5585 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5586                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5587   // Shift based bit reversal.
5588   assert(VM_Version::supports_evex(), "");
5589   switch(bt) {
5590     case T_LONG:
5591       // Swap upper and lower double word of each quad word.
5592       evprorq(xtmp1, k0, src, 32, true, vec_enc);
5593       evprord(xtmp1, k0, xtmp1, 16, true, vec_enc);
5594       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
5595       break;
5596     case T_INT:
5597       // Swap upper and lower word of each double word.
5598       evprord(xtmp1, k0, src, 16, true, vec_enc);
5599       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
5600       break;
5601     case T_CHAR:
5602     case T_SHORT:
5603       // Swap upper and lower byte of each word.
5604       vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc);
5605       break;
5606     case T_BYTE:
5607       evmovdquq(dst, k0, src, true, vec_enc);
5608       break;
5609     default:
5610       fatal("Unsupported type %s", type2name(bt));
5611       break;
5612   }
5613 }
5614 
5615 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) {
5616   if (bt == T_BYTE) {
5617     if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) {
5618       evmovdquq(dst, k0, src, true, vec_enc);
5619     } else {
5620       vmovdqu(dst, src);
5621     }
5622     return;
5623   }
5624   // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using
5625   // pre-computed shuffle indices.
5626   switch(bt) {
5627     case T_LONG:
5628       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg);
5629       break;
5630     case T_INT:
5631       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg);
5632       break;
5633     case T_CHAR:
5634     case T_SHORT:
5635       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg);
5636       break;
5637     default:
5638       fatal("Unsupported type %s", type2name(bt));
5639       break;
5640   }
5641   vpshufb(dst, src, dst, vec_enc);
5642 }
5643 
5644 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5645                                                         XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
5646                                                         KRegister ktmp, Register rtmp, bool merge, int vec_enc) {
5647   assert(is_integral_type(bt), "");
5648   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5649   assert(VM_Version::supports_avx512cd(), "");
5650   switch(bt) {
5651     case T_LONG:
5652       evplzcntq(dst, ktmp, src, merge, vec_enc);
5653       break;
5654     case T_INT:
5655       evplzcntd(dst, ktmp, src, merge, vec_enc);
5656       break;
5657     case T_SHORT:
5658       vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc);
5659       vpunpcklwd(xtmp2, xtmp1, src, vec_enc);
5660       evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc);
5661       vpunpckhwd(dst, xtmp1, src, vec_enc);
5662       evplzcntd(dst, ktmp, dst, merge, vec_enc);
5663       vpackusdw(dst, xtmp2, dst, vec_enc);
5664       break;
5665     case T_BYTE:
5666       // T1 = Compute leading zero counts of 4 LSB bits of each byte by
5667       // accessing the lookup table.
5668       // T2 = Compute leading zero counts of 4 MSB bits of each byte by
5669       // accessing the lookup table.
5670       // Add T1 to T2 if 4 MSB bits of byte are all zeros.
5671       assert(VM_Version::supports_avx512bw(), "");
5672       evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp);
5673       vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc);
5674       vpand(xtmp2, dst, src, vec_enc);
5675       vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5676       vpsrlw(xtmp3, src, 4, vec_enc);
5677       vpand(xtmp3, dst, xtmp3, vec_enc);
5678       vpshufb(dst, xtmp1, xtmp3, vec_enc);
5679       vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
5680       evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc);
5681       evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc);
5682       break;
5683     default:
5684       fatal("Unsupported type %s", type2name(bt));
5685       break;
5686   }
5687 }
5688 
5689 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5690                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
5691   vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp);
5692   vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5693   // T1 = Compute leading zero counts of 4 LSB bits of each byte by
5694   // accessing the lookup table.
5695   vpand(dst, xtmp2, src, vec_enc);
5696   vpshufb(dst, xtmp1, dst, vec_enc);
5697   // T2 = Compute leading zero counts of 4 MSB bits of each byte by
5698   // accessing the lookup table.
5699   vpsrlw(xtmp3, src, 4, vec_enc);
5700   vpand(xtmp3, xtmp2, xtmp3, vec_enc);
5701   vpshufb(xtmp2, xtmp1, xtmp3, vec_enc);
5702   // Add T1 to T2 if 4 MSB bits of byte are all zeros.
5703   vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
5704   vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc);
5705   vpaddb(dst, dst, xtmp2, vec_enc);
5706   vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc);
5707 }
5708 
5709 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5710                                                              XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
5711   vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
5712   // Add zero counts of lower byte and upper byte of a word if
5713   // upper byte holds a zero value.
5714   vpsrlw(xtmp3, src, 8, vec_enc);
5715   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
5716   vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc);
5717   vpsllw(xtmp2, dst, 8, vec_enc);
5718   vpaddw(xtmp2, xtmp2, dst, vec_enc);
5719   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
5720   vpsrlw(dst, dst, 8, vec_enc);
5721 }
5722 
5723 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5724                                                            XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) {
5725   // Since IEEE 754 floating point format represents mantissa in 1.0 format
5726   // hence biased exponent can be used to compute leading zero count as per
5727   // following formula:-
5728   // LZCNT = 32 - (biased_exp - 127)
5729   // Special handling has been introduced for Zero, Max_Int and -ve source values.
5730 
5731   // Broadcast 0xFF
5732   vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc);
5733   vpsrld(xtmp1, xtmp1, 24, vec_enc);
5734 
5735   // Extract biased exponent.
5736   vcvtdq2ps(dst, src, vec_enc);
5737   vpsrld(dst, dst, 23, vec_enc);
5738   vpand(dst, dst, xtmp1, vec_enc);
5739 
5740   // Broadcast 127.
5741   vpsrld(xtmp1, xtmp1, 1, vec_enc);
5742   // Exponent = biased_exp - 127
5743   vpsubd(dst, dst, xtmp1, vec_enc);
5744 
5745   // Exponent = Exponent  + 1
5746   vpsrld(xtmp3, xtmp1, 6, vec_enc);
5747   vpaddd(dst, dst, xtmp3, vec_enc);
5748 
5749   // Replace -ve exponent with zero, exponent is -ve when src
5750   // lane contains a zero value.
5751   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5752   vblendvps(dst, dst, xtmp2, dst, vec_enc);
5753 
5754   // Rematerialize broadcast 32.
5755   vpslld(xtmp1, xtmp3, 5, vec_enc);
5756   // Exponent is 32 if corresponding source lane contains max_int value.
5757   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
5758   // LZCNT = 32 - exponent
5759   vpsubd(dst, xtmp1, dst, vec_enc);
5760 
5761   // Replace LZCNT with a value 1 if corresponding source lane
5762   // contains max_int value.
5763   vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc);
5764 
5765   // Replace biased_exp with 0 if source lane value is less than zero.
5766   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5767   vblendvps(dst, dst, xtmp2, src, vec_enc);
5768 }
5769 
5770 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5771                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
5772   vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
5773   // Add zero counts of lower word and upper word of a double word if
5774   // upper word holds a zero value.
5775   vpsrld(xtmp3, src, 16, vec_enc);
5776   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
5777   vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc);
5778   vpslld(xtmp2, dst, 16, vec_enc);
5779   vpaddd(xtmp2, xtmp2, dst, vec_enc);
5780   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
5781   vpsrld(dst, dst, 16, vec_enc);
5782   // Add zero counts of lower doubleword and upper doubleword of a
5783   // quadword if upper doubleword holds a zero value.
5784   vpsrlq(xtmp3, src, 32, vec_enc);
5785   vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc);
5786   vpsllq(xtmp2, dst, 32, vec_enc);
5787   vpaddq(xtmp2, xtmp2, dst, vec_enc);
5788   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
5789   vpsrlq(dst, dst, 32, vec_enc);
5790 }
5791 
5792 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src,
5793                                                        XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
5794                                                        Register rtmp, int vec_enc) {
5795   assert(is_integral_type(bt), "unexpected type");
5796   assert(vec_enc < Assembler::AVX_512bit, "");
5797   switch(bt) {
5798     case T_LONG:
5799       vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
5800       break;
5801     case T_INT:
5802       vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
5803       break;
5804     case T_SHORT:
5805       vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
5806       break;
5807     case T_BYTE:
5808       vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
5809       break;
5810     default:
5811       fatal("Unsupported type %s", type2name(bt));
5812       break;
5813   }
5814 }
5815 
5816 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) {
5817   switch(bt) {
5818     case T_BYTE:
5819       vpsubb(dst, src1, src2, vec_enc);
5820       break;
5821     case T_SHORT:
5822       vpsubw(dst, src1, src2, vec_enc);
5823       break;
5824     case T_INT:
5825       vpsubd(dst, src1, src2, vec_enc);
5826       break;
5827     case T_LONG:
5828       vpsubq(dst, src1, src2, vec_enc);
5829       break;
5830     default:
5831       fatal("Unsupported type %s", type2name(bt));
5832       break;
5833   }
5834 }
5835 
5836 // Trailing zero count computation is based on leading zero count operation as per
5837 // following equation. All AVX3 targets support AVX512CD feature which offers
5838 // direct vector instruction to compute leading zero count.
5839 //      CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x)
5840 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5841                                                          XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
5842                                                          XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) {
5843   assert(is_integral_type(bt), "");
5844   // xtmp = -1
5845   vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc);
5846   // xtmp = xtmp + src
5847   vpadd(bt, xtmp4, xtmp4, src, vec_enc);
5848   // xtmp = xtmp & ~src
5849   vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc);
5850   vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc);
5851   vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc);
5852   vpsub(bt, dst, xtmp4, dst, vec_enc);
5853 }
5854 
5855 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation
5856 //      CTZ = PRIM_TYPE_WIDHT - POPC(x | -x)
5857 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5858                                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
5859   assert(is_integral_type(bt), "");
5860   // xtmp = 0
5861   vpxor(xtmp3 , xtmp3, xtmp3, vec_enc);
5862   // xtmp = 0 - src
5863   vpsub(bt, xtmp3, xtmp3, src, vec_enc);
5864   // xtmp = xtmp | src
5865   vpor(xtmp3, xtmp3, src, vec_enc);
5866   vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc);
5867   vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc);
5868   vpsub(bt, dst, xtmp1, dst, vec_enc);
5869 }
5870 
5871 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) {
5872   Label done;
5873   Label neg_divisor_fastpath;
5874   cmpl(divisor, 0);
5875   jccb(Assembler::less, neg_divisor_fastpath);
5876   xorl(rdx, rdx);
5877   divl(divisor);
5878   jmpb(done);
5879   bind(neg_divisor_fastpath);
5880   // Fastpath for divisor < 0:
5881   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
5882   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
5883   movl(rdx, rax);
5884   subl(rdx, divisor);
5885   if (VM_Version::supports_bmi1()) {
5886     andnl(rax, rdx, rax);
5887   } else {
5888     notl(rdx);
5889     andl(rax, rdx);
5890   }
5891   shrl(rax, 31);
5892   bind(done);
5893 }
5894 
5895 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) {
5896   Label done;
5897   Label neg_divisor_fastpath;
5898   cmpl(divisor, 0);
5899   jccb(Assembler::less, neg_divisor_fastpath);
5900   xorl(rdx, rdx);
5901   divl(divisor);
5902   jmpb(done);
5903   bind(neg_divisor_fastpath);
5904   // Fastpath when divisor < 0:
5905   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
5906   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
5907   movl(rdx, rax);
5908   subl(rax, divisor);
5909   if (VM_Version::supports_bmi1()) {
5910     andnl(rax, rax, rdx);
5911   } else {
5912     notl(rax);
5913     andl(rax, rdx);
5914   }
5915   sarl(rax, 31);
5916   andl(rax, divisor);
5917   subl(rdx, rax);
5918   bind(done);
5919 }
5920 
5921 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) {
5922   Label done;
5923   Label neg_divisor_fastpath;
5924 
5925   cmpl(divisor, 0);
5926   jccb(Assembler::less, neg_divisor_fastpath);
5927   xorl(rdx, rdx);
5928   divl(divisor);
5929   jmpb(done);
5930   bind(neg_divisor_fastpath);
5931   // Fastpath for divisor < 0:
5932   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
5933   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
5934   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
5935   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
5936   movl(rdx, rax);
5937   subl(rax, divisor);
5938   if (VM_Version::supports_bmi1()) {
5939     andnl(rax, rax, rdx);
5940   } else {
5941     notl(rax);
5942     andl(rax, rdx);
5943   }
5944   movl(tmp, rax);
5945   shrl(rax, 31); // quotient
5946   sarl(tmp, 31);
5947   andl(tmp, divisor);
5948   subl(rdx, tmp); // remainder
5949   bind(done);
5950 }
5951 
5952 #ifdef _LP64
5953 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1,
5954                                  XMMRegister xtmp2, Register rtmp) {
5955   if(VM_Version::supports_gfni()) {
5956     // Galois field instruction based bit reversal based on following algorithm.
5957     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
5958     mov64(rtmp, 0x8040201008040201L);
5959     movq(xtmp1, src);
5960     movq(xtmp2, rtmp);
5961     gf2p8affineqb(xtmp1, xtmp2, 0);
5962     movq(dst, xtmp1);
5963   } else {
5964     // Swap even and odd numbered bits.
5965     movl(rtmp, src);
5966     andl(rtmp, 0x55555555);
5967     shll(rtmp, 1);
5968     movl(dst, src);
5969     andl(dst, 0xAAAAAAAA);
5970     shrl(dst, 1);
5971     orl(dst, rtmp);
5972 
5973     // Swap LSB and MSB 2 bits of each nibble.
5974     movl(rtmp, dst);
5975     andl(rtmp, 0x33333333);
5976     shll(rtmp, 2);
5977     andl(dst, 0xCCCCCCCC);
5978     shrl(dst, 2);
5979     orl(dst, rtmp);
5980 
5981     // Swap LSB and MSB 4 bits of each byte.
5982     movl(rtmp, dst);
5983     andl(rtmp, 0x0F0F0F0F);
5984     shll(rtmp, 4);
5985     andl(dst, 0xF0F0F0F0);
5986     shrl(dst, 4);
5987     orl(dst, rtmp);
5988   }
5989   bswapl(dst);
5990 }
5991 
5992 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1,
5993                                  XMMRegister xtmp2, Register rtmp1, Register rtmp2) {
5994   if(VM_Version::supports_gfni()) {
5995     // Galois field instruction based bit reversal based on following algorithm.
5996     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
5997     mov64(rtmp1, 0x8040201008040201L);
5998     movq(xtmp1, src);
5999     movq(xtmp2, rtmp1);
6000     gf2p8affineqb(xtmp1, xtmp2, 0);
6001     movq(dst, xtmp1);
6002   } else {
6003     // Swap even and odd numbered bits.
6004     movq(rtmp1, src);
6005     mov64(rtmp2, 0x5555555555555555L);
6006     andq(rtmp1, rtmp2);
6007     shlq(rtmp1, 1);
6008     movq(dst, src);
6009     notq(rtmp2);
6010     andq(dst, rtmp2);
6011     shrq(dst, 1);
6012     orq(dst, rtmp1);
6013 
6014     // Swap LSB and MSB 2 bits of each nibble.
6015     movq(rtmp1, dst);
6016     mov64(rtmp2, 0x3333333333333333L);
6017     andq(rtmp1, rtmp2);
6018     shlq(rtmp1, 2);
6019     notq(rtmp2);
6020     andq(dst, rtmp2);
6021     shrq(dst, 2);
6022     orq(dst, rtmp1);
6023 
6024     // Swap LSB and MSB 4 bits of each byte.
6025     movq(rtmp1, dst);
6026     mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL);
6027     andq(rtmp1, rtmp2);
6028     shlq(rtmp1, 4);
6029     notq(rtmp2);
6030     andq(dst, rtmp2);
6031     shrq(dst, 4);
6032     orq(dst, rtmp1);
6033   }
6034   bswapq(dst);
6035 }
6036 
6037 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) {
6038   Label done;
6039   Label neg_divisor_fastpath;
6040   cmpq(divisor, 0);
6041   jccb(Assembler::less, neg_divisor_fastpath);
6042   xorl(rdx, rdx);
6043   divq(divisor);
6044   jmpb(done);
6045   bind(neg_divisor_fastpath);
6046   // Fastpath for divisor < 0:
6047   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6048   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6049   movq(rdx, rax);
6050   subq(rdx, divisor);
6051   if (VM_Version::supports_bmi1()) {
6052     andnq(rax, rdx, rax);
6053   } else {
6054     notq(rdx);
6055     andq(rax, rdx);
6056   }
6057   shrq(rax, 63);
6058   bind(done);
6059 }
6060 
6061 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) {
6062   Label done;
6063   Label neg_divisor_fastpath;
6064   cmpq(divisor, 0);
6065   jccb(Assembler::less, neg_divisor_fastpath);
6066   xorq(rdx, rdx);
6067   divq(divisor);
6068   jmp(done);
6069   bind(neg_divisor_fastpath);
6070   // Fastpath when divisor < 0:
6071   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6072   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6073   movq(rdx, rax);
6074   subq(rax, divisor);
6075   if (VM_Version::supports_bmi1()) {
6076     andnq(rax, rax, rdx);
6077   } else {
6078     notq(rax);
6079     andq(rax, rdx);
6080   }
6081   sarq(rax, 63);
6082   andq(rax, divisor);
6083   subq(rdx, rax);
6084   bind(done);
6085 }
6086 
6087 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) {
6088   Label done;
6089   Label neg_divisor_fastpath;
6090   cmpq(divisor, 0);
6091   jccb(Assembler::less, neg_divisor_fastpath);
6092   xorq(rdx, rdx);
6093   divq(divisor);
6094   jmp(done);
6095   bind(neg_divisor_fastpath);
6096   // Fastpath for divisor < 0:
6097   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6098   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6099   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6100   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6101   movq(rdx, rax);
6102   subq(rax, divisor);
6103   if (VM_Version::supports_bmi1()) {
6104     andnq(rax, rax, rdx);
6105   } else {
6106     notq(rax);
6107     andq(rax, rdx);
6108   }
6109   movq(tmp, rax);
6110   shrq(rax, 63); // quotient
6111   sarq(tmp, 63);
6112   andq(tmp, divisor);
6113   subq(rdx, tmp); // remainder
6114   bind(done);
6115 }
6116 #endif
6117 
6118 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1,
6119                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp,
6120                                         int vlen_enc) {
6121   assert(VM_Version::supports_avx512bw(), "");
6122   // Byte shuffles are inlane operations and indices are determined using
6123   // lower 4 bit of each shuffle lane, thus all shuffle indices are
6124   // normalized to index range 0-15. This makes sure that all the multiples
6125   // of an index value are placed at same relative position in 128 bit
6126   // lane i.e. elements corresponding to shuffle indices 16, 32 and 64
6127   // will be 16th element in their respective 128 bit lanes.
6128   movl(rtmp, 16);
6129   evpbroadcastb(xtmp1, rtmp, vlen_enc);
6130 
6131   // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16,
6132   // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using
6133   // original shuffle indices and move the shuffled lanes corresponding to true
6134   // mask to destination vector.
6135   evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
6136   evshufi64x2(xtmp2, src, src, 0x0, vlen_enc);
6137   evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc);
6138 
6139   // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32
6140   // and broadcasting second 128 bit lane.
6141   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6142   vpsllq(xtmp2, xtmp1, 0x1, vlen_enc);
6143   evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
6144   evshufi64x2(xtmp3, src, src, 0x55, vlen_enc);
6145   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6146 
6147   // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48
6148   // and broadcasting third 128 bit lane.
6149   evpcmpb(ktmp, k0, shuffle,  xtmp2, Assembler::nlt, true, vlen_enc);
6150   vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc);
6151   evpcmpb(ktmp, ktmp, shuffle,  xtmp1, Assembler::lt, true, vlen_enc);
6152   evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc);
6153   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6154 
6155   // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64
6156   // and broadcasting third 128 bit lane.
6157   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
6158   vpsllq(xtmp2, xtmp2, 0x1, vlen_enc);
6159   evpcmpb(ktmp, ktmp, shuffle,  xtmp2, Assembler::lt, true, vlen_enc);
6160   evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc);
6161   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6162 }
6163 
6164 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst,
6165                                                    XMMRegister shuffle, XMMRegister src, int vlen_enc) {
6166   if (vlen_enc == AVX_128bit) {
6167     vpermilps(dst, src, shuffle, vlen_enc);
6168   } else if (bt == T_INT) {
6169     vpermd(dst, shuffle, src, vlen_enc);
6170   } else {
6171     assert(bt == T_FLOAT, "");
6172     vpermps(dst, shuffle, src, vlen_enc);
6173   }
6174 }