1 /*
   2  * Copyright (c) 2020, 2022, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "gc/shared/barrierSet.hpp"
  29 #include "gc/shared/barrierSetAssembler.hpp"
  30 #include "oops/methodData.hpp"
  31 #include "opto/c2_MacroAssembler.hpp"
  32 #include "opto/intrinsicnode.hpp"
  33 #include "opto/output.hpp"
  34 #include "opto/opcodes.hpp"
  35 #include "opto/subnode.hpp"
  36 #include "runtime/objectMonitor.hpp"
  37 #include "runtime/stubRoutines.hpp"
  38 
  39 #ifdef PRODUCT
  40 #define BLOCK_COMMENT(str) /* nothing */
  41 #define STOP(error) stop(error)
  42 #else
  43 #define BLOCK_COMMENT(str) block_comment(str)
  44 #define STOP(error) block_comment(error); stop(error)
  45 #endif
  46 
  47 // C2 compiled method's prolog code.
  48 void C2_MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) {
  49 
  50   // WARNING: Initial instruction MUST be 5 bytes or longer so that
  51   // NativeJump::patch_verified_entry will be able to patch out the entry
  52   // code safely. The push to verify stack depth is ok at 5 bytes,
  53   // the frame allocation can be either 3 or 6 bytes. So if we don't do
  54   // stack bang then we must use the 6 byte frame allocation even if
  55   // we have no frame. :-(
  56   assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
  57 
  58   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
  59   // Remove word for return addr
  60   framesize -= wordSize;
  61   stack_bang_size -= wordSize;
  62 
  63   // Calls to C2R adapters often do not accept exceptional returns.
  64   // We require that their callers must bang for them.  But be careful, because
  65   // some VM calls (such as call site linkage) can use several kilobytes of
  66   // stack.  But the stack safety zone should account for that.
  67   // See bugs 4446381, 4468289, 4497237.
  68   if (stack_bang_size > 0) {
  69     generate_stack_overflow_check(stack_bang_size);
  70 
  71     // We always push rbp, so that on return to interpreter rbp, will be
  72     // restored correctly and we can correct the stack.
  73     push(rbp);
  74     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  75     if (PreserveFramePointer) {
  76       mov(rbp, rsp);
  77     }
  78     // Remove word for ebp
  79     framesize -= wordSize;
  80 
  81     // Create frame
  82     if (framesize) {
  83       subptr(rsp, framesize);
  84     }
  85   } else {
  86     // Create frame (force generation of a 4 byte immediate value)
  87     subptr_imm32(rsp, framesize);
  88 
  89     // Save RBP register now.
  90     framesize -= wordSize;
  91     movptr(Address(rsp, framesize), rbp);
  92     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  93     if (PreserveFramePointer) {
  94       movptr(rbp, rsp);
  95       if (framesize > 0) {
  96         addptr(rbp, framesize);
  97       }
  98     }
  99   }
 100 
 101   if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
 102     framesize -= wordSize;
 103     movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 104   }
 105 
 106 #ifndef _LP64
 107   // If method sets FPU control word do it now
 108   if (fp_mode_24b) {
 109     fldcw(ExternalAddress(StubRoutines::x86::addr_fpu_cntrl_wrd_24()));
 110   }
 111   if (UseSSE >= 2 && VerifyFPU) {
 112     verify_FPU(0, "FPU stack must be clean on entry");
 113   }
 114 #endif
 115 
 116 #ifdef ASSERT
 117   if (VerifyStackAtCalls) {
 118     Label L;
 119     push(rax);
 120     mov(rax, rsp);
 121     andptr(rax, StackAlignmentInBytes-1);
 122     cmpptr(rax, StackAlignmentInBytes-wordSize);
 123     pop(rax);
 124     jcc(Assembler::equal, L);
 125     STOP("Stack is not properly aligned!");
 126     bind(L);
 127   }
 128 #endif
 129 
 130   if (!is_stub) {
 131     BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 132  #ifdef _LP64
 133     if (BarrierSet::barrier_set()->barrier_set_nmethod() != NULL) {
 134       // We put the non-hot code of the nmethod entry barrier out-of-line in a stub.
 135       Label dummy_slow_path;
 136       Label dummy_continuation;
 137       Label* slow_path = &dummy_slow_path;
 138       Label* continuation = &dummy_continuation;
 139       if (!Compile::current()->output()->in_scratch_emit_size()) {
 140         // Use real labels from actual stub when not emitting code for the purpose of measuring its size
 141         C2EntryBarrierStub* stub = Compile::current()->output()->entry_barrier_table()->add_entry_barrier();
 142         slow_path = &stub->slow_path();
 143         continuation = &stub->continuation();
 144       }
 145       bs->nmethod_entry_barrier(this, slow_path, continuation);
 146     }
 147 #else
 148     // Don't bother with out-of-line nmethod entry barrier stub for x86_32.
 149     bs->nmethod_entry_barrier(this, NULL /* slow_path */, NULL /* continuation */);
 150 #endif
 151   }
 152 }
 153 
 154 void C2_MacroAssembler::emit_entry_barrier_stub(C2EntryBarrierStub* stub) {
 155   bind(stub->slow_path());
 156   call(RuntimeAddress(StubRoutines::x86::method_entry_barrier()));
 157   jmp(stub->continuation(), false /* maybe_short */);
 158 }
 159 
 160 int C2_MacroAssembler::entry_barrier_stub_size() {
 161   return 10;
 162 }
 163 
 164 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
 165   switch (vlen_in_bytes) {
 166     case  4: // fall-through
 167     case  8: // fall-through
 168     case 16: return Assembler::AVX_128bit;
 169     case 32: return Assembler::AVX_256bit;
 170     case 64: return Assembler::AVX_512bit;
 171 
 172     default: {
 173       ShouldNotReachHere();
 174       return Assembler::AVX_NoVec;
 175     }
 176   }
 177 }
 178 
 179 #if INCLUDE_RTM_OPT
 180 
 181 // Update rtm_counters based on abort status
 182 // input: abort_status
 183 //        rtm_counters (RTMLockingCounters*)
 184 // flags are killed
 185 void C2_MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) {
 186 
 187   atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset()));
 188   if (PrintPreciseRTMLockingStatistics) {
 189     for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {
 190       Label check_abort;
 191       testl(abort_status, (1<<i));
 192       jccb(Assembler::equal, check_abort);
 193       atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx))));
 194       bind(check_abort);
 195     }
 196   }
 197 }
 198 
 199 // Branch if (random & (count-1) != 0), count is 2^n
 200 // tmp, scr and flags are killed
 201 void C2_MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) {
 202   assert(tmp == rax, "");
 203   assert(scr == rdx, "");
 204   rdtsc(); // modifies EDX:EAX
 205   andptr(tmp, count-1);
 206   jccb(Assembler::notZero, brLabel);
 207 }
 208 
 209 // Perform abort ratio calculation, set no_rtm bit if high ratio
 210 // input:  rtm_counters_Reg (RTMLockingCounters* address)
 211 // tmpReg, rtm_counters_Reg and flags are killed
 212 void C2_MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg,
 213                                                     Register rtm_counters_Reg,
 214                                                     RTMLockingCounters* rtm_counters,
 215                                                     Metadata* method_data) {
 216   Label L_done, L_check_always_rtm1, L_check_always_rtm2;
 217 
 218   if (RTMLockingCalculationDelay > 0) {
 219     // Delay calculation
 220     movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr()));
 221     testptr(tmpReg, tmpReg);
 222     jccb(Assembler::equal, L_done);
 223   }
 224   // Abort ratio calculation only if abort_count > RTMAbortThreshold
 225   //   Aborted transactions = abort_count * 100
 226   //   All transactions = total_count *  RTMTotalCountIncrRate
 227   //   Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
 228 
 229   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset()));
 230   cmpptr(tmpReg, RTMAbortThreshold);
 231   jccb(Assembler::below, L_check_always_rtm2);
 232   imulptr(tmpReg, tmpReg, 100);
 233 
 234   Register scrReg = rtm_counters_Reg;
 235   movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
 236   imulptr(scrReg, scrReg, RTMTotalCountIncrRate);
 237   imulptr(scrReg, scrReg, RTMAbortRatio);
 238   cmpptr(tmpReg, scrReg);
 239   jccb(Assembler::below, L_check_always_rtm1);
 240   if (method_data != NULL) {
 241     // set rtm_state to "no rtm" in MDO
 242     mov_metadata(tmpReg, method_data);
 243     lock();
 244     orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), NoRTM);
 245   }
 246   jmpb(L_done);
 247   bind(L_check_always_rtm1);
 248   // Reload RTMLockingCounters* address
 249   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
 250   bind(L_check_always_rtm2);
 251   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
 252   cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate);
 253   jccb(Assembler::below, L_done);
 254   if (method_data != NULL) {
 255     // set rtm_state to "always rtm" in MDO
 256     mov_metadata(tmpReg, method_data);
 257     lock();
 258     orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), UseRTM);
 259   }
 260   bind(L_done);
 261 }
 262 
 263 // Update counters and perform abort ratio calculation
 264 // input:  abort_status_Reg
 265 // rtm_counters_Reg, flags are killed
 266 void C2_MacroAssembler::rtm_profiling(Register abort_status_Reg,
 267                                       Register rtm_counters_Reg,
 268                                       RTMLockingCounters* rtm_counters,
 269                                       Metadata* method_data,
 270                                       bool profile_rtm) {
 271 
 272   assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
 273   // update rtm counters based on rax value at abort
 274   // reads abort_status_Reg, updates flags
 275   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
 276   rtm_counters_update(abort_status_Reg, rtm_counters_Reg);
 277   if (profile_rtm) {
 278     // Save abort status because abort_status_Reg is used by following code.
 279     if (RTMRetryCount > 0) {
 280       push(abort_status_Reg);
 281     }
 282     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
 283     rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data);
 284     // restore abort status
 285     if (RTMRetryCount > 0) {
 286       pop(abort_status_Reg);
 287     }
 288   }
 289 }
 290 
 291 // Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4)
 292 // inputs: retry_count_Reg
 293 //       : abort_status_Reg
 294 // output: retry_count_Reg decremented by 1
 295 // flags are killed
 296 void C2_MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) {
 297   Label doneRetry;
 298   assert(abort_status_Reg == rax, "");
 299   // The abort reason bits are in eax (see all states in rtmLocking.hpp)
 300   // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4)
 301   // if reason is in 0x6 and retry count != 0 then retry
 302   andptr(abort_status_Reg, 0x6);
 303   jccb(Assembler::zero, doneRetry);
 304   testl(retry_count_Reg, retry_count_Reg);
 305   jccb(Assembler::zero, doneRetry);
 306   pause();
 307   decrementl(retry_count_Reg);
 308   jmp(retryLabel);
 309   bind(doneRetry);
 310 }
 311 
 312 // Spin and retry if lock is busy,
 313 // inputs: box_Reg (monitor address)
 314 //       : retry_count_Reg
 315 // output: retry_count_Reg decremented by 1
 316 //       : clear z flag if retry count exceeded
 317 // tmp_Reg, scr_Reg, flags are killed
 318 void C2_MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg,
 319                                                Register tmp_Reg, Register scr_Reg, Label& retryLabel) {
 320   Label SpinLoop, SpinExit, doneRetry;
 321   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 322 
 323   testl(retry_count_Reg, retry_count_Reg);
 324   jccb(Assembler::zero, doneRetry);
 325   decrementl(retry_count_Reg);
 326   movptr(scr_Reg, RTMSpinLoopCount);
 327 
 328   bind(SpinLoop);
 329   pause();
 330   decrementl(scr_Reg);
 331   jccb(Assembler::lessEqual, SpinExit);
 332   movptr(tmp_Reg, Address(box_Reg, owner_offset));
 333   testptr(tmp_Reg, tmp_Reg);
 334   jccb(Assembler::notZero, SpinLoop);
 335 
 336   bind(SpinExit);
 337   jmp(retryLabel);
 338   bind(doneRetry);
 339   incrementl(retry_count_Reg); // clear z flag
 340 }
 341 
 342 // Use RTM for normal stack locks
 343 // Input: objReg (object to lock)
 344 void C2_MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg,
 345                                          Register retry_on_abort_count_Reg,
 346                                          RTMLockingCounters* stack_rtm_counters,
 347                                          Metadata* method_data, bool profile_rtm,
 348                                          Label& DONE_LABEL, Label& IsInflated) {
 349   assert(UseRTMForStackLocks, "why call this otherwise?");
 350   assert(tmpReg == rax, "");
 351   assert(scrReg == rdx, "");
 352   Label L_rtm_retry, L_decrement_retry, L_on_abort;
 353 
 354   if (RTMRetryCount > 0) {
 355     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
 356     bind(L_rtm_retry);
 357   }
 358   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
 359   testptr(tmpReg, markWord::monitor_value);  // inflated vs stack-locked|neutral
 360   jcc(Assembler::notZero, IsInflated);
 361 
 362   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 363     Label L_noincrement;
 364     if (RTMTotalCountIncrRate > 1) {
 365       // tmpReg, scrReg and flags are killed
 366       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
 367     }
 368     assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM");
 369     atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg);
 370     bind(L_noincrement);
 371   }
 372   xbegin(L_on_abort);
 373   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));       // fetch markword
 374   andptr(tmpReg, markWord::lock_mask_in_place);     // look at 2 lock bits
 375   cmpptr(tmpReg, markWord::unlocked_value);         // bits = 01 unlocked
 376   jcc(Assembler::equal, DONE_LABEL);        // all done if unlocked
 377 
 378   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
 379   if (UseRTMXendForLockBusy) {
 380     xend();
 381     movptr(abort_status_Reg, 0x2);   // Set the abort status to 2 (so we can retry)
 382     jmp(L_decrement_retry);
 383   }
 384   else {
 385     xabort(0);
 386   }
 387   bind(L_on_abort);
 388   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 389     rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm);
 390   }
 391   bind(L_decrement_retry);
 392   if (RTMRetryCount > 0) {
 393     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
 394     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
 395   }
 396 }
 397 
 398 // Use RTM for inflating locks
 399 // inputs: objReg (object to lock)
 400 //         boxReg (on-stack box address (displaced header location) - KILLED)
 401 //         tmpReg (ObjectMonitor address + markWord::monitor_value)
 402 void C2_MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg,
 403                                             Register scrReg, Register retry_on_busy_count_Reg,
 404                                             Register retry_on_abort_count_Reg,
 405                                             RTMLockingCounters* rtm_counters,
 406                                             Metadata* method_data, bool profile_rtm,
 407                                             Label& DONE_LABEL) {
 408   assert(UseRTMLocking, "why call this otherwise?");
 409   assert(tmpReg == rax, "");
 410   assert(scrReg == rdx, "");
 411   Label L_rtm_retry, L_decrement_retry, L_on_abort;
 412   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 413 
 414   movptr(boxReg, tmpReg); // Save ObjectMonitor address
 415 
 416   if (RTMRetryCount > 0) {
 417     movl(retry_on_busy_count_Reg, RTMRetryCount);  // Retry on lock busy
 418     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
 419     bind(L_rtm_retry);
 420   }
 421   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 422     Label L_noincrement;
 423     if (RTMTotalCountIncrRate > 1) {
 424       // tmpReg, scrReg and flags are killed
 425       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
 426     }
 427     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
 428     atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg);
 429     bind(L_noincrement);
 430   }
 431   xbegin(L_on_abort);
 432   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
 433   movptr(tmpReg, Address(tmpReg, owner_offset));
 434   testptr(tmpReg, tmpReg);
 435   jcc(Assembler::zero, DONE_LABEL);
 436   if (UseRTMXendForLockBusy) {
 437     xend();
 438     jmp(L_decrement_retry);
 439   }
 440   else {
 441     xabort(0);
 442   }
 443   bind(L_on_abort);
 444   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
 445   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 446     rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm);
 447   }
 448   if (RTMRetryCount > 0) {
 449     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
 450     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
 451   }
 452 
 453   movptr(tmpReg, Address(boxReg, owner_offset)) ;
 454   testptr(tmpReg, tmpReg) ;
 455   jccb(Assembler::notZero, L_decrement_retry) ;
 456 
 457   // Appears unlocked - try to swing _owner from null to non-null.
 458   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 459 #ifdef _LP64
 460   Register threadReg = r15_thread;
 461 #else
 462   get_thread(scrReg);
 463   Register threadReg = scrReg;
 464 #endif
 465   lock();
 466   cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg
 467 
 468   if (RTMRetryCount > 0) {
 469     // success done else retry
 470     jccb(Assembler::equal, DONE_LABEL) ;
 471     bind(L_decrement_retry);
 472     // Spin and retry if lock is busy.
 473     rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry);
 474   }
 475   else {
 476     bind(L_decrement_retry);
 477   }
 478 }
 479 
 480 #endif //  INCLUDE_RTM_OPT
 481 
 482 // fast_lock and fast_unlock used by C2
 483 
 484 // Because the transitions from emitted code to the runtime
 485 // monitorenter/exit helper stubs are so slow it's critical that
 486 // we inline both the stack-locking fast path and the inflated fast path.
 487 //
 488 // See also: cmpFastLock and cmpFastUnlock.
 489 //
 490 // What follows is a specialized inline transliteration of the code
 491 // in enter() and exit(). If we're concerned about I$ bloat another
 492 // option would be to emit TrySlowEnter and TrySlowExit methods
 493 // at startup-time.  These methods would accept arguments as
 494 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
 495 // indications in the icc.ZFlag.  fast_lock and fast_unlock would simply
 496 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
 497 // In practice, however, the # of lock sites is bounded and is usually small.
 498 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
 499 // if the processor uses simple bimodal branch predictors keyed by EIP
 500 // Since the helper routines would be called from multiple synchronization
 501 // sites.
 502 //
 503 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
 504 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
 505 // to those specialized methods.  That'd give us a mostly platform-independent
 506 // implementation that the JITs could optimize and inline at their pleasure.
 507 // Done correctly, the only time we'd need to cross to native could would be
 508 // to park() or unpark() threads.  We'd also need a few more unsafe operators
 509 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
 510 // (b) explicit barriers or fence operations.
 511 //
 512 // TODO:
 513 //
 514 // *  Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
 515 //    This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
 516 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
 517 //    the lock operators would typically be faster than reifying Self.
 518 //
 519 // *  Ideally I'd define the primitives as:
 520 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
 521 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
 522 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
 523 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
 524 //    Furthermore the register assignments are overconstrained, possibly resulting in
 525 //    sub-optimal code near the synchronization site.
 526 //
 527 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
 528 //    Alternately, use a better sp-proximity test.
 529 //
 530 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
 531 //    Either one is sufficient to uniquely identify a thread.
 532 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
 533 //
 534 // *  Intrinsify notify() and notifyAll() for the common cases where the
 535 //    object is locked by the calling thread but the waitlist is empty.
 536 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
 537 //
 538 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
 539 //    But beware of excessive branch density on AMD Opterons.
 540 //
 541 // *  Both fast_lock and fast_unlock set the ICC.ZF to indicate success
 542 //    or failure of the fast path.  If the fast path fails then we pass
 543 //    control to the slow path, typically in C.  In fast_lock and
 544 //    fast_unlock we often branch to DONE_LABEL, just to find that C2
 545 //    will emit a conditional branch immediately after the node.
 546 //    So we have branches to branches and lots of ICC.ZF games.
 547 //    Instead, it might be better to have C2 pass a "FailureLabel"
 548 //    into fast_lock and fast_unlock.  In the case of success, control
 549 //    will drop through the node.  ICC.ZF is undefined at exit.
 550 //    In the case of failure, the node will branch directly to the
 551 //    FailureLabel
 552 
 553 
 554 // obj: object to lock
 555 // box: on-stack box address (displaced header location) - KILLED
 556 // rax,: tmp -- KILLED
 557 // scr: tmp -- KILLED
 558 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
 559                                  Register scrReg, Register cx1Reg, Register cx2Reg, Register thread,
 560                                  RTMLockingCounters* rtm_counters,
 561                                  RTMLockingCounters* stack_rtm_counters,
 562                                  Metadata* method_data,
 563                                  bool use_rtm, bool profile_rtm) {
 564   // Ensure the register assignments are disjoint
 565   assert(tmpReg == rax, "");
 566 
 567   if (use_rtm) {
 568     assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg);
 569   } else {
 570     assert(cx2Reg == noreg, "");
 571     assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg);
 572   }
 573 
 574   // Possible cases that we'll encounter in fast_lock
 575   // ------------------------------------------------
 576   // * Inflated
 577   //    -- unlocked
 578   //    -- Locked
 579   //       = by self
 580   //       = by other
 581   // * neutral
 582   // * stack-locked
 583   //    -- by self
 584   //       = sp-proximity test hits
 585   //       = sp-proximity test generates false-negative
 586   //    -- by other
 587   //
 588 
 589   Label IsInflated, DONE_LABEL, slow_path, NO_COUNT, COUNT;
 590 
 591   if (DiagnoseSyncOnValueBasedClasses != 0) {
 592     load_klass(tmpReg, objReg, cx1Reg);
 593     movl(tmpReg, Address(tmpReg, Klass::access_flags_offset()));
 594     testl(tmpReg, JVM_ACC_IS_VALUE_BASED_CLASS);
 595     jcc(Assembler::notZero, DONE_LABEL);
 596   }
 597 
 598 #if INCLUDE_RTM_OPT
 599   if (UseRTMForStackLocks && use_rtm) {
 600     assert(!UseHeavyMonitors, "+UseHeavyMonitors and +UseRTMForStackLocks are mutually exclusive");
 601     rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg,
 602                       stack_rtm_counters, method_data, profile_rtm,
 603                       DONE_LABEL, IsInflated);
 604   }
 605 #endif // INCLUDE_RTM_OPT
 606 
 607   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));          // [FETCH]
 608   testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral
 609   jccb(Assembler::notZero, IsInflated);
 610 
 611   if (!UseHeavyMonitors) {
 612     fast_lock_impl(objReg, tmpReg, thread, scrReg, cx1Reg, slow_path);
 613     xorptr(rax, rax); // Set ZF = 1 (success)
 614     jmp(COUNT);
 615   }
 616   bind(slow_path);
 617   // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0.
 618   testptr(objReg, objReg);
 619   jmp(DONE_LABEL);
 620 
 621   bind(IsInflated);
 622   // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value
 623 
 624 #if INCLUDE_RTM_OPT
 625   // Use the same RTM locking code in 32- and 64-bit VM.
 626   if (use_rtm) {
 627     rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg,
 628                          rtm_counters, method_data, profile_rtm, DONE_LABEL);
 629   } else {
 630 #endif // INCLUDE_RTM_OPT
 631 
 632 #ifndef _LP64
 633   // The object is inflated.
 634 
 635   // boxReg refers to the on-stack BasicLock in the current frame.
 636   // We'd like to write:
 637   //   set box->_displaced_header = markWord::unused_mark().  Any non-0 value suffices.
 638   // This is convenient but results a ST-before-CAS penalty.  The following CAS suffers
 639   // additional latency as we have another ST in the store buffer that must drain.
 640 
 641   // avoid ST-before-CAS
 642   // register juggle because we need tmpReg for cmpxchgptr below
 643   movptr(scrReg, boxReg);
 644   movptr(boxReg, tmpReg);                   // consider: LEA box, [tmp-2]
 645 
 646   // Optimistic form: consider XORL tmpReg,tmpReg
 647   movptr(tmpReg, NULL_WORD);
 648 
 649   // Appears unlocked - try to swing _owner from null to non-null.
 650   // Ideally, I'd manifest "Self" with get_thread and then attempt
 651   // to CAS the register containing Self into m->Owner.
 652   // But we don't have enough registers, so instead we can either try to CAS
 653   // rsp or the address of the box (in scr) into &m->owner.  If the CAS succeeds
 654   // we later store "Self" into m->Owner.  Transiently storing a stack address
 655   // (rsp or the address of the box) into  m->owner is harmless.
 656   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 657   lock();
 658   cmpxchgptr(thread, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 659   // If we weren't able to swing _owner from NULL to the thread
 660   // then take the slow path.
 661   jccb  (Assembler::notZero, NO_COUNT);
 662   xorptr(boxReg, boxReg);                 // set icc.ZFlag = 1 to indicate success
 663 
 664   // If the CAS fails we can either retry or pass control to the slow path.
 665   // We use the latter tactic.
 666   // Pass the CAS result in the icc.ZFlag into DONE_LABEL
 667   // If the CAS was successful ...
 668   //   Self has acquired the lock
 669   //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
 670   // Intentional fall-through into DONE_LABEL ...
 671 #else // _LP64
 672   // It's inflated and we use scrReg for ObjectMonitor* in this section.
 673   movq(scrReg, tmpReg);
 674   xorq(tmpReg, tmpReg);
 675   lock();
 676   cmpxchgptr(thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 677   // Propagate ICC.ZF from CAS above into DONE_LABEL.
 678   jccb(Assembler::equal, COUNT);          // CAS above succeeded; propagate ZF = 1 (success)
 679 
 680   cmpptr(thread, rax);                     // Check if we are already the owner (recursive lock)
 681   jccb(Assembler::notEqual, NO_COUNT);    // If not recursive, ZF = 0 at this point (fail)
 682   incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 683   xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success
 684 #endif // _LP64
 685 #if INCLUDE_RTM_OPT
 686   } // use_rtm()
 687 #endif
 688   // DONE_LABEL is a hot target - we'd really like to place it at the
 689   // start of cache line by padding with NOPs.
 690   // See the AMD and Intel software optimization manuals for the
 691   // most efficient "long" NOP encodings.
 692   // Unfortunately none of our alignment mechanisms suffice.
 693   bind(DONE_LABEL);
 694 
 695   // ZFlag == 1 count in fast path
 696   // ZFlag == 0 count in slow path
 697   jccb(Assembler::notZero, NO_COUNT); // jump if ZFlag == 0
 698 
 699   bind(COUNT);
 700   // Count monitors in fast path
 701 #ifndef _LP64
 702   get_thread(tmpReg);
 703   incrementl(Address(tmpReg, JavaThread::held_monitor_count_offset()));
 704 #else // _LP64
 705   incrementq(Address(r15_thread, JavaThread::held_monitor_count_offset()));
 706 #endif
 707 
 708   xorl(tmpReg, tmpReg); // Set ZF == 1
 709 
 710   bind(NO_COUNT);
 711 
 712   // At NO_COUNT the icc ZFlag is set as follows ...
 713   // fast_unlock uses the same protocol.
 714   // ZFlag == 1 -> Success
 715   // ZFlag == 0 -> Failure - force control through the slow path
 716 }
 717 
 718 // obj: object to unlock
 719 // box: box address (displaced header location), killed.  Must be EAX.
 720 // tmp: killed, cannot be obj nor box.
 721 //
 722 // Some commentary on balanced locking:
 723 //
 724 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
 725 // Methods that don't have provably balanced locking are forced to run in the
 726 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
 727 // The interpreter provides two properties:
 728 // I1:  At return-time the interpreter automatically and quietly unlocks any
 729 //      objects acquired the current activation (frame).  Recall that the
 730 //      interpreter maintains an on-stack list of locks currently held by
 731 //      a frame.
 732 // I2:  If a method attempts to unlock an object that is not held by the
 733 //      the frame the interpreter throws IMSX.
 734 //
 735 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
 736 // B() doesn't have provably balanced locking so it runs in the interpreter.
 737 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
 738 // is still locked by A().
 739 //
 740 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
 741 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
 742 // should not be unlocked by "normal" java-level locking and vice-versa.  The specification
 743 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
 744 // Arguably given that the spec legislates the JNI case as undefined our implementation
 745 // could reasonably *avoid* checking owner in fast_unlock().
 746 // In the interest of performance we elide m->Owner==Self check in unlock.
 747 // A perfectly viable alternative is to elide the owner check except when
 748 // Xcheck:jni is enabled.
 749 
 750 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) {
 751   assert(boxReg == rax, "");
 752   assert_different_registers(objReg, boxReg, tmpReg);
 753 
 754   Label DONE_LABEL, Stacked, CheckSucc, COUNT, NO_COUNT;
 755 
 756 #if INCLUDE_RTM_OPT
 757   if (UseRTMForStackLocks && use_rtm) {
 758     assert(!UseHeavyMonitors, "+UseHeavyMonitors and +UseRTMForStackLocks are mutually exclusive");
 759     Label L_regular_unlock;
 760     movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword
 761     andptr(tmpReg, markWord::lock_mask_in_place);                     // look at 2 lock bits
 762     cmpptr(tmpReg, markWord::unlocked_value);                         // bits = 01 unlocked
 763     jccb(Assembler::notEqual, L_regular_unlock);                      // if !HLE RegularLock
 764     xend();                                                           // otherwise end...
 765     jmp(DONE_LABEL);                                                  // ... and we're done
 766     bind(L_regular_unlock);
 767   }
 768 #endif
 769 
 770   movptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword
 771   if (!UseHeavyMonitors) {
 772     testptr(boxReg, markWord::monitor_value);
 773     jcc(Assembler::zero, Stacked);
 774 
 775     // If the owner is ANONYMOUS, we need to fix it - in the slow-path.
 776     Label L;
 777     cmpptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t) (intptr_t) ANONYMOUS_OWNER);
 778     jccb(Assembler::notEqual, L);
 779     testptr(objReg, objReg); // Clear ZF to indicate failure at DONE_LABEL.
 780     jmp(DONE_LABEL);
 781     bind(L);
 782   }
 783 
 784   // It's inflated.
 785 #if INCLUDE_RTM_OPT
 786   if (use_rtm) {
 787     Label L_regular_inflated_unlock;
 788     int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 789     movptr(tmpReg, Address(boxReg, owner_offset));
 790     testptr(tmpReg, tmpReg);
 791     jccb(Assembler::notZero, L_regular_inflated_unlock);
 792     xend();
 793     jmp(DONE_LABEL);
 794     bind(L_regular_inflated_unlock);
 795   }
 796 #endif
 797 
 798   // Despite our balanced locking property we still check that m->_owner == Self
 799   // as java routines or native JNI code called by this thread might
 800   // have released the lock.
 801   // Refer to the comments in synchronizer.cpp for how we might encode extra
 802   // state in _succ so we can avoid fetching EntryList|cxq.
 803   //
 804   // If there's no contention try a 1-0 exit.  That is, exit without
 805   // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
 806   // we detect and recover from the race that the 1-0 exit admits.
 807   //
 808   // Conceptually fast_unlock() must execute a STST|LDST "release" barrier
 809   // before it STs null into _owner, releasing the lock.  Updates
 810   // to data protected by the critical section must be visible before
 811   // we drop the lock (and thus before any other thread could acquire
 812   // the lock and observe the fields protected by the lock).
 813   // IA32's memory-model is SPO, so STs are ordered with respect to
 814   // each other and there's no need for an explicit barrier (fence).
 815   // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
 816 #ifndef _LP64
 817   // Note that we could employ various encoding schemes to reduce
 818   // the number of loads below (currently 4) to just 2 or 3.
 819   // Refer to the comments in synchronizer.cpp.
 820   // In practice the chain of fetches doesn't seem to impact performance, however.
 821   xorptr(tmpReg, tmpReg);
 822   orptr(tmpReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 823   jccb  (Assembler::notZero, DONE_LABEL);
 824   movptr(tmpReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 825   orptr(tmpReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 826   jccb  (Assembler::notZero, DONE_LABEL);
 827   movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 828   jmpb  (DONE_LABEL);
 829 #else // _LP64
 830   // It's inflated
 831   Label LNotRecursive, LSuccess, LGoSlowPath;
 832 
 833   cmpptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0);
 834   jccb(Assembler::equal, LNotRecursive);
 835 
 836   // Recursive inflated unlock
 837   decq(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 838   jmpb(LSuccess);
 839 
 840   bind(LNotRecursive);
 841   movptr(tmpReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 842   orptr(tmpReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 843   jccb  (Assembler::notZero, CheckSucc);
 844   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 845   movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
 846   jmpb  (DONE_LABEL);
 847 
 848   // Try to avoid passing control into the slow_path ...
 849   bind  (CheckSucc);
 850 
 851   // The following optional optimization can be elided if necessary
 852   // Effectively: if (succ == null) goto slow path
 853   // The code reduces the window for a race, however,
 854   // and thus benefits performance.
 855   cmpptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
 856   jccb  (Assembler::zero, LGoSlowPath);
 857 
 858   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 859   movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
 860 
 861   // Memory barrier/fence
 862   // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ
 863   // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack.
 864   // This is faster on Nehalem and AMD Shanghai/Barcelona.
 865   // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences
 866   // We might also restructure (ST Owner=0;barrier;LD _Succ) to
 867   // (mov box,0; xchgq box, &m->Owner; LD _succ) .
 868   lock(); addl(Address(rsp, 0), 0);
 869 
 870   cmpptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
 871   jccb  (Assembler::notZero, LSuccess);
 872 
 873   mov(tmpReg, boxReg);
 874   xorptr(boxReg, boxReg);
 875 
 876   // Rare inopportune interleaving - race.
 877   // The successor vanished in the small window above.
 878   // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor.
 879   // We need to ensure progress and succession.
 880   // Try to reacquire the lock.
 881   // If that fails then the new owner is responsible for succession and this
 882   // thread needs to take no further action and can exit via the fast path (success).
 883   // If the re-acquire succeeds then pass control into the slow path.
 884   // As implemented, this latter mode is horrible because we generated more
 885   // coherence traffic on the lock *and* artificially extended the critical section
 886   // length while by virtue of passing control into the slow path.
 887 
 888   // box is really RAX -- the following CMPXCHG depends on that binding
 889   // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R)
 890   lock();
 891   cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 892   // There's no successor so we tried to regrab the lock.
 893   // If that didn't work, then another thread grabbed the
 894   // lock so we're done (and exit was a success).
 895   jccb  (Assembler::notEqual, LSuccess);
 896   // Intentional fall-through into slow path
 897 
 898   bind  (LGoSlowPath);
 899   orl   (boxReg, 1);                      // set ICC.ZF=0 to indicate failure
 900   jmpb  (DONE_LABEL);
 901 
 902   bind  (LSuccess);
 903   testl (boxReg, 0);                      // set ICC.ZF=1 to indicate success
 904   jmpb  (DONE_LABEL);
 905 
 906 #endif
 907   if (!UseHeavyMonitors) {
 908     bind(Stacked);
 909     // Mark-word must be 00 now, try to swing it back to 01 (unlocked)
 910     fast_unlock_impl(objReg, boxReg, tmpReg, DONE_LABEL);
 911     xorptr(rax, rax); // Set ZF = 1 (success)
 912   }
 913   bind(DONE_LABEL);
 914 
 915   // ZFlag == 1 count in fast path
 916   // ZFlag == 0 count in slow path
 917   jccb(Assembler::notZero, NO_COUNT);
 918 
 919   bind(COUNT);
 920   // Count monitors in fast path
 921 #ifndef _LP64
 922   get_thread(tmpReg);
 923   decrementl(Address(tmpReg, JavaThread::held_monitor_count_offset()));
 924 #else // _LP64
 925   decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset()));
 926 #endif
 927 
 928   xorl(tmpReg, tmpReg); // Set ZF == 1
 929 
 930   bind(NO_COUNT);
 931 }
 932 
 933 //-------------------------------------------------------------------------------------------
 934 // Generic instructions support for use in .ad files C2 code generation
 935 
 936 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) {
 937   if (dst != src) {
 938     movdqu(dst, src);
 939   }
 940   if (opcode == Op_AbsVD) {
 941     andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg);
 942   } else {
 943     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 944     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
 945   }
 946 }
 947 
 948 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 949   if (opcode == Op_AbsVD) {
 950     vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg);
 951   } else {
 952     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 953     vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg);
 954   }
 955 }
 956 
 957 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) {
 958   if (dst != src) {
 959     movdqu(dst, src);
 960   }
 961   if (opcode == Op_AbsVF) {
 962     andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg);
 963   } else {
 964     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 965     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
 966   }
 967 }
 968 
 969 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 970   if (opcode == Op_AbsVF) {
 971     vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg);
 972   } else {
 973     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 974     vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg);
 975   }
 976 }
 977 
 978 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
 979   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
 980   assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
 981 
 982   if (opcode == Op_MinV) {
 983     if (elem_bt == T_BYTE) {
 984       pminsb(dst, src);
 985     } else if (elem_bt == T_SHORT) {
 986       pminsw(dst, src);
 987     } else if (elem_bt == T_INT) {
 988       pminsd(dst, src);
 989     } else {
 990       assert(elem_bt == T_LONG, "required");
 991       assert(tmp == xmm0, "required");
 992       assert_different_registers(dst, src, tmp);
 993       movdqu(xmm0, dst);
 994       pcmpgtq(xmm0, src);
 995       blendvpd(dst, src);  // xmm0 as mask
 996     }
 997   } else { // opcode == Op_MaxV
 998     if (elem_bt == T_BYTE) {
 999       pmaxsb(dst, src);
1000     } else if (elem_bt == T_SHORT) {
1001       pmaxsw(dst, src);
1002     } else if (elem_bt == T_INT) {
1003       pmaxsd(dst, src);
1004     } else {
1005       assert(elem_bt == T_LONG, "required");
1006       assert(tmp == xmm0, "required");
1007       assert_different_registers(dst, src, tmp);
1008       movdqu(xmm0, src);
1009       pcmpgtq(xmm0, dst);
1010       blendvpd(dst, src);  // xmm0 as mask
1011     }
1012   }
1013 }
1014 
1015 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
1016                                  XMMRegister dst, XMMRegister src1, XMMRegister src2,
1017                                  int vlen_enc) {
1018   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
1019 
1020   if (opcode == Op_MinV) {
1021     if (elem_bt == T_BYTE) {
1022       vpminsb(dst, src1, src2, vlen_enc);
1023     } else if (elem_bt == T_SHORT) {
1024       vpminsw(dst, src1, src2, vlen_enc);
1025     } else if (elem_bt == T_INT) {
1026       vpminsd(dst, src1, src2, vlen_enc);
1027     } else {
1028       assert(elem_bt == T_LONG, "required");
1029       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
1030         vpminsq(dst, src1, src2, vlen_enc);
1031       } else {
1032         assert_different_registers(dst, src1, src2);
1033         vpcmpgtq(dst, src1, src2, vlen_enc);
1034         vblendvpd(dst, src1, src2, dst, vlen_enc);
1035       }
1036     }
1037   } else { // opcode == Op_MaxV
1038     if (elem_bt == T_BYTE) {
1039       vpmaxsb(dst, src1, src2, vlen_enc);
1040     } else if (elem_bt == T_SHORT) {
1041       vpmaxsw(dst, src1, src2, vlen_enc);
1042     } else if (elem_bt == T_INT) {
1043       vpmaxsd(dst, src1, src2, vlen_enc);
1044     } else {
1045       assert(elem_bt == T_LONG, "required");
1046       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
1047         vpmaxsq(dst, src1, src2, vlen_enc);
1048       } else {
1049         assert_different_registers(dst, src1, src2);
1050         vpcmpgtq(dst, src1, src2, vlen_enc);
1051         vblendvpd(dst, src2, src1, dst, vlen_enc);
1052       }
1053     }
1054   }
1055 }
1056 
1057 // Float/Double min max
1058 
1059 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
1060                                    XMMRegister dst, XMMRegister a, XMMRegister b,
1061                                    XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
1062                                    int vlen_enc) {
1063   assert(UseAVX > 0, "required");
1064   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1065          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1066   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1067   assert_different_registers(a, b, tmp, atmp, btmp);
1068 
1069   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1070   bool is_double_word = is_double_word_type(elem_bt);
1071 
1072   if (!is_double_word && is_min) {
1073     vblendvps(atmp, a, b, a, vlen_enc);
1074     vblendvps(btmp, b, a, a, vlen_enc);
1075     vminps(tmp, atmp, btmp, vlen_enc);
1076     vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1077     vblendvps(dst, tmp, atmp, btmp, vlen_enc);
1078   } else if (!is_double_word && !is_min) {
1079     vblendvps(btmp, b, a, b, vlen_enc);
1080     vblendvps(atmp, a, b, b, vlen_enc);
1081     vmaxps(tmp, atmp, btmp, vlen_enc);
1082     vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1083     vblendvps(dst, tmp, atmp, btmp, vlen_enc);
1084   } else if (is_double_word && is_min) {
1085     vblendvpd(atmp, a, b, a, vlen_enc);
1086     vblendvpd(btmp, b, a, a, vlen_enc);
1087     vminpd(tmp, atmp, btmp, vlen_enc);
1088     vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1089     vblendvpd(dst, tmp, atmp, btmp, vlen_enc);
1090   } else {
1091     assert(is_double_word && !is_min, "sanity");
1092     vblendvpd(btmp, b, a, b, vlen_enc);
1093     vblendvpd(atmp, a, b, b, vlen_enc);
1094     vmaxpd(tmp, atmp, btmp, vlen_enc);
1095     vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1096     vblendvpd(dst, tmp, atmp, btmp, vlen_enc);
1097   }
1098 }
1099 
1100 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
1101                                     XMMRegister dst, XMMRegister a, XMMRegister b,
1102                                     KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
1103                                     int vlen_enc) {
1104   assert(UseAVX > 2, "required");
1105   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1106          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1107   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1108   assert_different_registers(dst, a, b, atmp, btmp);
1109 
1110   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1111   bool is_double_word = is_double_word_type(elem_bt);
1112   bool merge = true;
1113 
1114   if (!is_double_word && is_min) {
1115     evpmovd2m(ktmp, a, vlen_enc);
1116     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1117     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1118     vminps(dst, atmp, btmp, vlen_enc);
1119     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1120     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1121   } else if (!is_double_word && !is_min) {
1122     evpmovd2m(ktmp, b, vlen_enc);
1123     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1124     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1125     vmaxps(dst, atmp, btmp, vlen_enc);
1126     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1127     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1128   } else if (is_double_word && is_min) {
1129     evpmovq2m(ktmp, a, vlen_enc);
1130     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1131     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1132     vminpd(dst, atmp, btmp, vlen_enc);
1133     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1134     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1135   } else {
1136     assert(is_double_word && !is_min, "sanity");
1137     evpmovq2m(ktmp, b, vlen_enc);
1138     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1139     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1140     vmaxpd(dst, atmp, btmp, vlen_enc);
1141     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1142     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1143   }
1144 }
1145 
1146 // Float/Double signum
1147 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) {
1148   assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");
1149 
1150   Label DONE_LABEL;
1151 
1152   if (opcode == Op_SignumF) {
1153     assert(UseSSE > 0, "required");
1154     ucomiss(dst, zero);
1155     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1156     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1157     movflt(dst, one);
1158     jcc(Assembler::above, DONE_LABEL);
1159     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
1160   } else if (opcode == Op_SignumD) {
1161     assert(UseSSE > 1, "required");
1162     ucomisd(dst, zero);
1163     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1164     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1165     movdbl(dst, one);
1166     jcc(Assembler::above, DONE_LABEL);
1167     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
1168   }
1169 
1170   bind(DONE_LABEL);
1171 }
1172 
1173 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1174   if (sign) {
1175     pmovsxbw(dst, src);
1176   } else {
1177     pmovzxbw(dst, src);
1178   }
1179 }
1180 
1181 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1182   if (sign) {
1183     vpmovsxbw(dst, src, vector_len);
1184   } else {
1185     vpmovzxbw(dst, src, vector_len);
1186   }
1187 }
1188 
1189 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1190   if (sign) {
1191     vpmovsxbd(dst, src, vector_len);
1192   } else {
1193     vpmovzxbd(dst, src, vector_len);
1194   }
1195 }
1196 
1197 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1198   if (sign) {
1199     vpmovsxwd(dst, src, vector_len);
1200   } else {
1201     vpmovzxwd(dst, src, vector_len);
1202   }
1203 }
1204 
1205 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1206                                      int shift, int vector_len) {
1207   if (opcode == Op_RotateLeftV) {
1208     if (etype == T_INT) {
1209       evprold(dst, src, shift, vector_len);
1210     } else {
1211       assert(etype == T_LONG, "expected type T_LONG");
1212       evprolq(dst, src, shift, vector_len);
1213     }
1214   } else {
1215     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1216     if (etype == T_INT) {
1217       evprord(dst, src, shift, vector_len);
1218     } else {
1219       assert(etype == T_LONG, "expected type T_LONG");
1220       evprorq(dst, src, shift, vector_len);
1221     }
1222   }
1223 }
1224 
1225 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1226                                      XMMRegister shift, int vector_len) {
1227   if (opcode == Op_RotateLeftV) {
1228     if (etype == T_INT) {
1229       evprolvd(dst, src, shift, vector_len);
1230     } else {
1231       assert(etype == T_LONG, "expected type T_LONG");
1232       evprolvq(dst, src, shift, vector_len);
1233     }
1234   } else {
1235     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1236     if (etype == T_INT) {
1237       evprorvd(dst, src, shift, vector_len);
1238     } else {
1239       assert(etype == T_LONG, "expected type T_LONG");
1240       evprorvq(dst, src, shift, vector_len);
1241     }
1242   }
1243 }
1244 
1245 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1246   if (opcode == Op_RShiftVI) {
1247     psrad(dst, shift);
1248   } else if (opcode == Op_LShiftVI) {
1249     pslld(dst, shift);
1250   } else {
1251     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1252     psrld(dst, shift);
1253   }
1254 }
1255 
1256 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1257   switch (opcode) {
1258     case Op_RShiftVI:  psrad(dst, shift); break;
1259     case Op_LShiftVI:  pslld(dst, shift); break;
1260     case Op_URShiftVI: psrld(dst, shift); break;
1261 
1262     default: assert(false, "%s", NodeClassNames[opcode]);
1263   }
1264 }
1265 
1266 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1267   if (opcode == Op_RShiftVI) {
1268     vpsrad(dst, nds, shift, vector_len);
1269   } else if (opcode == Op_LShiftVI) {
1270     vpslld(dst, nds, shift, vector_len);
1271   } else {
1272     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1273     vpsrld(dst, nds, shift, vector_len);
1274   }
1275 }
1276 
1277 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1278   switch (opcode) {
1279     case Op_RShiftVI:  vpsrad(dst, src, shift, vlen_enc); break;
1280     case Op_LShiftVI:  vpslld(dst, src, shift, vlen_enc); break;
1281     case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1282 
1283     default: assert(false, "%s", NodeClassNames[opcode]);
1284   }
1285 }
1286 
1287 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1288   switch (opcode) {
1289     case Op_RShiftVB:  // fall-through
1290     case Op_RShiftVS:  psraw(dst, shift); break;
1291 
1292     case Op_LShiftVB:  // fall-through
1293     case Op_LShiftVS:  psllw(dst, shift);   break;
1294 
1295     case Op_URShiftVS: // fall-through
1296     case Op_URShiftVB: psrlw(dst, shift);  break;
1297 
1298     default: assert(false, "%s", NodeClassNames[opcode]);
1299   }
1300 }
1301 
1302 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1303   switch (opcode) {
1304     case Op_RShiftVB:  // fall-through
1305     case Op_RShiftVS:  vpsraw(dst, src, shift, vlen_enc); break;
1306 
1307     case Op_LShiftVB:  // fall-through
1308     case Op_LShiftVS:  vpsllw(dst, src, shift, vlen_enc); break;
1309 
1310     case Op_URShiftVS: // fall-through
1311     case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1312 
1313     default: assert(false, "%s", NodeClassNames[opcode]);
1314   }
1315 }
1316 
1317 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1318   switch (opcode) {
1319     case Op_RShiftVL:  psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1320     case Op_LShiftVL:  psllq(dst, shift); break;
1321     case Op_URShiftVL: psrlq(dst, shift); break;
1322 
1323     default: assert(false, "%s", NodeClassNames[opcode]);
1324   }
1325 }
1326 
1327 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1328   if (opcode == Op_RShiftVL) {
1329     psrlq(dst, shift);  // using srl to implement sra on pre-avs512 systems
1330   } else if (opcode == Op_LShiftVL) {
1331     psllq(dst, shift);
1332   } else {
1333     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1334     psrlq(dst, shift);
1335   }
1336 }
1337 
1338 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1339   switch (opcode) {
1340     case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1341     case Op_LShiftVL:  vpsllq(dst, src, shift, vlen_enc); break;
1342     case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1343 
1344     default: assert(false, "%s", NodeClassNames[opcode]);
1345   }
1346 }
1347 
1348 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1349   if (opcode == Op_RShiftVL) {
1350     evpsraq(dst, nds, shift, vector_len);
1351   } else if (opcode == Op_LShiftVL) {
1352     vpsllq(dst, nds, shift, vector_len);
1353   } else {
1354     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1355     vpsrlq(dst, nds, shift, vector_len);
1356   }
1357 }
1358 
1359 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1360   switch (opcode) {
1361     case Op_RShiftVB:  // fall-through
1362     case Op_RShiftVS:  // fall-through
1363     case Op_RShiftVI:  vpsravd(dst, src, shift, vlen_enc); break;
1364 
1365     case Op_LShiftVB:  // fall-through
1366     case Op_LShiftVS:  // fall-through
1367     case Op_LShiftVI:  vpsllvd(dst, src, shift, vlen_enc); break;
1368 
1369     case Op_URShiftVB: // fall-through
1370     case Op_URShiftVS: // fall-through
1371     case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1372 
1373     default: assert(false, "%s", NodeClassNames[opcode]);
1374   }
1375 }
1376 
1377 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1378   switch (opcode) {
1379     case Op_RShiftVB:  // fall-through
1380     case Op_RShiftVS:  evpsravw(dst, src, shift, vlen_enc); break;
1381 
1382     case Op_LShiftVB:  // fall-through
1383     case Op_LShiftVS:  evpsllvw(dst, src, shift, vlen_enc); break;
1384 
1385     case Op_URShiftVB: // fall-through
1386     case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1387 
1388     default: assert(false, "%s", NodeClassNames[opcode]);
1389   }
1390 }
1391 
1392 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1393   assert(UseAVX >= 2, "required");
1394   switch (opcode) {
1395     case Op_RShiftVL: {
1396       if (UseAVX > 2) {
1397         assert(tmp == xnoreg, "not used");
1398         if (!VM_Version::supports_avx512vl()) {
1399           vlen_enc = Assembler::AVX_512bit;
1400         }
1401         evpsravq(dst, src, shift, vlen_enc);
1402       } else {
1403         vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1404         vpsrlvq(dst, src, shift, vlen_enc);
1405         vpsrlvq(tmp, tmp, shift, vlen_enc);
1406         vpxor(dst, dst, tmp, vlen_enc);
1407         vpsubq(dst, dst, tmp, vlen_enc);
1408       }
1409       break;
1410     }
1411     case Op_LShiftVL: {
1412       assert(tmp == xnoreg, "not used");
1413       vpsllvq(dst, src, shift, vlen_enc);
1414       break;
1415     }
1416     case Op_URShiftVL: {
1417       assert(tmp == xnoreg, "not used");
1418       vpsrlvq(dst, src, shift, vlen_enc);
1419       break;
1420     }
1421     default: assert(false, "%s", NodeClassNames[opcode]);
1422   }
1423 }
1424 
1425 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1426 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1427   assert(opcode == Op_LShiftVB ||
1428          opcode == Op_RShiftVB ||
1429          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1430   bool sign = (opcode != Op_URShiftVB);
1431   assert(vector_len == 0, "required");
1432   vextendbd(sign, dst, src, 1);
1433   vpmovzxbd(vtmp, shift, 1);
1434   varshiftd(opcode, dst, dst, vtmp, 1);
1435   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg);
1436   vextracti128_high(vtmp, dst);
1437   vpackusdw(dst, dst, vtmp, 0);
1438 }
1439 
1440 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1441 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1442   assert(opcode == Op_LShiftVB ||
1443          opcode == Op_RShiftVB ||
1444          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1445   bool sign = (opcode != Op_URShiftVB);
1446   int ext_vector_len = vector_len + 1;
1447   vextendbw(sign, dst, src, ext_vector_len);
1448   vpmovzxbw(vtmp, shift, ext_vector_len);
1449   varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1450   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg);
1451   if (vector_len == 0) {
1452     vextracti128_high(vtmp, dst);
1453     vpackuswb(dst, dst, vtmp, vector_len);
1454   } else {
1455     vextracti64x4_high(vtmp, dst);
1456     vpackuswb(dst, dst, vtmp, vector_len);
1457     vpermq(dst, dst, 0xD8, vector_len);
1458   }
1459 }
1460 
1461 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1462   switch(typ) {
1463     case T_BYTE:
1464       pinsrb(dst, val, idx);
1465       break;
1466     case T_SHORT:
1467       pinsrw(dst, val, idx);
1468       break;
1469     case T_INT:
1470       pinsrd(dst, val, idx);
1471       break;
1472     case T_LONG:
1473       pinsrq(dst, val, idx);
1474       break;
1475     default:
1476       assert(false,"Should not reach here.");
1477       break;
1478   }
1479 }
1480 
1481 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1482   switch(typ) {
1483     case T_BYTE:
1484       vpinsrb(dst, src, val, idx);
1485       break;
1486     case T_SHORT:
1487       vpinsrw(dst, src, val, idx);
1488       break;
1489     case T_INT:
1490       vpinsrd(dst, src, val, idx);
1491       break;
1492     case T_LONG:
1493       vpinsrq(dst, src, val, idx);
1494       break;
1495     default:
1496       assert(false,"Should not reach here.");
1497       break;
1498   }
1499 }
1500 
1501 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1502   switch(typ) {
1503     case T_INT:
1504       vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1505       break;
1506     case T_FLOAT:
1507       vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1508       break;
1509     case T_LONG:
1510       vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1511       break;
1512     case T_DOUBLE:
1513       vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1514       break;
1515     default:
1516       assert(false,"Should not reach here.");
1517       break;
1518   }
1519 }
1520 
1521 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1522   switch(typ) {
1523     case T_INT:
1524       evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1525       break;
1526     case T_FLOAT:
1527       evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1528       break;
1529     case T_LONG:
1530       evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1531       break;
1532     case T_DOUBLE:
1533       evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1534       break;
1535     default:
1536       assert(false,"Should not reach here.");
1537       break;
1538   }
1539 }
1540 
1541 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1542   switch(typ) {
1543     case T_INT:
1544       evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1545       break;
1546     case T_FLOAT:
1547       evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1548       break;
1549     case T_LONG:
1550       evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1551       break;
1552     case T_DOUBLE:
1553       evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1554       break;
1555     default:
1556       assert(false,"Should not reach here.");
1557       break;
1558   }
1559 }
1560 
1561 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) {
1562   if (vlen_in_bytes <= 16) {
1563     pxor (dst, dst);
1564     psubb(dst, src);
1565     switch (elem_bt) {
1566       case T_BYTE:   /* nothing to do */ break;
1567       case T_SHORT:  pmovsxbw(dst, dst); break;
1568       case T_INT:    pmovsxbd(dst, dst); break;
1569       case T_FLOAT:  pmovsxbd(dst, dst); break;
1570       case T_LONG:   pmovsxbq(dst, dst); break;
1571       case T_DOUBLE: pmovsxbq(dst, dst); break;
1572 
1573       default: assert(false, "%s", type2name(elem_bt));
1574     }
1575   } else {
1576     assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, "");
1577     int vlen_enc = vector_length_encoding(vlen_in_bytes);
1578 
1579     vpxor (dst, dst, dst, vlen_enc);
1580     vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc);
1581 
1582     switch (elem_bt) {
1583       case T_BYTE:   /* nothing to do */            break;
1584       case T_SHORT:  vpmovsxbw(dst, dst, vlen_enc); break;
1585       case T_INT:    vpmovsxbd(dst, dst, vlen_enc); break;
1586       case T_FLOAT:  vpmovsxbd(dst, dst, vlen_enc); break;
1587       case T_LONG:   vpmovsxbq(dst, dst, vlen_enc); break;
1588       case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1589 
1590       default: assert(false, "%s", type2name(elem_bt));
1591     }
1592   }
1593 }
1594 
1595 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) {
1596   if (novlbwdq) {
1597     vpmovsxbd(xtmp, src, vlen_enc);
1598     evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()),
1599             Assembler::eq, true, vlen_enc, noreg);
1600   } else {
1601     vpxor(xtmp, xtmp, xtmp, vlen_enc);
1602     vpsubb(xtmp, xtmp, src, vlen_enc);
1603     evpmovb2m(dst, xtmp, vlen_enc);
1604   }
1605 }
1606 
1607 void C2_MacroAssembler::load_vector(XMMRegister dst, Address src, int vlen_in_bytes) {
1608   switch (vlen_in_bytes) {
1609     case 4:  movdl(dst, src);   break;
1610     case 8:  movq(dst, src);    break;
1611     case 16: movdqu(dst, src);  break;
1612     case 32: vmovdqu(dst, src); break;
1613     case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break;
1614     default: ShouldNotReachHere();
1615   }
1616 }
1617 
1618 void C2_MacroAssembler::load_vector(XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) {
1619   assert(rscratch != noreg || always_reachable(src), "missing");
1620 
1621   if (reachable(src)) {
1622     load_vector(dst, as_Address(src), vlen_in_bytes);
1623   } else {
1624     lea(rscratch, src);
1625     load_vector(dst, Address(rscratch, 0), vlen_in_bytes);
1626   }
1627 }
1628 
1629 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) {
1630   int vlen_enc = vector_length_encoding(vlen);
1631   if (VM_Version::supports_avx()) {
1632     if (bt == T_LONG) {
1633       if (VM_Version::supports_avx2()) {
1634         vpbroadcastq(dst, src, vlen_enc);
1635       } else {
1636         vmovddup(dst, src, vlen_enc);
1637       }
1638     } else if (bt == T_DOUBLE) {
1639       if (vlen_enc != Assembler::AVX_128bit) {
1640         vbroadcastsd(dst, src, vlen_enc, noreg);
1641       } else {
1642         vmovddup(dst, src, vlen_enc);
1643       }
1644     } else {
1645       if (VM_Version::supports_avx2() && is_integral_type(bt)) {
1646         vpbroadcastd(dst, src, vlen_enc);
1647       } else {
1648         vbroadcastss(dst, src, vlen_enc);
1649       }
1650     }
1651   } else if (VM_Version::supports_sse3()) {
1652     movddup(dst, src);
1653   } else {
1654     movq(dst, src);
1655     if (vlen == 16) {
1656       punpcklqdq(dst, dst);
1657     }
1658   }
1659 }
1660 
1661 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes) {
1662   ExternalAddress addr(StubRoutines::x86::vector_iota_indices());
1663   if (vlen_in_bytes <= 4) {
1664     movdl(dst, addr);
1665   } else if (vlen_in_bytes == 8) {
1666     movq(dst, addr);
1667   } else if (vlen_in_bytes == 16) {
1668     movdqu(dst, addr, noreg);
1669   } else if (vlen_in_bytes == 32) {
1670     vmovdqu(dst, addr, noreg);
1671   } else {
1672     assert(vlen_in_bytes == 64, "%d", vlen_in_bytes);
1673     evmovdqub(dst, k0, addr, false /*merge*/, Assembler::AVX_512bit, noreg);
1674   }
1675 }
1676 
1677 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1678 
1679 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1680   int vector_len = Assembler::AVX_128bit;
1681 
1682   switch (opcode) {
1683     case Op_AndReductionV:  pand(dst, src); break;
1684     case Op_OrReductionV:   por (dst, src); break;
1685     case Op_XorReductionV:  pxor(dst, src); break;
1686     case Op_MinReductionV:
1687       switch (typ) {
1688         case T_BYTE:        pminsb(dst, src); break;
1689         case T_SHORT:       pminsw(dst, src); break;
1690         case T_INT:         pminsd(dst, src); break;
1691         case T_LONG:        assert(UseAVX > 2, "required");
1692                             vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
1693         default:            assert(false, "wrong type");
1694       }
1695       break;
1696     case Op_MaxReductionV:
1697       switch (typ) {
1698         case T_BYTE:        pmaxsb(dst, src); break;
1699         case T_SHORT:       pmaxsw(dst, src); break;
1700         case T_INT:         pmaxsd(dst, src); break;
1701         case T_LONG:        assert(UseAVX > 2, "required");
1702                             vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
1703         default:            assert(false, "wrong type");
1704       }
1705       break;
1706     case Op_AddReductionVF: addss(dst, src); break;
1707     case Op_AddReductionVD: addsd(dst, src); break;
1708     case Op_AddReductionVI:
1709       switch (typ) {
1710         case T_BYTE:        paddb(dst, src); break;
1711         case T_SHORT:       paddw(dst, src); break;
1712         case T_INT:         paddd(dst, src); break;
1713         default:            assert(false, "wrong type");
1714       }
1715       break;
1716     case Op_AddReductionVL: paddq(dst, src); break;
1717     case Op_MulReductionVF: mulss(dst, src); break;
1718     case Op_MulReductionVD: mulsd(dst, src); break;
1719     case Op_MulReductionVI:
1720       switch (typ) {
1721         case T_SHORT:       pmullw(dst, src); break;
1722         case T_INT:         pmulld(dst, src); break;
1723         default:            assert(false, "wrong type");
1724       }
1725       break;
1726     case Op_MulReductionVL: assert(UseAVX > 2, "required");
1727                             vpmullq(dst, dst, src, vector_len); break;
1728     default:                assert(false, "wrong opcode");
1729   }
1730 }
1731 
1732 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1733   int vector_len = Assembler::AVX_256bit;
1734 
1735   switch (opcode) {
1736     case Op_AndReductionV:  vpand(dst, src1, src2, vector_len); break;
1737     case Op_OrReductionV:   vpor (dst, src1, src2, vector_len); break;
1738     case Op_XorReductionV:  vpxor(dst, src1, src2, vector_len); break;
1739     case Op_MinReductionV:
1740       switch (typ) {
1741         case T_BYTE:        vpminsb(dst, src1, src2, vector_len); break;
1742         case T_SHORT:       vpminsw(dst, src1, src2, vector_len); break;
1743         case T_INT:         vpminsd(dst, src1, src2, vector_len); break;
1744         case T_LONG:        assert(UseAVX > 2, "required");
1745                             vpminsq(dst, src1, src2, vector_len); break;
1746         default:            assert(false, "wrong type");
1747       }
1748       break;
1749     case Op_MaxReductionV:
1750       switch (typ) {
1751         case T_BYTE:        vpmaxsb(dst, src1, src2, vector_len); break;
1752         case T_SHORT:       vpmaxsw(dst, src1, src2, vector_len); break;
1753         case T_INT:         vpmaxsd(dst, src1, src2, vector_len); break;
1754         case T_LONG:        assert(UseAVX > 2, "required");
1755                             vpmaxsq(dst, src1, src2, vector_len); break;
1756         default:            assert(false, "wrong type");
1757       }
1758       break;
1759     case Op_AddReductionVI:
1760       switch (typ) {
1761         case T_BYTE:        vpaddb(dst, src1, src2, vector_len); break;
1762         case T_SHORT:       vpaddw(dst, src1, src2, vector_len); break;
1763         case T_INT:         vpaddd(dst, src1, src2, vector_len); break;
1764         default:            assert(false, "wrong type");
1765       }
1766       break;
1767     case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
1768     case Op_MulReductionVI:
1769       switch (typ) {
1770         case T_SHORT:       vpmullw(dst, src1, src2, vector_len); break;
1771         case T_INT:         vpmulld(dst, src1, src2, vector_len); break;
1772         default:            assert(false, "wrong type");
1773       }
1774       break;
1775     case Op_MulReductionVL: vpmullq(dst, src1, src2, vector_len); break;
1776     default:                assert(false, "wrong opcode");
1777   }
1778 }
1779 
1780 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
1781                                   XMMRegister dst, XMMRegister src,
1782                                   XMMRegister vtmp1, XMMRegister vtmp2) {
1783   switch (opcode) {
1784     case Op_AddReductionVF:
1785     case Op_MulReductionVF:
1786       reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1787       break;
1788 
1789     case Op_AddReductionVD:
1790     case Op_MulReductionVD:
1791       reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1792       break;
1793 
1794     default: assert(false, "wrong opcode");
1795   }
1796 }
1797 
1798 void C2_MacroAssembler::reduceB(int opcode, int vlen,
1799                              Register dst, Register src1, XMMRegister src2,
1800                              XMMRegister vtmp1, XMMRegister vtmp2) {
1801   switch (vlen) {
1802     case  8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1803     case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1804     case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1805     case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1806 
1807     default: assert(false, "wrong vector length");
1808   }
1809 }
1810 
1811 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
1812                              Register dst, Register src1, XMMRegister src2,
1813                              XMMRegister vtmp1, XMMRegister vtmp2) {
1814   switch (vlen) {
1815     case  8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1816     case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1817     case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1818     case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1819 
1820     default: assert(false, "wrong vector length");
1821   }
1822 }
1823 
1824 void C2_MacroAssembler::reduceS(int opcode, int vlen,
1825                              Register dst, Register src1, XMMRegister src2,
1826                              XMMRegister vtmp1, XMMRegister vtmp2) {
1827   switch (vlen) {
1828     case  4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1829     case  8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1830     case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1831     case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1832 
1833     default: assert(false, "wrong vector length");
1834   }
1835 }
1836 
1837 void C2_MacroAssembler::reduceI(int opcode, int vlen,
1838                              Register dst, Register src1, XMMRegister src2,
1839                              XMMRegister vtmp1, XMMRegister vtmp2) {
1840   switch (vlen) {
1841     case  2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1842     case  4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1843     case  8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1844     case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1845 
1846     default: assert(false, "wrong vector length");
1847   }
1848 }
1849 
1850 #ifdef _LP64
1851 void C2_MacroAssembler::reduceL(int opcode, int vlen,
1852                              Register dst, Register src1, XMMRegister src2,
1853                              XMMRegister vtmp1, XMMRegister vtmp2) {
1854   switch (vlen) {
1855     case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1856     case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1857     case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1858 
1859     default: assert(false, "wrong vector length");
1860   }
1861 }
1862 #endif // _LP64
1863 
1864 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1865   switch (vlen) {
1866     case 2:
1867       assert(vtmp2 == xnoreg, "");
1868       reduce2F(opcode, dst, src, vtmp1);
1869       break;
1870     case 4:
1871       assert(vtmp2 == xnoreg, "");
1872       reduce4F(opcode, dst, src, vtmp1);
1873       break;
1874     case 8:
1875       reduce8F(opcode, dst, src, vtmp1, vtmp2);
1876       break;
1877     case 16:
1878       reduce16F(opcode, dst, src, vtmp1, vtmp2);
1879       break;
1880     default: assert(false, "wrong vector length");
1881   }
1882 }
1883 
1884 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1885   switch (vlen) {
1886     case 2:
1887       assert(vtmp2 == xnoreg, "");
1888       reduce2D(opcode, dst, src, vtmp1);
1889       break;
1890     case 4:
1891       reduce4D(opcode, dst, src, vtmp1, vtmp2);
1892       break;
1893     case 8:
1894       reduce8D(opcode, dst, src, vtmp1, vtmp2);
1895       break;
1896     default: assert(false, "wrong vector length");
1897   }
1898 }
1899 
1900 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1901   if (opcode == Op_AddReductionVI) {
1902     if (vtmp1 != src2) {
1903       movdqu(vtmp1, src2);
1904     }
1905     phaddd(vtmp1, vtmp1);
1906   } else {
1907     pshufd(vtmp1, src2, 0x1);
1908     reduce_operation_128(T_INT, opcode, vtmp1, src2);
1909   }
1910   movdl(vtmp2, src1);
1911   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
1912   movdl(dst, vtmp1);
1913 }
1914 
1915 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1916   if (opcode == Op_AddReductionVI) {
1917     if (vtmp1 != src2) {
1918       movdqu(vtmp1, src2);
1919     }
1920     phaddd(vtmp1, src2);
1921     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1922   } else {
1923     pshufd(vtmp2, src2, 0xE);
1924     reduce_operation_128(T_INT, opcode, vtmp2, src2);
1925     reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1926   }
1927 }
1928 
1929 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1930   if (opcode == Op_AddReductionVI) {
1931     vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
1932     vextracti128_high(vtmp2, vtmp1);
1933     vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
1934     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1935   } else {
1936     vextracti128_high(vtmp1, src2);
1937     reduce_operation_128(T_INT, opcode, vtmp1, src2);
1938     reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1939   }
1940 }
1941 
1942 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1943   vextracti64x4_high(vtmp2, src2);
1944   reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
1945   reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1946 }
1947 
1948 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1949   pshufd(vtmp2, src2, 0x1);
1950   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
1951   movdqu(vtmp1, vtmp2);
1952   psrldq(vtmp1, 2);
1953   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
1954   movdqu(vtmp2, vtmp1);
1955   psrldq(vtmp2, 1);
1956   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
1957   movdl(vtmp2, src1);
1958   pmovsxbd(vtmp1, vtmp1);
1959   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
1960   pextrb(dst, vtmp1, 0x0);
1961   movsbl(dst, dst);
1962 }
1963 
1964 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1965   pshufd(vtmp1, src2, 0xE);
1966   reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
1967   reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1968 }
1969 
1970 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1971   vextracti128_high(vtmp2, src2);
1972   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
1973   reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1974 }
1975 
1976 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1977   vextracti64x4_high(vtmp1, src2);
1978   reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
1979   reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1980 }
1981 
1982 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1983   pmovsxbw(vtmp2, src2);
1984   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1985 }
1986 
1987 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1988   if (UseAVX > 1) {
1989     int vector_len = Assembler::AVX_256bit;
1990     vpmovsxbw(vtmp1, src2, vector_len);
1991     reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1992   } else {
1993     pmovsxbw(vtmp2, src2);
1994     reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1995     pshufd(vtmp2, src2, 0x1);
1996     pmovsxbw(vtmp2, src2);
1997     reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
1998   }
1999 }
2000 
2001 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2002   if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
2003     int vector_len = Assembler::AVX_512bit;
2004     vpmovsxbw(vtmp1, src2, vector_len);
2005     reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2006   } else {
2007     assert(UseAVX >= 2,"Should not reach here.");
2008     mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
2009     vextracti128_high(vtmp2, src2);
2010     mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2011   }
2012 }
2013 
2014 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2015   mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
2016   vextracti64x4_high(vtmp2, src2);
2017   mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2018 }
2019 
2020 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2021   if (opcode == Op_AddReductionVI) {
2022     if (vtmp1 != src2) {
2023       movdqu(vtmp1, src2);
2024     }
2025     phaddw(vtmp1, vtmp1);
2026     phaddw(vtmp1, vtmp1);
2027   } else {
2028     pshufd(vtmp2, src2, 0x1);
2029     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2030     movdqu(vtmp1, vtmp2);
2031     psrldq(vtmp1, 2);
2032     reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
2033   }
2034   movdl(vtmp2, src1);
2035   pmovsxwd(vtmp1, vtmp1);
2036   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2037   pextrw(dst, vtmp1, 0x0);
2038   movswl(dst, dst);
2039 }
2040 
2041 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2042   if (opcode == Op_AddReductionVI) {
2043     if (vtmp1 != src2) {
2044       movdqu(vtmp1, src2);
2045     }
2046     phaddw(vtmp1, src2);
2047   } else {
2048     pshufd(vtmp1, src2, 0xE);
2049     reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
2050   }
2051   reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2052 }
2053 
2054 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2055   if (opcode == Op_AddReductionVI) {
2056     int vector_len = Assembler::AVX_256bit;
2057     vphaddw(vtmp2, src2, src2, vector_len);
2058     vpermq(vtmp2, vtmp2, 0xD8, vector_len);
2059   } else {
2060     vextracti128_high(vtmp2, src2);
2061     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2062   }
2063   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2064 }
2065 
2066 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2067   int vector_len = Assembler::AVX_256bit;
2068   vextracti64x4_high(vtmp1, src2);
2069   reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
2070   reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2071 }
2072 
2073 #ifdef _LP64
2074 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2075   pshufd(vtmp2, src2, 0xE);
2076   reduce_operation_128(T_LONG, opcode, vtmp2, src2);
2077   movdq(vtmp1, src1);
2078   reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
2079   movdq(dst, vtmp1);
2080 }
2081 
2082 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2083   vextracti128_high(vtmp1, src2);
2084   reduce_operation_128(T_LONG, opcode, vtmp1, src2);
2085   reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2086 }
2087 
2088 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2089   vextracti64x4_high(vtmp2, src2);
2090   reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
2091   reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2092 }
2093 
2094 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
2095   mov64(temp, -1L);
2096   bzhiq(temp, temp, len);
2097   kmovql(dst, temp);
2098 }
2099 #endif // _LP64
2100 
2101 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2102   reduce_operation_128(T_FLOAT, opcode, dst, src);
2103   pshufd(vtmp, src, 0x1);
2104   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2105 }
2106 
2107 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2108   reduce2F(opcode, dst, src, vtmp);
2109   pshufd(vtmp, src, 0x2);
2110   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2111   pshufd(vtmp, src, 0x3);
2112   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2113 }
2114 
2115 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2116   reduce4F(opcode, dst, src, vtmp2);
2117   vextractf128_high(vtmp2, src);
2118   reduce4F(opcode, dst, vtmp2, vtmp1);
2119 }
2120 
2121 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2122   reduce8F(opcode, dst, src, vtmp1, vtmp2);
2123   vextracti64x4_high(vtmp1, src);
2124   reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
2125 }
2126 
2127 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2128   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2129   pshufd(vtmp, src, 0xE);
2130   reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
2131 }
2132 
2133 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2134   reduce2D(opcode, dst, src, vtmp2);
2135   vextractf128_high(vtmp2, src);
2136   reduce2D(opcode, dst, vtmp2, vtmp1);
2137 }
2138 
2139 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2140   reduce4D(opcode, dst, src, vtmp1, vtmp2);
2141   vextracti64x4_high(vtmp1, src);
2142   reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
2143 }
2144 
2145 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) {
2146   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2147 }
2148 
2149 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) {
2150   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2151 }
2152 
2153 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask,
2154                                  int vec_enc) {
2155   switch(elem_bt) {
2156     case T_INT:
2157     case T_FLOAT:
2158       vmaskmovps(dst, src, mask, vec_enc);
2159       break;
2160     case T_LONG:
2161     case T_DOUBLE:
2162       vmaskmovpd(dst, src, mask, vec_enc);
2163       break;
2164     default:
2165       fatal("Unsupported type %s", type2name(elem_bt));
2166       break;
2167   }
2168 }
2169 
2170 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask,
2171                                  int vec_enc) {
2172   switch(elem_bt) {
2173     case T_INT:
2174     case T_FLOAT:
2175       vmaskmovps(dst, src, mask, vec_enc);
2176       break;
2177     case T_LONG:
2178     case T_DOUBLE:
2179       vmaskmovpd(dst, src, mask, vec_enc);
2180       break;
2181     default:
2182       fatal("Unsupported type %s", type2name(elem_bt));
2183       break;
2184   }
2185 }
2186 
2187 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
2188                                           XMMRegister dst, XMMRegister src,
2189                                           XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2190                                           XMMRegister xmm_0, XMMRegister xmm_1) {
2191   int permconst[] = {1, 14};
2192   XMMRegister wsrc = src;
2193   XMMRegister wdst = xmm_0;
2194   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2195 
2196   int vlen_enc = Assembler::AVX_128bit;
2197   if (vlen == 16) {
2198     vlen_enc = Assembler::AVX_256bit;
2199   }
2200 
2201   for (int i = log2(vlen) - 1; i >=0; i--) {
2202     if (i == 0 && !is_dst_valid) {
2203       wdst = dst;
2204     }
2205     if (i == 3) {
2206       vextracti64x4_high(wtmp, wsrc);
2207     } else if (i == 2) {
2208       vextracti128_high(wtmp, wsrc);
2209     } else { // i = [0,1]
2210       vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
2211     }
2212     vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2213     wsrc = wdst;
2214     vlen_enc = Assembler::AVX_128bit;
2215   }
2216   if (is_dst_valid) {
2217     vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2218   }
2219 }
2220 
2221 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
2222                                         XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2223                                         XMMRegister xmm_0, XMMRegister xmm_1) {
2224   XMMRegister wsrc = src;
2225   XMMRegister wdst = xmm_0;
2226   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2227   int vlen_enc = Assembler::AVX_128bit;
2228   if (vlen == 8) {
2229     vlen_enc = Assembler::AVX_256bit;
2230   }
2231   for (int i = log2(vlen) - 1; i >=0; i--) {
2232     if (i == 0 && !is_dst_valid) {
2233       wdst = dst;
2234     }
2235     if (i == 1) {
2236       vextracti128_high(wtmp, wsrc);
2237     } else if (i == 2) {
2238       vextracti64x4_high(wtmp, wsrc);
2239     } else {
2240       assert(i == 0, "%d", i);
2241       vpermilpd(wtmp, wsrc, 1, vlen_enc);
2242     }
2243     vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2244     wsrc = wdst;
2245     vlen_enc = Assembler::AVX_128bit;
2246   }
2247   if (is_dst_valid) {
2248     vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2249   }
2250 }
2251 
2252 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2253   switch (bt) {
2254     case T_BYTE:  pextrb(dst, src, idx); break;
2255     case T_SHORT: pextrw(dst, src, idx); break;
2256     case T_INT:   pextrd(dst, src, idx); break;
2257     case T_LONG:  pextrq(dst, src, idx); break;
2258 
2259     default:
2260       assert(false,"Should not reach here.");
2261       break;
2262   }
2263 }
2264 
2265 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2266   int esize =  type2aelembytes(typ);
2267   int elem_per_lane = 16/esize;
2268   int lane = elemindex / elem_per_lane;
2269   int eindex = elemindex % elem_per_lane;
2270 
2271   if (lane >= 2) {
2272     assert(UseAVX > 2, "required");
2273     vextractf32x4(dst, src, lane & 3);
2274     return dst;
2275   } else if (lane > 0) {
2276     assert(UseAVX > 0, "required");
2277     vextractf128(dst, src, lane);
2278     return dst;
2279   } else {
2280     return src;
2281   }
2282 }
2283 
2284 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2285   int esize =  type2aelembytes(typ);
2286   int elem_per_lane = 16/esize;
2287   int eindex = elemindex % elem_per_lane;
2288   assert(is_integral_type(typ),"required");
2289 
2290   if (eindex == 0) {
2291     if (typ == T_LONG) {
2292       movq(dst, src);
2293     } else {
2294       movdl(dst, src);
2295       if (typ == T_BYTE)
2296         movsbl(dst, dst);
2297       else if (typ == T_SHORT)
2298         movswl(dst, dst);
2299     }
2300   } else {
2301     extract(typ, dst, src, eindex);
2302   }
2303 }
2304 
2305 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) {
2306   int esize =  type2aelembytes(typ);
2307   int elem_per_lane = 16/esize;
2308   int eindex = elemindex % elem_per_lane;
2309   assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2310 
2311   if (eindex == 0) {
2312     movq(dst, src);
2313   } else {
2314     if (typ == T_FLOAT) {
2315       if (UseAVX == 0) {
2316         movdqu(dst, src);
2317         shufps(dst, dst, eindex);
2318       } else {
2319         vshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2320       }
2321     } else {
2322       if (UseAVX == 0) {
2323         movdqu(dst, src);
2324         psrldq(dst, eindex*esize);
2325       } else {
2326         vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2327       }
2328       movq(dst, dst);
2329     }
2330   }
2331   // Zero upper bits
2332   if (typ == T_FLOAT) {
2333     if (UseAVX == 0) {
2334       assert(vtmp != xnoreg, "required.");
2335       movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg);
2336       pand(dst, vtmp);
2337     } else {
2338       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg);
2339     }
2340   }
2341 }
2342 
2343 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) {
2344   switch(typ) {
2345     case T_BYTE:
2346     case T_BOOLEAN:
2347       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2348       break;
2349     case T_SHORT:
2350     case T_CHAR:
2351       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2352       break;
2353     case T_INT:
2354     case T_FLOAT:
2355       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2356       break;
2357     case T_LONG:
2358     case T_DOUBLE:
2359       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2360       break;
2361     default:
2362       assert(false,"Should not reach here.");
2363       break;
2364   }
2365 }
2366 
2367 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) {
2368   assert(rscratch != noreg || always_reachable(src2), "missing");
2369 
2370   switch(typ) {
2371     case T_BOOLEAN:
2372     case T_BYTE:
2373       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2374       break;
2375     case T_CHAR:
2376     case T_SHORT:
2377       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2378       break;
2379     case T_INT:
2380     case T_FLOAT:
2381       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2382       break;
2383     case T_LONG:
2384     case T_DOUBLE:
2385       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2386       break;
2387     default:
2388       assert(false,"Should not reach here.");
2389       break;
2390   }
2391 }
2392 
2393 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2394   switch(typ) {
2395     case T_BYTE:
2396       evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2397       break;
2398     case T_SHORT:
2399       evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2400       break;
2401     case T_INT:
2402     case T_FLOAT:
2403       evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2404       break;
2405     case T_LONG:
2406     case T_DOUBLE:
2407       evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2408       break;
2409     default:
2410       assert(false,"Should not reach here.");
2411       break;
2412   }
2413 }
2414 
2415 void C2_MacroAssembler::vectortest(int bt, int vlen, XMMRegister src1, XMMRegister src2,
2416                                    XMMRegister vtmp1, XMMRegister vtmp2, KRegister mask) {
2417   switch(vlen) {
2418     case 4:
2419       assert(vtmp1 != xnoreg, "required.");
2420       // Broadcast lower 32 bits to 128 bits before ptest
2421       pshufd(vtmp1, src1, 0x0);
2422       if (bt == BoolTest::overflow) {
2423         assert(vtmp2 != xnoreg, "required.");
2424         pshufd(vtmp2, src2, 0x0);
2425       } else {
2426         assert(vtmp2 == xnoreg, "required.");
2427         vtmp2 = src2;
2428       }
2429       ptest(vtmp1, vtmp2);
2430      break;
2431     case 8:
2432       assert(vtmp1 != xnoreg, "required.");
2433       // Broadcast lower 64 bits to 128 bits before ptest
2434       pshufd(vtmp1, src1, 0x4);
2435       if (bt == BoolTest::overflow) {
2436         assert(vtmp2 != xnoreg, "required.");
2437         pshufd(vtmp2, src2, 0x4);
2438       } else {
2439         assert(vtmp2 == xnoreg, "required.");
2440         vtmp2 = src2;
2441       }
2442       ptest(vtmp1, vtmp2);
2443      break;
2444     case 16:
2445       assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required.");
2446       ptest(src1, src2);
2447       break;
2448     case 32:
2449       assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required.");
2450       vptest(src1, src2, Assembler::AVX_256bit);
2451       break;
2452     case 64:
2453       {
2454         assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required.");
2455         evpcmpeqb(mask, src1, src2, Assembler::AVX_512bit);
2456         if (bt == BoolTest::ne) {
2457           ktestql(mask, mask);
2458         } else {
2459           assert(bt == BoolTest::overflow, "required");
2460           kortestql(mask, mask);
2461         }
2462       }
2463       break;
2464     default:
2465       assert(false,"Should not reach here.");
2466       break;
2467   }
2468 }
2469 
2470 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
2471   assert(UseAVX >= 2, "required");
2472 #ifdef ASSERT
2473   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2474   bool is_bw_supported = VM_Version::supports_avx512bw();
2475   if (is_bw && !is_bw_supported) {
2476     assert(vlen_enc != Assembler::AVX_512bit, "required");
2477     assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16),
2478            "XMM register should be 0-15");
2479   }
2480 #endif // ASSERT
2481   switch (elem_bt) {
2482     case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return;
2483     case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return;
2484     case T_INT: vpaddd(dst, src1, src2, vlen_enc); return;
2485     case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return;
2486     case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return;
2487     case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return;
2488     default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2489   }
2490 }
2491 
2492 #ifdef _LP64
2493 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) {
2494   assert(UseAVX >= 2, "required");
2495   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2496   bool is_vl = vlen_enc != Assembler::AVX_512bit;
2497   if ((UseAVX > 2) &&
2498       (!is_bw || VM_Version::supports_avx512bw()) &&
2499       (!is_vl || VM_Version::supports_avx512vl())) {
2500     switch (elem_bt) {
2501       case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return;
2502       case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return;
2503       case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return;
2504       case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return;
2505       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2506     }
2507   } else {
2508     assert(vlen_enc != Assembler::AVX_512bit, "required");
2509     assert((dst->encoding() < 16),"XMM register should be 0-15");
2510     switch (elem_bt) {
2511       case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return;
2512       case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return;
2513       case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return;
2514       case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return;
2515       case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return;
2516       case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return;
2517       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2518     }
2519   }
2520 }
2521 #endif
2522 
2523 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
2524   switch (to_elem_bt) {
2525     case T_SHORT:
2526       vpmovsxbw(dst, src, vlen_enc);
2527       break;
2528     case T_INT:
2529       vpmovsxbd(dst, src, vlen_enc);
2530       break;
2531     case T_FLOAT:
2532       vpmovsxbd(dst, src, vlen_enc);
2533       vcvtdq2ps(dst, dst, vlen_enc);
2534       break;
2535     case T_LONG:
2536       vpmovsxbq(dst, src, vlen_enc);
2537       break;
2538     case T_DOUBLE: {
2539       int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
2540       vpmovsxbd(dst, src, mid_vlen_enc);
2541       vcvtdq2pd(dst, dst, vlen_enc);
2542       break;
2543     }
2544     default:
2545       fatal("Unsupported type %s", type2name(to_elem_bt));
2546       break;
2547   }
2548 }
2549 
2550 //-------------------------------------------------------------------------------------------
2551 
2552 // IndexOf for constant substrings with size >= 8 chars
2553 // which don't need to be loaded through stack.
2554 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2555                                          Register cnt1, Register cnt2,
2556                                          int int_cnt2,  Register result,
2557                                          XMMRegister vec, Register tmp,
2558                                          int ae) {
2559   ShortBranchVerifier sbv(this);
2560   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2561   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2562 
2563   // This method uses the pcmpestri instruction with bound registers
2564   //   inputs:
2565   //     xmm - substring
2566   //     rax - substring length (elements count)
2567   //     mem - scanned string
2568   //     rdx - string length (elements count)
2569   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2570   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2571   //   outputs:
2572   //     rcx - matched index in string
2573   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2574   int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2575   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2576   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2577   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2578 
2579   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2580         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2581         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2582 
2583   // Note, inline_string_indexOf() generates checks:
2584   // if (substr.count > string.count) return -1;
2585   // if (substr.count == 0) return 0;
2586   assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2587 
2588   // Load substring.
2589   if (ae == StrIntrinsicNode::UL) {
2590     pmovzxbw(vec, Address(str2, 0));
2591   } else {
2592     movdqu(vec, Address(str2, 0));
2593   }
2594   movl(cnt2, int_cnt2);
2595   movptr(result, str1); // string addr
2596 
2597   if (int_cnt2 > stride) {
2598     jmpb(SCAN_TO_SUBSTR);
2599 
2600     // Reload substr for rescan, this code
2601     // is executed only for large substrings (> 8 chars)
2602     bind(RELOAD_SUBSTR);
2603     if (ae == StrIntrinsicNode::UL) {
2604       pmovzxbw(vec, Address(str2, 0));
2605     } else {
2606       movdqu(vec, Address(str2, 0));
2607     }
2608     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2609 
2610     bind(RELOAD_STR);
2611     // We came here after the beginning of the substring was
2612     // matched but the rest of it was not so we need to search
2613     // again. Start from the next element after the previous match.
2614 
2615     // cnt2 is number of substring reminding elements and
2616     // cnt1 is number of string reminding elements when cmp failed.
2617     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
2618     subl(cnt1, cnt2);
2619     addl(cnt1, int_cnt2);
2620     movl(cnt2, int_cnt2); // Now restore cnt2
2621 
2622     decrementl(cnt1);     // Shift to next element
2623     cmpl(cnt1, cnt2);
2624     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2625 
2626     addptr(result, (1<<scale1));
2627 
2628   } // (int_cnt2 > 8)
2629 
2630   // Scan string for start of substr in 16-byte vectors
2631   bind(SCAN_TO_SUBSTR);
2632   pcmpestri(vec, Address(result, 0), mode);
2633   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2634   subl(cnt1, stride);
2635   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2636   cmpl(cnt1, cnt2);
2637   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2638   addptr(result, 16);
2639   jmpb(SCAN_TO_SUBSTR);
2640 
2641   // Found a potential substr
2642   bind(FOUND_CANDIDATE);
2643   // Matched whole vector if first element matched (tmp(rcx) == 0).
2644   if (int_cnt2 == stride) {
2645     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
2646   } else { // int_cnt2 > 8
2647     jccb(Assembler::overflow, FOUND_SUBSTR);
2648   }
2649   // After pcmpestri tmp(rcx) contains matched element index
2650   // Compute start addr of substr
2651   lea(result, Address(result, tmp, scale1));
2652 
2653   // Make sure string is still long enough
2654   subl(cnt1, tmp);
2655   cmpl(cnt1, cnt2);
2656   if (int_cnt2 == stride) {
2657     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2658   } else { // int_cnt2 > 8
2659     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
2660   }
2661   // Left less then substring.
2662 
2663   bind(RET_NOT_FOUND);
2664   movl(result, -1);
2665   jmp(EXIT);
2666 
2667   if (int_cnt2 > stride) {
2668     // This code is optimized for the case when whole substring
2669     // is matched if its head is matched.
2670     bind(MATCH_SUBSTR_HEAD);
2671     pcmpestri(vec, Address(result, 0), mode);
2672     // Reload only string if does not match
2673     jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
2674 
2675     Label CONT_SCAN_SUBSTR;
2676     // Compare the rest of substring (> 8 chars).
2677     bind(FOUND_SUBSTR);
2678     // First 8 chars are already matched.
2679     negptr(cnt2);
2680     addptr(cnt2, stride);
2681 
2682     bind(SCAN_SUBSTR);
2683     subl(cnt1, stride);
2684     cmpl(cnt2, -stride); // Do not read beyond substring
2685     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
2686     // Back-up strings to avoid reading beyond substring:
2687     // cnt1 = cnt1 - cnt2 + 8
2688     addl(cnt1, cnt2); // cnt2 is negative
2689     addl(cnt1, stride);
2690     movl(cnt2, stride); negptr(cnt2);
2691     bind(CONT_SCAN_SUBSTR);
2692     if (int_cnt2 < (int)G) {
2693       int tail_off1 = int_cnt2<<scale1;
2694       int tail_off2 = int_cnt2<<scale2;
2695       if (ae == StrIntrinsicNode::UL) {
2696         pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
2697       } else {
2698         movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
2699       }
2700       pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
2701     } else {
2702       // calculate index in register to avoid integer overflow (int_cnt2*2)
2703       movl(tmp, int_cnt2);
2704       addptr(tmp, cnt2);
2705       if (ae == StrIntrinsicNode::UL) {
2706         pmovzxbw(vec, Address(str2, tmp, scale2, 0));
2707       } else {
2708         movdqu(vec, Address(str2, tmp, scale2, 0));
2709       }
2710       pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
2711     }
2712     // Need to reload strings pointers if not matched whole vector
2713     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2714     addptr(cnt2, stride);
2715     jcc(Assembler::negative, SCAN_SUBSTR);
2716     // Fall through if found full substring
2717 
2718   } // (int_cnt2 > 8)
2719 
2720   bind(RET_FOUND);
2721   // Found result if we matched full small substring.
2722   // Compute substr offset
2723   subptr(result, str1);
2724   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2725     shrl(result, 1); // index
2726   }
2727   bind(EXIT);
2728 
2729 } // string_indexofC8
2730 
2731 // Small strings are loaded through stack if they cross page boundary.
2732 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
2733                                        Register cnt1, Register cnt2,
2734                                        int int_cnt2,  Register result,
2735                                        XMMRegister vec, Register tmp,
2736                                        int ae) {
2737   ShortBranchVerifier sbv(this);
2738   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2739   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2740 
2741   //
2742   // int_cnt2 is length of small (< 8 chars) constant substring
2743   // or (-1) for non constant substring in which case its length
2744   // is in cnt2 register.
2745   //
2746   // Note, inline_string_indexOf() generates checks:
2747   // if (substr.count > string.count) return -1;
2748   // if (substr.count == 0) return 0;
2749   //
2750   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2751   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
2752   // This method uses the pcmpestri instruction with bound registers
2753   //   inputs:
2754   //     xmm - substring
2755   //     rax - substring length (elements count)
2756   //     mem - scanned string
2757   //     rdx - string length (elements count)
2758   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2759   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2760   //   outputs:
2761   //     rcx - matched index in string
2762   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2763   int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2764   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2765   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2766 
2767   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
2768         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
2769         FOUND_CANDIDATE;
2770 
2771   { //========================================================
2772     // We don't know where these strings are located
2773     // and we can't read beyond them. Load them through stack.
2774     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
2775 
2776     movptr(tmp, rsp); // save old SP
2777 
2778     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
2779       if (int_cnt2 == (1>>scale2)) { // One byte
2780         assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
2781         load_unsigned_byte(result, Address(str2, 0));
2782         movdl(vec, result); // move 32 bits
2783       } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
2784         // Not enough header space in 32-bit VM: 12+3 = 15.
2785         movl(result, Address(str2, -1));
2786         shrl(result, 8);
2787         movdl(vec, result); // move 32 bits
2788       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
2789         load_unsigned_short(result, Address(str2, 0));
2790         movdl(vec, result); // move 32 bits
2791       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
2792         movdl(vec, Address(str2, 0)); // move 32 bits
2793       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
2794         movq(vec, Address(str2, 0));  // move 64 bits
2795       } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
2796         // Array header size is 12 bytes in 32-bit VM
2797         // + 6 bytes for 3 chars == 18 bytes,
2798         // enough space to load vec and shift.
2799         assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
2800         if (ae == StrIntrinsicNode::UL) {
2801           int tail_off = int_cnt2-8;
2802           pmovzxbw(vec, Address(str2, tail_off));
2803           psrldq(vec, -2*tail_off);
2804         }
2805         else {
2806           int tail_off = int_cnt2*(1<<scale2);
2807           movdqu(vec, Address(str2, tail_off-16));
2808           psrldq(vec, 16-tail_off);
2809         }
2810       }
2811     } else { // not constant substring
2812       cmpl(cnt2, stride);
2813       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
2814 
2815       // We can read beyond string if srt+16 does not cross page boundary
2816       // since heaps are aligned and mapped by pages.
2817       assert(os::vm_page_size() < (int)G, "default page should be small");
2818       movl(result, str2); // We need only low 32 bits
2819       andl(result, (os::vm_page_size()-1));
2820       cmpl(result, (os::vm_page_size()-16));
2821       jccb(Assembler::belowEqual, CHECK_STR);
2822 
2823       // Move small strings to stack to allow load 16 bytes into vec.
2824       subptr(rsp, 16);
2825       int stk_offset = wordSize-(1<<scale2);
2826       push(cnt2);
2827 
2828       bind(COPY_SUBSTR);
2829       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
2830         load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
2831         movb(Address(rsp, cnt2, scale2, stk_offset), result);
2832       } else if (ae == StrIntrinsicNode::UU) {
2833         load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
2834         movw(Address(rsp, cnt2, scale2, stk_offset), result);
2835       }
2836       decrement(cnt2);
2837       jccb(Assembler::notZero, COPY_SUBSTR);
2838 
2839       pop(cnt2);
2840       movptr(str2, rsp);  // New substring address
2841     } // non constant
2842 
2843     bind(CHECK_STR);
2844     cmpl(cnt1, stride);
2845     jccb(Assembler::aboveEqual, BIG_STRINGS);
2846 
2847     // Check cross page boundary.
2848     movl(result, str1); // We need only low 32 bits
2849     andl(result, (os::vm_page_size()-1));
2850     cmpl(result, (os::vm_page_size()-16));
2851     jccb(Assembler::belowEqual, BIG_STRINGS);
2852 
2853     subptr(rsp, 16);
2854     int stk_offset = -(1<<scale1);
2855     if (int_cnt2 < 0) { // not constant
2856       push(cnt2);
2857       stk_offset += wordSize;
2858     }
2859     movl(cnt2, cnt1);
2860 
2861     bind(COPY_STR);
2862     if (ae == StrIntrinsicNode::LL) {
2863       load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
2864       movb(Address(rsp, cnt2, scale1, stk_offset), result);
2865     } else {
2866       load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
2867       movw(Address(rsp, cnt2, scale1, stk_offset), result);
2868     }
2869     decrement(cnt2);
2870     jccb(Assembler::notZero, COPY_STR);
2871 
2872     if (int_cnt2 < 0) { // not constant
2873       pop(cnt2);
2874     }
2875     movptr(str1, rsp);  // New string address
2876 
2877     bind(BIG_STRINGS);
2878     // Load substring.
2879     if (int_cnt2 < 0) { // -1
2880       if (ae == StrIntrinsicNode::UL) {
2881         pmovzxbw(vec, Address(str2, 0));
2882       } else {
2883         movdqu(vec, Address(str2, 0));
2884       }
2885       push(cnt2);       // substr count
2886       push(str2);       // substr addr
2887       push(str1);       // string addr
2888     } else {
2889       // Small (< 8 chars) constant substrings are loaded already.
2890       movl(cnt2, int_cnt2);
2891     }
2892     push(tmp);  // original SP
2893 
2894   } // Finished loading
2895 
2896   //========================================================
2897   // Start search
2898   //
2899 
2900   movptr(result, str1); // string addr
2901 
2902   if (int_cnt2  < 0) {  // Only for non constant substring
2903     jmpb(SCAN_TO_SUBSTR);
2904 
2905     // SP saved at sp+0
2906     // String saved at sp+1*wordSize
2907     // Substr saved at sp+2*wordSize
2908     // Substr count saved at sp+3*wordSize
2909 
2910     // Reload substr for rescan, this code
2911     // is executed only for large substrings (> 8 chars)
2912     bind(RELOAD_SUBSTR);
2913     movptr(str2, Address(rsp, 2*wordSize));
2914     movl(cnt2, Address(rsp, 3*wordSize));
2915     if (ae == StrIntrinsicNode::UL) {
2916       pmovzxbw(vec, Address(str2, 0));
2917     } else {
2918       movdqu(vec, Address(str2, 0));
2919     }
2920     // We came here after the beginning of the substring was
2921     // matched but the rest of it was not so we need to search
2922     // again. Start from the next element after the previous match.
2923     subptr(str1, result); // Restore counter
2924     if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2925       shrl(str1, 1);
2926     }
2927     addl(cnt1, str1);
2928     decrementl(cnt1);   // Shift to next element
2929     cmpl(cnt1, cnt2);
2930     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2931 
2932     addptr(result, (1<<scale1));
2933   } // non constant
2934 
2935   // Scan string for start of substr in 16-byte vectors
2936   bind(SCAN_TO_SUBSTR);
2937   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2938   pcmpestri(vec, Address(result, 0), mode);
2939   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2940   subl(cnt1, stride);
2941   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2942   cmpl(cnt1, cnt2);
2943   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2944   addptr(result, 16);
2945 
2946   bind(ADJUST_STR);
2947   cmpl(cnt1, stride); // Do not read beyond string
2948   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2949   // Back-up string to avoid reading beyond string.
2950   lea(result, Address(result, cnt1, scale1, -16));
2951   movl(cnt1, stride);
2952   jmpb(SCAN_TO_SUBSTR);
2953 
2954   // Found a potential substr
2955   bind(FOUND_CANDIDATE);
2956   // After pcmpestri tmp(rcx) contains matched element index
2957 
2958   // Make sure string is still long enough
2959   subl(cnt1, tmp);
2960   cmpl(cnt1, cnt2);
2961   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
2962   // Left less then substring.
2963 
2964   bind(RET_NOT_FOUND);
2965   movl(result, -1);
2966   jmp(CLEANUP);
2967 
2968   bind(FOUND_SUBSTR);
2969   // Compute start addr of substr
2970   lea(result, Address(result, tmp, scale1));
2971   if (int_cnt2 > 0) { // Constant substring
2972     // Repeat search for small substring (< 8 chars)
2973     // from new point without reloading substring.
2974     // Have to check that we don't read beyond string.
2975     cmpl(tmp, stride-int_cnt2);
2976     jccb(Assembler::greater, ADJUST_STR);
2977     // Fall through if matched whole substring.
2978   } else { // non constant
2979     assert(int_cnt2 == -1, "should be != 0");
2980 
2981     addl(tmp, cnt2);
2982     // Found result if we matched whole substring.
2983     cmpl(tmp, stride);
2984     jcc(Assembler::lessEqual, RET_FOUND);
2985 
2986     // Repeat search for small substring (<= 8 chars)
2987     // from new point 'str1' without reloading substring.
2988     cmpl(cnt2, stride);
2989     // Have to check that we don't read beyond string.
2990     jccb(Assembler::lessEqual, ADJUST_STR);
2991 
2992     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
2993     // Compare the rest of substring (> 8 chars).
2994     movptr(str1, result);
2995 
2996     cmpl(tmp, cnt2);
2997     // First 8 chars are already matched.
2998     jccb(Assembler::equal, CHECK_NEXT);
2999 
3000     bind(SCAN_SUBSTR);
3001     pcmpestri(vec, Address(str1, 0), mode);
3002     // Need to reload strings pointers if not matched whole vector
3003     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
3004 
3005     bind(CHECK_NEXT);
3006     subl(cnt2, stride);
3007     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
3008     addptr(str1, 16);
3009     if (ae == StrIntrinsicNode::UL) {
3010       addptr(str2, 8);
3011     } else {
3012       addptr(str2, 16);
3013     }
3014     subl(cnt1, stride);
3015     cmpl(cnt2, stride); // Do not read beyond substring
3016     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
3017     // Back-up strings to avoid reading beyond substring.
3018 
3019     if (ae == StrIntrinsicNode::UL) {
3020       lea(str2, Address(str2, cnt2, scale2, -8));
3021       lea(str1, Address(str1, cnt2, scale1, -16));
3022     } else {
3023       lea(str2, Address(str2, cnt2, scale2, -16));
3024       lea(str1, Address(str1, cnt2, scale1, -16));
3025     }
3026     subl(cnt1, cnt2);
3027     movl(cnt2, stride);
3028     addl(cnt1, stride);
3029     bind(CONT_SCAN_SUBSTR);
3030     if (ae == StrIntrinsicNode::UL) {
3031       pmovzxbw(vec, Address(str2, 0));
3032     } else {
3033       movdqu(vec, Address(str2, 0));
3034     }
3035     jmp(SCAN_SUBSTR);
3036 
3037     bind(RET_FOUND_LONG);
3038     movptr(str1, Address(rsp, wordSize));
3039   } // non constant
3040 
3041   bind(RET_FOUND);
3042   // Compute substr offset
3043   subptr(result, str1);
3044   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3045     shrl(result, 1); // index
3046   }
3047   bind(CLEANUP);
3048   pop(rsp); // restore SP
3049 
3050 } // string_indexof
3051 
3052 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3053                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3054   ShortBranchVerifier sbv(this);
3055   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3056 
3057   int stride = 8;
3058 
3059   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
3060         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
3061         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
3062         FOUND_SEQ_CHAR, DONE_LABEL;
3063 
3064   movptr(result, str1);
3065   if (UseAVX >= 2) {
3066     cmpl(cnt1, stride);
3067     jcc(Assembler::less, SCAN_TO_CHAR);
3068     cmpl(cnt1, 2*stride);
3069     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
3070     movdl(vec1, ch);
3071     vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
3072     vpxor(vec2, vec2);
3073     movl(tmp, cnt1);
3074     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
3075     andl(cnt1,0x0000000F);  //tail count (in chars)
3076 
3077     bind(SCAN_TO_16_CHAR_LOOP);
3078     vmovdqu(vec3, Address(result, 0));
3079     vpcmpeqw(vec3, vec3, vec1, 1);
3080     vptest(vec2, vec3);
3081     jcc(Assembler::carryClear, FOUND_CHAR);
3082     addptr(result, 32);
3083     subl(tmp, 2*stride);
3084     jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
3085     jmp(SCAN_TO_8_CHAR);
3086     bind(SCAN_TO_8_CHAR_INIT);
3087     movdl(vec1, ch);
3088     pshuflw(vec1, vec1, 0x00);
3089     pshufd(vec1, vec1, 0);
3090     pxor(vec2, vec2);
3091   }
3092   bind(SCAN_TO_8_CHAR);
3093   cmpl(cnt1, stride);
3094   jcc(Assembler::less, SCAN_TO_CHAR);
3095   if (UseAVX < 2) {
3096     movdl(vec1, ch);
3097     pshuflw(vec1, vec1, 0x00);
3098     pshufd(vec1, vec1, 0);
3099     pxor(vec2, vec2);
3100   }
3101   movl(tmp, cnt1);
3102   andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
3103   andl(cnt1,0x00000007);  //tail count (in chars)
3104 
3105   bind(SCAN_TO_8_CHAR_LOOP);
3106   movdqu(vec3, Address(result, 0));
3107   pcmpeqw(vec3, vec1);
3108   ptest(vec2, vec3);
3109   jcc(Assembler::carryClear, FOUND_CHAR);
3110   addptr(result, 16);
3111   subl(tmp, stride);
3112   jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
3113   bind(SCAN_TO_CHAR);
3114   testl(cnt1, cnt1);
3115   jcc(Assembler::zero, RET_NOT_FOUND);
3116   bind(SCAN_TO_CHAR_LOOP);
3117   load_unsigned_short(tmp, Address(result, 0));
3118   cmpl(ch, tmp);
3119   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3120   addptr(result, 2);
3121   subl(cnt1, 1);
3122   jccb(Assembler::zero, RET_NOT_FOUND);
3123   jmp(SCAN_TO_CHAR_LOOP);
3124 
3125   bind(RET_NOT_FOUND);
3126   movl(result, -1);
3127   jmpb(DONE_LABEL);
3128 
3129   bind(FOUND_CHAR);
3130   if (UseAVX >= 2) {
3131     vpmovmskb(tmp, vec3);
3132   } else {
3133     pmovmskb(tmp, vec3);
3134   }
3135   bsfl(ch, tmp);
3136   addptr(result, ch);
3137 
3138   bind(FOUND_SEQ_CHAR);
3139   subptr(result, str1);
3140   shrl(result, 1);
3141 
3142   bind(DONE_LABEL);
3143 } // string_indexof_char
3144 
3145 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3146                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3147   ShortBranchVerifier sbv(this);
3148   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3149 
3150   int stride = 16;
3151 
3152   Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
3153         SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
3154         RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
3155         FOUND_SEQ_CHAR, DONE_LABEL;
3156 
3157   movptr(result, str1);
3158   if (UseAVX >= 2) {
3159     cmpl(cnt1, stride);
3160     jcc(Assembler::less, SCAN_TO_CHAR_INIT);
3161     cmpl(cnt1, stride*2);
3162     jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
3163     movdl(vec1, ch);
3164     vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
3165     vpxor(vec2, vec2);
3166     movl(tmp, cnt1);
3167     andl(tmp, 0xFFFFFFE0);  //vector count (in chars)
3168     andl(cnt1,0x0000001F);  //tail count (in chars)
3169 
3170     bind(SCAN_TO_32_CHAR_LOOP);
3171     vmovdqu(vec3, Address(result, 0));
3172     vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
3173     vptest(vec2, vec3);
3174     jcc(Assembler::carryClear, FOUND_CHAR);
3175     addptr(result, 32);
3176     subl(tmp, stride*2);
3177     jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
3178     jmp(SCAN_TO_16_CHAR);
3179 
3180     bind(SCAN_TO_16_CHAR_INIT);
3181     movdl(vec1, ch);
3182     pxor(vec2, vec2);
3183     pshufb(vec1, vec2);
3184   }
3185 
3186   bind(SCAN_TO_16_CHAR);
3187   cmpl(cnt1, stride);
3188   jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left
3189   if (UseAVX < 2) {
3190     movdl(vec1, ch);
3191     pxor(vec2, vec2);
3192     pshufb(vec1, vec2);
3193   }
3194   movl(tmp, cnt1);
3195   andl(tmp, 0xFFFFFFF0);  //vector count (in bytes)
3196   andl(cnt1,0x0000000F);  //tail count (in bytes)
3197 
3198   bind(SCAN_TO_16_CHAR_LOOP);
3199   movdqu(vec3, Address(result, 0));
3200   pcmpeqb(vec3, vec1);
3201   ptest(vec2, vec3);
3202   jcc(Assembler::carryClear, FOUND_CHAR);
3203   addptr(result, 16);
3204   subl(tmp, stride);
3205   jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
3206 
3207   bind(SCAN_TO_CHAR_INIT);
3208   testl(cnt1, cnt1);
3209   jcc(Assembler::zero, RET_NOT_FOUND);
3210   bind(SCAN_TO_CHAR_LOOP);
3211   load_unsigned_byte(tmp, Address(result, 0));
3212   cmpl(ch, tmp);
3213   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3214   addptr(result, 1);
3215   subl(cnt1, 1);
3216   jccb(Assembler::zero, RET_NOT_FOUND);
3217   jmp(SCAN_TO_CHAR_LOOP);
3218 
3219   bind(RET_NOT_FOUND);
3220   movl(result, -1);
3221   jmpb(DONE_LABEL);
3222 
3223   bind(FOUND_CHAR);
3224   if (UseAVX >= 2) {
3225     vpmovmskb(tmp, vec3);
3226   } else {
3227     pmovmskb(tmp, vec3);
3228   }
3229   bsfl(ch, tmp);
3230   addptr(result, ch);
3231 
3232   bind(FOUND_SEQ_CHAR);
3233   subptr(result, str1);
3234 
3235   bind(DONE_LABEL);
3236 } // stringL_indexof_char
3237 
3238 // helper function for string_compare
3239 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
3240                                            Address::ScaleFactor scale, Address::ScaleFactor scale1,
3241                                            Address::ScaleFactor scale2, Register index, int ae) {
3242   if (ae == StrIntrinsicNode::LL) {
3243     load_unsigned_byte(elem1, Address(str1, index, scale, 0));
3244     load_unsigned_byte(elem2, Address(str2, index, scale, 0));
3245   } else if (ae == StrIntrinsicNode::UU) {
3246     load_unsigned_short(elem1, Address(str1, index, scale, 0));
3247     load_unsigned_short(elem2, Address(str2, index, scale, 0));
3248   } else {
3249     load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
3250     load_unsigned_short(elem2, Address(str2, index, scale2, 0));
3251   }
3252 }
3253 
3254 // Compare strings, used for char[] and byte[].
3255 void C2_MacroAssembler::string_compare(Register str1, Register str2,
3256                                        Register cnt1, Register cnt2, Register result,
3257                                        XMMRegister vec1, int ae, KRegister mask) {
3258   ShortBranchVerifier sbv(this);
3259   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
3260   Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only _LP64 && AVX3
3261   int stride, stride2, adr_stride, adr_stride1, adr_stride2;
3262   int stride2x2 = 0x40;
3263   Address::ScaleFactor scale = Address::no_scale;
3264   Address::ScaleFactor scale1 = Address::no_scale;
3265   Address::ScaleFactor scale2 = Address::no_scale;
3266 
3267   if (ae != StrIntrinsicNode::LL) {
3268     stride2x2 = 0x20;
3269   }
3270 
3271   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
3272     shrl(cnt2, 1);
3273   }
3274   // Compute the minimum of the string lengths and the
3275   // difference of the string lengths (stack).
3276   // Do the conditional move stuff
3277   movl(result, cnt1);
3278   subl(cnt1, cnt2);
3279   push(cnt1);
3280   cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
3281 
3282   // Is the minimum length zero?
3283   testl(cnt2, cnt2);
3284   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3285   if (ae == StrIntrinsicNode::LL) {
3286     // Load first bytes
3287     load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
3288     load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
3289   } else if (ae == StrIntrinsicNode::UU) {
3290     // Load first characters
3291     load_unsigned_short(result, Address(str1, 0));
3292     load_unsigned_short(cnt1, Address(str2, 0));
3293   } else {
3294     load_unsigned_byte(result, Address(str1, 0));
3295     load_unsigned_short(cnt1, Address(str2, 0));
3296   }
3297   subl(result, cnt1);
3298   jcc(Assembler::notZero,  POP_LABEL);
3299 
3300   if (ae == StrIntrinsicNode::UU) {
3301     // Divide length by 2 to get number of chars
3302     shrl(cnt2, 1);
3303   }
3304   cmpl(cnt2, 1);
3305   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3306 
3307   // Check if the strings start at the same location and setup scale and stride
3308   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3309     cmpptr(str1, str2);
3310     jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3311     if (ae == StrIntrinsicNode::LL) {
3312       scale = Address::times_1;
3313       stride = 16;
3314     } else {
3315       scale = Address::times_2;
3316       stride = 8;
3317     }
3318   } else {
3319     scale1 = Address::times_1;
3320     scale2 = Address::times_2;
3321     // scale not used
3322     stride = 8;
3323   }
3324 
3325   if (UseAVX >= 2 && UseSSE42Intrinsics) {
3326     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
3327     Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
3328     Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
3329     Label COMPARE_TAIL_LONG;
3330     Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only _LP64 && AVX3
3331 
3332     int pcmpmask = 0x19;
3333     if (ae == StrIntrinsicNode::LL) {
3334       pcmpmask &= ~0x01;
3335     }
3336 
3337     // Setup to compare 16-chars (32-bytes) vectors,
3338     // start from first character again because it has aligned address.
3339     if (ae == StrIntrinsicNode::LL) {
3340       stride2 = 32;
3341     } else {
3342       stride2 = 16;
3343     }
3344     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3345       adr_stride = stride << scale;
3346     } else {
3347       adr_stride1 = 8;  //stride << scale1;
3348       adr_stride2 = 16; //stride << scale2;
3349     }
3350 
3351     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3352     // rax and rdx are used by pcmpestri as elements counters
3353     movl(result, cnt2);
3354     andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
3355     jcc(Assembler::zero, COMPARE_TAIL_LONG);
3356 
3357     // fast path : compare first 2 8-char vectors.
3358     bind(COMPARE_16_CHARS);
3359     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3360       movdqu(vec1, Address(str1, 0));
3361     } else {
3362       pmovzxbw(vec1, Address(str1, 0));
3363     }
3364     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3365     jccb(Assembler::below, COMPARE_INDEX_CHAR);
3366 
3367     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3368       movdqu(vec1, Address(str1, adr_stride));
3369       pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
3370     } else {
3371       pmovzxbw(vec1, Address(str1, adr_stride1));
3372       pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
3373     }
3374     jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
3375     addl(cnt1, stride);
3376 
3377     // Compare the characters at index in cnt1
3378     bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
3379     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3380     subl(result, cnt2);
3381     jmp(POP_LABEL);
3382 
3383     // Setup the registers to start vector comparison loop
3384     bind(COMPARE_WIDE_VECTORS);
3385     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3386       lea(str1, Address(str1, result, scale));
3387       lea(str2, Address(str2, result, scale));
3388     } else {
3389       lea(str1, Address(str1, result, scale1));
3390       lea(str2, Address(str2, result, scale2));
3391     }
3392     subl(result, stride2);
3393     subl(cnt2, stride2);
3394     jcc(Assembler::zero, COMPARE_WIDE_TAIL);
3395     negptr(result);
3396 
3397     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
3398     bind(COMPARE_WIDE_VECTORS_LOOP);
3399 
3400 #ifdef _LP64
3401     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3402       cmpl(cnt2, stride2x2);
3403       jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3404       testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
3405       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
3406 
3407       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3408       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3409         evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
3410         evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3411       } else {
3412         vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
3413         evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3414       }
3415       kortestql(mask, mask);
3416       jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
3417       addptr(result, stride2x2);  // update since we already compared at this addr
3418       subl(cnt2, stride2x2);      // and sub the size too
3419       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3420 
3421       vpxor(vec1, vec1);
3422       jmpb(COMPARE_WIDE_TAIL);
3423     }//if (VM_Version::supports_avx512vlbw())
3424 #endif // _LP64
3425 
3426 
3427     bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3428     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3429       vmovdqu(vec1, Address(str1, result, scale));
3430       vpxor(vec1, Address(str2, result, scale));
3431     } else {
3432       vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
3433       vpxor(vec1, Address(str2, result, scale2));
3434     }
3435     vptest(vec1, vec1);
3436     jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
3437     addptr(result, stride2);
3438     subl(cnt2, stride2);
3439     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
3440     // clean upper bits of YMM registers
3441     vpxor(vec1, vec1);
3442 
3443     // compare wide vectors tail
3444     bind(COMPARE_WIDE_TAIL);
3445     testptr(result, result);
3446     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3447 
3448     movl(result, stride2);
3449     movl(cnt2, result);
3450     negptr(result);
3451     jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3452 
3453     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
3454     bind(VECTOR_NOT_EQUAL);
3455     // clean upper bits of YMM registers
3456     vpxor(vec1, vec1);
3457     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3458       lea(str1, Address(str1, result, scale));
3459       lea(str2, Address(str2, result, scale));
3460     } else {
3461       lea(str1, Address(str1, result, scale1));
3462       lea(str2, Address(str2, result, scale2));
3463     }
3464     jmp(COMPARE_16_CHARS);
3465 
3466     // Compare tail chars, length between 1 to 15 chars
3467     bind(COMPARE_TAIL_LONG);
3468     movl(cnt2, result);
3469     cmpl(cnt2, stride);
3470     jcc(Assembler::less, COMPARE_SMALL_STR);
3471 
3472     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3473       movdqu(vec1, Address(str1, 0));
3474     } else {
3475       pmovzxbw(vec1, Address(str1, 0));
3476     }
3477     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3478     jcc(Assembler::below, COMPARE_INDEX_CHAR);
3479     subptr(cnt2, stride);
3480     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3481     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3482       lea(str1, Address(str1, result, scale));
3483       lea(str2, Address(str2, result, scale));
3484     } else {
3485       lea(str1, Address(str1, result, scale1));
3486       lea(str2, Address(str2, result, scale2));
3487     }
3488     negptr(cnt2);
3489     jmpb(WHILE_HEAD_LABEL);
3490 
3491     bind(COMPARE_SMALL_STR);
3492   } else if (UseSSE42Intrinsics) {
3493     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
3494     int pcmpmask = 0x19;
3495     // Setup to compare 8-char (16-byte) vectors,
3496     // start from first character again because it has aligned address.
3497     movl(result, cnt2);
3498     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
3499     if (ae == StrIntrinsicNode::LL) {
3500       pcmpmask &= ~0x01;
3501     }
3502     jcc(Assembler::zero, COMPARE_TAIL);
3503     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3504       lea(str1, Address(str1, result, scale));
3505       lea(str2, Address(str2, result, scale));
3506     } else {
3507       lea(str1, Address(str1, result, scale1));
3508       lea(str2, Address(str2, result, scale2));
3509     }
3510     negptr(result);
3511 
3512     // pcmpestri
3513     //   inputs:
3514     //     vec1- substring
3515     //     rax - negative string length (elements count)
3516     //     mem - scanned string
3517     //     rdx - string length (elements count)
3518     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
3519     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
3520     //   outputs:
3521     //     rcx - first mismatched element index
3522     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3523 
3524     bind(COMPARE_WIDE_VECTORS);
3525     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3526       movdqu(vec1, Address(str1, result, scale));
3527       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3528     } else {
3529       pmovzxbw(vec1, Address(str1, result, scale1));
3530       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3531     }
3532     // After pcmpestri cnt1(rcx) contains mismatched element index
3533 
3534     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
3535     addptr(result, stride);
3536     subptr(cnt2, stride);
3537     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3538 
3539     // compare wide vectors tail
3540     testptr(result, result);
3541     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3542 
3543     movl(cnt2, stride);
3544     movl(result, stride);
3545     negptr(result);
3546     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3547       movdqu(vec1, Address(str1, result, scale));
3548       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3549     } else {
3550       pmovzxbw(vec1, Address(str1, result, scale1));
3551       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3552     }
3553     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
3554 
3555     // Mismatched characters in the vectors
3556     bind(VECTOR_NOT_EQUAL);
3557     addptr(cnt1, result);
3558     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3559     subl(result, cnt2);
3560     jmpb(POP_LABEL);
3561 
3562     bind(COMPARE_TAIL); // limit is zero
3563     movl(cnt2, result);
3564     // Fallthru to tail compare
3565   }
3566   // Shift str2 and str1 to the end of the arrays, negate min
3567   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3568     lea(str1, Address(str1, cnt2, scale));
3569     lea(str2, Address(str2, cnt2, scale));
3570   } else {
3571     lea(str1, Address(str1, cnt2, scale1));
3572     lea(str2, Address(str2, cnt2, scale2));
3573   }
3574   decrementl(cnt2);  // first character was compared already
3575   negptr(cnt2);
3576 
3577   // Compare the rest of the elements
3578   bind(WHILE_HEAD_LABEL);
3579   load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
3580   subl(result, cnt1);
3581   jccb(Assembler::notZero, POP_LABEL);
3582   increment(cnt2);
3583   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
3584 
3585   // Strings are equal up to min length.  Return the length difference.
3586   bind(LENGTH_DIFF_LABEL);
3587   pop(result);
3588   if (ae == StrIntrinsicNode::UU) {
3589     // Divide diff by 2 to get number of chars
3590     sarl(result, 1);
3591   }
3592   jmpb(DONE_LABEL);
3593 
3594 #ifdef _LP64
3595   if (VM_Version::supports_avx512vlbw()) {
3596 
3597     bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
3598 
3599     kmovql(cnt1, mask);
3600     notq(cnt1);
3601     bsfq(cnt2, cnt1);
3602     if (ae != StrIntrinsicNode::LL) {
3603       // Divide diff by 2 to get number of chars
3604       sarl(cnt2, 1);
3605     }
3606     addq(result, cnt2);
3607     if (ae == StrIntrinsicNode::LL) {
3608       load_unsigned_byte(cnt1, Address(str2, result));
3609       load_unsigned_byte(result, Address(str1, result));
3610     } else if (ae == StrIntrinsicNode::UU) {
3611       load_unsigned_short(cnt1, Address(str2, result, scale));
3612       load_unsigned_short(result, Address(str1, result, scale));
3613     } else {
3614       load_unsigned_short(cnt1, Address(str2, result, scale2));
3615       load_unsigned_byte(result, Address(str1, result, scale1));
3616     }
3617     subl(result, cnt1);
3618     jmpb(POP_LABEL);
3619   }//if (VM_Version::supports_avx512vlbw())
3620 #endif // _LP64
3621 
3622   // Discard the stored length difference
3623   bind(POP_LABEL);
3624   pop(cnt1);
3625 
3626   // That's it
3627   bind(DONE_LABEL);
3628   if(ae == StrIntrinsicNode::UL) {
3629     negl(result);
3630   }
3631 
3632 }
3633 
3634 // Search for Non-ASCII character (Negative byte value) in a byte array,
3635 // return the index of the first such character, otherwise the length
3636 // of the array segment searched.
3637 //   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
3638 //   @IntrinsicCandidate
3639 //   public static int countPositives(byte[] ba, int off, int len) {
3640 //     for (int i = off; i < off + len; i++) {
3641 //       if (ba[i] < 0) {
3642 //         return i - off;
3643 //       }
3644 //     }
3645 //     return len;
3646 //   }
3647 void C2_MacroAssembler::count_positives(Register ary1, Register len,
3648   Register result, Register tmp1,
3649   XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {
3650   // rsi: byte array
3651   // rcx: len
3652   // rax: result
3653   ShortBranchVerifier sbv(this);
3654   assert_different_registers(ary1, len, result, tmp1);
3655   assert_different_registers(vec1, vec2);
3656   Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
3657 
3658   movl(result, len); // copy
3659   // len == 0
3660   testl(len, len);
3661   jcc(Assembler::zero, DONE);
3662 
3663   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
3664     VM_Version::supports_avx512vlbw() &&
3665     VM_Version::supports_bmi2()) {
3666 
3667     Label test_64_loop, test_tail, BREAK_LOOP;
3668     Register tmp3_aliased = len;
3669 
3670     movl(tmp1, len);
3671     vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
3672 
3673     andl(tmp1, 64 - 1);   // tail count (in chars) 0x3F
3674     andl(len, ~(64 - 1));    // vector count (in chars)
3675     jccb(Assembler::zero, test_tail);
3676 
3677     lea(ary1, Address(ary1, len, Address::times_1));
3678     negptr(len);
3679 
3680     bind(test_64_loop);
3681     // Check whether our 64 elements of size byte contain negatives
3682     evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
3683     kortestql(mask1, mask1);
3684     jcc(Assembler::notZero, BREAK_LOOP);
3685 
3686     addptr(len, 64);
3687     jccb(Assembler::notZero, test_64_loop);
3688 
3689     bind(test_tail);
3690     // bail out when there is nothing to be done
3691     testl(tmp1, -1);
3692     jcc(Assembler::zero, DONE);
3693 
3694     // ~(~0 << len) applied up to two times (for 32-bit scenario)
3695 #ifdef _LP64
3696     mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
3697     shlxq(tmp3_aliased, tmp3_aliased, tmp1);
3698     notq(tmp3_aliased);
3699     kmovql(mask2, tmp3_aliased);
3700 #else
3701     Label k_init;
3702     jmp(k_init);
3703 
3704     // We could not read 64-bits from a general purpose register thus we move
3705     // data required to compose 64 1's to the instruction stream
3706     // We emit 64 byte wide series of elements from 0..63 which later on would
3707     // be used as a compare targets with tail count contained in tmp1 register.
3708     // Result would be a k register having tmp1 consecutive number or 1
3709     // counting from least significant bit.
3710     address tmp = pc();
3711     emit_int64(0x0706050403020100);
3712     emit_int64(0x0F0E0D0C0B0A0908);
3713     emit_int64(0x1716151413121110);
3714     emit_int64(0x1F1E1D1C1B1A1918);
3715     emit_int64(0x2726252423222120);
3716     emit_int64(0x2F2E2D2C2B2A2928);
3717     emit_int64(0x3736353433323130);
3718     emit_int64(0x3F3E3D3C3B3A3938);
3719 
3720     bind(k_init);
3721     lea(len, InternalAddress(tmp));
3722     // create mask to test for negative byte inside a vector
3723     evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit);
3724     evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit);
3725 
3726 #endif
3727     evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);
3728     ktestq(mask1, mask2);
3729     jcc(Assembler::zero, DONE);
3730 
3731     bind(BREAK_LOOP);
3732     // At least one byte in the last 64 bytes is negative.
3733     // Set up to look at the last 64 bytes as if they were a tail
3734     lea(ary1, Address(ary1, len, Address::times_1));
3735     addptr(result, len);
3736     // Ignore the very last byte: if all others are positive,
3737     // it must be negative, so we can skip right to the 2+1 byte
3738     // end comparison at this point
3739     orl(result, 63);
3740     movl(len, 63);
3741     // Fallthru to tail compare
3742   } else {
3743 
3744     if (UseAVX >= 2 && UseSSE >= 2) {
3745       // With AVX2, use 32-byte vector compare
3746       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
3747 
3748       // Compare 32-byte vectors
3749       testl(len, 0xffffffe0);   // vector count (in bytes)
3750       jccb(Assembler::zero, TAIL_START);
3751 
3752       andl(len, 0xffffffe0);
3753       lea(ary1, Address(ary1, len, Address::times_1));
3754       negptr(len);
3755 
3756       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
3757       movdl(vec2, tmp1);
3758       vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
3759 
3760       bind(COMPARE_WIDE_VECTORS);
3761       vmovdqu(vec1, Address(ary1, len, Address::times_1));
3762       vptest(vec1, vec2);
3763       jccb(Assembler::notZero, BREAK_LOOP);
3764       addptr(len, 32);
3765       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3766 
3767       testl(result, 0x0000001f);   // any bytes remaining?
3768       jcc(Assembler::zero, DONE);
3769 
3770       // Quick test using the already prepared vector mask
3771       movl(len, result);
3772       andl(len, 0x0000001f);
3773       vmovdqu(vec1, Address(ary1, len, Address::times_1, -32));
3774       vptest(vec1, vec2);
3775       jcc(Assembler::zero, DONE);
3776       // There are zeros, jump to the tail to determine exactly where
3777       jmpb(TAIL_START);
3778 
3779       bind(BREAK_LOOP);
3780       // At least one byte in the last 32-byte vector is negative.
3781       // Set up to look at the last 32 bytes as if they were a tail
3782       lea(ary1, Address(ary1, len, Address::times_1));
3783       addptr(result, len);
3784       // Ignore the very last byte: if all others are positive,
3785       // it must be negative, so we can skip right to the 2+1 byte
3786       // end comparison at this point
3787       orl(result, 31);
3788       movl(len, 31);
3789       // Fallthru to tail compare
3790     } else if (UseSSE42Intrinsics) {
3791       // With SSE4.2, use double quad vector compare
3792       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
3793 
3794       // Compare 16-byte vectors
3795       testl(len, 0xfffffff0);   // vector count (in bytes)
3796       jcc(Assembler::zero, TAIL_START);
3797 
3798       andl(len, 0xfffffff0);
3799       lea(ary1, Address(ary1, len, Address::times_1));
3800       negptr(len);
3801 
3802       movl(tmp1, 0x80808080);
3803       movdl(vec2, tmp1);
3804       pshufd(vec2, vec2, 0);
3805 
3806       bind(COMPARE_WIDE_VECTORS);
3807       movdqu(vec1, Address(ary1, len, Address::times_1));
3808       ptest(vec1, vec2);
3809       jccb(Assembler::notZero, BREAK_LOOP);
3810       addptr(len, 16);
3811       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3812 
3813       testl(result, 0x0000000f); // len is zero, any bytes remaining?
3814       jcc(Assembler::zero, DONE);
3815 
3816       // Quick test using the already prepared vector mask
3817       movl(len, result);
3818       andl(len, 0x0000000f);   // tail count (in bytes)
3819       movdqu(vec1, Address(ary1, len, Address::times_1, -16));
3820       ptest(vec1, vec2);
3821       jcc(Assembler::zero, DONE);
3822       jmpb(TAIL_START);
3823 
3824       bind(BREAK_LOOP);
3825       // At least one byte in the last 16-byte vector is negative.
3826       // Set up and look at the last 16 bytes as if they were a tail
3827       lea(ary1, Address(ary1, len, Address::times_1));
3828       addptr(result, len);
3829       // Ignore the very last byte: if all others are positive,
3830       // it must be negative, so we can skip right to the 2+1 byte
3831       // end comparison at this point
3832       orl(result, 15);
3833       movl(len, 15);
3834       // Fallthru to tail compare
3835     }
3836   }
3837 
3838   bind(TAIL_START);
3839   // Compare 4-byte vectors
3840   andl(len, 0xfffffffc); // vector count (in bytes)
3841   jccb(Assembler::zero, COMPARE_CHAR);
3842 
3843   lea(ary1, Address(ary1, len, Address::times_1));
3844   negptr(len);
3845 
3846   bind(COMPARE_VECTORS);
3847   movl(tmp1, Address(ary1, len, Address::times_1));
3848   andl(tmp1, 0x80808080);
3849   jccb(Assembler::notZero, TAIL_ADJUST);
3850   addptr(len, 4);
3851   jccb(Assembler::notZero, COMPARE_VECTORS);
3852 
3853   // Compare trailing char (final 2-3 bytes), if any
3854   bind(COMPARE_CHAR);
3855 
3856   testl(result, 0x2);   // tail  char
3857   jccb(Assembler::zero, COMPARE_BYTE);
3858   load_unsigned_short(tmp1, Address(ary1, 0));
3859   andl(tmp1, 0x00008080);
3860   jccb(Assembler::notZero, CHAR_ADJUST);
3861   lea(ary1, Address(ary1, 2));
3862 
3863   bind(COMPARE_BYTE);
3864   testl(result, 0x1);   // tail  byte
3865   jccb(Assembler::zero, DONE);
3866   load_unsigned_byte(tmp1, Address(ary1, 0));
3867   testl(tmp1, 0x00000080);
3868   jccb(Assembler::zero, DONE);
3869   subptr(result, 1);
3870   jmpb(DONE);
3871 
3872   bind(TAIL_ADJUST);
3873   // there are negative bits in the last 4 byte block.
3874   // Adjust result and check the next three bytes
3875   addptr(result, len);
3876   orl(result, 3);
3877   lea(ary1, Address(ary1, len, Address::times_1));
3878   jmpb(COMPARE_CHAR);
3879 
3880   bind(CHAR_ADJUST);
3881   // We are looking at a char + optional byte tail, and found that one
3882   // of the bytes in the char is negative. Adjust the result, check the
3883   // first byte and readjust if needed.
3884   andl(result, 0xfffffffc);
3885   testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first
3886   jccb(Assembler::notZero, DONE);
3887   addptr(result, 1);
3888 
3889   // That's it
3890   bind(DONE);
3891   if (UseAVX >= 2 && UseSSE >= 2) {
3892     // clean upper bits of YMM registers
3893     vpxor(vec1, vec1);
3894     vpxor(vec2, vec2);
3895   }
3896 }
3897 
3898 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
3899 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
3900                                       Register limit, Register result, Register chr,
3901                                       XMMRegister vec1, XMMRegister vec2, bool is_char, KRegister mask) {
3902   ShortBranchVerifier sbv(this);
3903   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
3904 
3905   int length_offset  = arrayOopDesc::length_offset_in_bytes();
3906   int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
3907 
3908   if (is_array_equ) {
3909     // Check the input args
3910     cmpoop(ary1, ary2);
3911     jcc(Assembler::equal, TRUE_LABEL);
3912 
3913     // Need additional checks for arrays_equals.
3914     testptr(ary1, ary1);
3915     jcc(Assembler::zero, FALSE_LABEL);
3916     testptr(ary2, ary2);
3917     jcc(Assembler::zero, FALSE_LABEL);
3918 
3919     // Check the lengths
3920     movl(limit, Address(ary1, length_offset));
3921     cmpl(limit, Address(ary2, length_offset));
3922     jcc(Assembler::notEqual, FALSE_LABEL);
3923   }
3924 
3925   // count == 0
3926   testl(limit, limit);
3927   jcc(Assembler::zero, TRUE_LABEL);
3928 
3929   if (is_array_equ) {
3930     // Load array address
3931     lea(ary1, Address(ary1, base_offset));
3932     lea(ary2, Address(ary2, base_offset));
3933   }
3934 
3935   if (is_array_equ && is_char) {
3936     // arrays_equals when used for char[].
3937     shll(limit, 1);      // byte count != 0
3938   }
3939   movl(result, limit); // copy
3940 
3941   if (UseAVX >= 2) {
3942     // With AVX2, use 32-byte vector compare
3943     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
3944 
3945     // Compare 32-byte vectors
3946     andl(result, 0x0000001f);  //   tail count (in bytes)
3947     andl(limit, 0xffffffe0);   // vector count (in bytes)
3948     jcc(Assembler::zero, COMPARE_TAIL);
3949 
3950     lea(ary1, Address(ary1, limit, Address::times_1));
3951     lea(ary2, Address(ary2, limit, Address::times_1));
3952     negptr(limit);
3953 
3954 #ifdef _LP64
3955     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3956       Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
3957 
3958       cmpl(limit, -64);
3959       jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3960 
3961       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3962 
3963       evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
3964       evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
3965       kortestql(mask, mask);
3966       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
3967       addptr(limit, 64);  // update since we already compared at this addr
3968       cmpl(limit, -64);
3969       jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3970 
3971       // At this point we may still need to compare -limit+result bytes.
3972       // We could execute the next two instruction and just continue via non-wide path:
3973       //  cmpl(limit, 0);
3974       //  jcc(Assembler::equal, COMPARE_TAIL);  // true
3975       // But since we stopped at the points ary{1,2}+limit which are
3976       // not farther than 64 bytes from the ends of arrays ary{1,2}+result
3977       // (|limit| <= 32 and result < 32),
3978       // we may just compare the last 64 bytes.
3979       //
3980       addptr(result, -64);   // it is safe, bc we just came from this area
3981       evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
3982       evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
3983       kortestql(mask, mask);
3984       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
3985 
3986       jmp(TRUE_LABEL);
3987 
3988       bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3989 
3990     }//if (VM_Version::supports_avx512vlbw())
3991 #endif //_LP64
3992     bind(COMPARE_WIDE_VECTORS);
3993     vmovdqu(vec1, Address(ary1, limit, Address::times_1));
3994     vmovdqu(vec2, Address(ary2, limit, Address::times_1));
3995     vpxor(vec1, vec2);
3996 
3997     vptest(vec1, vec1);
3998     jcc(Assembler::notZero, FALSE_LABEL);
3999     addptr(limit, 32);
4000     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4001 
4002     testl(result, result);
4003     jcc(Assembler::zero, TRUE_LABEL);
4004 
4005     vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
4006     vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
4007     vpxor(vec1, vec2);
4008 
4009     vptest(vec1, vec1);
4010     jccb(Assembler::notZero, FALSE_LABEL);
4011     jmpb(TRUE_LABEL);
4012 
4013     bind(COMPARE_TAIL); // limit is zero
4014     movl(limit, result);
4015     // Fallthru to tail compare
4016   } else if (UseSSE42Intrinsics) {
4017     // With SSE4.2, use double quad vector compare
4018     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4019 
4020     // Compare 16-byte vectors
4021     andl(result, 0x0000000f);  //   tail count (in bytes)
4022     andl(limit, 0xfffffff0);   // vector count (in bytes)
4023     jcc(Assembler::zero, COMPARE_TAIL);
4024 
4025     lea(ary1, Address(ary1, limit, Address::times_1));
4026     lea(ary2, Address(ary2, limit, Address::times_1));
4027     negptr(limit);
4028 
4029     bind(COMPARE_WIDE_VECTORS);
4030     movdqu(vec1, Address(ary1, limit, Address::times_1));
4031     movdqu(vec2, Address(ary2, limit, Address::times_1));
4032     pxor(vec1, vec2);
4033 
4034     ptest(vec1, vec1);
4035     jcc(Assembler::notZero, FALSE_LABEL);
4036     addptr(limit, 16);
4037     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4038 
4039     testl(result, result);
4040     jcc(Assembler::zero, TRUE_LABEL);
4041 
4042     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
4043     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
4044     pxor(vec1, vec2);
4045 
4046     ptest(vec1, vec1);
4047     jccb(Assembler::notZero, FALSE_LABEL);
4048     jmpb(TRUE_LABEL);
4049 
4050     bind(COMPARE_TAIL); // limit is zero
4051     movl(limit, result);
4052     // Fallthru to tail compare
4053   }
4054 
4055   // Compare 4-byte vectors
4056   andl(limit, 0xfffffffc); // vector count (in bytes)
4057   jccb(Assembler::zero, COMPARE_CHAR);
4058 
4059   lea(ary1, Address(ary1, limit, Address::times_1));
4060   lea(ary2, Address(ary2, limit, Address::times_1));
4061   negptr(limit);
4062 
4063   bind(COMPARE_VECTORS);
4064   movl(chr, Address(ary1, limit, Address::times_1));
4065   cmpl(chr, Address(ary2, limit, Address::times_1));
4066   jccb(Assembler::notEqual, FALSE_LABEL);
4067   addptr(limit, 4);
4068   jcc(Assembler::notZero, COMPARE_VECTORS);
4069 
4070   // Compare trailing char (final 2 bytes), if any
4071   bind(COMPARE_CHAR);
4072   testl(result, 0x2);   // tail  char
4073   jccb(Assembler::zero, COMPARE_BYTE);
4074   load_unsigned_short(chr, Address(ary1, 0));
4075   load_unsigned_short(limit, Address(ary2, 0));
4076   cmpl(chr, limit);
4077   jccb(Assembler::notEqual, FALSE_LABEL);
4078 
4079   if (is_array_equ && is_char) {
4080     bind(COMPARE_BYTE);
4081   } else {
4082     lea(ary1, Address(ary1, 2));
4083     lea(ary2, Address(ary2, 2));
4084 
4085     bind(COMPARE_BYTE);
4086     testl(result, 0x1);   // tail  byte
4087     jccb(Assembler::zero, TRUE_LABEL);
4088     load_unsigned_byte(chr, Address(ary1, 0));
4089     load_unsigned_byte(limit, Address(ary2, 0));
4090     cmpl(chr, limit);
4091     jccb(Assembler::notEqual, FALSE_LABEL);
4092   }
4093   bind(TRUE_LABEL);
4094   movl(result, 1);   // return true
4095   jmpb(DONE);
4096 
4097   bind(FALSE_LABEL);
4098   xorl(result, result); // return false
4099 
4100   // That's it
4101   bind(DONE);
4102   if (UseAVX >= 2) {
4103     // clean upper bits of YMM registers
4104     vpxor(vec1, vec1);
4105     vpxor(vec2, vec2);
4106   }
4107 }
4108 
4109 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4110                                     XMMRegister src1, int imm8, bool merge, int vlen_enc) {
4111   switch(ideal_opc) {
4112     case Op_LShiftVS:
4113       Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break;
4114     case Op_LShiftVI:
4115       Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break;
4116     case Op_LShiftVL:
4117       Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break;
4118     case Op_RShiftVS:
4119       Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break;
4120     case Op_RShiftVI:
4121       Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break;
4122     case Op_RShiftVL:
4123       Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break;
4124     case Op_URShiftVS:
4125       Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break;
4126     case Op_URShiftVI:
4127       Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break;
4128     case Op_URShiftVL:
4129       Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break;
4130     case Op_RotateRightV:
4131       evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4132     case Op_RotateLeftV:
4133       evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4134     default:
4135       fatal("Unsupported masked operation"); break;
4136   }
4137 }
4138 
4139 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4140                                     XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc,
4141                                     bool is_varshift) {
4142   switch (ideal_opc) {
4143     case Op_AddVB:
4144       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4145     case Op_AddVS:
4146       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4147     case Op_AddVI:
4148       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4149     case Op_AddVL:
4150       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4151     case Op_AddVF:
4152       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4153     case Op_AddVD:
4154       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4155     case Op_SubVB:
4156       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4157     case Op_SubVS:
4158       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4159     case Op_SubVI:
4160       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4161     case Op_SubVL:
4162       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4163     case Op_SubVF:
4164       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4165     case Op_SubVD:
4166       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4167     case Op_MulVS:
4168       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4169     case Op_MulVI:
4170       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4171     case Op_MulVL:
4172       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4173     case Op_MulVF:
4174       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4175     case Op_MulVD:
4176       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4177     case Op_DivVF:
4178       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4179     case Op_DivVD:
4180       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4181     case Op_SqrtVF:
4182       evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break;
4183     case Op_SqrtVD:
4184       evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break;
4185     case Op_AbsVB:
4186       evpabsb(dst, mask, src2, merge, vlen_enc); break;
4187     case Op_AbsVS:
4188       evpabsw(dst, mask, src2, merge, vlen_enc); break;
4189     case Op_AbsVI:
4190       evpabsd(dst, mask, src2, merge, vlen_enc); break;
4191     case Op_AbsVL:
4192       evpabsq(dst, mask, src2, merge, vlen_enc); break;
4193     case Op_FmaVF:
4194       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4195     case Op_FmaVD:
4196       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4197     case Op_VectorRearrange:
4198       evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break;
4199     case Op_LShiftVS:
4200       evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4201     case Op_LShiftVI:
4202       evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4203     case Op_LShiftVL:
4204       evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4205     case Op_RShiftVS:
4206       evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4207     case Op_RShiftVI:
4208       evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4209     case Op_RShiftVL:
4210       evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4211     case Op_URShiftVS:
4212       evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4213     case Op_URShiftVI:
4214       evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4215     case Op_URShiftVL:
4216       evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4217     case Op_RotateLeftV:
4218       evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4219     case Op_RotateRightV:
4220       evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4221     case Op_MaxV:
4222       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4223     case Op_MinV:
4224       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4225     case Op_XorV:
4226       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4227     case Op_OrV:
4228       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4229     case Op_AndV:
4230       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4231     default:
4232       fatal("Unsupported masked operation"); break;
4233   }
4234 }
4235 
4236 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4237                                     XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4238   switch (ideal_opc) {
4239     case Op_AddVB:
4240       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4241     case Op_AddVS:
4242       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4243     case Op_AddVI:
4244       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4245     case Op_AddVL:
4246       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4247     case Op_AddVF:
4248       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4249     case Op_AddVD:
4250       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4251     case Op_SubVB:
4252       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4253     case Op_SubVS:
4254       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4255     case Op_SubVI:
4256       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4257     case Op_SubVL:
4258       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4259     case Op_SubVF:
4260       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4261     case Op_SubVD:
4262       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4263     case Op_MulVS:
4264       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4265     case Op_MulVI:
4266       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4267     case Op_MulVL:
4268       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4269     case Op_MulVF:
4270       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4271     case Op_MulVD:
4272       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4273     case Op_DivVF:
4274       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4275     case Op_DivVD:
4276       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4277     case Op_FmaVF:
4278       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4279     case Op_FmaVD:
4280       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4281     case Op_MaxV:
4282       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4283     case Op_MinV:
4284       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4285     case Op_XorV:
4286       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4287     case Op_OrV:
4288       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4289     case Op_AndV:
4290       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4291     default:
4292       fatal("Unsupported masked operation"); break;
4293   }
4294 }
4295 
4296 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst,
4297                                   KRegister src1, KRegister src2) {
4298   BasicType etype = T_ILLEGAL;
4299   switch(mask_len) {
4300     case 2:
4301     case 4:
4302     case 8:  etype = T_BYTE; break;
4303     case 16: etype = T_SHORT; break;
4304     case 32: etype = T_INT; break;
4305     case 64: etype = T_LONG; break;
4306     default: fatal("Unsupported type"); break;
4307   }
4308   assert(etype != T_ILLEGAL, "");
4309   switch(ideal_opc) {
4310     case Op_AndVMask:
4311       kand(etype, dst, src1, src2); break;
4312     case Op_OrVMask:
4313       kor(etype, dst, src1, src2); break;
4314     case Op_XorVMask:
4315       kxor(etype, dst, src1, src2); break;
4316     default:
4317       fatal("Unsupported masked operation"); break;
4318   }
4319 }
4320 
4321 /*
4322  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4323  * If src is NaN, the result is 0.
4324  * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE,
4325  * the result is equal to the value of Integer.MIN_VALUE.
4326  * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE,
4327  * the result is equal to the value of Integer.MAX_VALUE.
4328  */
4329 void C2_MacroAssembler::vector_cast_float_special_cases_avx(XMMRegister dst, XMMRegister src, AddressLiteral float_sign_flip, int vec_enc,
4330                                                             XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
4331                                                             Register rscratch) {
4332   Label done;
4333   vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch);
4334   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
4335   vptest(xtmp2, xtmp2, vec_enc);
4336   jccb(Assembler::equal, done);
4337 
4338   vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc);
4339   vpxor(xtmp1, xtmp1, xtmp4, vec_enc);
4340 
4341   vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
4342   vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc);
4343   vblendvps(dst, dst, xtmp4, xtmp3, vec_enc);
4344 
4345   // Recompute the mask for remaining special value.
4346   vpxor(xtmp2, xtmp2, xtmp3, vec_enc);
4347   // Extract SRC values corresponding to TRUE mask lanes.
4348   vpand(xtmp4, xtmp2, src, vec_enc);
4349   // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special
4350   // values are set.
4351   vpxor(xtmp3, xtmp2, xtmp4, vec_enc);
4352 
4353   vblendvps(dst, dst, xtmp1, xtmp3, vec_enc);
4354   bind(done);
4355 }
4356 
4357 void C2_MacroAssembler::vector_cast_float_special_cases_evex(XMMRegister dst, XMMRegister src, AddressLiteral float_sign_flip, int vec_enc,
4358                                                              XMMRegister xtmp1, XMMRegister xtmp2,
4359                                                              KRegister ktmp1, KRegister ktmp2,
4360                                                              Register rscratch) {
4361   Label done;
4362   evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
4363   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4364   kortestwl(ktmp1, ktmp1);
4365   jccb(Assembler::equal, done);
4366 
4367   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4368   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4369   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
4370 
4371   kxorwl(ktmp1, ktmp1, ktmp2);
4372   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4373   vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4374   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
4375   bind(done);
4376 }
4377 
4378 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src,
4379                                                                      AddressLiteral double_sign_flip, int vec_enc,
4380                                                                      XMMRegister xtmp1, XMMRegister xtmp2,
4381                                                                      KRegister ktmp1, KRegister ktmp2,
4382                                                                      Register rscratch) {
4383   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4384 
4385   Label done;
4386   evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
4387   Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc);
4388   kortestwl(ktmp1, ktmp1);
4389   jccb(Assembler::equal, done);
4390 
4391   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4392   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4393   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
4394 
4395   kxorwl(ktmp1, ktmp1, ktmp2);
4396   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4397   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4398   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
4399   bind(done);
4400 }
4401 
4402 /*
4403  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4404  * If src is NaN, the result is 0.
4405  * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE,
4406  * the result is equal to the value of Long.MIN_VALUE.
4407  * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE,
4408  * the result is equal to the value of Long.MAX_VALUE.
4409  */
4410 void C2_MacroAssembler::vector_cast_double_special_cases_evex(XMMRegister dst, XMMRegister src,
4411                                                               AddressLiteral double_sign_flip, int vec_enc,
4412                                                               XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4413                                                               Register rscratch) {
4414   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4415 
4416   Label done;
4417   evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
4418   evpcmpeqq(ktmp1, xtmp1, dst, vec_enc);
4419   kortestwl(ktmp1, ktmp1);
4420   jccb(Assembler::equal, done);
4421 
4422   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4423   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4424   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
4425 
4426   kxorwl(ktmp1, ktmp1, ktmp2);
4427   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4428   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4429   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
4430   bind(done);
4431 }
4432 
4433 /*
4434  * Algorithm for vector D2L and F2I conversions:-
4435  * a) Perform vector D2L/F2I cast.
4436  * b) Choose fast path if none of the result vector lane contains 0x80000000 value.
4437  *    It signifies that source value could be any of the special floating point
4438  *    values(NaN,-Inf,Inf,Max,-Min).
4439  * c) Set destination to zero if source is NaN value.
4440  * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value.
4441  */
4442 
4443 void C2_MacroAssembler::vector_castD2L_evex(XMMRegister dst, XMMRegister src, AddressLiteral double_sign_flip, int vec_enc,
4444                                             XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, Register rscratch) {
4445   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4446 
4447   evcvttpd2qq(dst, src, vec_enc);
4448   vector_cast_double_special_cases_evex(dst, src, double_sign_flip, vec_enc,
4449                                         xtmp1, xtmp2, ktmp1, ktmp2, rscratch);
4450 }
4451 
4452 void C2_MacroAssembler::vector_castF2I_avx(XMMRegister dst, XMMRegister src, AddressLiteral float_sign_flip, int vec_enc,
4453                                            XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, Register rscratch) {
4454   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4455 
4456   vcvttps2dq(dst, src, vec_enc);
4457   vector_cast_float_special_cases_avx(dst, src, float_sign_flip, vec_enc,
4458                                       xtmp1, xtmp2, xtmp3, xtmp4, rscratch);
4459 }
4460 
4461 void C2_MacroAssembler::vector_castF2I_evex(XMMRegister dst, XMMRegister src, AddressLiteral float_sign_flip, int vec_enc,
4462                                             XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, Register rscratch) {
4463   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4464 
4465   vcvttps2dq(dst, src, vec_enc);
4466   vector_cast_float_special_cases_evex(dst, src, float_sign_flip, vec_enc,
4467                                        xtmp1, xtmp2, ktmp1, ktmp2, rscratch);
4468 }
4469 
4470 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, AddressLiteral float_sign_flip, int vec_enc,
4471                                             XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, Register rscratch) {
4472   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4473 
4474   evcvttps2qq(dst, src, vec_enc);
4475   vector_cast_float_to_long_special_cases_evex(dst, src, float_sign_flip, vec_enc,
4476                                                xtmp1, xtmp2, ktmp1, ktmp2, rscratch);
4477 }
4478 
4479 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, AddressLiteral double_sign_flip, int vec_enc,
4480                                             XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, Register rscratch) {
4481   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4482 
4483   vector_castD2L_evex(dst, src, double_sign_flip, vec_enc,
4484                       xtmp1, xtmp2, ktmp1, ktmp2, rscratch);
4485   if (to_elem_bt != T_LONG) {
4486     switch(to_elem_bt) {
4487       case T_INT:
4488         evpmovsqd(dst, dst, vec_enc);
4489         break;
4490       case T_SHORT:
4491         evpmovsqd(dst, dst, vec_enc);
4492         evpmovdw(dst, dst, vec_enc);
4493         break;
4494       case T_BYTE:
4495         evpmovsqd(dst, dst, vec_enc);
4496         evpmovdb(dst, dst, vec_enc);
4497         break;
4498       default: assert(false, "%s", type2name(to_elem_bt));
4499     }
4500   }
4501 }
4502 
4503 #ifdef _LP64
4504 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src,
4505                                                  AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
4506                                                  Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
4507   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
4508   // and re-instantiate original MXCSR.RC mode after that.
4509   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
4510 
4511   mov64(tmp, julong_cast(0.5L));
4512   evpbroadcastq(xtmp1, tmp, vec_enc);
4513   vaddpd(xtmp1, src , xtmp1, vec_enc);
4514   evcvtpd2qq(dst, xtmp1, vec_enc);
4515   vector_cast_double_special_cases_evex(dst, src, double_sign_flip, vec_enc,
4516                                         xtmp1, xtmp2, ktmp1, ktmp2, tmp);
4517 
4518   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
4519 }
4520 
4521 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src,
4522                                                 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
4523                                                 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
4524   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
4525   // and re-instantiate original MXCSR.RC mode after that.
4526   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
4527 
4528   movl(tmp, jint_cast(0.5));
4529   movq(xtmp1, tmp);
4530   vbroadcastss(xtmp1, xtmp1, vec_enc);
4531   vaddps(xtmp1, src , xtmp1, vec_enc);
4532   vcvtps2dq(dst, xtmp1, vec_enc);
4533   vector_cast_float_special_cases_evex(dst, src, float_sign_flip, vec_enc,
4534                                        xtmp1, xtmp2, ktmp1, ktmp2, tmp);
4535 
4536   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
4537 }
4538 
4539 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src,
4540                                                AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
4541                                                Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) {
4542   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
4543   // and re-instantiate original MXCSR.RC mode after that.
4544   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
4545 
4546   movl(tmp, jint_cast(0.5));
4547   movq(xtmp1, tmp);
4548   vbroadcastss(xtmp1, xtmp1, vec_enc);
4549   vaddps(xtmp1, src , xtmp1, vec_enc);
4550   vcvtps2dq(dst, xtmp1, vec_enc);
4551   vector_cast_float_special_cases_avx(dst, src, float_sign_flip, vec_enc,
4552                                       xtmp1, xtmp2, xtmp3, xtmp4, tmp);
4553 
4554   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
4555 }
4556 #endif // _LP64
4557 
4558 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
4559                                              BasicType from_elem_bt, BasicType to_elem_bt) {
4560   switch (from_elem_bt) {
4561     case T_BYTE:
4562       switch (to_elem_bt) {
4563         case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break;
4564         case T_INT:   vpmovzxbd(dst, src, vlen_enc); break;
4565         case T_LONG:  vpmovzxbq(dst, src, vlen_enc); break;
4566         default: ShouldNotReachHere();
4567       }
4568       break;
4569     case T_SHORT:
4570       switch (to_elem_bt) {
4571         case T_INT:  vpmovzxwd(dst, src, vlen_enc); break;
4572         case T_LONG: vpmovzxwq(dst, src, vlen_enc); break;
4573         default: ShouldNotReachHere();
4574       }
4575       break;
4576     case T_INT:
4577       assert(to_elem_bt == T_LONG, "");
4578       vpmovzxdq(dst, src, vlen_enc);
4579       break;
4580     default:
4581       ShouldNotReachHere();
4582   }
4583 }
4584 
4585 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3,
4586                                    bool merge, BasicType bt, int vlen_enc) {
4587   if (bt == T_INT) {
4588     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
4589   } else {
4590     assert(bt == T_LONG, "");
4591     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
4592   }
4593 }
4594 
4595 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3,
4596                                    bool merge, BasicType bt, int vlen_enc) {
4597   if (bt == T_INT) {
4598     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
4599   } else {
4600     assert(bt == T_LONG, "");
4601     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
4602   }
4603 }
4604 
4605 #ifdef _LP64
4606 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1,
4607                                                Register rtmp2, XMMRegister xtmp, int mask_len,
4608                                                int vec_enc) {
4609   int index = 0;
4610   int vindex = 0;
4611   mov64(rtmp1, 0x0101010101010101L);
4612   pdepq(rtmp1, src, rtmp1);
4613   if (mask_len > 8) {
4614     movq(rtmp2, src);
4615     vpxor(xtmp, xtmp, xtmp, vec_enc);
4616     movq(xtmp, rtmp1);
4617   }
4618   movq(dst, rtmp1);
4619 
4620   mask_len -= 8;
4621   while (mask_len > 0) {
4622     assert ((mask_len & 0x7) == 0, "mask must be multiple of 8");
4623     index++;
4624     if ((index % 2) == 0) {
4625       pxor(xtmp, xtmp);
4626     }
4627     mov64(rtmp1, 0x0101010101010101L);
4628     shrq(rtmp2, 8);
4629     pdepq(rtmp1, rtmp2, rtmp1);
4630     pinsrq(xtmp, rtmp1, index % 2);
4631     vindex = index / 2;
4632     if (vindex) {
4633       // Write entire 16 byte vector when both 64 bit
4634       // lanes are update to save redundant instructions.
4635       if (index % 2) {
4636         vinsertf128(dst, dst, xtmp, vindex);
4637       }
4638     } else {
4639       vmovdqu(dst, xtmp);
4640     }
4641     mask_len -= 8;
4642   }
4643 }
4644 
4645 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) {
4646   switch(opc) {
4647     case Op_VectorMaskTrueCount:
4648       popcntq(dst, tmp);
4649       break;
4650     case Op_VectorMaskLastTrue:
4651       if (VM_Version::supports_lzcnt()) {
4652         lzcntq(tmp, tmp);
4653         movl(dst, 63);
4654         subl(dst, tmp);
4655       } else {
4656         movl(dst, -1);
4657         bsrq(tmp, tmp);
4658         cmov32(Assembler::notZero, dst, tmp);
4659       }
4660       break;
4661     case Op_VectorMaskFirstTrue:
4662       if (VM_Version::supports_bmi1()) {
4663         if (masklen < 32) {
4664           orl(tmp, 1 << masklen);
4665           tzcntl(dst, tmp);
4666         } else if (masklen == 32) {
4667           tzcntl(dst, tmp);
4668         } else {
4669           assert(masklen == 64, "");
4670           tzcntq(dst, tmp);
4671         }
4672       } else {
4673         if (masklen < 32) {
4674           orl(tmp, 1 << masklen);
4675           bsfl(dst, tmp);
4676         } else {
4677           assert(masklen == 32 || masklen == 64, "");
4678           movl(dst, masklen);
4679           if (masklen == 32)  {
4680             bsfl(tmp, tmp);
4681           } else {
4682             bsfq(tmp, tmp);
4683           }
4684           cmov32(Assembler::notZero, dst, tmp);
4685         }
4686       }
4687       break;
4688     case Op_VectorMaskToLong:
4689       assert(dst == tmp, "Dst and tmp should be the same for toLong operations");
4690       break;
4691     default: assert(false, "Unhandled mask operation");
4692   }
4693 }
4694 
4695 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp,
4696                                               int masklen, int masksize, int vec_enc) {
4697   assert(VM_Version::supports_popcnt(), "");
4698 
4699   if(VM_Version::supports_avx512bw()) {
4700     kmovql(tmp, mask);
4701   } else {
4702     assert(masklen <= 16, "");
4703     kmovwl(tmp, mask);
4704   }
4705 
4706   // Mask generated out of partial vector comparisons/replicate/mask manipulation
4707   // operations needs to be clipped.
4708   if (masksize < 16 && opc != Op_VectorMaskFirstTrue) {
4709     andq(tmp, (1 << masklen) - 1);
4710   }
4711 
4712   vector_mask_operation_helper(opc, dst, tmp, masklen);
4713 }
4714 
4715 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
4716                                               Register tmp, int masklen, BasicType bt, int vec_enc) {
4717   assert(vec_enc == AVX_128bit && VM_Version::supports_avx() ||
4718          vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4), "");
4719   assert(VM_Version::supports_popcnt(), "");
4720 
4721   bool need_clip = false;
4722   switch(bt) {
4723     case T_BOOLEAN:
4724       // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1
4725       vpxor(xtmp, xtmp, xtmp, vec_enc);
4726       vpsubb(xtmp, xtmp, mask, vec_enc);
4727       vpmovmskb(tmp, xtmp, vec_enc);
4728       need_clip = masklen < 16;
4729       break;
4730     case T_BYTE:
4731       vpmovmskb(tmp, mask, vec_enc);
4732       need_clip = masklen < 16;
4733       break;
4734     case T_SHORT:
4735       vpacksswb(xtmp, mask, mask, vec_enc);
4736       if (masklen >= 16) {
4737         vpermpd(xtmp, xtmp, 8, vec_enc);
4738       }
4739       vpmovmskb(tmp, xtmp, Assembler::AVX_128bit);
4740       need_clip = masklen < 16;
4741       break;
4742     case T_INT:
4743     case T_FLOAT:
4744       vmovmskps(tmp, mask, vec_enc);
4745       need_clip = masklen < 4;
4746       break;
4747     case T_LONG:
4748     case T_DOUBLE:
4749       vmovmskpd(tmp, mask, vec_enc);
4750       need_clip = masklen < 2;
4751       break;
4752     default: assert(false, "Unhandled type, %s", type2name(bt));
4753   }
4754 
4755   // Mask generated out of partial vector comparisons/replicate/mask manipulation
4756   // operations needs to be clipped.
4757   if (need_clip && opc != Op_VectorMaskFirstTrue) {
4758     // need_clip implies masklen < 32
4759     andq(tmp, (1 << masklen) - 1);
4760   }
4761 
4762   vector_mask_operation_helper(opc, dst, tmp, masklen);
4763 }
4764 
4765 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1,
4766                                              Register rtmp2, int mask_len) {
4767   kmov(rtmp1, src);
4768   andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len)));
4769   mov64(rtmp2, -1L);
4770   pextq(rtmp2, rtmp2, rtmp1);
4771   kmov(dst, rtmp2);
4772 }
4773 
4774 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask,
4775                                                bool merge, BasicType bt, int vec_enc) {
4776   if (opcode == Op_CompressV) {
4777     switch(bt) {
4778     case T_BYTE:
4779       evpcompressb(dst, mask, src, merge, vec_enc);
4780       break;
4781     case T_CHAR:
4782     case T_SHORT:
4783       evpcompressw(dst, mask, src, merge, vec_enc);
4784       break;
4785     case T_INT:
4786       evpcompressd(dst, mask, src, merge, vec_enc);
4787       break;
4788     case T_FLOAT:
4789       evcompressps(dst, mask, src, merge, vec_enc);
4790       break;
4791     case T_LONG:
4792       evpcompressq(dst, mask, src, merge, vec_enc);
4793       break;
4794     case T_DOUBLE:
4795       evcompresspd(dst, mask, src, merge, vec_enc);
4796       break;
4797     default:
4798       fatal("Unsupported type %s", type2name(bt));
4799       break;
4800     }
4801   } else {
4802     assert(opcode == Op_ExpandV, "");
4803     switch(bt) {
4804     case T_BYTE:
4805       evpexpandb(dst, mask, src, merge, vec_enc);
4806       break;
4807     case T_CHAR:
4808     case T_SHORT:
4809       evpexpandw(dst, mask, src, merge, vec_enc);
4810       break;
4811     case T_INT:
4812       evpexpandd(dst, mask, src, merge, vec_enc);
4813       break;
4814     case T_FLOAT:
4815       evexpandps(dst, mask, src, merge, vec_enc);
4816       break;
4817     case T_LONG:
4818       evpexpandq(dst, mask, src, merge, vec_enc);
4819       break;
4820     case T_DOUBLE:
4821       evexpandpd(dst, mask, src, merge, vec_enc);
4822       break;
4823     default:
4824       fatal("Unsupported type %s", type2name(bt));
4825       break;
4826     }
4827   }
4828 }
4829 #endif
4830 
4831 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
4832                                            KRegister ktmp1, int vec_enc) {
4833   if (opcode == Op_SignumVD) {
4834     vsubpd(dst, zero, one, vec_enc);
4835     // if src < 0 ? -1 : 1
4836     evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
4837     evblendmpd(dst, ktmp1, one, dst, true, vec_enc);
4838     // if src == NaN, -0.0 or 0.0 return src.
4839     evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
4840     evblendmpd(dst, ktmp1, dst, src, true, vec_enc);
4841   } else {
4842     assert(opcode == Op_SignumVF, "");
4843     vsubps(dst, zero, one, vec_enc);
4844     // if src < 0 ? -1 : 1
4845     evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
4846     evblendmps(dst, ktmp1, one, dst, true, vec_enc);
4847     // if src == NaN, -0.0 or 0.0 return src.
4848     evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
4849     evblendmps(dst, ktmp1, dst, src, true, vec_enc);
4850   }
4851 }
4852 
4853 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
4854                                           XMMRegister xtmp1, int vec_enc) {
4855   if (opcode == Op_SignumVD) {
4856     vsubpd(dst, zero, one, vec_enc);
4857     // if src < 0 ? -1 : 1
4858     vblendvpd(dst, one, dst, src, vec_enc);
4859     // if src == NaN, -0.0 or 0.0 return src.
4860     vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
4861     vblendvpd(dst, dst, src, xtmp1, vec_enc);
4862   } else {
4863     assert(opcode == Op_SignumVF, "");
4864     vsubps(dst, zero, one, vec_enc);
4865     // if src < 0 ? -1 : 1
4866     vblendvps(dst, one, dst, src, vec_enc);
4867     // if src == NaN, -0.0 or 0.0 return src.
4868     vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
4869     vblendvps(dst, dst, src, xtmp1, vec_enc);
4870   }
4871 }
4872 
4873 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) {
4874   if (VM_Version::supports_avx512bw()) {
4875     if (mask_len > 32) {
4876       kmovql(dst, src);
4877     } else {
4878       kmovdl(dst, src);
4879       if (mask_len != 32) {
4880         kshiftrdl(dst, dst, 32 - mask_len);
4881       }
4882     }
4883   } else {
4884     assert(mask_len <= 16, "");
4885     kmovwl(dst, src);
4886     if (mask_len != 16) {
4887       kshiftrwl(dst, dst, 16 - mask_len);
4888     }
4889   }
4890 }
4891 
4892 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) {
4893   int lane_size = type2aelembytes(bt);
4894   bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
4895   if ((is_LP64 || lane_size < 8) &&
4896       ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) ||
4897        (is_subword_type(bt) && VM_Version::supports_avx512vlbw()))) {
4898     movptr(rtmp, imm32);
4899     switch(lane_size) {
4900       case 1 : evpbroadcastb(dst, rtmp, vec_enc); break;
4901       case 2 : evpbroadcastw(dst, rtmp, vec_enc); break;
4902       case 4 : evpbroadcastd(dst, rtmp, vec_enc); break;
4903       case 8 : evpbroadcastq(dst, rtmp, vec_enc); break;
4904       fatal("Unsupported lane size %d", lane_size);
4905       break;
4906     }
4907   } else {
4908     movptr(rtmp, imm32);
4909     LP64_ONLY(movq(dst, rtmp)) NOT_LP64(movdl(dst, rtmp));
4910     switch(lane_size) {
4911       case 1 : vpbroadcastb(dst, dst, vec_enc); break;
4912       case 2 : vpbroadcastw(dst, dst, vec_enc); break;
4913       case 4 : vpbroadcastd(dst, dst, vec_enc); break;
4914       case 8 : vpbroadcastq(dst, dst, vec_enc); break;
4915       fatal("Unsupported lane size %d", lane_size);
4916       break;
4917     }
4918   }
4919 }
4920 
4921 //
4922 // Following is lookup table based popcount computation algorithm:-
4923 //       Index   Bit set count
4924 //     [ 0000 ->   0,
4925 //       0001 ->   1,
4926 //       0010 ->   1,
4927 //       0011 ->   2,
4928 //       0100 ->   1,
4929 //       0101 ->   2,
4930 //       0110 ->   2,
4931 //       0111 ->   3,
4932 //       1000 ->   1,
4933 //       1001 ->   2,
4934 //       1010 ->   3,
4935 //       1011 ->   3,
4936 //       1100 ->   2,
4937 //       1101 ->   3,
4938 //       1111 ->   4 ]
4939 //  a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as
4940 //     shuffle indices for lookup table access.
4941 //  b. Right shift each byte of vector lane by 4 positions.
4942 //  c. Count the number of 1s in 4 MSB bits each byte. These bits are used as
4943 //     shuffle indices for lookup table access.
4944 //  d. Add the bitset count of upper and lower 4 bits of each byte.
4945 //  e. Unpack double words to quad words and compute sum of absolute difference of bitset
4946 //     count of all the bytes of a quadword.
4947 //  f. Perform step e. for upper 128bit vector lane.
4948 //  g. Pack the bitset count of quadwords back to double word.
4949 //  h. Unpacking and packing operations are not needed for 64bit vector lane.
4950 
4951 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4952                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
4953   assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), "");
4954   vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc);
4955   vpsrlw(dst, src, 4, vec_enc);
4956   vpand(dst, dst, xtmp1, vec_enc);
4957   vpand(xtmp1, src, xtmp1, vec_enc);
4958   vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg);
4959   vpshufb(xtmp1, xtmp2, xtmp1, vec_enc);
4960   vpshufb(dst, xtmp2, dst, vec_enc);
4961   vpaddb(dst, dst, xtmp1, vec_enc);
4962 }
4963 
4964 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4965                                             XMMRegister xtmp2, Register rtmp, int vec_enc) {
4966   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
4967   // Following code is as per steps e,f,g and h of above algorithm.
4968   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4969   vpunpckhdq(dst, xtmp1, xtmp2, vec_enc);
4970   vpsadbw(dst, dst, xtmp2, vec_enc);
4971   vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc);
4972   vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc);
4973   vpackuswb(dst, xtmp1, dst, vec_enc);
4974 }
4975 
4976 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4977                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
4978   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
4979   // Add the popcount of upper and lower bytes of word.
4980   vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc);
4981   vpsrlw(dst, xtmp1, 8, vec_enc);
4982   vpand(xtmp1, xtmp1, xtmp2, vec_enc);
4983   vpaddw(dst, dst, xtmp1, vec_enc);
4984 }
4985 
4986 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4987                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
4988   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
4989   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4990   vpsadbw(dst, xtmp1, xtmp2, vec_enc);
4991 }
4992 
4993 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4994                                                  XMMRegister xtmp2, Register rtmp, int vec_enc) {
4995   switch(bt) {
4996     case T_LONG:
4997       vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
4998       break;
4999     case T_INT:
5000       vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5001       break;
5002     case T_CHAR:
5003     case T_SHORT:
5004       vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5005       break;
5006     case T_BYTE:
5007     case T_BOOLEAN:
5008       vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5009       break;
5010     default:
5011       fatal("Unsupported type %s", type2name(bt));
5012       break;
5013   }
5014 }
5015 
5016 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5017                                                       KRegister mask, bool merge, int vec_enc) {
5018   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5019   switch(bt) {
5020     case T_LONG:
5021       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5022       evpopcntq(dst, mask, src, merge, vec_enc);
5023       break;
5024     case T_INT:
5025       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5026       evpopcntd(dst, mask, src, merge, vec_enc);
5027       break;
5028     case T_CHAR:
5029     case T_SHORT:
5030       assert(VM_Version::supports_avx512_bitalg(), "");
5031       evpopcntw(dst, mask, src, merge, vec_enc);
5032       break;
5033     case T_BYTE:
5034     case T_BOOLEAN:
5035       assert(VM_Version::supports_avx512_bitalg(), "");
5036       evpopcntb(dst, mask, src, merge, vec_enc);
5037       break;
5038     default:
5039       fatal("Unsupported type %s", type2name(bt));
5040       break;
5041   }
5042 }
5043 
5044 #ifndef _LP64
5045 void C2_MacroAssembler::vector_maskall_operation32(KRegister dst, Register src, KRegister tmp, int mask_len) {
5046   assert(VM_Version::supports_avx512bw(), "");
5047   kmovdl(tmp, src);
5048   kunpckdql(dst, tmp, tmp);
5049 }
5050 #endif
5051 
5052 // Bit reversal algorithm first reverses the bits of each byte followed by
5053 // a byte level reversal for multi-byte primitive types (short/int/long).
5054 // Algorithm performs a lookup table access to get reverse bit sequence
5055 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte
5056 // is obtained by swapping the reverse bit sequences of upper and lower
5057 // nibble of a byte.
5058 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5059                                            XMMRegister xtmp2, Register rtmp, int vec_enc) {
5060   if (VM_Version::supports_avx512vlbw()) {
5061 
5062     // Get the reverse bit sequence of lower nibble of each byte.
5063     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg);
5064     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5065     vpandq(dst, xtmp2, src, vec_enc);
5066     vpshufb(dst, xtmp1, dst, vec_enc);
5067     vpsllq(dst, dst, 4, vec_enc);
5068 
5069     // Get the reverse bit sequence of upper nibble of each byte.
5070     vpandn(xtmp2, xtmp2, src, vec_enc);
5071     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5072     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5073 
5074     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5075     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5076     vporq(xtmp2, dst, xtmp2, vec_enc);
5077     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
5078 
5079   } else if(vec_enc == Assembler::AVX_512bit) {
5080     // Shift based bit reversal.
5081     assert(bt == T_LONG || bt == T_INT, "");
5082 
5083     // Swap lower and upper nibble of each byte.
5084     vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc);
5085 
5086     // Swap two least and most significant bits of each nibble.
5087     vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc);
5088 
5089     // Swap adjacent pair of bits.
5090     evmovdqul(xtmp1, k0, dst, true, vec_enc);
5091     vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc);
5092 
5093     evmovdqul(xtmp1, k0, dst, true, vec_enc);
5094     vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc);
5095   } else {
5096     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp);
5097     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5098 
5099     // Get the reverse bit sequence of lower nibble of each byte.
5100     vpand(dst, xtmp2, src, vec_enc);
5101     vpshufb(dst, xtmp1, dst, vec_enc);
5102     vpsllq(dst, dst, 4, vec_enc);
5103 
5104     // Get the reverse bit sequence of upper nibble of each byte.
5105     vpandn(xtmp2, xtmp2, src, vec_enc);
5106     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5107     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5108 
5109     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5110     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5111     vpor(xtmp2, dst, xtmp2, vec_enc);
5112     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
5113   }
5114 }
5115 
5116 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc,
5117                                                 XMMRegister xtmp, Register rscratch) {
5118   assert(VM_Version::supports_gfni(), "");
5119   assert(rscratch != noreg || always_reachable(mask), "missing");
5120 
5121   // Galois field instruction based bit reversal based on following algorithm.
5122   // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
5123   vpbroadcastq(xtmp, mask, vec_enc, rscratch);
5124   vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc);
5125   vector_reverse_byte(bt, dst, xtmp, vec_enc);
5126 }
5127 
5128 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src,
5129                                           XMMRegister xtmp1, Register rtmp, int vec_enc) {
5130   vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc);
5131   vpandq(dst, xtmp1, src, vec_enc);
5132   vpsllq(dst, dst, nbits, vec_enc);
5133   vpandn(xtmp1, xtmp1, src, vec_enc);
5134   vpsrlq(xtmp1, xtmp1, nbits, vec_enc);
5135   vporq(dst, dst, xtmp1, vec_enc);
5136 }
5137 
5138 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5139                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5140   // Shift based bit reversal.
5141   assert(VM_Version::supports_evex(), "");
5142   switch(bt) {
5143     case T_LONG:
5144       // Swap upper and lower double word of each quad word.
5145       evprorq(xtmp1, k0, src, 32, true, vec_enc);
5146       evprord(xtmp1, k0, xtmp1, 16, true, vec_enc);
5147       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
5148       break;
5149     case T_INT:
5150       // Swap upper and lower word of each double word.
5151       evprord(xtmp1, k0, src, 16, true, vec_enc);
5152       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
5153       break;
5154     case T_CHAR:
5155     case T_SHORT:
5156       // Swap upper and lower byte of each word.
5157       vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc);
5158       break;
5159     case T_BYTE:
5160       evmovdquq(dst, k0, src, true, vec_enc);
5161       break;
5162     default:
5163       fatal("Unsupported type %s", type2name(bt));
5164       break;
5165   }
5166 }
5167 
5168 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) {
5169   if (bt == T_BYTE) {
5170     if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) {
5171       evmovdquq(dst, k0, src, true, vec_enc);
5172     } else {
5173       vmovdqu(dst, src);
5174     }
5175     return;
5176   }
5177   // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using
5178   // pre-computed shuffle indices.
5179   switch(bt) {
5180     case T_LONG:
5181       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg);
5182       break;
5183     case T_INT:
5184       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg);
5185       break;
5186     case T_CHAR:
5187     case T_SHORT:
5188       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg);
5189       break;
5190     default:
5191       fatal("Unsupported type %s", type2name(bt));
5192       break;
5193   }
5194   vpshufb(dst, src, dst, vec_enc);
5195 }
5196 
5197 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5198                                                         XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
5199                                                         KRegister ktmp, Register rtmp, bool merge, int vec_enc) {
5200   assert(is_integral_type(bt), "");
5201   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5202   assert(VM_Version::supports_avx512cd(), "");
5203   switch(bt) {
5204     case T_LONG:
5205       evplzcntq(dst, ktmp, src, merge, vec_enc);
5206       break;
5207     case T_INT:
5208       evplzcntd(dst, ktmp, src, merge, vec_enc);
5209       break;
5210     case T_SHORT:
5211       vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc);
5212       vpunpcklwd(xtmp2, xtmp1, src, vec_enc);
5213       evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc);
5214       vpunpckhwd(dst, xtmp1, src, vec_enc);
5215       evplzcntd(dst, ktmp, dst, merge, vec_enc);
5216       vpackusdw(dst, xtmp2, dst, vec_enc);
5217       break;
5218     case T_BYTE:
5219       // T1 = Compute leading zero counts of 4 LSB bits of each byte by
5220       // accessing the lookup table.
5221       // T2 = Compute leading zero counts of 4 MSB bits of each byte by
5222       // accessing the lookup table.
5223       // Add T1 to T2 if 4 MSB bits of byte are all zeros.
5224       assert(VM_Version::supports_avx512bw(), "");
5225       evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp);
5226       vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc);
5227       vpand(xtmp2, dst, src, vec_enc);
5228       vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5229       vpsrlw(xtmp3, src, 4, vec_enc);
5230       vpand(xtmp3, dst, xtmp3, vec_enc);
5231       vpshufb(dst, xtmp1, xtmp3, vec_enc);
5232       vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
5233       evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc);
5234       evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc);
5235       break;
5236     default:
5237       fatal("Unsupported type %s", type2name(bt));
5238       break;
5239   }
5240 }
5241 
5242 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5243                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
5244   vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp);
5245   vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5246   // T1 = Compute leading zero counts of 4 LSB bits of each byte by
5247   // accessing the lookup table.
5248   vpand(dst, xtmp2, src, vec_enc);
5249   vpshufb(dst, xtmp1, dst, vec_enc);
5250   // T2 = Compute leading zero counts of 4 MSB bits of each byte by
5251   // accessing the lookup table.
5252   vpsrlw(xtmp3, src, 4, vec_enc);
5253   vpand(xtmp3, xtmp2, xtmp3, vec_enc);
5254   vpshufb(xtmp2, xtmp1, xtmp3, vec_enc);
5255   // Add T1 to T2 if 4 MSB bits of byte are all zeros.
5256   vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
5257   vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc);
5258   vpaddb(dst, dst, xtmp2, vec_enc);
5259   vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc);
5260 }
5261 
5262 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5263                                                              XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
5264   vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
5265   // Add zero counts of lower byte and upper byte of a word if
5266   // upper byte holds a zero value.
5267   vpsrlw(xtmp3, src, 8, vec_enc);
5268   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
5269   vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc);
5270   vpsllw(xtmp2, dst, 8, vec_enc);
5271   vpaddw(xtmp2, xtmp2, dst, vec_enc);
5272   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
5273   vpsrlw(dst, dst, 8, vec_enc);
5274 }
5275 
5276 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5277                                                            XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) {
5278   // Since IEEE 754 floating point format represents mantissa in 1.0 format
5279   // hence biased exponent can be used to compute leading zero count as per
5280   // following formula:-
5281   // LZCNT = 32 - (biased_exp - 127)
5282   // Special handling has been introduced for Zero, Max_Int and -ve source values.
5283 
5284   // Broadcast 0xFF
5285   vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc);
5286   vpsrld(xtmp1, xtmp1, 24, vec_enc);
5287 
5288   // Extract biased exponent.
5289   vcvtdq2ps(dst, src, vec_enc);
5290   vpsrld(dst, dst, 23, vec_enc);
5291   vpand(dst, dst, xtmp1, vec_enc);
5292 
5293   // Broadcast 127.
5294   vpsrld(xtmp1, xtmp1, 1, vec_enc);
5295   // Exponent = biased_exp - 127
5296   vpsubd(dst, dst, xtmp1, vec_enc);
5297 
5298   // Exponent = Exponent  + 1
5299   vpsrld(xtmp3, xtmp1, 6, vec_enc);
5300   vpaddd(dst, dst, xtmp3, vec_enc);
5301 
5302   // Replace -ve exponent with zero, exponent is -ve when src
5303   // lane contains a zero value.
5304   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5305   vblendvps(dst, dst, xtmp2, dst, vec_enc);
5306 
5307   // Rematerialize broadcast 32.
5308   vpslld(xtmp1, xtmp3, 5, vec_enc);
5309   // Exponent is 32 if corresponding source lane contains max_int value.
5310   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
5311   // LZCNT = 32 - exponent
5312   vpsubd(dst, xtmp1, dst, vec_enc);
5313 
5314   // Replace LZCNT with a value 1 if corresponding source lane
5315   // contains max_int value.
5316   vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc);
5317 
5318   // Replace biased_exp with 0 if source lane value is less than zero.
5319   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5320   vblendvps(dst, dst, xtmp2, src, vec_enc);
5321 }
5322 
5323 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5324                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
5325   vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
5326   // Add zero counts of lower word and upper word of a double word if
5327   // upper word holds a zero value.
5328   vpsrld(xtmp3, src, 16, vec_enc);
5329   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
5330   vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc);
5331   vpslld(xtmp2, dst, 16, vec_enc);
5332   vpaddd(xtmp2, xtmp2, dst, vec_enc);
5333   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
5334   vpsrld(dst, dst, 16, vec_enc);
5335   // Add zero counts of lower doubleword and upper doubleword of a
5336   // quadword if upper doubleword holds a zero value.
5337   vpsrlq(xtmp3, src, 32, vec_enc);
5338   vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc);
5339   vpsllq(xtmp2, dst, 32, vec_enc);
5340   vpaddq(xtmp2, xtmp2, dst, vec_enc);
5341   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
5342   vpsrlq(dst, dst, 32, vec_enc);
5343 }
5344 
5345 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src,
5346                                                        XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
5347                                                        Register rtmp, int vec_enc) {
5348   assert(is_integral_type(bt), "unexpected type");
5349   assert(vec_enc < Assembler::AVX_512bit, "");
5350   switch(bt) {
5351     case T_LONG:
5352       vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
5353       break;
5354     case T_INT:
5355       vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
5356       break;
5357     case T_SHORT:
5358       vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
5359       break;
5360     case T_BYTE:
5361       vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
5362       break;
5363     default:
5364       fatal("Unsupported type %s", type2name(bt));
5365       break;
5366   }
5367 }
5368 
5369 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) {
5370   switch(bt) {
5371     case T_BYTE:
5372       vpsubb(dst, src1, src2, vec_enc);
5373       break;
5374     case T_SHORT:
5375       vpsubw(dst, src1, src2, vec_enc);
5376       break;
5377     case T_INT:
5378       vpsubd(dst, src1, src2, vec_enc);
5379       break;
5380     case T_LONG:
5381       vpsubq(dst, src1, src2, vec_enc);
5382       break;
5383     default:
5384       fatal("Unsupported type %s", type2name(bt));
5385       break;
5386   }
5387 }
5388 
5389 // Trailing zero count computation is based on leading zero count operation as per
5390 // following equation. All AVX3 targets support AVX512CD feature which offers
5391 // direct vector instruction to compute leading zero count.
5392 //      CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x)
5393 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5394                                                          XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
5395                                                          XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) {
5396   assert(is_integral_type(bt), "");
5397   // xtmp = -1
5398   vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc);
5399   // xtmp = xtmp + src
5400   vpadd(bt, xtmp4, xtmp4, src, vec_enc);
5401   // xtmp = xtmp & ~src
5402   vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc);
5403   vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc);
5404   vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc);
5405   vpsub(bt, dst, xtmp4, dst, vec_enc);
5406 }
5407 
5408 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation
5409 //      CTZ = PRIM_TYPE_WIDHT - POPC(x | -x)
5410 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5411                                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
5412   assert(is_integral_type(bt), "");
5413   // xtmp = 0
5414   vpxor(xtmp3 , xtmp3, xtmp3, vec_enc);
5415   // xtmp = 0 - src
5416   vpsub(bt, xtmp3, xtmp3, src, vec_enc);
5417   // xtmp = xtmp | src
5418   vpor(xtmp3, xtmp3, src, vec_enc);
5419   vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc);
5420   vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc);
5421   vpsub(bt, dst, xtmp1, dst, vec_enc);
5422 }
5423 
5424 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) {
5425   Label done;
5426   Label neg_divisor_fastpath;
5427   cmpl(divisor, 0);
5428   jccb(Assembler::less, neg_divisor_fastpath);
5429   xorl(rdx, rdx);
5430   divl(divisor);
5431   jmpb(done);
5432   bind(neg_divisor_fastpath);
5433   // Fastpath for divisor < 0:
5434   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
5435   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
5436   movl(rdx, rax);
5437   subl(rdx, divisor);
5438   if (VM_Version::supports_bmi1()) {
5439     andnl(rax, rdx, rax);
5440   } else {
5441     notl(rdx);
5442     andl(rax, rdx);
5443   }
5444   shrl(rax, 31);
5445   bind(done);
5446 }
5447 
5448 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) {
5449   Label done;
5450   Label neg_divisor_fastpath;
5451   cmpl(divisor, 0);
5452   jccb(Assembler::less, neg_divisor_fastpath);
5453   xorl(rdx, rdx);
5454   divl(divisor);
5455   jmpb(done);
5456   bind(neg_divisor_fastpath);
5457   // Fastpath when divisor < 0:
5458   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
5459   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
5460   movl(rdx, rax);
5461   subl(rax, divisor);
5462   if (VM_Version::supports_bmi1()) {
5463     andnl(rax, rax, rdx);
5464   } else {
5465     notl(rax);
5466     andl(rax, rdx);
5467   }
5468   sarl(rax, 31);
5469   andl(rax, divisor);
5470   subl(rdx, rax);
5471   bind(done);
5472 }
5473 
5474 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) {
5475   Label done;
5476   Label neg_divisor_fastpath;
5477 
5478   cmpl(divisor, 0);
5479   jccb(Assembler::less, neg_divisor_fastpath);
5480   xorl(rdx, rdx);
5481   divl(divisor);
5482   jmpb(done);
5483   bind(neg_divisor_fastpath);
5484   // Fastpath for divisor < 0:
5485   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
5486   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
5487   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
5488   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
5489   movl(rdx, rax);
5490   subl(rax, divisor);
5491   if (VM_Version::supports_bmi1()) {
5492     andnl(rax, rax, rdx);
5493   } else {
5494     notl(rax);
5495     andl(rax, rdx);
5496   }
5497   movl(tmp, rax);
5498   shrl(rax, 31); // quotient
5499   sarl(tmp, 31);
5500   andl(tmp, divisor);
5501   subl(rdx, tmp); // remainder
5502   bind(done);
5503 }
5504 
5505 #ifdef _LP64
5506 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1,
5507                                  XMMRegister xtmp2, Register rtmp) {
5508   if(VM_Version::supports_gfni()) {
5509     // Galois field instruction based bit reversal based on following algorithm.
5510     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
5511     mov64(rtmp, 0x8040201008040201L);
5512     movq(xtmp1, src);
5513     movq(xtmp2, rtmp);
5514     gf2p8affineqb(xtmp1, xtmp2, 0);
5515     movq(dst, xtmp1);
5516   } else {
5517     // Swap even and odd numbered bits.
5518     movl(rtmp, src);
5519     andl(rtmp, 0x55555555);
5520     shll(rtmp, 1);
5521     movl(dst, src);
5522     andl(dst, 0xAAAAAAAA);
5523     shrl(dst, 1);
5524     orl(dst, rtmp);
5525 
5526     // Swap LSB and MSB 2 bits of each nibble.
5527     movl(rtmp, dst);
5528     andl(rtmp, 0x33333333);
5529     shll(rtmp, 2);
5530     andl(dst, 0xCCCCCCCC);
5531     shrl(dst, 2);
5532     orl(dst, rtmp);
5533 
5534     // Swap LSB and MSB 4 bits of each byte.
5535     movl(rtmp, dst);
5536     andl(rtmp, 0x0F0F0F0F);
5537     shll(rtmp, 4);
5538     andl(dst, 0xF0F0F0F0);
5539     shrl(dst, 4);
5540     orl(dst, rtmp);
5541   }
5542   bswapl(dst);
5543 }
5544 
5545 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1,
5546                                  XMMRegister xtmp2, Register rtmp1, Register rtmp2) {
5547   if(VM_Version::supports_gfni()) {
5548     // Galois field instruction based bit reversal based on following algorithm.
5549     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
5550     mov64(rtmp1, 0x8040201008040201L);
5551     movq(xtmp1, src);
5552     movq(xtmp2, rtmp1);
5553     gf2p8affineqb(xtmp1, xtmp2, 0);
5554     movq(dst, xtmp1);
5555   } else {
5556     // Swap even and odd numbered bits.
5557     movq(rtmp1, src);
5558     mov64(rtmp2, 0x5555555555555555L);
5559     andq(rtmp1, rtmp2);
5560     shlq(rtmp1, 1);
5561     movq(dst, src);
5562     notq(rtmp2);
5563     andq(dst, rtmp2);
5564     shrq(dst, 1);
5565     orq(dst, rtmp1);
5566 
5567     // Swap LSB and MSB 2 bits of each nibble.
5568     movq(rtmp1, dst);
5569     mov64(rtmp2, 0x3333333333333333L);
5570     andq(rtmp1, rtmp2);
5571     shlq(rtmp1, 2);
5572     notq(rtmp2);
5573     andq(dst, rtmp2);
5574     shrq(dst, 2);
5575     orq(dst, rtmp1);
5576 
5577     // Swap LSB and MSB 4 bits of each byte.
5578     movq(rtmp1, dst);
5579     mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL);
5580     andq(rtmp1, rtmp2);
5581     shlq(rtmp1, 4);
5582     notq(rtmp2);
5583     andq(dst, rtmp2);
5584     shrq(dst, 4);
5585     orq(dst, rtmp1);
5586   }
5587   bswapq(dst);
5588 }
5589 
5590 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) {
5591   Label done;
5592   Label neg_divisor_fastpath;
5593   cmpq(divisor, 0);
5594   jccb(Assembler::less, neg_divisor_fastpath);
5595   xorl(rdx, rdx);
5596   divq(divisor);
5597   jmpb(done);
5598   bind(neg_divisor_fastpath);
5599   // Fastpath for divisor < 0:
5600   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
5601   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
5602   movq(rdx, rax);
5603   subq(rdx, divisor);
5604   if (VM_Version::supports_bmi1()) {
5605     andnq(rax, rdx, rax);
5606   } else {
5607     notq(rdx);
5608     andq(rax, rdx);
5609   }
5610   shrq(rax, 63);
5611   bind(done);
5612 }
5613 
5614 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) {
5615   Label done;
5616   Label neg_divisor_fastpath;
5617   cmpq(divisor, 0);
5618   jccb(Assembler::less, neg_divisor_fastpath);
5619   xorq(rdx, rdx);
5620   divq(divisor);
5621   jmp(done);
5622   bind(neg_divisor_fastpath);
5623   // Fastpath when divisor < 0:
5624   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
5625   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
5626   movq(rdx, rax);
5627   subq(rax, divisor);
5628   if (VM_Version::supports_bmi1()) {
5629     andnq(rax, rax, rdx);
5630   } else {
5631     notq(rax);
5632     andq(rax, rdx);
5633   }
5634   sarq(rax, 63);
5635   andq(rax, divisor);
5636   subq(rdx, rax);
5637   bind(done);
5638 }
5639 
5640 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) {
5641   Label done;
5642   Label neg_divisor_fastpath;
5643   cmpq(divisor, 0);
5644   jccb(Assembler::less, neg_divisor_fastpath);
5645   xorq(rdx, rdx);
5646   divq(divisor);
5647   jmp(done);
5648   bind(neg_divisor_fastpath);
5649   // Fastpath for divisor < 0:
5650   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
5651   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
5652   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
5653   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
5654   movq(rdx, rax);
5655   subq(rax, divisor);
5656   if (VM_Version::supports_bmi1()) {
5657     andnq(rax, rax, rdx);
5658   } else {
5659     notq(rax);
5660     andq(rax, rdx);
5661   }
5662   movq(tmp, rax);
5663   shrq(rax, 63); // quotient
5664   sarq(tmp, 63);
5665   andq(tmp, divisor);
5666   subq(rdx, tmp); // remainder
5667   bind(done);
5668 }
5669 #endif
5670 
5671 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1,
5672                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp,
5673                                         int vlen_enc) {
5674   assert(VM_Version::supports_avx512bw(), "");
5675   // Byte shuffles are inlane operations and indices are determined using
5676   // lower 4 bit of each shuffle lane, thus all shuffle indices are
5677   // normalized to index range 0-15. This makes sure that all the multiples
5678   // of an index value are placed at same relative position in 128 bit
5679   // lane i.e. elements corresponding to shuffle indices 16, 32 and 64
5680   // will be 16th element in their respective 128 bit lanes.
5681   movl(rtmp, 16);
5682   evpbroadcastb(xtmp1, rtmp, vlen_enc);
5683 
5684   // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16,
5685   // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using
5686   // original shuffle indices and move the shuffled lanes corresponding to true
5687   // mask to destination vector.
5688   evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
5689   evshufi64x2(xtmp2, src, src, 0x0, vlen_enc);
5690   evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc);
5691 
5692   // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32
5693   // and broadcasting second 128 bit lane.
5694   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
5695   vpsllq(xtmp2, xtmp1, 0x1, vlen_enc);
5696   evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
5697   evshufi64x2(xtmp3, src, src, 0x55, vlen_enc);
5698   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
5699 
5700   // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48
5701   // and broadcasting third 128 bit lane.
5702   evpcmpb(ktmp, k0, shuffle,  xtmp2, Assembler::nlt, true, vlen_enc);
5703   vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc);
5704   evpcmpb(ktmp, ktmp, shuffle,  xtmp1, Assembler::lt, true, vlen_enc);
5705   evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc);
5706   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
5707 
5708   // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64
5709   // and broadcasting third 128 bit lane.
5710   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
5711   vpsllq(xtmp2, xtmp2, 0x1, vlen_enc);
5712   evpcmpb(ktmp, ktmp, shuffle,  xtmp2, Assembler::lt, true, vlen_enc);
5713   evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc);
5714   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
5715 }
5716