1 /*
   2  * Copyright (c) 2020, 2022, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "gc/shared/barrierSet.hpp"
  29 #include "gc/shared/barrierSetAssembler.hpp"
  30 #include "oops/methodData.hpp"
  31 #include "opto/c2_MacroAssembler.hpp"
  32 #include "opto/intrinsicnode.hpp"
  33 #include "opto/output.hpp"
  34 #include "opto/opcodes.hpp"
  35 #include "opto/subnode.hpp"
  36 #include "runtime/objectMonitor.hpp"
  37 #include "runtime/stubRoutines.hpp"
  38 
  39 #ifdef PRODUCT
  40 #define BLOCK_COMMENT(str) /* nothing */
  41 #define STOP(error) stop(error)
  42 #else
  43 #define BLOCK_COMMENT(str) block_comment(str)
  44 #define STOP(error) block_comment(error); stop(error)
  45 #endif
  46 
  47 // C2 compiled method's prolog code.
  48 void C2_MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) {
  49 
  50   // WARNING: Initial instruction MUST be 5 bytes or longer so that
  51   // NativeJump::patch_verified_entry will be able to patch out the entry
  52   // code safely. The push to verify stack depth is ok at 5 bytes,
  53   // the frame allocation can be either 3 or 6 bytes. So if we don't do
  54   // stack bang then we must use the 6 byte frame allocation even if
  55   // we have no frame. :-(
  56   assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
  57 
  58   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
  59   // Remove word for return addr
  60   framesize -= wordSize;
  61   stack_bang_size -= wordSize;
  62 
  63   // Calls to C2R adapters often do not accept exceptional returns.
  64   // We require that their callers must bang for them.  But be careful, because
  65   // some VM calls (such as call site linkage) can use several kilobytes of
  66   // stack.  But the stack safety zone should account for that.
  67   // See bugs 4446381, 4468289, 4497237.
  68   if (stack_bang_size > 0) {
  69     generate_stack_overflow_check(stack_bang_size);
  70 
  71     // We always push rbp, so that on return to interpreter rbp, will be
  72     // restored correctly and we can correct the stack.
  73     push(rbp);
  74     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  75     if (PreserveFramePointer) {
  76       mov(rbp, rsp);
  77     }
  78     // Remove word for ebp
  79     framesize -= wordSize;
  80 
  81     // Create frame
  82     if (framesize) {
  83       subptr(rsp, framesize);
  84     }
  85   } else {
  86     // Create frame (force generation of a 4 byte immediate value)
  87     subptr_imm32(rsp, framesize);
  88 
  89     // Save RBP register now.
  90     framesize -= wordSize;
  91     movptr(Address(rsp, framesize), rbp);
  92     // Save caller's stack pointer into RBP if the frame pointer is preserved.
  93     if (PreserveFramePointer) {
  94       movptr(rbp, rsp);
  95       if (framesize > 0) {
  96         addptr(rbp, framesize);
  97       }
  98     }
  99   }
 100 
 101   if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
 102     framesize -= wordSize;
 103     movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 104   }
 105 
 106 #ifndef _LP64
 107   // If method sets FPU control word do it now
 108   if (fp_mode_24b) {
 109     fldcw(ExternalAddress(StubRoutines::x86::addr_fpu_cntrl_wrd_24()));
 110   }
 111   if (UseSSE >= 2 && VerifyFPU) {
 112     verify_FPU(0, "FPU stack must be clean on entry");
 113   }
 114 #endif
 115 
 116 #ifdef ASSERT
 117   if (VerifyStackAtCalls) {
 118     Label L;
 119     push(rax);
 120     mov(rax, rsp);
 121     andptr(rax, StackAlignmentInBytes-1);
 122     cmpptr(rax, StackAlignmentInBytes-wordSize);
 123     pop(rax);
 124     jcc(Assembler::equal, L);
 125     STOP("Stack is not properly aligned!");
 126     bind(L);
 127   }
 128 #endif
 129 
 130   if (!is_stub) {
 131     BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 132  #ifdef _LP64
 133     if (BarrierSet::barrier_set()->barrier_set_nmethod() != NULL) {
 134       // We put the non-hot code of the nmethod entry barrier out-of-line in a stub.
 135       Label dummy_slow_path;
 136       Label dummy_continuation;
 137       Label* slow_path = &dummy_slow_path;
 138       Label* continuation = &dummy_continuation;
 139       if (!Compile::current()->output()->in_scratch_emit_size()) {
 140         // Use real labels from actual stub when not emitting code for the purpose of measuring its size
 141         C2EntryBarrierStub* stub = Compile::current()->output()->entry_barrier_table()->add_entry_barrier();
 142         slow_path = &stub->slow_path();
 143         continuation = &stub->continuation();
 144       }
 145       bs->nmethod_entry_barrier(this, slow_path, continuation);
 146     }
 147 #else
 148     // Don't bother with out-of-line nmethod entry barrier stub for x86_32.
 149     bs->nmethod_entry_barrier(this, NULL /* slow_path */, NULL /* continuation */);
 150 #endif
 151   }
 152 }
 153 
 154 void C2_MacroAssembler::emit_entry_barrier_stub(C2EntryBarrierStub* stub) {
 155   bind(stub->slow_path());
 156   call(RuntimeAddress(StubRoutines::x86::method_entry_barrier()));
 157   jmp(stub->continuation(), false /* maybe_short */);
 158 }
 159 
 160 int C2_MacroAssembler::entry_barrier_stub_size() {
 161   return 10;
 162 }
 163 
 164 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
 165   switch (vlen_in_bytes) {
 166     case  4: // fall-through
 167     case  8: // fall-through
 168     case 16: return Assembler::AVX_128bit;
 169     case 32: return Assembler::AVX_256bit;
 170     case 64: return Assembler::AVX_512bit;
 171 
 172     default: {
 173       ShouldNotReachHere();
 174       return Assembler::AVX_NoVec;
 175     }
 176   }
 177 }
 178 
 179 #if INCLUDE_RTM_OPT
 180 
 181 // Update rtm_counters based on abort status
 182 // input: abort_status
 183 //        rtm_counters (RTMLockingCounters*)
 184 // flags are killed
 185 void C2_MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) {
 186 
 187   atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset()));
 188   if (PrintPreciseRTMLockingStatistics) {
 189     for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {
 190       Label check_abort;
 191       testl(abort_status, (1<<i));
 192       jccb(Assembler::equal, check_abort);
 193       atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx))));
 194       bind(check_abort);
 195     }
 196   }
 197 }
 198 
 199 // Branch if (random & (count-1) != 0), count is 2^n
 200 // tmp, scr and flags are killed
 201 void C2_MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) {
 202   assert(tmp == rax, "");
 203   assert(scr == rdx, "");
 204   rdtsc(); // modifies EDX:EAX
 205   andptr(tmp, count-1);
 206   jccb(Assembler::notZero, brLabel);
 207 }
 208 
 209 // Perform abort ratio calculation, set no_rtm bit if high ratio
 210 // input:  rtm_counters_Reg (RTMLockingCounters* address)
 211 // tmpReg, rtm_counters_Reg and flags are killed
 212 void C2_MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg,
 213                                                     Register rtm_counters_Reg,
 214                                                     RTMLockingCounters* rtm_counters,
 215                                                     Metadata* method_data) {
 216   Label L_done, L_check_always_rtm1, L_check_always_rtm2;
 217 
 218   if (RTMLockingCalculationDelay > 0) {
 219     // Delay calculation
 220     movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr()));
 221     testptr(tmpReg, tmpReg);
 222     jccb(Assembler::equal, L_done);
 223   }
 224   // Abort ratio calculation only if abort_count > RTMAbortThreshold
 225   //   Aborted transactions = abort_count * 100
 226   //   All transactions = total_count *  RTMTotalCountIncrRate
 227   //   Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
 228 
 229   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset()));
 230   cmpptr(tmpReg, RTMAbortThreshold);
 231   jccb(Assembler::below, L_check_always_rtm2);
 232   imulptr(tmpReg, tmpReg, 100);
 233 
 234   Register scrReg = rtm_counters_Reg;
 235   movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
 236   imulptr(scrReg, scrReg, RTMTotalCountIncrRate);
 237   imulptr(scrReg, scrReg, RTMAbortRatio);
 238   cmpptr(tmpReg, scrReg);
 239   jccb(Assembler::below, L_check_always_rtm1);
 240   if (method_data != NULL) {
 241     // set rtm_state to "no rtm" in MDO
 242     mov_metadata(tmpReg, method_data);
 243     lock();
 244     orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), NoRTM);
 245   }
 246   jmpb(L_done);
 247   bind(L_check_always_rtm1);
 248   // Reload RTMLockingCounters* address
 249   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
 250   bind(L_check_always_rtm2);
 251   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
 252   cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate);
 253   jccb(Assembler::below, L_done);
 254   if (method_data != NULL) {
 255     // set rtm_state to "always rtm" in MDO
 256     mov_metadata(tmpReg, method_data);
 257     lock();
 258     orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), UseRTM);
 259   }
 260   bind(L_done);
 261 }
 262 
 263 // Update counters and perform abort ratio calculation
 264 // input:  abort_status_Reg
 265 // rtm_counters_Reg, flags are killed
 266 void C2_MacroAssembler::rtm_profiling(Register abort_status_Reg,
 267                                       Register rtm_counters_Reg,
 268                                       RTMLockingCounters* rtm_counters,
 269                                       Metadata* method_data,
 270                                       bool profile_rtm) {
 271 
 272   assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
 273   // update rtm counters based on rax value at abort
 274   // reads abort_status_Reg, updates flags
 275   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
 276   rtm_counters_update(abort_status_Reg, rtm_counters_Reg);
 277   if (profile_rtm) {
 278     // Save abort status because abort_status_Reg is used by following code.
 279     if (RTMRetryCount > 0) {
 280       push(abort_status_Reg);
 281     }
 282     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
 283     rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data);
 284     // restore abort status
 285     if (RTMRetryCount > 0) {
 286       pop(abort_status_Reg);
 287     }
 288   }
 289 }
 290 
 291 // Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4)
 292 // inputs: retry_count_Reg
 293 //       : abort_status_Reg
 294 // output: retry_count_Reg decremented by 1
 295 // flags are killed
 296 void C2_MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) {
 297   Label doneRetry;
 298   assert(abort_status_Reg == rax, "");
 299   // The abort reason bits are in eax (see all states in rtmLocking.hpp)
 300   // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4)
 301   // if reason is in 0x6 and retry count != 0 then retry
 302   andptr(abort_status_Reg, 0x6);
 303   jccb(Assembler::zero, doneRetry);
 304   testl(retry_count_Reg, retry_count_Reg);
 305   jccb(Assembler::zero, doneRetry);
 306   pause();
 307   decrementl(retry_count_Reg);
 308   jmp(retryLabel);
 309   bind(doneRetry);
 310 }
 311 
 312 // Spin and retry if lock is busy,
 313 // inputs: box_Reg (monitor address)
 314 //       : retry_count_Reg
 315 // output: retry_count_Reg decremented by 1
 316 //       : clear z flag if retry count exceeded
 317 // tmp_Reg, scr_Reg, flags are killed
 318 void C2_MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg,
 319                                                Register tmp_Reg, Register scr_Reg, Label& retryLabel) {
 320   Label SpinLoop, SpinExit, doneRetry;
 321   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 322 
 323   testl(retry_count_Reg, retry_count_Reg);
 324   jccb(Assembler::zero, doneRetry);
 325   decrementl(retry_count_Reg);
 326   movptr(scr_Reg, RTMSpinLoopCount);
 327 
 328   bind(SpinLoop);
 329   pause();
 330   decrementl(scr_Reg);
 331   jccb(Assembler::lessEqual, SpinExit);
 332   movptr(tmp_Reg, Address(box_Reg, owner_offset));
 333   testptr(tmp_Reg, tmp_Reg);
 334   jccb(Assembler::notZero, SpinLoop);
 335 
 336   bind(SpinExit);
 337   jmp(retryLabel);
 338   bind(doneRetry);
 339   incrementl(retry_count_Reg); // clear z flag
 340 }
 341 
 342 // Use RTM for normal stack locks
 343 // Input: objReg (object to lock)
 344 void C2_MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg,
 345                                          Register retry_on_abort_count_Reg,
 346                                          RTMLockingCounters* stack_rtm_counters,
 347                                          Metadata* method_data, bool profile_rtm,
 348                                          Label& DONE_LABEL, Label& IsInflated) {
 349   assert(UseRTMForStackLocks, "why call this otherwise?");
 350   assert(tmpReg == rax, "");
 351   assert(scrReg == rdx, "");
 352   Label L_rtm_retry, L_decrement_retry, L_on_abort;
 353 
 354   if (RTMRetryCount > 0) {
 355     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
 356     bind(L_rtm_retry);
 357   }
 358   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
 359   testptr(tmpReg, markWord::monitor_value);  // inflated vs stack-locked|neutral
 360   jcc(Assembler::notZero, IsInflated);
 361 
 362   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 363     Label L_noincrement;
 364     if (RTMTotalCountIncrRate > 1) {
 365       // tmpReg, scrReg and flags are killed
 366       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
 367     }
 368     assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM");
 369     atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg);
 370     bind(L_noincrement);
 371   }
 372   xbegin(L_on_abort);
 373   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));       // fetch markword
 374   andptr(tmpReg, markWord::lock_mask_in_place);     // look at 2 lock bits
 375   cmpptr(tmpReg, markWord::unlocked_value);         // bits = 01 unlocked
 376   jcc(Assembler::equal, DONE_LABEL);        // all done if unlocked
 377 
 378   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
 379   if (UseRTMXendForLockBusy) {
 380     xend();
 381     movptr(abort_status_Reg, 0x2);   // Set the abort status to 2 (so we can retry)
 382     jmp(L_decrement_retry);
 383   }
 384   else {
 385     xabort(0);
 386   }
 387   bind(L_on_abort);
 388   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 389     rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm);
 390   }
 391   bind(L_decrement_retry);
 392   if (RTMRetryCount > 0) {
 393     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
 394     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
 395   }
 396 }
 397 
 398 // Use RTM for inflating locks
 399 // inputs: objReg (object to lock)
 400 //         boxReg (on-stack box address (displaced header location) - KILLED)
 401 //         tmpReg (ObjectMonitor address + markWord::monitor_value)
 402 void C2_MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg,
 403                                             Register scrReg, Register retry_on_busy_count_Reg,
 404                                             Register retry_on_abort_count_Reg,
 405                                             RTMLockingCounters* rtm_counters,
 406                                             Metadata* method_data, bool profile_rtm,
 407                                             Label& DONE_LABEL) {
 408   assert(UseRTMLocking, "why call this otherwise?");
 409   assert(tmpReg == rax, "");
 410   assert(scrReg == rdx, "");
 411   Label L_rtm_retry, L_decrement_retry, L_on_abort;
 412   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 413 
 414   movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value()));
 415   movptr(boxReg, tmpReg); // Save ObjectMonitor address
 416 
 417   if (RTMRetryCount > 0) {
 418     movl(retry_on_busy_count_Reg, RTMRetryCount);  // Retry on lock busy
 419     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
 420     bind(L_rtm_retry);
 421   }
 422   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 423     Label L_noincrement;
 424     if (RTMTotalCountIncrRate > 1) {
 425       // tmpReg, scrReg and flags are killed
 426       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
 427     }
 428     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
 429     atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg);
 430     bind(L_noincrement);
 431   }
 432   xbegin(L_on_abort);
 433   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
 434   movptr(tmpReg, Address(tmpReg, owner_offset));
 435   testptr(tmpReg, tmpReg);
 436   jcc(Assembler::zero, DONE_LABEL);
 437   if (UseRTMXendForLockBusy) {
 438     xend();
 439     jmp(L_decrement_retry);
 440   }
 441   else {
 442     xabort(0);
 443   }
 444   bind(L_on_abort);
 445   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
 446   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 447     rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm);
 448   }
 449   if (RTMRetryCount > 0) {
 450     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
 451     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
 452   }
 453 
 454   movptr(tmpReg, Address(boxReg, owner_offset)) ;
 455   testptr(tmpReg, tmpReg) ;
 456   jccb(Assembler::notZero, L_decrement_retry) ;
 457 
 458   // Appears unlocked - try to swing _owner from null to non-null.
 459   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 460 #ifdef _LP64
 461   Register threadReg = r15_thread;
 462 #else
 463   get_thread(scrReg);
 464   Register threadReg = scrReg;
 465 #endif
 466   lock();
 467   cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg
 468 
 469   if (RTMRetryCount > 0) {
 470     // success done else retry
 471     jccb(Assembler::equal, DONE_LABEL) ;
 472     bind(L_decrement_retry);
 473     // Spin and retry if lock is busy.
 474     rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry);
 475   }
 476   else {
 477     bind(L_decrement_retry);
 478   }
 479 }
 480 
 481 #endif //  INCLUDE_RTM_OPT
 482 
 483 // fast_lock and fast_unlock used by C2
 484 
 485 // Because the transitions from emitted code to the runtime
 486 // monitorenter/exit helper stubs are so slow it's critical that
 487 // we inline both the stack-locking fast path and the inflated fast path.
 488 //
 489 // See also: cmpFastLock and cmpFastUnlock.
 490 //
 491 // What follows is a specialized inline transliteration of the code
 492 // in enter() and exit(). If we're concerned about I$ bloat another
 493 // option would be to emit TrySlowEnter and TrySlowExit methods
 494 // at startup-time.  These methods would accept arguments as
 495 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
 496 // indications in the icc.ZFlag.  fast_lock and fast_unlock would simply
 497 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
 498 // In practice, however, the # of lock sites is bounded and is usually small.
 499 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
 500 // if the processor uses simple bimodal branch predictors keyed by EIP
 501 // Since the helper routines would be called from multiple synchronization
 502 // sites.
 503 //
 504 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
 505 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
 506 // to those specialized methods.  That'd give us a mostly platform-independent
 507 // implementation that the JITs could optimize and inline at their pleasure.
 508 // Done correctly, the only time we'd need to cross to native could would be
 509 // to park() or unpark() threads.  We'd also need a few more unsafe operators
 510 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
 511 // (b) explicit barriers or fence operations.
 512 //
 513 // TODO:
 514 //
 515 // *  Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
 516 //    This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
 517 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
 518 //    the lock operators would typically be faster than reifying Self.
 519 //
 520 // *  Ideally I'd define the primitives as:
 521 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
 522 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
 523 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
 524 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
 525 //    Furthermore the register assignments are overconstrained, possibly resulting in
 526 //    sub-optimal code near the synchronization site.
 527 //
 528 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
 529 //    Alternately, use a better sp-proximity test.
 530 //
 531 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
 532 //    Either one is sufficient to uniquely identify a thread.
 533 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
 534 //
 535 // *  Intrinsify notify() and notifyAll() for the common cases where the
 536 //    object is locked by the calling thread but the waitlist is empty.
 537 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
 538 //
 539 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
 540 //    But beware of excessive branch density on AMD Opterons.
 541 //
 542 // *  Both fast_lock and fast_unlock set the ICC.ZF to indicate success
 543 //    or failure of the fast path.  If the fast path fails then we pass
 544 //    control to the slow path, typically in C.  In fast_lock and
 545 //    fast_unlock we often branch to DONE_LABEL, just to find that C2
 546 //    will emit a conditional branch immediately after the node.
 547 //    So we have branches to branches and lots of ICC.ZF games.
 548 //    Instead, it might be better to have C2 pass a "FailureLabel"
 549 //    into fast_lock and fast_unlock.  In the case of success, control
 550 //    will drop through the node.  ICC.ZF is undefined at exit.
 551 //    In the case of failure, the node will branch directly to the
 552 //    FailureLabel
 553 
 554 
 555 // obj: object to lock
 556 // box: on-stack box address (displaced header location) - KILLED
 557 // rax,: tmp -- KILLED
 558 // scr: tmp -- KILLED
 559 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
 560                                  Register scrReg, Register cx1Reg, Register cx2Reg,
 561                                  RTMLockingCounters* rtm_counters,
 562                                  RTMLockingCounters* stack_rtm_counters,
 563                                  Metadata* method_data,
 564                                  bool use_rtm, bool profile_rtm) {
 565   // Ensure the register assignments are disjoint
 566   assert(tmpReg == rax, "");
 567 
 568   if (use_rtm) {
 569     assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg);
 570   } else {
 571     assert(cx2Reg == noreg, "");
 572     assert_different_registers(objReg, boxReg, tmpReg, scrReg);
 573   }
 574 
 575   // Possible cases that we'll encounter in fast_lock
 576   // ------------------------------------------------
 577   // * Inflated
 578   //    -- unlocked
 579   //    -- Locked
 580   //       = by self
 581   //       = by other
 582   // * neutral
 583   // * stack-locked
 584   //    -- by self
 585   //       = sp-proximity test hits
 586   //       = sp-proximity test generates false-negative
 587   //    -- by other
 588   //
 589 
 590   Label IsInflated, DONE_LABEL, NO_COUNT, COUNT;
 591 
 592   if (DiagnoseSyncOnValueBasedClasses != 0) {
 593     load_klass(tmpReg, objReg, cx1Reg);
 594     movl(tmpReg, Address(tmpReg, Klass::access_flags_offset()));
 595     testl(tmpReg, JVM_ACC_IS_VALUE_BASED_CLASS);
 596     jcc(Assembler::notZero, DONE_LABEL);
 597   }
 598 
 599 #if INCLUDE_RTM_OPT
 600   if (UseRTMForStackLocks && use_rtm) {
 601     assert(!UseHeavyMonitors, "+UseHeavyMonitors and +UseRTMForStackLocks are mutually exclusive");
 602     rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg,
 603                       stack_rtm_counters, method_data, profile_rtm,
 604                       DONE_LABEL, IsInflated);
 605   }
 606 #endif // INCLUDE_RTM_OPT
 607 
 608   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));          // [FETCH]
 609   testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral
 610   jccb(Assembler::notZero, IsInflated);
 611 
 612   if (!UseHeavyMonitors) {
 613     // Attempt stack-locking ...
 614     orptr (tmpReg, markWord::unlocked_value);
 615     movptr(Address(boxReg, 0), tmpReg);          // Anticipate successful CAS
 616     lock();
 617     cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes()));      // Updates tmpReg
 618     jcc(Assembler::equal, COUNT);           // Success
 619 
 620     // Recursive locking.
 621     // The object is stack-locked: markword contains stack pointer to BasicLock.
 622     // Locked by current thread if difference with current SP is less than one page.
 623     subptr(tmpReg, rsp);
 624     // Next instruction set ZFlag == 1 (Success) if difference is less then one page.
 625     andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - os::vm_page_size())) );
 626     movptr(Address(boxReg, 0), tmpReg);
 627   } else {
 628     // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0.
 629     testptr(objReg, objReg);
 630   }
 631   jmp(DONE_LABEL);
 632 
 633   bind(IsInflated);
 634   // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value
 635 
 636 #if INCLUDE_RTM_OPT
 637   // Use the same RTM locking code in 32- and 64-bit VM.
 638   if (use_rtm) {
 639     rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg,
 640                          rtm_counters, method_data, profile_rtm, DONE_LABEL);
 641   } else {
 642 #endif // INCLUDE_RTM_OPT
 643 
 644 #ifndef _LP64
 645   // The object is inflated.
 646 
 647   // boxReg refers to the on-stack BasicLock in the current frame.
 648   // We'd like to write:
 649   //   set box->_displaced_header = markWord::unused_mark().  Any non-0 value suffices.
 650   // This is convenient but results a ST-before-CAS penalty.  The following CAS suffers
 651   // additional latency as we have another ST in the store buffer that must drain.
 652 
 653   // avoid ST-before-CAS
 654   // register juggle because we need tmpReg for cmpxchgptr below
 655   movptr(scrReg, boxReg);
 656   movptr(boxReg, tmpReg);                   // consider: LEA box, [tmp-2]
 657 
 658   // Optimistic form: consider XORL tmpReg,tmpReg
 659   movptr(tmpReg, NULL_WORD);
 660 
 661   // Appears unlocked - try to swing _owner from null to non-null.
 662   // Ideally, I'd manifest "Self" with get_thread and then attempt
 663   // to CAS the register containing Self into m->Owner.
 664   // But we don't have enough registers, so instead we can either try to CAS
 665   // rsp or the address of the box (in scr) into &m->owner.  If the CAS succeeds
 666   // we later store "Self" into m->Owner.  Transiently storing a stack address
 667   // (rsp or the address of the box) into  m->owner is harmless.
 668   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 669   lock();
 670   cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 671   movptr(Address(scrReg, 0), 3);          // box->_displaced_header = 3
 672   // If we weren't able to swing _owner from NULL to the BasicLock
 673   // then take the slow path.
 674   jccb  (Assembler::notZero, NO_COUNT);
 675   // update _owner from BasicLock to thread
 676   get_thread (scrReg);                    // beware: clobbers ICCs
 677   movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg);
 678   xorptr(boxReg, boxReg);                 // set icc.ZFlag = 1 to indicate success
 679 
 680   // If the CAS fails we can either retry or pass control to the slow path.
 681   // We use the latter tactic.
 682   // Pass the CAS result in the icc.ZFlag into DONE_LABEL
 683   // If the CAS was successful ...
 684   //   Self has acquired the lock
 685   //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
 686   // Intentional fall-through into DONE_LABEL ...
 687 #else // _LP64
 688   // It's inflated and we use scrReg for ObjectMonitor* in this section.
 689   movq(scrReg, tmpReg);
 690   xorq(tmpReg, tmpReg);
 691   lock();
 692   cmpxchgptr(r15_thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 693   // Unconditionally set box->_displaced_header = markWord::unused_mark().
 694   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 695   movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value()));
 696   // Propagate ICC.ZF from CAS above into DONE_LABEL.
 697   jccb(Assembler::equal, COUNT);          // CAS above succeeded; propagate ZF = 1 (success)
 698 
 699   cmpptr(r15_thread, rax);                // Check if we are already the owner (recursive lock)
 700   jccb(Assembler::notEqual, NO_COUNT);    // If not recursive, ZF = 0 at this point (fail)
 701   incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 702   xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success
 703 #endif // _LP64
 704 #if INCLUDE_RTM_OPT
 705   } // use_rtm()
 706 #endif
 707   // DONE_LABEL is a hot target - we'd really like to place it at the
 708   // start of cache line by padding with NOPs.
 709   // See the AMD and Intel software optimization manuals for the
 710   // most efficient "long" NOP encodings.
 711   // Unfortunately none of our alignment mechanisms suffice.
 712   bind(DONE_LABEL);
 713 
 714   // ZFlag == 1 count in fast path
 715   // ZFlag == 0 count in slow path
 716   jccb(Assembler::notZero, NO_COUNT); // jump if ZFlag == 0
 717 
 718   bind(COUNT);
 719   // Count monitors in fast path
 720 #ifndef _LP64
 721   get_thread(tmpReg);
 722   incrementl(Address(tmpReg, JavaThread::held_monitor_count_offset()));
 723 #else // _LP64
 724   incrementq(Address(r15_thread, JavaThread::held_monitor_count_offset()));
 725 #endif
 726 
 727   xorl(tmpReg, tmpReg); // Set ZF == 1
 728 
 729   bind(NO_COUNT);
 730 
 731   // At NO_COUNT the icc ZFlag is set as follows ...
 732   // fast_unlock uses the same protocol.
 733   // ZFlag == 1 -> Success
 734   // ZFlag == 0 -> Failure - force control through the slow path
 735 }
 736 
 737 // obj: object to unlock
 738 // box: box address (displaced header location), killed.  Must be EAX.
 739 // tmp: killed, cannot be obj nor box.
 740 //
 741 // Some commentary on balanced locking:
 742 //
 743 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
 744 // Methods that don't have provably balanced locking are forced to run in the
 745 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
 746 // The interpreter provides two properties:
 747 // I1:  At return-time the interpreter automatically and quietly unlocks any
 748 //      objects acquired the current activation (frame).  Recall that the
 749 //      interpreter maintains an on-stack list of locks currently held by
 750 //      a frame.
 751 // I2:  If a method attempts to unlock an object that is not held by the
 752 //      the frame the interpreter throws IMSX.
 753 //
 754 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
 755 // B() doesn't have provably balanced locking so it runs in the interpreter.
 756 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
 757 // is still locked by A().
 758 //
 759 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
 760 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
 761 // should not be unlocked by "normal" java-level locking and vice-versa.  The specification
 762 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
 763 // Arguably given that the spec legislates the JNI case as undefined our implementation
 764 // could reasonably *avoid* checking owner in fast_unlock().
 765 // In the interest of performance we elide m->Owner==Self check in unlock.
 766 // A perfectly viable alternative is to elide the owner check except when
 767 // Xcheck:jni is enabled.
 768 
 769 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) {
 770   assert(boxReg == rax, "");
 771   assert_different_registers(objReg, boxReg, tmpReg);
 772 
 773   Label DONE_LABEL, Stacked, CheckSucc, COUNT, NO_COUNT;
 774 
 775 #if INCLUDE_RTM_OPT
 776   if (UseRTMForStackLocks && use_rtm) {
 777     assert(!UseHeavyMonitors, "+UseHeavyMonitors and +UseRTMForStackLocks are mutually exclusive");
 778     Label L_regular_unlock;
 779     movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword
 780     andptr(tmpReg, markWord::lock_mask_in_place);                     // look at 2 lock bits
 781     cmpptr(tmpReg, markWord::unlocked_value);                         // bits = 01 unlocked
 782     jccb(Assembler::notEqual, L_regular_unlock);                      // if !HLE RegularLock
 783     xend();                                                           // otherwise end...
 784     jmp(DONE_LABEL);                                                  // ... and we're done
 785     bind(L_regular_unlock);
 786   }
 787 #endif
 788 
 789   if (!UseHeavyMonitors) {
 790     cmpptr(Address(boxReg, 0), NULL_WORD);                            // Examine the displaced header
 791     jcc   (Assembler::zero, COUNT);                                   // 0 indicates recursive stack-lock
 792   }
 793   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));   // Examine the object's markword
 794   if (!UseHeavyMonitors) {
 795     testptr(tmpReg, markWord::monitor_value);                         // Inflated?
 796     jccb   (Assembler::zero, Stacked);
 797   }
 798 
 799   // It's inflated.
 800 #if INCLUDE_RTM_OPT
 801   if (use_rtm) {
 802     Label L_regular_inflated_unlock;
 803     int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 804     movptr(boxReg, Address(tmpReg, owner_offset));
 805     testptr(boxReg, boxReg);
 806     jccb(Assembler::notZero, L_regular_inflated_unlock);
 807     xend();
 808     jmpb(DONE_LABEL);
 809     bind(L_regular_inflated_unlock);
 810   }
 811 #endif
 812 
 813   // Despite our balanced locking property we still check that m->_owner == Self
 814   // as java routines or native JNI code called by this thread might
 815   // have released the lock.
 816   // Refer to the comments in synchronizer.cpp for how we might encode extra
 817   // state in _succ so we can avoid fetching EntryList|cxq.
 818   //
 819   // If there's no contention try a 1-0 exit.  That is, exit without
 820   // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
 821   // we detect and recover from the race that the 1-0 exit admits.
 822   //
 823   // Conceptually fast_unlock() must execute a STST|LDST "release" barrier
 824   // before it STs null into _owner, releasing the lock.  Updates
 825   // to data protected by the critical section must be visible before
 826   // we drop the lock (and thus before any other thread could acquire
 827   // the lock and observe the fields protected by the lock).
 828   // IA32's memory-model is SPO, so STs are ordered with respect to
 829   // each other and there's no need for an explicit barrier (fence).
 830   // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
 831 #ifndef _LP64
 832   get_thread (boxReg);
 833 
 834   // Note that we could employ various encoding schemes to reduce
 835   // the number of loads below (currently 4) to just 2 or 3.
 836   // Refer to the comments in synchronizer.cpp.
 837   // In practice the chain of fetches doesn't seem to impact performance, however.
 838   xorptr(boxReg, boxReg);
 839   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 840   jccb  (Assembler::notZero, DONE_LABEL);
 841   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 842   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 843   jccb  (Assembler::notZero, CheckSucc);
 844   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 845   jmpb  (DONE_LABEL);
 846 
 847   bind (Stacked);
 848   // It's not inflated and it's not recursively stack-locked.
 849   // It must be stack-locked.
 850   // Try to reset the header to displaced header.
 851   // The "box" value on the stack is stable, so we can reload
 852   // and be assured we observe the same value as above.
 853   movptr(tmpReg, Address(boxReg, 0));
 854   lock();
 855   cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
 856   // Intention fall-thru into DONE_LABEL
 857 
 858   // DONE_LABEL is a hot target - we'd really like to place it at the
 859   // start of cache line by padding with NOPs.
 860   // See the AMD and Intel software optimization manuals for the
 861   // most efficient "long" NOP encodings.
 862   // Unfortunately none of our alignment mechanisms suffice.
 863   bind (CheckSucc);
 864 #else // _LP64
 865   // It's inflated
 866   Label LNotRecursive, LSuccess, LGoSlowPath;
 867 
 868   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0);
 869   jccb(Assembler::equal, LNotRecursive);
 870 
 871   // Recursive inflated unlock
 872   decq(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 873   jmpb(LSuccess);
 874 
 875   bind(LNotRecursive);
 876   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 877   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 878   jccb  (Assembler::notZero, CheckSucc);
 879   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 880   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 881   jmpb  (DONE_LABEL);
 882 
 883   // Try to avoid passing control into the slow_path ...
 884   bind  (CheckSucc);
 885 
 886   // The following optional optimization can be elided if necessary
 887   // Effectively: if (succ == null) goto slow path
 888   // The code reduces the window for a race, however,
 889   // and thus benefits performance.
 890   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD);
 891   jccb  (Assembler::zero, LGoSlowPath);
 892 
 893   xorptr(boxReg, boxReg);
 894   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
 895   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 896 
 897   // Memory barrier/fence
 898   // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ
 899   // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack.
 900   // This is faster on Nehalem and AMD Shanghai/Barcelona.
 901   // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences
 902   // We might also restructure (ST Owner=0;barrier;LD _Succ) to
 903   // (mov box,0; xchgq box, &m->Owner; LD _succ) .
 904   lock(); addl(Address(rsp, 0), 0);
 905 
 906   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD);
 907   jccb  (Assembler::notZero, LSuccess);
 908 
 909   // Rare inopportune interleaving - race.
 910   // The successor vanished in the small window above.
 911   // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor.
 912   // We need to ensure progress and succession.
 913   // Try to reacquire the lock.
 914   // If that fails then the new owner is responsible for succession and this
 915   // thread needs to take no further action and can exit via the fast path (success).
 916   // If the re-acquire succeeds then pass control into the slow path.
 917   // As implemented, this latter mode is horrible because we generated more
 918   // coherence traffic on the lock *and* artificially extended the critical section
 919   // length while by virtue of passing control into the slow path.
 920 
 921   // box is really RAX -- the following CMPXCHG depends on that binding
 922   // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R)
 923   lock();
 924   cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 925   // There's no successor so we tried to regrab the lock.
 926   // If that didn't work, then another thread grabbed the
 927   // lock so we're done (and exit was a success).
 928   jccb  (Assembler::notEqual, LSuccess);
 929   // Intentional fall-through into slow path
 930 
 931   bind  (LGoSlowPath);
 932   orl   (boxReg, 1);                      // set ICC.ZF=0 to indicate failure
 933   jmpb  (DONE_LABEL);
 934 
 935   bind  (LSuccess);
 936   testl (boxReg, 0);                      // set ICC.ZF=1 to indicate success
 937   jmpb  (DONE_LABEL);
 938 
 939   if (!UseHeavyMonitors) {
 940     bind  (Stacked);
 941     movptr(tmpReg, Address (boxReg, 0));      // re-fetch
 942     lock();
 943     cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
 944   }
 945 #endif
 946   bind(DONE_LABEL);
 947 
 948   // ZFlag == 1 count in fast path
 949   // ZFlag == 0 count in slow path
 950   jccb(Assembler::notZero, NO_COUNT);
 951 
 952   bind(COUNT);
 953   // Count monitors in fast path
 954 #ifndef _LP64
 955   get_thread(tmpReg);
 956   decrementl(Address(tmpReg, JavaThread::held_monitor_count_offset()));
 957 #else // _LP64
 958   decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset()));
 959 #endif
 960 
 961   xorl(tmpReg, tmpReg); // Set ZF == 1
 962 
 963   bind(NO_COUNT);
 964 }
 965 
 966 //-------------------------------------------------------------------------------------------
 967 // Generic instructions support for use in .ad files C2 code generation
 968 
 969 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) {
 970   if (dst != src) {
 971     movdqu(dst, src);
 972   }
 973   if (opcode == Op_AbsVD) {
 974     andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg);
 975   } else {
 976     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 977     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
 978   }
 979 }
 980 
 981 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
 982   if (opcode == Op_AbsVD) {
 983     vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg);
 984   } else {
 985     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
 986     vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg);
 987   }
 988 }
 989 
 990 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) {
 991   if (dst != src) {
 992     movdqu(dst, src);
 993   }
 994   if (opcode == Op_AbsVF) {
 995     andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg);
 996   } else {
 997     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
 998     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
 999   }
1000 }
1001 
1002 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
1003   if (opcode == Op_AbsVF) {
1004     vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg);
1005   } else {
1006     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
1007     vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg);
1008   }
1009 }
1010 
1011 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
1012   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
1013   assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
1014 
1015   if (opcode == Op_MinV) {
1016     if (elem_bt == T_BYTE) {
1017       pminsb(dst, src);
1018     } else if (elem_bt == T_SHORT) {
1019       pminsw(dst, src);
1020     } else if (elem_bt == T_INT) {
1021       pminsd(dst, src);
1022     } else {
1023       assert(elem_bt == T_LONG, "required");
1024       assert(tmp == xmm0, "required");
1025       assert_different_registers(dst, src, tmp);
1026       movdqu(xmm0, dst);
1027       pcmpgtq(xmm0, src);
1028       blendvpd(dst, src);  // xmm0 as mask
1029     }
1030   } else { // opcode == Op_MaxV
1031     if (elem_bt == T_BYTE) {
1032       pmaxsb(dst, src);
1033     } else if (elem_bt == T_SHORT) {
1034       pmaxsw(dst, src);
1035     } else if (elem_bt == T_INT) {
1036       pmaxsd(dst, src);
1037     } else {
1038       assert(elem_bt == T_LONG, "required");
1039       assert(tmp == xmm0, "required");
1040       assert_different_registers(dst, src, tmp);
1041       movdqu(xmm0, src);
1042       pcmpgtq(xmm0, dst);
1043       blendvpd(dst, src);  // xmm0 as mask
1044     }
1045   }
1046 }
1047 
1048 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
1049                                  XMMRegister dst, XMMRegister src1, XMMRegister src2,
1050                                  int vlen_enc) {
1051   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
1052 
1053   if (opcode == Op_MinV) {
1054     if (elem_bt == T_BYTE) {
1055       vpminsb(dst, src1, src2, vlen_enc);
1056     } else if (elem_bt == T_SHORT) {
1057       vpminsw(dst, src1, src2, vlen_enc);
1058     } else if (elem_bt == T_INT) {
1059       vpminsd(dst, src1, src2, vlen_enc);
1060     } else {
1061       assert(elem_bt == T_LONG, "required");
1062       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
1063         vpminsq(dst, src1, src2, vlen_enc);
1064       } else {
1065         assert_different_registers(dst, src1, src2);
1066         vpcmpgtq(dst, src1, src2, vlen_enc);
1067         vblendvpd(dst, src1, src2, dst, vlen_enc);
1068       }
1069     }
1070   } else { // opcode == Op_MaxV
1071     if (elem_bt == T_BYTE) {
1072       vpmaxsb(dst, src1, src2, vlen_enc);
1073     } else if (elem_bt == T_SHORT) {
1074       vpmaxsw(dst, src1, src2, vlen_enc);
1075     } else if (elem_bt == T_INT) {
1076       vpmaxsd(dst, src1, src2, vlen_enc);
1077     } else {
1078       assert(elem_bt == T_LONG, "required");
1079       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
1080         vpmaxsq(dst, src1, src2, vlen_enc);
1081       } else {
1082         assert_different_registers(dst, src1, src2);
1083         vpcmpgtq(dst, src1, src2, vlen_enc);
1084         vblendvpd(dst, src2, src1, dst, vlen_enc);
1085       }
1086     }
1087   }
1088 }
1089 
1090 // Float/Double min max
1091 
1092 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
1093                                    XMMRegister dst, XMMRegister a, XMMRegister b,
1094                                    XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
1095                                    int vlen_enc) {
1096   assert(UseAVX > 0, "required");
1097   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1098          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1099   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1100   assert_different_registers(a, b, tmp, atmp, btmp);
1101 
1102   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1103   bool is_double_word = is_double_word_type(elem_bt);
1104 
1105   if (!is_double_word && is_min) {
1106     vblendvps(atmp, a, b, a, vlen_enc);
1107     vblendvps(btmp, b, a, a, vlen_enc);
1108     vminps(tmp, atmp, btmp, vlen_enc);
1109     vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1110     vblendvps(dst, tmp, atmp, btmp, vlen_enc);
1111   } else if (!is_double_word && !is_min) {
1112     vblendvps(btmp, b, a, b, vlen_enc);
1113     vblendvps(atmp, a, b, b, vlen_enc);
1114     vmaxps(tmp, atmp, btmp, vlen_enc);
1115     vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1116     vblendvps(dst, tmp, atmp, btmp, vlen_enc);
1117   } else if (is_double_word && is_min) {
1118     vblendvpd(atmp, a, b, a, vlen_enc);
1119     vblendvpd(btmp, b, a, a, vlen_enc);
1120     vminpd(tmp, atmp, btmp, vlen_enc);
1121     vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1122     vblendvpd(dst, tmp, atmp, btmp, vlen_enc);
1123   } else {
1124     assert(is_double_word && !is_min, "sanity");
1125     vblendvpd(btmp, b, a, b, vlen_enc);
1126     vblendvpd(atmp, a, b, b, vlen_enc);
1127     vmaxpd(tmp, atmp, btmp, vlen_enc);
1128     vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1129     vblendvpd(dst, tmp, atmp, btmp, vlen_enc);
1130   }
1131 }
1132 
1133 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
1134                                     XMMRegister dst, XMMRegister a, XMMRegister b,
1135                                     KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
1136                                     int vlen_enc) {
1137   assert(UseAVX > 2, "required");
1138   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1139          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1140   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1141   assert_different_registers(dst, a, b, atmp, btmp);
1142 
1143   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1144   bool is_double_word = is_double_word_type(elem_bt);
1145   bool merge = true;
1146 
1147   if (!is_double_word && is_min) {
1148     evpmovd2m(ktmp, a, vlen_enc);
1149     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1150     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1151     vminps(dst, atmp, btmp, vlen_enc);
1152     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1153     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1154   } else if (!is_double_word && !is_min) {
1155     evpmovd2m(ktmp, b, vlen_enc);
1156     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1157     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1158     vmaxps(dst, atmp, btmp, vlen_enc);
1159     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1160     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1161   } else if (is_double_word && is_min) {
1162     evpmovq2m(ktmp, a, vlen_enc);
1163     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1164     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1165     vminpd(dst, atmp, btmp, vlen_enc);
1166     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1167     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1168   } else {
1169     assert(is_double_word && !is_min, "sanity");
1170     evpmovq2m(ktmp, b, vlen_enc);
1171     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1172     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1173     vmaxpd(dst, atmp, btmp, vlen_enc);
1174     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1175     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1176   }
1177 }
1178 
1179 // Float/Double signum
1180 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) {
1181   assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");
1182 
1183   Label DONE_LABEL;
1184 
1185   if (opcode == Op_SignumF) {
1186     assert(UseSSE > 0, "required");
1187     ucomiss(dst, zero);
1188     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1189     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1190     movflt(dst, one);
1191     jcc(Assembler::above, DONE_LABEL);
1192     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
1193   } else if (opcode == Op_SignumD) {
1194     assert(UseSSE > 1, "required");
1195     ucomisd(dst, zero);
1196     jcc(Assembler::equal, DONE_LABEL);    // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1197     jcc(Assembler::parity, DONE_LABEL);   // handle special case NaN, if argument NaN, return NaN
1198     movdbl(dst, one);
1199     jcc(Assembler::above, DONE_LABEL);
1200     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
1201   }
1202 
1203   bind(DONE_LABEL);
1204 }
1205 
1206 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1207   if (sign) {
1208     pmovsxbw(dst, src);
1209   } else {
1210     pmovzxbw(dst, src);
1211   }
1212 }
1213 
1214 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1215   if (sign) {
1216     vpmovsxbw(dst, src, vector_len);
1217   } else {
1218     vpmovzxbw(dst, src, vector_len);
1219   }
1220 }
1221 
1222 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1223   if (sign) {
1224     vpmovsxbd(dst, src, vector_len);
1225   } else {
1226     vpmovzxbd(dst, src, vector_len);
1227   }
1228 }
1229 
1230 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1231   if (sign) {
1232     vpmovsxwd(dst, src, vector_len);
1233   } else {
1234     vpmovzxwd(dst, src, vector_len);
1235   }
1236 }
1237 
1238 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1239                                      int shift, int vector_len) {
1240   if (opcode == Op_RotateLeftV) {
1241     if (etype == T_INT) {
1242       evprold(dst, src, shift, vector_len);
1243     } else {
1244       assert(etype == T_LONG, "expected type T_LONG");
1245       evprolq(dst, src, shift, vector_len);
1246     }
1247   } else {
1248     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1249     if (etype == T_INT) {
1250       evprord(dst, src, shift, vector_len);
1251     } else {
1252       assert(etype == T_LONG, "expected type T_LONG");
1253       evprorq(dst, src, shift, vector_len);
1254     }
1255   }
1256 }
1257 
1258 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1259                                      XMMRegister shift, int vector_len) {
1260   if (opcode == Op_RotateLeftV) {
1261     if (etype == T_INT) {
1262       evprolvd(dst, src, shift, vector_len);
1263     } else {
1264       assert(etype == T_LONG, "expected type T_LONG");
1265       evprolvq(dst, src, shift, vector_len);
1266     }
1267   } else {
1268     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1269     if (etype == T_INT) {
1270       evprorvd(dst, src, shift, vector_len);
1271     } else {
1272       assert(etype == T_LONG, "expected type T_LONG");
1273       evprorvq(dst, src, shift, vector_len);
1274     }
1275   }
1276 }
1277 
1278 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1279   if (opcode == Op_RShiftVI) {
1280     psrad(dst, shift);
1281   } else if (opcode == Op_LShiftVI) {
1282     pslld(dst, shift);
1283   } else {
1284     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1285     psrld(dst, shift);
1286   }
1287 }
1288 
1289 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1290   switch (opcode) {
1291     case Op_RShiftVI:  psrad(dst, shift); break;
1292     case Op_LShiftVI:  pslld(dst, shift); break;
1293     case Op_URShiftVI: psrld(dst, shift); break;
1294 
1295     default: assert(false, "%s", NodeClassNames[opcode]);
1296   }
1297 }
1298 
1299 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1300   if (opcode == Op_RShiftVI) {
1301     vpsrad(dst, nds, shift, vector_len);
1302   } else if (opcode == Op_LShiftVI) {
1303     vpslld(dst, nds, shift, vector_len);
1304   } else {
1305     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1306     vpsrld(dst, nds, shift, vector_len);
1307   }
1308 }
1309 
1310 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1311   switch (opcode) {
1312     case Op_RShiftVI:  vpsrad(dst, src, shift, vlen_enc); break;
1313     case Op_LShiftVI:  vpslld(dst, src, shift, vlen_enc); break;
1314     case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1315 
1316     default: assert(false, "%s", NodeClassNames[opcode]);
1317   }
1318 }
1319 
1320 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1321   switch (opcode) {
1322     case Op_RShiftVB:  // fall-through
1323     case Op_RShiftVS:  psraw(dst, shift); break;
1324 
1325     case Op_LShiftVB:  // fall-through
1326     case Op_LShiftVS:  psllw(dst, shift);   break;
1327 
1328     case Op_URShiftVS: // fall-through
1329     case Op_URShiftVB: psrlw(dst, shift);  break;
1330 
1331     default: assert(false, "%s", NodeClassNames[opcode]);
1332   }
1333 }
1334 
1335 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1336   switch (opcode) {
1337     case Op_RShiftVB:  // fall-through
1338     case Op_RShiftVS:  vpsraw(dst, src, shift, vlen_enc); break;
1339 
1340     case Op_LShiftVB:  // fall-through
1341     case Op_LShiftVS:  vpsllw(dst, src, shift, vlen_enc); break;
1342 
1343     case Op_URShiftVS: // fall-through
1344     case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1345 
1346     default: assert(false, "%s", NodeClassNames[opcode]);
1347   }
1348 }
1349 
1350 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1351   switch (opcode) {
1352     case Op_RShiftVL:  psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1353     case Op_LShiftVL:  psllq(dst, shift); break;
1354     case Op_URShiftVL: psrlq(dst, shift); break;
1355 
1356     default: assert(false, "%s", NodeClassNames[opcode]);
1357   }
1358 }
1359 
1360 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1361   if (opcode == Op_RShiftVL) {
1362     psrlq(dst, shift);  // using srl to implement sra on pre-avs512 systems
1363   } else if (opcode == Op_LShiftVL) {
1364     psllq(dst, shift);
1365   } else {
1366     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1367     psrlq(dst, shift);
1368   }
1369 }
1370 
1371 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1372   switch (opcode) {
1373     case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1374     case Op_LShiftVL:  vpsllq(dst, src, shift, vlen_enc); break;
1375     case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1376 
1377     default: assert(false, "%s", NodeClassNames[opcode]);
1378   }
1379 }
1380 
1381 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1382   if (opcode == Op_RShiftVL) {
1383     evpsraq(dst, nds, shift, vector_len);
1384   } else if (opcode == Op_LShiftVL) {
1385     vpsllq(dst, nds, shift, vector_len);
1386   } else {
1387     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1388     vpsrlq(dst, nds, shift, vector_len);
1389   }
1390 }
1391 
1392 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1393   switch (opcode) {
1394     case Op_RShiftVB:  // fall-through
1395     case Op_RShiftVS:  // fall-through
1396     case Op_RShiftVI:  vpsravd(dst, src, shift, vlen_enc); break;
1397 
1398     case Op_LShiftVB:  // fall-through
1399     case Op_LShiftVS:  // fall-through
1400     case Op_LShiftVI:  vpsllvd(dst, src, shift, vlen_enc); break;
1401 
1402     case Op_URShiftVB: // fall-through
1403     case Op_URShiftVS: // fall-through
1404     case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1405 
1406     default: assert(false, "%s", NodeClassNames[opcode]);
1407   }
1408 }
1409 
1410 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1411   switch (opcode) {
1412     case Op_RShiftVB:  // fall-through
1413     case Op_RShiftVS:  evpsravw(dst, src, shift, vlen_enc); break;
1414 
1415     case Op_LShiftVB:  // fall-through
1416     case Op_LShiftVS:  evpsllvw(dst, src, shift, vlen_enc); break;
1417 
1418     case Op_URShiftVB: // fall-through
1419     case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1420 
1421     default: assert(false, "%s", NodeClassNames[opcode]);
1422   }
1423 }
1424 
1425 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1426   assert(UseAVX >= 2, "required");
1427   switch (opcode) {
1428     case Op_RShiftVL: {
1429       if (UseAVX > 2) {
1430         assert(tmp == xnoreg, "not used");
1431         if (!VM_Version::supports_avx512vl()) {
1432           vlen_enc = Assembler::AVX_512bit;
1433         }
1434         evpsravq(dst, src, shift, vlen_enc);
1435       } else {
1436         vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1437         vpsrlvq(dst, src, shift, vlen_enc);
1438         vpsrlvq(tmp, tmp, shift, vlen_enc);
1439         vpxor(dst, dst, tmp, vlen_enc);
1440         vpsubq(dst, dst, tmp, vlen_enc);
1441       }
1442       break;
1443     }
1444     case Op_LShiftVL: {
1445       assert(tmp == xnoreg, "not used");
1446       vpsllvq(dst, src, shift, vlen_enc);
1447       break;
1448     }
1449     case Op_URShiftVL: {
1450       assert(tmp == xnoreg, "not used");
1451       vpsrlvq(dst, src, shift, vlen_enc);
1452       break;
1453     }
1454     default: assert(false, "%s", NodeClassNames[opcode]);
1455   }
1456 }
1457 
1458 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1459 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1460   assert(opcode == Op_LShiftVB ||
1461          opcode == Op_RShiftVB ||
1462          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1463   bool sign = (opcode != Op_URShiftVB);
1464   assert(vector_len == 0, "required");
1465   vextendbd(sign, dst, src, 1);
1466   vpmovzxbd(vtmp, shift, 1);
1467   varshiftd(opcode, dst, dst, vtmp, 1);
1468   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg);
1469   vextracti128_high(vtmp, dst);
1470   vpackusdw(dst, dst, vtmp, 0);
1471 }
1472 
1473 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1474 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1475   assert(opcode == Op_LShiftVB ||
1476          opcode == Op_RShiftVB ||
1477          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1478   bool sign = (opcode != Op_URShiftVB);
1479   int ext_vector_len = vector_len + 1;
1480   vextendbw(sign, dst, src, ext_vector_len);
1481   vpmovzxbw(vtmp, shift, ext_vector_len);
1482   varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1483   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg);
1484   if (vector_len == 0) {
1485     vextracti128_high(vtmp, dst);
1486     vpackuswb(dst, dst, vtmp, vector_len);
1487   } else {
1488     vextracti64x4_high(vtmp, dst);
1489     vpackuswb(dst, dst, vtmp, vector_len);
1490     vpermq(dst, dst, 0xD8, vector_len);
1491   }
1492 }
1493 
1494 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1495   switch(typ) {
1496     case T_BYTE:
1497       pinsrb(dst, val, idx);
1498       break;
1499     case T_SHORT:
1500       pinsrw(dst, val, idx);
1501       break;
1502     case T_INT:
1503       pinsrd(dst, val, idx);
1504       break;
1505     case T_LONG:
1506       pinsrq(dst, val, idx);
1507       break;
1508     default:
1509       assert(false,"Should not reach here.");
1510       break;
1511   }
1512 }
1513 
1514 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1515   switch(typ) {
1516     case T_BYTE:
1517       vpinsrb(dst, src, val, idx);
1518       break;
1519     case T_SHORT:
1520       vpinsrw(dst, src, val, idx);
1521       break;
1522     case T_INT:
1523       vpinsrd(dst, src, val, idx);
1524       break;
1525     case T_LONG:
1526       vpinsrq(dst, src, val, idx);
1527       break;
1528     default:
1529       assert(false,"Should not reach here.");
1530       break;
1531   }
1532 }
1533 
1534 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1535   switch(typ) {
1536     case T_INT:
1537       vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1538       break;
1539     case T_FLOAT:
1540       vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1541       break;
1542     case T_LONG:
1543       vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1544       break;
1545     case T_DOUBLE:
1546       vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1547       break;
1548     default:
1549       assert(false,"Should not reach here.");
1550       break;
1551   }
1552 }
1553 
1554 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1555   switch(typ) {
1556     case T_INT:
1557       evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1558       break;
1559     case T_FLOAT:
1560       evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1561       break;
1562     case T_LONG:
1563       evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1564       break;
1565     case T_DOUBLE:
1566       evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1567       break;
1568     default:
1569       assert(false,"Should not reach here.");
1570       break;
1571   }
1572 }
1573 
1574 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1575   switch(typ) {
1576     case T_INT:
1577       evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1578       break;
1579     case T_FLOAT:
1580       evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1581       break;
1582     case T_LONG:
1583       evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1584       break;
1585     case T_DOUBLE:
1586       evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1587       break;
1588     default:
1589       assert(false,"Should not reach here.");
1590       break;
1591   }
1592 }
1593 
1594 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) {
1595   if (vlen_in_bytes <= 16) {
1596     pxor (dst, dst);
1597     psubb(dst, src);
1598     switch (elem_bt) {
1599       case T_BYTE:   /* nothing to do */ break;
1600       case T_SHORT:  pmovsxbw(dst, dst); break;
1601       case T_INT:    pmovsxbd(dst, dst); break;
1602       case T_FLOAT:  pmovsxbd(dst, dst); break;
1603       case T_LONG:   pmovsxbq(dst, dst); break;
1604       case T_DOUBLE: pmovsxbq(dst, dst); break;
1605 
1606       default: assert(false, "%s", type2name(elem_bt));
1607     }
1608   } else {
1609     assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, "");
1610     int vlen_enc = vector_length_encoding(vlen_in_bytes);
1611 
1612     vpxor (dst, dst, dst, vlen_enc);
1613     vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc);
1614 
1615     switch (elem_bt) {
1616       case T_BYTE:   /* nothing to do */            break;
1617       case T_SHORT:  vpmovsxbw(dst, dst, vlen_enc); break;
1618       case T_INT:    vpmovsxbd(dst, dst, vlen_enc); break;
1619       case T_FLOAT:  vpmovsxbd(dst, dst, vlen_enc); break;
1620       case T_LONG:   vpmovsxbq(dst, dst, vlen_enc); break;
1621       case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1622 
1623       default: assert(false, "%s", type2name(elem_bt));
1624     }
1625   }
1626 }
1627 
1628 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) {
1629   if (novlbwdq) {
1630     vpmovsxbd(xtmp, src, vlen_enc);
1631     evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()),
1632             Assembler::eq, true, vlen_enc, noreg);
1633   } else {
1634     vpxor(xtmp, xtmp, xtmp, vlen_enc);
1635     vpsubb(xtmp, xtmp, src, vlen_enc);
1636     evpmovb2m(dst, xtmp, vlen_enc);
1637   }
1638 }
1639 
1640 void C2_MacroAssembler::load_vector(XMMRegister dst, Address src, int vlen_in_bytes) {
1641   switch (vlen_in_bytes) {
1642     case 4:  movdl(dst, src);   break;
1643     case 8:  movq(dst, src);    break;
1644     case 16: movdqu(dst, src);  break;
1645     case 32: vmovdqu(dst, src); break;
1646     case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break;
1647     default: ShouldNotReachHere();
1648   }
1649 }
1650 
1651 void C2_MacroAssembler::load_vector(XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) {
1652   assert(rscratch != noreg || always_reachable(src), "missing");
1653 
1654   if (reachable(src)) {
1655     load_vector(dst, as_Address(src), vlen_in_bytes);
1656   } else {
1657     lea(rscratch, src);
1658     load_vector(dst, Address(rscratch, 0), vlen_in_bytes);
1659   }
1660 }
1661 
1662 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) {
1663   int vlen_enc = vector_length_encoding(vlen);
1664   if (VM_Version::supports_avx()) {
1665     if (bt == T_LONG) {
1666       if (VM_Version::supports_avx2()) {
1667         vpbroadcastq(dst, src, vlen_enc);
1668       } else {
1669         vmovddup(dst, src, vlen_enc);
1670       }
1671     } else if (bt == T_DOUBLE) {
1672       if (vlen_enc != Assembler::AVX_128bit) {
1673         vbroadcastsd(dst, src, vlen_enc, noreg);
1674       } else {
1675         vmovddup(dst, src, vlen_enc);
1676       }
1677     } else {
1678       if (VM_Version::supports_avx2() && is_integral_type(bt)) {
1679         vpbroadcastd(dst, src, vlen_enc);
1680       } else {
1681         vbroadcastss(dst, src, vlen_enc);
1682       }
1683     }
1684   } else if (VM_Version::supports_sse3()) {
1685     movddup(dst, src);
1686   } else {
1687     movq(dst, src);
1688     if (vlen == 16) {
1689       punpcklqdq(dst, dst);
1690     }
1691   }
1692 }
1693 
1694 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes) {
1695   ExternalAddress addr(StubRoutines::x86::vector_iota_indices());
1696   if (vlen_in_bytes <= 4) {
1697     movdl(dst, addr);
1698   } else if (vlen_in_bytes == 8) {
1699     movq(dst, addr);
1700   } else if (vlen_in_bytes == 16) {
1701     movdqu(dst, addr, noreg);
1702   } else if (vlen_in_bytes == 32) {
1703     vmovdqu(dst, addr, noreg);
1704   } else {
1705     assert(vlen_in_bytes == 64, "%d", vlen_in_bytes);
1706     evmovdqub(dst, k0, addr, false /*merge*/, Assembler::AVX_512bit, noreg);
1707   }
1708 }
1709 
1710 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1711 
1712 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1713   int vector_len = Assembler::AVX_128bit;
1714 
1715   switch (opcode) {
1716     case Op_AndReductionV:  pand(dst, src); break;
1717     case Op_OrReductionV:   por (dst, src); break;
1718     case Op_XorReductionV:  pxor(dst, src); break;
1719     case Op_MinReductionV:
1720       switch (typ) {
1721         case T_BYTE:        pminsb(dst, src); break;
1722         case T_SHORT:       pminsw(dst, src); break;
1723         case T_INT:         pminsd(dst, src); break;
1724         case T_LONG:        assert(UseAVX > 2, "required");
1725                             vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
1726         default:            assert(false, "wrong type");
1727       }
1728       break;
1729     case Op_MaxReductionV:
1730       switch (typ) {
1731         case T_BYTE:        pmaxsb(dst, src); break;
1732         case T_SHORT:       pmaxsw(dst, src); break;
1733         case T_INT:         pmaxsd(dst, src); break;
1734         case T_LONG:        assert(UseAVX > 2, "required");
1735                             vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
1736         default:            assert(false, "wrong type");
1737       }
1738       break;
1739     case Op_AddReductionVF: addss(dst, src); break;
1740     case Op_AddReductionVD: addsd(dst, src); break;
1741     case Op_AddReductionVI:
1742       switch (typ) {
1743         case T_BYTE:        paddb(dst, src); break;
1744         case T_SHORT:       paddw(dst, src); break;
1745         case T_INT:         paddd(dst, src); break;
1746         default:            assert(false, "wrong type");
1747       }
1748       break;
1749     case Op_AddReductionVL: paddq(dst, src); break;
1750     case Op_MulReductionVF: mulss(dst, src); break;
1751     case Op_MulReductionVD: mulsd(dst, src); break;
1752     case Op_MulReductionVI:
1753       switch (typ) {
1754         case T_SHORT:       pmullw(dst, src); break;
1755         case T_INT:         pmulld(dst, src); break;
1756         default:            assert(false, "wrong type");
1757       }
1758       break;
1759     case Op_MulReductionVL: assert(UseAVX > 2, "required");
1760                             vpmullq(dst, dst, src, vector_len); break;
1761     default:                assert(false, "wrong opcode");
1762   }
1763 }
1764 
1765 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1766   int vector_len = Assembler::AVX_256bit;
1767 
1768   switch (opcode) {
1769     case Op_AndReductionV:  vpand(dst, src1, src2, vector_len); break;
1770     case Op_OrReductionV:   vpor (dst, src1, src2, vector_len); break;
1771     case Op_XorReductionV:  vpxor(dst, src1, src2, vector_len); break;
1772     case Op_MinReductionV:
1773       switch (typ) {
1774         case T_BYTE:        vpminsb(dst, src1, src2, vector_len); break;
1775         case T_SHORT:       vpminsw(dst, src1, src2, vector_len); break;
1776         case T_INT:         vpminsd(dst, src1, src2, vector_len); break;
1777         case T_LONG:        assert(UseAVX > 2, "required");
1778                             vpminsq(dst, src1, src2, vector_len); break;
1779         default:            assert(false, "wrong type");
1780       }
1781       break;
1782     case Op_MaxReductionV:
1783       switch (typ) {
1784         case T_BYTE:        vpmaxsb(dst, src1, src2, vector_len); break;
1785         case T_SHORT:       vpmaxsw(dst, src1, src2, vector_len); break;
1786         case T_INT:         vpmaxsd(dst, src1, src2, vector_len); break;
1787         case T_LONG:        assert(UseAVX > 2, "required");
1788                             vpmaxsq(dst, src1, src2, vector_len); break;
1789         default:            assert(false, "wrong type");
1790       }
1791       break;
1792     case Op_AddReductionVI:
1793       switch (typ) {
1794         case T_BYTE:        vpaddb(dst, src1, src2, vector_len); break;
1795         case T_SHORT:       vpaddw(dst, src1, src2, vector_len); break;
1796         case T_INT:         vpaddd(dst, src1, src2, vector_len); break;
1797         default:            assert(false, "wrong type");
1798       }
1799       break;
1800     case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
1801     case Op_MulReductionVI:
1802       switch (typ) {
1803         case T_SHORT:       vpmullw(dst, src1, src2, vector_len); break;
1804         case T_INT:         vpmulld(dst, src1, src2, vector_len); break;
1805         default:            assert(false, "wrong type");
1806       }
1807       break;
1808     case Op_MulReductionVL: vpmullq(dst, src1, src2, vector_len); break;
1809     default:                assert(false, "wrong opcode");
1810   }
1811 }
1812 
1813 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
1814                                   XMMRegister dst, XMMRegister src,
1815                                   XMMRegister vtmp1, XMMRegister vtmp2) {
1816   switch (opcode) {
1817     case Op_AddReductionVF:
1818     case Op_MulReductionVF:
1819       reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1820       break;
1821 
1822     case Op_AddReductionVD:
1823     case Op_MulReductionVD:
1824       reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1825       break;
1826 
1827     default: assert(false, "wrong opcode");
1828   }
1829 }
1830 
1831 void C2_MacroAssembler::reduceB(int opcode, int vlen,
1832                              Register dst, Register src1, XMMRegister src2,
1833                              XMMRegister vtmp1, XMMRegister vtmp2) {
1834   switch (vlen) {
1835     case  8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1836     case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1837     case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1838     case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1839 
1840     default: assert(false, "wrong vector length");
1841   }
1842 }
1843 
1844 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
1845                              Register dst, Register src1, XMMRegister src2,
1846                              XMMRegister vtmp1, XMMRegister vtmp2) {
1847   switch (vlen) {
1848     case  8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1849     case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1850     case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1851     case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1852 
1853     default: assert(false, "wrong vector length");
1854   }
1855 }
1856 
1857 void C2_MacroAssembler::reduceS(int opcode, int vlen,
1858                              Register dst, Register src1, XMMRegister src2,
1859                              XMMRegister vtmp1, XMMRegister vtmp2) {
1860   switch (vlen) {
1861     case  4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1862     case  8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1863     case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1864     case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1865 
1866     default: assert(false, "wrong vector length");
1867   }
1868 }
1869 
1870 void C2_MacroAssembler::reduceI(int opcode, int vlen,
1871                              Register dst, Register src1, XMMRegister src2,
1872                              XMMRegister vtmp1, XMMRegister vtmp2) {
1873   switch (vlen) {
1874     case  2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1875     case  4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1876     case  8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1877     case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1878 
1879     default: assert(false, "wrong vector length");
1880   }
1881 }
1882 
1883 #ifdef _LP64
1884 void C2_MacroAssembler::reduceL(int opcode, int vlen,
1885                              Register dst, Register src1, XMMRegister src2,
1886                              XMMRegister vtmp1, XMMRegister vtmp2) {
1887   switch (vlen) {
1888     case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1889     case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1890     case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1891 
1892     default: assert(false, "wrong vector length");
1893   }
1894 }
1895 #endif // _LP64
1896 
1897 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1898   switch (vlen) {
1899     case 2:
1900       assert(vtmp2 == xnoreg, "");
1901       reduce2F(opcode, dst, src, vtmp1);
1902       break;
1903     case 4:
1904       assert(vtmp2 == xnoreg, "");
1905       reduce4F(opcode, dst, src, vtmp1);
1906       break;
1907     case 8:
1908       reduce8F(opcode, dst, src, vtmp1, vtmp2);
1909       break;
1910     case 16:
1911       reduce16F(opcode, dst, src, vtmp1, vtmp2);
1912       break;
1913     default: assert(false, "wrong vector length");
1914   }
1915 }
1916 
1917 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1918   switch (vlen) {
1919     case 2:
1920       assert(vtmp2 == xnoreg, "");
1921       reduce2D(opcode, dst, src, vtmp1);
1922       break;
1923     case 4:
1924       reduce4D(opcode, dst, src, vtmp1, vtmp2);
1925       break;
1926     case 8:
1927       reduce8D(opcode, dst, src, vtmp1, vtmp2);
1928       break;
1929     default: assert(false, "wrong vector length");
1930   }
1931 }
1932 
1933 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1934   if (opcode == Op_AddReductionVI) {
1935     if (vtmp1 != src2) {
1936       movdqu(vtmp1, src2);
1937     }
1938     phaddd(vtmp1, vtmp1);
1939   } else {
1940     pshufd(vtmp1, src2, 0x1);
1941     reduce_operation_128(T_INT, opcode, vtmp1, src2);
1942   }
1943   movdl(vtmp2, src1);
1944   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
1945   movdl(dst, vtmp1);
1946 }
1947 
1948 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1949   if (opcode == Op_AddReductionVI) {
1950     if (vtmp1 != src2) {
1951       movdqu(vtmp1, src2);
1952     }
1953     phaddd(vtmp1, src2);
1954     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1955   } else {
1956     pshufd(vtmp2, src2, 0xE);
1957     reduce_operation_128(T_INT, opcode, vtmp2, src2);
1958     reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1959   }
1960 }
1961 
1962 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1963   if (opcode == Op_AddReductionVI) {
1964     vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
1965     vextracti128_high(vtmp2, vtmp1);
1966     vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
1967     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1968   } else {
1969     vextracti128_high(vtmp1, src2);
1970     reduce_operation_128(T_INT, opcode, vtmp1, src2);
1971     reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1972   }
1973 }
1974 
1975 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1976   vextracti64x4_high(vtmp2, src2);
1977   reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
1978   reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1979 }
1980 
1981 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1982   pshufd(vtmp2, src2, 0x1);
1983   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
1984   movdqu(vtmp1, vtmp2);
1985   psrldq(vtmp1, 2);
1986   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
1987   movdqu(vtmp2, vtmp1);
1988   psrldq(vtmp2, 1);
1989   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
1990   movdl(vtmp2, src1);
1991   pmovsxbd(vtmp1, vtmp1);
1992   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
1993   pextrb(dst, vtmp1, 0x0);
1994   movsbl(dst, dst);
1995 }
1996 
1997 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1998   pshufd(vtmp1, src2, 0xE);
1999   reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
2000   reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2001 }
2002 
2003 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2004   vextracti128_high(vtmp2, src2);
2005   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2006   reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2007 }
2008 
2009 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2010   vextracti64x4_high(vtmp1, src2);
2011   reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
2012   reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2013 }
2014 
2015 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2016   pmovsxbw(vtmp2, src2);
2017   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2018 }
2019 
2020 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2021   if (UseAVX > 1) {
2022     int vector_len = Assembler::AVX_256bit;
2023     vpmovsxbw(vtmp1, src2, vector_len);
2024     reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2025   } else {
2026     pmovsxbw(vtmp2, src2);
2027     reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2028     pshufd(vtmp2, src2, 0x1);
2029     pmovsxbw(vtmp2, src2);
2030     reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2031   }
2032 }
2033 
2034 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2035   if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
2036     int vector_len = Assembler::AVX_512bit;
2037     vpmovsxbw(vtmp1, src2, vector_len);
2038     reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2039   } else {
2040     assert(UseAVX >= 2,"Should not reach here.");
2041     mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
2042     vextracti128_high(vtmp2, src2);
2043     mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2044   }
2045 }
2046 
2047 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2048   mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
2049   vextracti64x4_high(vtmp2, src2);
2050   mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2051 }
2052 
2053 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2054   if (opcode == Op_AddReductionVI) {
2055     if (vtmp1 != src2) {
2056       movdqu(vtmp1, src2);
2057     }
2058     phaddw(vtmp1, vtmp1);
2059     phaddw(vtmp1, vtmp1);
2060   } else {
2061     pshufd(vtmp2, src2, 0x1);
2062     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2063     movdqu(vtmp1, vtmp2);
2064     psrldq(vtmp1, 2);
2065     reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
2066   }
2067   movdl(vtmp2, src1);
2068   pmovsxwd(vtmp1, vtmp1);
2069   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2070   pextrw(dst, vtmp1, 0x0);
2071   movswl(dst, dst);
2072 }
2073 
2074 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2075   if (opcode == Op_AddReductionVI) {
2076     if (vtmp1 != src2) {
2077       movdqu(vtmp1, src2);
2078     }
2079     phaddw(vtmp1, src2);
2080   } else {
2081     pshufd(vtmp1, src2, 0xE);
2082     reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
2083   }
2084   reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2085 }
2086 
2087 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2088   if (opcode == Op_AddReductionVI) {
2089     int vector_len = Assembler::AVX_256bit;
2090     vphaddw(vtmp2, src2, src2, vector_len);
2091     vpermq(vtmp2, vtmp2, 0xD8, vector_len);
2092   } else {
2093     vextracti128_high(vtmp2, src2);
2094     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2095   }
2096   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2097 }
2098 
2099 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2100   int vector_len = Assembler::AVX_256bit;
2101   vextracti64x4_high(vtmp1, src2);
2102   reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
2103   reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2104 }
2105 
2106 #ifdef _LP64
2107 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2108   pshufd(vtmp2, src2, 0xE);
2109   reduce_operation_128(T_LONG, opcode, vtmp2, src2);
2110   movdq(vtmp1, src1);
2111   reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
2112   movdq(dst, vtmp1);
2113 }
2114 
2115 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2116   vextracti128_high(vtmp1, src2);
2117   reduce_operation_128(T_LONG, opcode, vtmp1, src2);
2118   reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2119 }
2120 
2121 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2122   vextracti64x4_high(vtmp2, src2);
2123   reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
2124   reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2125 }
2126 
2127 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
2128   mov64(temp, -1L);
2129   bzhiq(temp, temp, len);
2130   kmovql(dst, temp);
2131 }
2132 #endif // _LP64
2133 
2134 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2135   reduce_operation_128(T_FLOAT, opcode, dst, src);
2136   pshufd(vtmp, src, 0x1);
2137   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2138 }
2139 
2140 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2141   reduce2F(opcode, dst, src, vtmp);
2142   pshufd(vtmp, src, 0x2);
2143   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2144   pshufd(vtmp, src, 0x3);
2145   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2146 }
2147 
2148 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2149   reduce4F(opcode, dst, src, vtmp2);
2150   vextractf128_high(vtmp2, src);
2151   reduce4F(opcode, dst, vtmp2, vtmp1);
2152 }
2153 
2154 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2155   reduce8F(opcode, dst, src, vtmp1, vtmp2);
2156   vextracti64x4_high(vtmp1, src);
2157   reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
2158 }
2159 
2160 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2161   reduce_operation_128(T_DOUBLE, opcode, dst, src);
2162   pshufd(vtmp, src, 0xE);
2163   reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
2164 }
2165 
2166 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2167   reduce2D(opcode, dst, src, vtmp2);
2168   vextractf128_high(vtmp2, src);
2169   reduce2D(opcode, dst, vtmp2, vtmp1);
2170 }
2171 
2172 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2173   reduce4D(opcode, dst, src, vtmp1, vtmp2);
2174   vextracti64x4_high(vtmp1, src);
2175   reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
2176 }
2177 
2178 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) {
2179   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2180 }
2181 
2182 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) {
2183   MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2184 }
2185 
2186 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask,
2187                                  int vec_enc) {
2188   switch(elem_bt) {
2189     case T_INT:
2190     case T_FLOAT:
2191       vmaskmovps(dst, src, mask, vec_enc);
2192       break;
2193     case T_LONG:
2194     case T_DOUBLE:
2195       vmaskmovpd(dst, src, mask, vec_enc);
2196       break;
2197     default:
2198       fatal("Unsupported type %s", type2name(elem_bt));
2199       break;
2200   }
2201 }
2202 
2203 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask,
2204                                  int vec_enc) {
2205   switch(elem_bt) {
2206     case T_INT:
2207     case T_FLOAT:
2208       vmaskmovps(dst, src, mask, vec_enc);
2209       break;
2210     case T_LONG:
2211     case T_DOUBLE:
2212       vmaskmovpd(dst, src, mask, vec_enc);
2213       break;
2214     default:
2215       fatal("Unsupported type %s", type2name(elem_bt));
2216       break;
2217   }
2218 }
2219 
2220 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
2221                                           XMMRegister dst, XMMRegister src,
2222                                           XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2223                                           XMMRegister xmm_0, XMMRegister xmm_1) {
2224   int permconst[] = {1, 14};
2225   XMMRegister wsrc = src;
2226   XMMRegister wdst = xmm_0;
2227   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2228 
2229   int vlen_enc = Assembler::AVX_128bit;
2230   if (vlen == 16) {
2231     vlen_enc = Assembler::AVX_256bit;
2232   }
2233 
2234   for (int i = log2(vlen) - 1; i >=0; i--) {
2235     if (i == 0 && !is_dst_valid) {
2236       wdst = dst;
2237     }
2238     if (i == 3) {
2239       vextracti64x4_high(wtmp, wsrc);
2240     } else if (i == 2) {
2241       vextracti128_high(wtmp, wsrc);
2242     } else { // i = [0,1]
2243       vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
2244     }
2245     vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2246     wsrc = wdst;
2247     vlen_enc = Assembler::AVX_128bit;
2248   }
2249   if (is_dst_valid) {
2250     vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2251   }
2252 }
2253 
2254 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
2255                                         XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2256                                         XMMRegister xmm_0, XMMRegister xmm_1) {
2257   XMMRegister wsrc = src;
2258   XMMRegister wdst = xmm_0;
2259   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2260   int vlen_enc = Assembler::AVX_128bit;
2261   if (vlen == 8) {
2262     vlen_enc = Assembler::AVX_256bit;
2263   }
2264   for (int i = log2(vlen) - 1; i >=0; i--) {
2265     if (i == 0 && !is_dst_valid) {
2266       wdst = dst;
2267     }
2268     if (i == 1) {
2269       vextracti128_high(wtmp, wsrc);
2270     } else if (i == 2) {
2271       vextracti64x4_high(wtmp, wsrc);
2272     } else {
2273       assert(i == 0, "%d", i);
2274       vpermilpd(wtmp, wsrc, 1, vlen_enc);
2275     }
2276     vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2277     wsrc = wdst;
2278     vlen_enc = Assembler::AVX_128bit;
2279   }
2280   if (is_dst_valid) {
2281     vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2282   }
2283 }
2284 
2285 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2286   switch (bt) {
2287     case T_BYTE:  pextrb(dst, src, idx); break;
2288     case T_SHORT: pextrw(dst, src, idx); break;
2289     case T_INT:   pextrd(dst, src, idx); break;
2290     case T_LONG:  pextrq(dst, src, idx); break;
2291 
2292     default:
2293       assert(false,"Should not reach here.");
2294       break;
2295   }
2296 }
2297 
2298 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2299   int esize =  type2aelembytes(typ);
2300   int elem_per_lane = 16/esize;
2301   int lane = elemindex / elem_per_lane;
2302   int eindex = elemindex % elem_per_lane;
2303 
2304   if (lane >= 2) {
2305     assert(UseAVX > 2, "required");
2306     vextractf32x4(dst, src, lane & 3);
2307     return dst;
2308   } else if (lane > 0) {
2309     assert(UseAVX > 0, "required");
2310     vextractf128(dst, src, lane);
2311     return dst;
2312   } else {
2313     return src;
2314   }
2315 }
2316 
2317 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2318   int esize =  type2aelembytes(typ);
2319   int elem_per_lane = 16/esize;
2320   int eindex = elemindex % elem_per_lane;
2321   assert(is_integral_type(typ),"required");
2322 
2323   if (eindex == 0) {
2324     if (typ == T_LONG) {
2325       movq(dst, src);
2326     } else {
2327       movdl(dst, src);
2328       if (typ == T_BYTE)
2329         movsbl(dst, dst);
2330       else if (typ == T_SHORT)
2331         movswl(dst, dst);
2332     }
2333   } else {
2334     extract(typ, dst, src, eindex);
2335   }
2336 }
2337 
2338 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) {
2339   int esize =  type2aelembytes(typ);
2340   int elem_per_lane = 16/esize;
2341   int eindex = elemindex % elem_per_lane;
2342   assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2343 
2344   if (eindex == 0) {
2345     movq(dst, src);
2346   } else {
2347     if (typ == T_FLOAT) {
2348       if (UseAVX == 0) {
2349         movdqu(dst, src);
2350         shufps(dst, dst, eindex);
2351       } else {
2352         vshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2353       }
2354     } else {
2355       if (UseAVX == 0) {
2356         movdqu(dst, src);
2357         psrldq(dst, eindex*esize);
2358       } else {
2359         vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2360       }
2361       movq(dst, dst);
2362     }
2363   }
2364   // Zero upper bits
2365   if (typ == T_FLOAT) {
2366     if (UseAVX == 0) {
2367       assert(vtmp != xnoreg, "required.");
2368       movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg);
2369       pand(dst, vtmp);
2370     } else {
2371       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg);
2372     }
2373   }
2374 }
2375 
2376 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) {
2377   switch(typ) {
2378     case T_BYTE:
2379     case T_BOOLEAN:
2380       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2381       break;
2382     case T_SHORT:
2383     case T_CHAR:
2384       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2385       break;
2386     case T_INT:
2387     case T_FLOAT:
2388       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2389       break;
2390     case T_LONG:
2391     case T_DOUBLE:
2392       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2393       break;
2394     default:
2395       assert(false,"Should not reach here.");
2396       break;
2397   }
2398 }
2399 
2400 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) {
2401   assert(rscratch != noreg || always_reachable(src2), "missing");
2402 
2403   switch(typ) {
2404     case T_BOOLEAN:
2405     case T_BYTE:
2406       evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2407       break;
2408     case T_CHAR:
2409     case T_SHORT:
2410       evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2411       break;
2412     case T_INT:
2413     case T_FLOAT:
2414       evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2415       break;
2416     case T_LONG:
2417     case T_DOUBLE:
2418       evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2419       break;
2420     default:
2421       assert(false,"Should not reach here.");
2422       break;
2423   }
2424 }
2425 
2426 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2427   switch(typ) {
2428     case T_BYTE:
2429       evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2430       break;
2431     case T_SHORT:
2432       evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2433       break;
2434     case T_INT:
2435     case T_FLOAT:
2436       evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2437       break;
2438     case T_LONG:
2439     case T_DOUBLE:
2440       evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2441       break;
2442     default:
2443       assert(false,"Should not reach here.");
2444       break;
2445   }
2446 }
2447 
2448 void C2_MacroAssembler::vectortest(int bt, int vlen, XMMRegister src1, XMMRegister src2,
2449                                    XMMRegister vtmp1, XMMRegister vtmp2, KRegister mask) {
2450   switch(vlen) {
2451     case 4:
2452       assert(vtmp1 != xnoreg, "required.");
2453       // Broadcast lower 32 bits to 128 bits before ptest
2454       pshufd(vtmp1, src1, 0x0);
2455       if (bt == BoolTest::overflow) {
2456         assert(vtmp2 != xnoreg, "required.");
2457         pshufd(vtmp2, src2, 0x0);
2458       } else {
2459         assert(vtmp2 == xnoreg, "required.");
2460         vtmp2 = src2;
2461       }
2462       ptest(vtmp1, vtmp2);
2463      break;
2464     case 8:
2465       assert(vtmp1 != xnoreg, "required.");
2466       // Broadcast lower 64 bits to 128 bits before ptest
2467       pshufd(vtmp1, src1, 0x4);
2468       if (bt == BoolTest::overflow) {
2469         assert(vtmp2 != xnoreg, "required.");
2470         pshufd(vtmp2, src2, 0x4);
2471       } else {
2472         assert(vtmp2 == xnoreg, "required.");
2473         vtmp2 = src2;
2474       }
2475       ptest(vtmp1, vtmp2);
2476      break;
2477     case 16:
2478       assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required.");
2479       ptest(src1, src2);
2480       break;
2481     case 32:
2482       assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required.");
2483       vptest(src1, src2, Assembler::AVX_256bit);
2484       break;
2485     case 64:
2486       {
2487         assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required.");
2488         evpcmpeqb(mask, src1, src2, Assembler::AVX_512bit);
2489         if (bt == BoolTest::ne) {
2490           ktestql(mask, mask);
2491         } else {
2492           assert(bt == BoolTest::overflow, "required");
2493           kortestql(mask, mask);
2494         }
2495       }
2496       break;
2497     default:
2498       assert(false,"Should not reach here.");
2499       break;
2500   }
2501 }
2502 
2503 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
2504   assert(UseAVX >= 2, "required");
2505 #ifdef ASSERT
2506   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2507   bool is_bw_supported = VM_Version::supports_avx512bw();
2508   if (is_bw && !is_bw_supported) {
2509     assert(vlen_enc != Assembler::AVX_512bit, "required");
2510     assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16),
2511            "XMM register should be 0-15");
2512   }
2513 #endif // ASSERT
2514   switch (elem_bt) {
2515     case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return;
2516     case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return;
2517     case T_INT: vpaddd(dst, src1, src2, vlen_enc); return;
2518     case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return;
2519     case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return;
2520     case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return;
2521     default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2522   }
2523 }
2524 
2525 #ifdef _LP64
2526 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) {
2527   assert(UseAVX >= 2, "required");
2528   bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2529   bool is_vl = vlen_enc != Assembler::AVX_512bit;
2530   if ((UseAVX > 2) &&
2531       (!is_bw || VM_Version::supports_avx512bw()) &&
2532       (!is_vl || VM_Version::supports_avx512vl())) {
2533     switch (elem_bt) {
2534       case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return;
2535       case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return;
2536       case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return;
2537       case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return;
2538       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2539     }
2540   } else {
2541     assert(vlen_enc != Assembler::AVX_512bit, "required");
2542     assert((dst->encoding() < 16),"XMM register should be 0-15");
2543     switch (elem_bt) {
2544       case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return;
2545       case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return;
2546       case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return;
2547       case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return;
2548       case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return;
2549       case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return;
2550       default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2551     }
2552   }
2553 }
2554 #endif
2555 
2556 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
2557   switch (to_elem_bt) {
2558     case T_SHORT:
2559       vpmovsxbw(dst, src, vlen_enc);
2560       break;
2561     case T_INT:
2562       vpmovsxbd(dst, src, vlen_enc);
2563       break;
2564     case T_FLOAT:
2565       vpmovsxbd(dst, src, vlen_enc);
2566       vcvtdq2ps(dst, dst, vlen_enc);
2567       break;
2568     case T_LONG:
2569       vpmovsxbq(dst, src, vlen_enc);
2570       break;
2571     case T_DOUBLE: {
2572       int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
2573       vpmovsxbd(dst, src, mid_vlen_enc);
2574       vcvtdq2pd(dst, dst, vlen_enc);
2575       break;
2576     }
2577     default:
2578       fatal("Unsupported type %s", type2name(to_elem_bt));
2579       break;
2580   }
2581 }
2582 
2583 //-------------------------------------------------------------------------------------------
2584 
2585 // IndexOf for constant substrings with size >= 8 chars
2586 // which don't need to be loaded through stack.
2587 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2588                                          Register cnt1, Register cnt2,
2589                                          int int_cnt2,  Register result,
2590                                          XMMRegister vec, Register tmp,
2591                                          int ae) {
2592   ShortBranchVerifier sbv(this);
2593   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2594   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2595 
2596   // This method uses the pcmpestri instruction with bound registers
2597   //   inputs:
2598   //     xmm - substring
2599   //     rax - substring length (elements count)
2600   //     mem - scanned string
2601   //     rdx - string length (elements count)
2602   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2603   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2604   //   outputs:
2605   //     rcx - matched index in string
2606   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2607   int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2608   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2609   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2610   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2611 
2612   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2613         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2614         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2615 
2616   // Note, inline_string_indexOf() generates checks:
2617   // if (substr.count > string.count) return -1;
2618   // if (substr.count == 0) return 0;
2619   assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2620 
2621   // Load substring.
2622   if (ae == StrIntrinsicNode::UL) {
2623     pmovzxbw(vec, Address(str2, 0));
2624   } else {
2625     movdqu(vec, Address(str2, 0));
2626   }
2627   movl(cnt2, int_cnt2);
2628   movptr(result, str1); // string addr
2629 
2630   if (int_cnt2 > stride) {
2631     jmpb(SCAN_TO_SUBSTR);
2632 
2633     // Reload substr for rescan, this code
2634     // is executed only for large substrings (> 8 chars)
2635     bind(RELOAD_SUBSTR);
2636     if (ae == StrIntrinsicNode::UL) {
2637       pmovzxbw(vec, Address(str2, 0));
2638     } else {
2639       movdqu(vec, Address(str2, 0));
2640     }
2641     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2642 
2643     bind(RELOAD_STR);
2644     // We came here after the beginning of the substring was
2645     // matched but the rest of it was not so we need to search
2646     // again. Start from the next element after the previous match.
2647 
2648     // cnt2 is number of substring reminding elements and
2649     // cnt1 is number of string reminding elements when cmp failed.
2650     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
2651     subl(cnt1, cnt2);
2652     addl(cnt1, int_cnt2);
2653     movl(cnt2, int_cnt2); // Now restore cnt2
2654 
2655     decrementl(cnt1);     // Shift to next element
2656     cmpl(cnt1, cnt2);
2657     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2658 
2659     addptr(result, (1<<scale1));
2660 
2661   } // (int_cnt2 > 8)
2662 
2663   // Scan string for start of substr in 16-byte vectors
2664   bind(SCAN_TO_SUBSTR);
2665   pcmpestri(vec, Address(result, 0), mode);
2666   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2667   subl(cnt1, stride);
2668   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2669   cmpl(cnt1, cnt2);
2670   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2671   addptr(result, 16);
2672   jmpb(SCAN_TO_SUBSTR);
2673 
2674   // Found a potential substr
2675   bind(FOUND_CANDIDATE);
2676   // Matched whole vector if first element matched (tmp(rcx) == 0).
2677   if (int_cnt2 == stride) {
2678     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
2679   } else { // int_cnt2 > 8
2680     jccb(Assembler::overflow, FOUND_SUBSTR);
2681   }
2682   // After pcmpestri tmp(rcx) contains matched element index
2683   // Compute start addr of substr
2684   lea(result, Address(result, tmp, scale1));
2685 
2686   // Make sure string is still long enough
2687   subl(cnt1, tmp);
2688   cmpl(cnt1, cnt2);
2689   if (int_cnt2 == stride) {
2690     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2691   } else { // int_cnt2 > 8
2692     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
2693   }
2694   // Left less then substring.
2695 
2696   bind(RET_NOT_FOUND);
2697   movl(result, -1);
2698   jmp(EXIT);
2699 
2700   if (int_cnt2 > stride) {
2701     // This code is optimized for the case when whole substring
2702     // is matched if its head is matched.
2703     bind(MATCH_SUBSTR_HEAD);
2704     pcmpestri(vec, Address(result, 0), mode);
2705     // Reload only string if does not match
2706     jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
2707 
2708     Label CONT_SCAN_SUBSTR;
2709     // Compare the rest of substring (> 8 chars).
2710     bind(FOUND_SUBSTR);
2711     // First 8 chars are already matched.
2712     negptr(cnt2);
2713     addptr(cnt2, stride);
2714 
2715     bind(SCAN_SUBSTR);
2716     subl(cnt1, stride);
2717     cmpl(cnt2, -stride); // Do not read beyond substring
2718     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
2719     // Back-up strings to avoid reading beyond substring:
2720     // cnt1 = cnt1 - cnt2 + 8
2721     addl(cnt1, cnt2); // cnt2 is negative
2722     addl(cnt1, stride);
2723     movl(cnt2, stride); negptr(cnt2);
2724     bind(CONT_SCAN_SUBSTR);
2725     if (int_cnt2 < (int)G) {
2726       int tail_off1 = int_cnt2<<scale1;
2727       int tail_off2 = int_cnt2<<scale2;
2728       if (ae == StrIntrinsicNode::UL) {
2729         pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
2730       } else {
2731         movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
2732       }
2733       pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
2734     } else {
2735       // calculate index in register to avoid integer overflow (int_cnt2*2)
2736       movl(tmp, int_cnt2);
2737       addptr(tmp, cnt2);
2738       if (ae == StrIntrinsicNode::UL) {
2739         pmovzxbw(vec, Address(str2, tmp, scale2, 0));
2740       } else {
2741         movdqu(vec, Address(str2, tmp, scale2, 0));
2742       }
2743       pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
2744     }
2745     // Need to reload strings pointers if not matched whole vector
2746     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2747     addptr(cnt2, stride);
2748     jcc(Assembler::negative, SCAN_SUBSTR);
2749     // Fall through if found full substring
2750 
2751   } // (int_cnt2 > 8)
2752 
2753   bind(RET_FOUND);
2754   // Found result if we matched full small substring.
2755   // Compute substr offset
2756   subptr(result, str1);
2757   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2758     shrl(result, 1); // index
2759   }
2760   bind(EXIT);
2761 
2762 } // string_indexofC8
2763 
2764 // Small strings are loaded through stack if they cross page boundary.
2765 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
2766                                        Register cnt1, Register cnt2,
2767                                        int int_cnt2,  Register result,
2768                                        XMMRegister vec, Register tmp,
2769                                        int ae) {
2770   ShortBranchVerifier sbv(this);
2771   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2772   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2773 
2774   //
2775   // int_cnt2 is length of small (< 8 chars) constant substring
2776   // or (-1) for non constant substring in which case its length
2777   // is in cnt2 register.
2778   //
2779   // Note, inline_string_indexOf() generates checks:
2780   // if (substr.count > string.count) return -1;
2781   // if (substr.count == 0) return 0;
2782   //
2783   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2784   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
2785   // This method uses the pcmpestri instruction with bound registers
2786   //   inputs:
2787   //     xmm - substring
2788   //     rax - substring length (elements count)
2789   //     mem - scanned string
2790   //     rdx - string length (elements count)
2791   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2792   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2793   //   outputs:
2794   //     rcx - matched index in string
2795   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2796   int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2797   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2798   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2799 
2800   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
2801         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
2802         FOUND_CANDIDATE;
2803 
2804   { //========================================================
2805     // We don't know where these strings are located
2806     // and we can't read beyond them. Load them through stack.
2807     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
2808 
2809     movptr(tmp, rsp); // save old SP
2810 
2811     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
2812       if (int_cnt2 == (1>>scale2)) { // One byte
2813         assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
2814         load_unsigned_byte(result, Address(str2, 0));
2815         movdl(vec, result); // move 32 bits
2816       } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
2817         // Not enough header space in 32-bit VM: 12+3 = 15.
2818         movl(result, Address(str2, -1));
2819         shrl(result, 8);
2820         movdl(vec, result); // move 32 bits
2821       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
2822         load_unsigned_short(result, Address(str2, 0));
2823         movdl(vec, result); // move 32 bits
2824       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
2825         movdl(vec, Address(str2, 0)); // move 32 bits
2826       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
2827         movq(vec, Address(str2, 0));  // move 64 bits
2828       } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
2829         // Array header size is 12 bytes in 32-bit VM
2830         // + 6 bytes for 3 chars == 18 bytes,
2831         // enough space to load vec and shift.
2832         assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
2833         if (ae == StrIntrinsicNode::UL) {
2834           int tail_off = int_cnt2-8;
2835           pmovzxbw(vec, Address(str2, tail_off));
2836           psrldq(vec, -2*tail_off);
2837         }
2838         else {
2839           int tail_off = int_cnt2*(1<<scale2);
2840           movdqu(vec, Address(str2, tail_off-16));
2841           psrldq(vec, 16-tail_off);
2842         }
2843       }
2844     } else { // not constant substring
2845       cmpl(cnt2, stride);
2846       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
2847 
2848       // We can read beyond string if srt+16 does not cross page boundary
2849       // since heaps are aligned and mapped by pages.
2850       assert(os::vm_page_size() < (int)G, "default page should be small");
2851       movl(result, str2); // We need only low 32 bits
2852       andl(result, (os::vm_page_size()-1));
2853       cmpl(result, (os::vm_page_size()-16));
2854       jccb(Assembler::belowEqual, CHECK_STR);
2855 
2856       // Move small strings to stack to allow load 16 bytes into vec.
2857       subptr(rsp, 16);
2858       int stk_offset = wordSize-(1<<scale2);
2859       push(cnt2);
2860 
2861       bind(COPY_SUBSTR);
2862       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
2863         load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
2864         movb(Address(rsp, cnt2, scale2, stk_offset), result);
2865       } else if (ae == StrIntrinsicNode::UU) {
2866         load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
2867         movw(Address(rsp, cnt2, scale2, stk_offset), result);
2868       }
2869       decrement(cnt2);
2870       jccb(Assembler::notZero, COPY_SUBSTR);
2871 
2872       pop(cnt2);
2873       movptr(str2, rsp);  // New substring address
2874     } // non constant
2875 
2876     bind(CHECK_STR);
2877     cmpl(cnt1, stride);
2878     jccb(Assembler::aboveEqual, BIG_STRINGS);
2879 
2880     // Check cross page boundary.
2881     movl(result, str1); // We need only low 32 bits
2882     andl(result, (os::vm_page_size()-1));
2883     cmpl(result, (os::vm_page_size()-16));
2884     jccb(Assembler::belowEqual, BIG_STRINGS);
2885 
2886     subptr(rsp, 16);
2887     int stk_offset = -(1<<scale1);
2888     if (int_cnt2 < 0) { // not constant
2889       push(cnt2);
2890       stk_offset += wordSize;
2891     }
2892     movl(cnt2, cnt1);
2893 
2894     bind(COPY_STR);
2895     if (ae == StrIntrinsicNode::LL) {
2896       load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
2897       movb(Address(rsp, cnt2, scale1, stk_offset), result);
2898     } else {
2899       load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
2900       movw(Address(rsp, cnt2, scale1, stk_offset), result);
2901     }
2902     decrement(cnt2);
2903     jccb(Assembler::notZero, COPY_STR);
2904 
2905     if (int_cnt2 < 0) { // not constant
2906       pop(cnt2);
2907     }
2908     movptr(str1, rsp);  // New string address
2909 
2910     bind(BIG_STRINGS);
2911     // Load substring.
2912     if (int_cnt2 < 0) { // -1
2913       if (ae == StrIntrinsicNode::UL) {
2914         pmovzxbw(vec, Address(str2, 0));
2915       } else {
2916         movdqu(vec, Address(str2, 0));
2917       }
2918       push(cnt2);       // substr count
2919       push(str2);       // substr addr
2920       push(str1);       // string addr
2921     } else {
2922       // Small (< 8 chars) constant substrings are loaded already.
2923       movl(cnt2, int_cnt2);
2924     }
2925     push(tmp);  // original SP
2926 
2927   } // Finished loading
2928 
2929   //========================================================
2930   // Start search
2931   //
2932 
2933   movptr(result, str1); // string addr
2934 
2935   if (int_cnt2  < 0) {  // Only for non constant substring
2936     jmpb(SCAN_TO_SUBSTR);
2937 
2938     // SP saved at sp+0
2939     // String saved at sp+1*wordSize
2940     // Substr saved at sp+2*wordSize
2941     // Substr count saved at sp+3*wordSize
2942 
2943     // Reload substr for rescan, this code
2944     // is executed only for large substrings (> 8 chars)
2945     bind(RELOAD_SUBSTR);
2946     movptr(str2, Address(rsp, 2*wordSize));
2947     movl(cnt2, Address(rsp, 3*wordSize));
2948     if (ae == StrIntrinsicNode::UL) {
2949       pmovzxbw(vec, Address(str2, 0));
2950     } else {
2951       movdqu(vec, Address(str2, 0));
2952     }
2953     // We came here after the beginning of the substring was
2954     // matched but the rest of it was not so we need to search
2955     // again. Start from the next element after the previous match.
2956     subptr(str1, result); // Restore counter
2957     if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2958       shrl(str1, 1);
2959     }
2960     addl(cnt1, str1);
2961     decrementl(cnt1);   // Shift to next element
2962     cmpl(cnt1, cnt2);
2963     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2964 
2965     addptr(result, (1<<scale1));
2966   } // non constant
2967 
2968   // Scan string for start of substr in 16-byte vectors
2969   bind(SCAN_TO_SUBSTR);
2970   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2971   pcmpestri(vec, Address(result, 0), mode);
2972   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2973   subl(cnt1, stride);
2974   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2975   cmpl(cnt1, cnt2);
2976   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2977   addptr(result, 16);
2978 
2979   bind(ADJUST_STR);
2980   cmpl(cnt1, stride); // Do not read beyond string
2981   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2982   // Back-up string to avoid reading beyond string.
2983   lea(result, Address(result, cnt1, scale1, -16));
2984   movl(cnt1, stride);
2985   jmpb(SCAN_TO_SUBSTR);
2986 
2987   // Found a potential substr
2988   bind(FOUND_CANDIDATE);
2989   // After pcmpestri tmp(rcx) contains matched element index
2990 
2991   // Make sure string is still long enough
2992   subl(cnt1, tmp);
2993   cmpl(cnt1, cnt2);
2994   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
2995   // Left less then substring.
2996 
2997   bind(RET_NOT_FOUND);
2998   movl(result, -1);
2999   jmp(CLEANUP);
3000 
3001   bind(FOUND_SUBSTR);
3002   // Compute start addr of substr
3003   lea(result, Address(result, tmp, scale1));
3004   if (int_cnt2 > 0) { // Constant substring
3005     // Repeat search for small substring (< 8 chars)
3006     // from new point without reloading substring.
3007     // Have to check that we don't read beyond string.
3008     cmpl(tmp, stride-int_cnt2);
3009     jccb(Assembler::greater, ADJUST_STR);
3010     // Fall through if matched whole substring.
3011   } else { // non constant
3012     assert(int_cnt2 == -1, "should be != 0");
3013 
3014     addl(tmp, cnt2);
3015     // Found result if we matched whole substring.
3016     cmpl(tmp, stride);
3017     jcc(Assembler::lessEqual, RET_FOUND);
3018 
3019     // Repeat search for small substring (<= 8 chars)
3020     // from new point 'str1' without reloading substring.
3021     cmpl(cnt2, stride);
3022     // Have to check that we don't read beyond string.
3023     jccb(Assembler::lessEqual, ADJUST_STR);
3024 
3025     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
3026     // Compare the rest of substring (> 8 chars).
3027     movptr(str1, result);
3028 
3029     cmpl(tmp, cnt2);
3030     // First 8 chars are already matched.
3031     jccb(Assembler::equal, CHECK_NEXT);
3032 
3033     bind(SCAN_SUBSTR);
3034     pcmpestri(vec, Address(str1, 0), mode);
3035     // Need to reload strings pointers if not matched whole vector
3036     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
3037 
3038     bind(CHECK_NEXT);
3039     subl(cnt2, stride);
3040     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
3041     addptr(str1, 16);
3042     if (ae == StrIntrinsicNode::UL) {
3043       addptr(str2, 8);
3044     } else {
3045       addptr(str2, 16);
3046     }
3047     subl(cnt1, stride);
3048     cmpl(cnt2, stride); // Do not read beyond substring
3049     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
3050     // Back-up strings to avoid reading beyond substring.
3051 
3052     if (ae == StrIntrinsicNode::UL) {
3053       lea(str2, Address(str2, cnt2, scale2, -8));
3054       lea(str1, Address(str1, cnt2, scale1, -16));
3055     } else {
3056       lea(str2, Address(str2, cnt2, scale2, -16));
3057       lea(str1, Address(str1, cnt2, scale1, -16));
3058     }
3059     subl(cnt1, cnt2);
3060     movl(cnt2, stride);
3061     addl(cnt1, stride);
3062     bind(CONT_SCAN_SUBSTR);
3063     if (ae == StrIntrinsicNode::UL) {
3064       pmovzxbw(vec, Address(str2, 0));
3065     } else {
3066       movdqu(vec, Address(str2, 0));
3067     }
3068     jmp(SCAN_SUBSTR);
3069 
3070     bind(RET_FOUND_LONG);
3071     movptr(str1, Address(rsp, wordSize));
3072   } // non constant
3073 
3074   bind(RET_FOUND);
3075   // Compute substr offset
3076   subptr(result, str1);
3077   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3078     shrl(result, 1); // index
3079   }
3080   bind(CLEANUP);
3081   pop(rsp); // restore SP
3082 
3083 } // string_indexof
3084 
3085 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3086                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3087   ShortBranchVerifier sbv(this);
3088   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3089 
3090   int stride = 8;
3091 
3092   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
3093         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
3094         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
3095         FOUND_SEQ_CHAR, DONE_LABEL;
3096 
3097   movptr(result, str1);
3098   if (UseAVX >= 2) {
3099     cmpl(cnt1, stride);
3100     jcc(Assembler::less, SCAN_TO_CHAR);
3101     cmpl(cnt1, 2*stride);
3102     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
3103     movdl(vec1, ch);
3104     vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
3105     vpxor(vec2, vec2);
3106     movl(tmp, cnt1);
3107     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
3108     andl(cnt1,0x0000000F);  //tail count (in chars)
3109 
3110     bind(SCAN_TO_16_CHAR_LOOP);
3111     vmovdqu(vec3, Address(result, 0));
3112     vpcmpeqw(vec3, vec3, vec1, 1);
3113     vptest(vec2, vec3);
3114     jcc(Assembler::carryClear, FOUND_CHAR);
3115     addptr(result, 32);
3116     subl(tmp, 2*stride);
3117     jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
3118     jmp(SCAN_TO_8_CHAR);
3119     bind(SCAN_TO_8_CHAR_INIT);
3120     movdl(vec1, ch);
3121     pshuflw(vec1, vec1, 0x00);
3122     pshufd(vec1, vec1, 0);
3123     pxor(vec2, vec2);
3124   }
3125   bind(SCAN_TO_8_CHAR);
3126   cmpl(cnt1, stride);
3127   jcc(Assembler::less, SCAN_TO_CHAR);
3128   if (UseAVX < 2) {
3129     movdl(vec1, ch);
3130     pshuflw(vec1, vec1, 0x00);
3131     pshufd(vec1, vec1, 0);
3132     pxor(vec2, vec2);
3133   }
3134   movl(tmp, cnt1);
3135   andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
3136   andl(cnt1,0x00000007);  //tail count (in chars)
3137 
3138   bind(SCAN_TO_8_CHAR_LOOP);
3139   movdqu(vec3, Address(result, 0));
3140   pcmpeqw(vec3, vec1);
3141   ptest(vec2, vec3);
3142   jcc(Assembler::carryClear, FOUND_CHAR);
3143   addptr(result, 16);
3144   subl(tmp, stride);
3145   jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
3146   bind(SCAN_TO_CHAR);
3147   testl(cnt1, cnt1);
3148   jcc(Assembler::zero, RET_NOT_FOUND);
3149   bind(SCAN_TO_CHAR_LOOP);
3150   load_unsigned_short(tmp, Address(result, 0));
3151   cmpl(ch, tmp);
3152   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3153   addptr(result, 2);
3154   subl(cnt1, 1);
3155   jccb(Assembler::zero, RET_NOT_FOUND);
3156   jmp(SCAN_TO_CHAR_LOOP);
3157 
3158   bind(RET_NOT_FOUND);
3159   movl(result, -1);
3160   jmpb(DONE_LABEL);
3161 
3162   bind(FOUND_CHAR);
3163   if (UseAVX >= 2) {
3164     vpmovmskb(tmp, vec3);
3165   } else {
3166     pmovmskb(tmp, vec3);
3167   }
3168   bsfl(ch, tmp);
3169   addptr(result, ch);
3170 
3171   bind(FOUND_SEQ_CHAR);
3172   subptr(result, str1);
3173   shrl(result, 1);
3174 
3175   bind(DONE_LABEL);
3176 } // string_indexof_char
3177 
3178 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3179                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3180   ShortBranchVerifier sbv(this);
3181   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3182 
3183   int stride = 16;
3184 
3185   Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
3186         SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
3187         RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
3188         FOUND_SEQ_CHAR, DONE_LABEL;
3189 
3190   movptr(result, str1);
3191   if (UseAVX >= 2) {
3192     cmpl(cnt1, stride);
3193     jcc(Assembler::less, SCAN_TO_CHAR_INIT);
3194     cmpl(cnt1, stride*2);
3195     jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
3196     movdl(vec1, ch);
3197     vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
3198     vpxor(vec2, vec2);
3199     movl(tmp, cnt1);
3200     andl(tmp, 0xFFFFFFE0);  //vector count (in chars)
3201     andl(cnt1,0x0000001F);  //tail count (in chars)
3202 
3203     bind(SCAN_TO_32_CHAR_LOOP);
3204     vmovdqu(vec3, Address(result, 0));
3205     vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
3206     vptest(vec2, vec3);
3207     jcc(Assembler::carryClear, FOUND_CHAR);
3208     addptr(result, 32);
3209     subl(tmp, stride*2);
3210     jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
3211     jmp(SCAN_TO_16_CHAR);
3212 
3213     bind(SCAN_TO_16_CHAR_INIT);
3214     movdl(vec1, ch);
3215     pxor(vec2, vec2);
3216     pshufb(vec1, vec2);
3217   }
3218 
3219   bind(SCAN_TO_16_CHAR);
3220   cmpl(cnt1, stride);
3221   jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left
3222   if (UseAVX < 2) {
3223     movdl(vec1, ch);
3224     pxor(vec2, vec2);
3225     pshufb(vec1, vec2);
3226   }
3227   movl(tmp, cnt1);
3228   andl(tmp, 0xFFFFFFF0);  //vector count (in bytes)
3229   andl(cnt1,0x0000000F);  //tail count (in bytes)
3230 
3231   bind(SCAN_TO_16_CHAR_LOOP);
3232   movdqu(vec3, Address(result, 0));
3233   pcmpeqb(vec3, vec1);
3234   ptest(vec2, vec3);
3235   jcc(Assembler::carryClear, FOUND_CHAR);
3236   addptr(result, 16);
3237   subl(tmp, stride);
3238   jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
3239 
3240   bind(SCAN_TO_CHAR_INIT);
3241   testl(cnt1, cnt1);
3242   jcc(Assembler::zero, RET_NOT_FOUND);
3243   bind(SCAN_TO_CHAR_LOOP);
3244   load_unsigned_byte(tmp, Address(result, 0));
3245   cmpl(ch, tmp);
3246   jccb(Assembler::equal, FOUND_SEQ_CHAR);
3247   addptr(result, 1);
3248   subl(cnt1, 1);
3249   jccb(Assembler::zero, RET_NOT_FOUND);
3250   jmp(SCAN_TO_CHAR_LOOP);
3251 
3252   bind(RET_NOT_FOUND);
3253   movl(result, -1);
3254   jmpb(DONE_LABEL);
3255 
3256   bind(FOUND_CHAR);
3257   if (UseAVX >= 2) {
3258     vpmovmskb(tmp, vec3);
3259   } else {
3260     pmovmskb(tmp, vec3);
3261   }
3262   bsfl(ch, tmp);
3263   addptr(result, ch);
3264 
3265   bind(FOUND_SEQ_CHAR);
3266   subptr(result, str1);
3267 
3268   bind(DONE_LABEL);
3269 } // stringL_indexof_char
3270 
3271 // helper function for string_compare
3272 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
3273                                            Address::ScaleFactor scale, Address::ScaleFactor scale1,
3274                                            Address::ScaleFactor scale2, Register index, int ae) {
3275   if (ae == StrIntrinsicNode::LL) {
3276     load_unsigned_byte(elem1, Address(str1, index, scale, 0));
3277     load_unsigned_byte(elem2, Address(str2, index, scale, 0));
3278   } else if (ae == StrIntrinsicNode::UU) {
3279     load_unsigned_short(elem1, Address(str1, index, scale, 0));
3280     load_unsigned_short(elem2, Address(str2, index, scale, 0));
3281   } else {
3282     load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
3283     load_unsigned_short(elem2, Address(str2, index, scale2, 0));
3284   }
3285 }
3286 
3287 // Compare strings, used for char[] and byte[].
3288 void C2_MacroAssembler::string_compare(Register str1, Register str2,
3289                                        Register cnt1, Register cnt2, Register result,
3290                                        XMMRegister vec1, int ae, KRegister mask) {
3291   ShortBranchVerifier sbv(this);
3292   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
3293   Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only _LP64 && AVX3
3294   int stride, stride2, adr_stride, adr_stride1, adr_stride2;
3295   int stride2x2 = 0x40;
3296   Address::ScaleFactor scale = Address::no_scale;
3297   Address::ScaleFactor scale1 = Address::no_scale;
3298   Address::ScaleFactor scale2 = Address::no_scale;
3299 
3300   if (ae != StrIntrinsicNode::LL) {
3301     stride2x2 = 0x20;
3302   }
3303 
3304   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
3305     shrl(cnt2, 1);
3306   }
3307   // Compute the minimum of the string lengths and the
3308   // difference of the string lengths (stack).
3309   // Do the conditional move stuff
3310   movl(result, cnt1);
3311   subl(cnt1, cnt2);
3312   push(cnt1);
3313   cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
3314 
3315   // Is the minimum length zero?
3316   testl(cnt2, cnt2);
3317   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3318   if (ae == StrIntrinsicNode::LL) {
3319     // Load first bytes
3320     load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
3321     load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
3322   } else if (ae == StrIntrinsicNode::UU) {
3323     // Load first characters
3324     load_unsigned_short(result, Address(str1, 0));
3325     load_unsigned_short(cnt1, Address(str2, 0));
3326   } else {
3327     load_unsigned_byte(result, Address(str1, 0));
3328     load_unsigned_short(cnt1, Address(str2, 0));
3329   }
3330   subl(result, cnt1);
3331   jcc(Assembler::notZero,  POP_LABEL);
3332 
3333   if (ae == StrIntrinsicNode::UU) {
3334     // Divide length by 2 to get number of chars
3335     shrl(cnt2, 1);
3336   }
3337   cmpl(cnt2, 1);
3338   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3339 
3340   // Check if the strings start at the same location and setup scale and stride
3341   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3342     cmpptr(str1, str2);
3343     jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3344     if (ae == StrIntrinsicNode::LL) {
3345       scale = Address::times_1;
3346       stride = 16;
3347     } else {
3348       scale = Address::times_2;
3349       stride = 8;
3350     }
3351   } else {
3352     scale1 = Address::times_1;
3353     scale2 = Address::times_2;
3354     // scale not used
3355     stride = 8;
3356   }
3357 
3358   if (UseAVX >= 2 && UseSSE42Intrinsics) {
3359     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
3360     Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
3361     Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
3362     Label COMPARE_TAIL_LONG;
3363     Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only _LP64 && AVX3
3364 
3365     int pcmpmask = 0x19;
3366     if (ae == StrIntrinsicNode::LL) {
3367       pcmpmask &= ~0x01;
3368     }
3369 
3370     // Setup to compare 16-chars (32-bytes) vectors,
3371     // start from first character again because it has aligned address.
3372     if (ae == StrIntrinsicNode::LL) {
3373       stride2 = 32;
3374     } else {
3375       stride2 = 16;
3376     }
3377     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3378       adr_stride = stride << scale;
3379     } else {
3380       adr_stride1 = 8;  //stride << scale1;
3381       adr_stride2 = 16; //stride << scale2;
3382     }
3383 
3384     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3385     // rax and rdx are used by pcmpestri as elements counters
3386     movl(result, cnt2);
3387     andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
3388     jcc(Assembler::zero, COMPARE_TAIL_LONG);
3389 
3390     // fast path : compare first 2 8-char vectors.
3391     bind(COMPARE_16_CHARS);
3392     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3393       movdqu(vec1, Address(str1, 0));
3394     } else {
3395       pmovzxbw(vec1, Address(str1, 0));
3396     }
3397     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3398     jccb(Assembler::below, COMPARE_INDEX_CHAR);
3399 
3400     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3401       movdqu(vec1, Address(str1, adr_stride));
3402       pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
3403     } else {
3404       pmovzxbw(vec1, Address(str1, adr_stride1));
3405       pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
3406     }
3407     jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
3408     addl(cnt1, stride);
3409 
3410     // Compare the characters at index in cnt1
3411     bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
3412     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3413     subl(result, cnt2);
3414     jmp(POP_LABEL);
3415 
3416     // Setup the registers to start vector comparison loop
3417     bind(COMPARE_WIDE_VECTORS);
3418     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3419       lea(str1, Address(str1, result, scale));
3420       lea(str2, Address(str2, result, scale));
3421     } else {
3422       lea(str1, Address(str1, result, scale1));
3423       lea(str2, Address(str2, result, scale2));
3424     }
3425     subl(result, stride2);
3426     subl(cnt2, stride2);
3427     jcc(Assembler::zero, COMPARE_WIDE_TAIL);
3428     negptr(result);
3429 
3430     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
3431     bind(COMPARE_WIDE_VECTORS_LOOP);
3432 
3433 #ifdef _LP64
3434     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3435       cmpl(cnt2, stride2x2);
3436       jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3437       testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
3438       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
3439 
3440       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3441       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3442         evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
3443         evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3444       } else {
3445         vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
3446         evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3447       }
3448       kortestql(mask, mask);
3449       jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
3450       addptr(result, stride2x2);  // update since we already compared at this addr
3451       subl(cnt2, stride2x2);      // and sub the size too
3452       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3453 
3454       vpxor(vec1, vec1);
3455       jmpb(COMPARE_WIDE_TAIL);
3456     }//if (VM_Version::supports_avx512vlbw())
3457 #endif // _LP64
3458 
3459 
3460     bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3461     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3462       vmovdqu(vec1, Address(str1, result, scale));
3463       vpxor(vec1, Address(str2, result, scale));
3464     } else {
3465       vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
3466       vpxor(vec1, Address(str2, result, scale2));
3467     }
3468     vptest(vec1, vec1);
3469     jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
3470     addptr(result, stride2);
3471     subl(cnt2, stride2);
3472     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
3473     // clean upper bits of YMM registers
3474     vpxor(vec1, vec1);
3475 
3476     // compare wide vectors tail
3477     bind(COMPARE_WIDE_TAIL);
3478     testptr(result, result);
3479     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3480 
3481     movl(result, stride2);
3482     movl(cnt2, result);
3483     negptr(result);
3484     jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3485 
3486     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
3487     bind(VECTOR_NOT_EQUAL);
3488     // clean upper bits of YMM registers
3489     vpxor(vec1, vec1);
3490     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3491       lea(str1, Address(str1, result, scale));
3492       lea(str2, Address(str2, result, scale));
3493     } else {
3494       lea(str1, Address(str1, result, scale1));
3495       lea(str2, Address(str2, result, scale2));
3496     }
3497     jmp(COMPARE_16_CHARS);
3498 
3499     // Compare tail chars, length between 1 to 15 chars
3500     bind(COMPARE_TAIL_LONG);
3501     movl(cnt2, result);
3502     cmpl(cnt2, stride);
3503     jcc(Assembler::less, COMPARE_SMALL_STR);
3504 
3505     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3506       movdqu(vec1, Address(str1, 0));
3507     } else {
3508       pmovzxbw(vec1, Address(str1, 0));
3509     }
3510     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3511     jcc(Assembler::below, COMPARE_INDEX_CHAR);
3512     subptr(cnt2, stride);
3513     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3514     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3515       lea(str1, Address(str1, result, scale));
3516       lea(str2, Address(str2, result, scale));
3517     } else {
3518       lea(str1, Address(str1, result, scale1));
3519       lea(str2, Address(str2, result, scale2));
3520     }
3521     negptr(cnt2);
3522     jmpb(WHILE_HEAD_LABEL);
3523 
3524     bind(COMPARE_SMALL_STR);
3525   } else if (UseSSE42Intrinsics) {
3526     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
3527     int pcmpmask = 0x19;
3528     // Setup to compare 8-char (16-byte) vectors,
3529     // start from first character again because it has aligned address.
3530     movl(result, cnt2);
3531     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
3532     if (ae == StrIntrinsicNode::LL) {
3533       pcmpmask &= ~0x01;
3534     }
3535     jcc(Assembler::zero, COMPARE_TAIL);
3536     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3537       lea(str1, Address(str1, result, scale));
3538       lea(str2, Address(str2, result, scale));
3539     } else {
3540       lea(str1, Address(str1, result, scale1));
3541       lea(str2, Address(str2, result, scale2));
3542     }
3543     negptr(result);
3544 
3545     // pcmpestri
3546     //   inputs:
3547     //     vec1- substring
3548     //     rax - negative string length (elements count)
3549     //     mem - scanned string
3550     //     rdx - string length (elements count)
3551     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
3552     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
3553     //   outputs:
3554     //     rcx - first mismatched element index
3555     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3556 
3557     bind(COMPARE_WIDE_VECTORS);
3558     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3559       movdqu(vec1, Address(str1, result, scale));
3560       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3561     } else {
3562       pmovzxbw(vec1, Address(str1, result, scale1));
3563       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3564     }
3565     // After pcmpestri cnt1(rcx) contains mismatched element index
3566 
3567     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
3568     addptr(result, stride);
3569     subptr(cnt2, stride);
3570     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3571 
3572     // compare wide vectors tail
3573     testptr(result, result);
3574     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3575 
3576     movl(cnt2, stride);
3577     movl(result, stride);
3578     negptr(result);
3579     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3580       movdqu(vec1, Address(str1, result, scale));
3581       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3582     } else {
3583       pmovzxbw(vec1, Address(str1, result, scale1));
3584       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3585     }
3586     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
3587 
3588     // Mismatched characters in the vectors
3589     bind(VECTOR_NOT_EQUAL);
3590     addptr(cnt1, result);
3591     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3592     subl(result, cnt2);
3593     jmpb(POP_LABEL);
3594 
3595     bind(COMPARE_TAIL); // limit is zero
3596     movl(cnt2, result);
3597     // Fallthru to tail compare
3598   }
3599   // Shift str2 and str1 to the end of the arrays, negate min
3600   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3601     lea(str1, Address(str1, cnt2, scale));
3602     lea(str2, Address(str2, cnt2, scale));
3603   } else {
3604     lea(str1, Address(str1, cnt2, scale1));
3605     lea(str2, Address(str2, cnt2, scale2));
3606   }
3607   decrementl(cnt2);  // first character was compared already
3608   negptr(cnt2);
3609 
3610   // Compare the rest of the elements
3611   bind(WHILE_HEAD_LABEL);
3612   load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
3613   subl(result, cnt1);
3614   jccb(Assembler::notZero, POP_LABEL);
3615   increment(cnt2);
3616   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
3617 
3618   // Strings are equal up to min length.  Return the length difference.
3619   bind(LENGTH_DIFF_LABEL);
3620   pop(result);
3621   if (ae == StrIntrinsicNode::UU) {
3622     // Divide diff by 2 to get number of chars
3623     sarl(result, 1);
3624   }
3625   jmpb(DONE_LABEL);
3626 
3627 #ifdef _LP64
3628   if (VM_Version::supports_avx512vlbw()) {
3629 
3630     bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
3631 
3632     kmovql(cnt1, mask);
3633     notq(cnt1);
3634     bsfq(cnt2, cnt1);
3635     if (ae != StrIntrinsicNode::LL) {
3636       // Divide diff by 2 to get number of chars
3637       sarl(cnt2, 1);
3638     }
3639     addq(result, cnt2);
3640     if (ae == StrIntrinsicNode::LL) {
3641       load_unsigned_byte(cnt1, Address(str2, result));
3642       load_unsigned_byte(result, Address(str1, result));
3643     } else if (ae == StrIntrinsicNode::UU) {
3644       load_unsigned_short(cnt1, Address(str2, result, scale));
3645       load_unsigned_short(result, Address(str1, result, scale));
3646     } else {
3647       load_unsigned_short(cnt1, Address(str2, result, scale2));
3648       load_unsigned_byte(result, Address(str1, result, scale1));
3649     }
3650     subl(result, cnt1);
3651     jmpb(POP_LABEL);
3652   }//if (VM_Version::supports_avx512vlbw())
3653 #endif // _LP64
3654 
3655   // Discard the stored length difference
3656   bind(POP_LABEL);
3657   pop(cnt1);
3658 
3659   // That's it
3660   bind(DONE_LABEL);
3661   if(ae == StrIntrinsicNode::UL) {
3662     negl(result);
3663   }
3664 
3665 }
3666 
3667 // Search for Non-ASCII character (Negative byte value) in a byte array,
3668 // return the index of the first such character, otherwise the length
3669 // of the array segment searched.
3670 //   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
3671 //   @IntrinsicCandidate
3672 //   public static int countPositives(byte[] ba, int off, int len) {
3673 //     for (int i = off; i < off + len; i++) {
3674 //       if (ba[i] < 0) {
3675 //         return i - off;
3676 //       }
3677 //     }
3678 //     return len;
3679 //   }
3680 void C2_MacroAssembler::count_positives(Register ary1, Register len,
3681   Register result, Register tmp1,
3682   XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {
3683   // rsi: byte array
3684   // rcx: len
3685   // rax: result
3686   ShortBranchVerifier sbv(this);
3687   assert_different_registers(ary1, len, result, tmp1);
3688   assert_different_registers(vec1, vec2);
3689   Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
3690 
3691   movl(result, len); // copy
3692   // len == 0
3693   testl(len, len);
3694   jcc(Assembler::zero, DONE);
3695 
3696   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
3697     VM_Version::supports_avx512vlbw() &&
3698     VM_Version::supports_bmi2()) {
3699 
3700     Label test_64_loop, test_tail, BREAK_LOOP;
3701     Register tmp3_aliased = len;
3702 
3703     movl(tmp1, len);
3704     vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
3705 
3706     andl(tmp1, 64 - 1);   // tail count (in chars) 0x3F
3707     andl(len, ~(64 - 1));    // vector count (in chars)
3708     jccb(Assembler::zero, test_tail);
3709 
3710     lea(ary1, Address(ary1, len, Address::times_1));
3711     negptr(len);
3712 
3713     bind(test_64_loop);
3714     // Check whether our 64 elements of size byte contain negatives
3715     evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
3716     kortestql(mask1, mask1);
3717     jcc(Assembler::notZero, BREAK_LOOP);
3718 
3719     addptr(len, 64);
3720     jccb(Assembler::notZero, test_64_loop);
3721 
3722     bind(test_tail);
3723     // bail out when there is nothing to be done
3724     testl(tmp1, -1);
3725     jcc(Assembler::zero, DONE);
3726 
3727     // ~(~0 << len) applied up to two times (for 32-bit scenario)
3728 #ifdef _LP64
3729     mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
3730     shlxq(tmp3_aliased, tmp3_aliased, tmp1);
3731     notq(tmp3_aliased);
3732     kmovql(mask2, tmp3_aliased);
3733 #else
3734     Label k_init;
3735     jmp(k_init);
3736 
3737     // We could not read 64-bits from a general purpose register thus we move
3738     // data required to compose 64 1's to the instruction stream
3739     // We emit 64 byte wide series of elements from 0..63 which later on would
3740     // be used as a compare targets with tail count contained in tmp1 register.
3741     // Result would be a k register having tmp1 consecutive number or 1
3742     // counting from least significant bit.
3743     address tmp = pc();
3744     emit_int64(0x0706050403020100);
3745     emit_int64(0x0F0E0D0C0B0A0908);
3746     emit_int64(0x1716151413121110);
3747     emit_int64(0x1F1E1D1C1B1A1918);
3748     emit_int64(0x2726252423222120);
3749     emit_int64(0x2F2E2D2C2B2A2928);
3750     emit_int64(0x3736353433323130);
3751     emit_int64(0x3F3E3D3C3B3A3938);
3752 
3753     bind(k_init);
3754     lea(len, InternalAddress(tmp));
3755     // create mask to test for negative byte inside a vector
3756     evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit);
3757     evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit);
3758 
3759 #endif
3760     evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);
3761     ktestq(mask1, mask2);
3762     jcc(Assembler::zero, DONE);
3763 
3764     bind(BREAK_LOOP);
3765     // At least one byte in the last 64 bytes is negative.
3766     // Set up to look at the last 64 bytes as if they were a tail
3767     lea(ary1, Address(ary1, len, Address::times_1));
3768     addptr(result, len);
3769     // Ignore the very last byte: if all others are positive,
3770     // it must be negative, so we can skip right to the 2+1 byte
3771     // end comparison at this point
3772     orl(result, 63);
3773     movl(len, 63);
3774     // Fallthru to tail compare
3775   } else {
3776 
3777     if (UseAVX >= 2 && UseSSE >= 2) {
3778       // With AVX2, use 32-byte vector compare
3779       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
3780 
3781       // Compare 32-byte vectors
3782       testl(len, 0xffffffe0);   // vector count (in bytes)
3783       jccb(Assembler::zero, TAIL_START);
3784 
3785       andl(len, 0xffffffe0);
3786       lea(ary1, Address(ary1, len, Address::times_1));
3787       negptr(len);
3788 
3789       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
3790       movdl(vec2, tmp1);
3791       vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
3792 
3793       bind(COMPARE_WIDE_VECTORS);
3794       vmovdqu(vec1, Address(ary1, len, Address::times_1));
3795       vptest(vec1, vec2);
3796       jccb(Assembler::notZero, BREAK_LOOP);
3797       addptr(len, 32);
3798       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3799 
3800       testl(result, 0x0000001f);   // any bytes remaining?
3801       jcc(Assembler::zero, DONE);
3802 
3803       // Quick test using the already prepared vector mask
3804       movl(len, result);
3805       andl(len, 0x0000001f);
3806       vmovdqu(vec1, Address(ary1, len, Address::times_1, -32));
3807       vptest(vec1, vec2);
3808       jcc(Assembler::zero, DONE);
3809       // There are zeros, jump to the tail to determine exactly where
3810       jmpb(TAIL_START);
3811 
3812       bind(BREAK_LOOP);
3813       // At least one byte in the last 32-byte vector is negative.
3814       // Set up to look at the last 32 bytes as if they were a tail
3815       lea(ary1, Address(ary1, len, Address::times_1));
3816       addptr(result, len);
3817       // Ignore the very last byte: if all others are positive,
3818       // it must be negative, so we can skip right to the 2+1 byte
3819       // end comparison at this point
3820       orl(result, 31);
3821       movl(len, 31);
3822       // Fallthru to tail compare
3823     } else if (UseSSE42Intrinsics) {
3824       // With SSE4.2, use double quad vector compare
3825       Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
3826 
3827       // Compare 16-byte vectors
3828       testl(len, 0xfffffff0);   // vector count (in bytes)
3829       jcc(Assembler::zero, TAIL_START);
3830 
3831       andl(len, 0xfffffff0);
3832       lea(ary1, Address(ary1, len, Address::times_1));
3833       negptr(len);
3834 
3835       movl(tmp1, 0x80808080);
3836       movdl(vec2, tmp1);
3837       pshufd(vec2, vec2, 0);
3838 
3839       bind(COMPARE_WIDE_VECTORS);
3840       movdqu(vec1, Address(ary1, len, Address::times_1));
3841       ptest(vec1, vec2);
3842       jccb(Assembler::notZero, BREAK_LOOP);
3843       addptr(len, 16);
3844       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3845 
3846       testl(result, 0x0000000f); // len is zero, any bytes remaining?
3847       jcc(Assembler::zero, DONE);
3848 
3849       // Quick test using the already prepared vector mask
3850       movl(len, result);
3851       andl(len, 0x0000000f);   // tail count (in bytes)
3852       movdqu(vec1, Address(ary1, len, Address::times_1, -16));
3853       ptest(vec1, vec2);
3854       jcc(Assembler::zero, DONE);
3855       jmpb(TAIL_START);
3856 
3857       bind(BREAK_LOOP);
3858       // At least one byte in the last 16-byte vector is negative.
3859       // Set up and look at the last 16 bytes as if they were a tail
3860       lea(ary1, Address(ary1, len, Address::times_1));
3861       addptr(result, len);
3862       // Ignore the very last byte: if all others are positive,
3863       // it must be negative, so we can skip right to the 2+1 byte
3864       // end comparison at this point
3865       orl(result, 15);
3866       movl(len, 15);
3867       // Fallthru to tail compare
3868     }
3869   }
3870 
3871   bind(TAIL_START);
3872   // Compare 4-byte vectors
3873   andl(len, 0xfffffffc); // vector count (in bytes)
3874   jccb(Assembler::zero, COMPARE_CHAR);
3875 
3876   lea(ary1, Address(ary1, len, Address::times_1));
3877   negptr(len);
3878 
3879   bind(COMPARE_VECTORS);
3880   movl(tmp1, Address(ary1, len, Address::times_1));
3881   andl(tmp1, 0x80808080);
3882   jccb(Assembler::notZero, TAIL_ADJUST);
3883   addptr(len, 4);
3884   jccb(Assembler::notZero, COMPARE_VECTORS);
3885 
3886   // Compare trailing char (final 2-3 bytes), if any
3887   bind(COMPARE_CHAR);
3888 
3889   testl(result, 0x2);   // tail  char
3890   jccb(Assembler::zero, COMPARE_BYTE);
3891   load_unsigned_short(tmp1, Address(ary1, 0));
3892   andl(tmp1, 0x00008080);
3893   jccb(Assembler::notZero, CHAR_ADJUST);
3894   lea(ary1, Address(ary1, 2));
3895 
3896   bind(COMPARE_BYTE);
3897   testl(result, 0x1);   // tail  byte
3898   jccb(Assembler::zero, DONE);
3899   load_unsigned_byte(tmp1, Address(ary1, 0));
3900   testl(tmp1, 0x00000080);
3901   jccb(Assembler::zero, DONE);
3902   subptr(result, 1);
3903   jmpb(DONE);
3904 
3905   bind(TAIL_ADJUST);
3906   // there are negative bits in the last 4 byte block.
3907   // Adjust result and check the next three bytes
3908   addptr(result, len);
3909   orl(result, 3);
3910   lea(ary1, Address(ary1, len, Address::times_1));
3911   jmpb(COMPARE_CHAR);
3912 
3913   bind(CHAR_ADJUST);
3914   // We are looking at a char + optional byte tail, and found that one
3915   // of the bytes in the char is negative. Adjust the result, check the
3916   // first byte and readjust if needed.
3917   andl(result, 0xfffffffc);
3918   testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first
3919   jccb(Assembler::notZero, DONE);
3920   addptr(result, 1);
3921 
3922   // That's it
3923   bind(DONE);
3924   if (UseAVX >= 2 && UseSSE >= 2) {
3925     // clean upper bits of YMM registers
3926     vpxor(vec1, vec1);
3927     vpxor(vec2, vec2);
3928   }
3929 }
3930 
3931 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
3932 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
3933                                       Register limit, Register result, Register chr,
3934                                       XMMRegister vec1, XMMRegister vec2, bool is_char, KRegister mask) {
3935   ShortBranchVerifier sbv(this);
3936   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
3937 
3938   int length_offset  = arrayOopDesc::length_offset_in_bytes();
3939   int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
3940 
3941   if (is_array_equ) {
3942     // Check the input args
3943     cmpoop(ary1, ary2);
3944     jcc(Assembler::equal, TRUE_LABEL);
3945 
3946     // Need additional checks for arrays_equals.
3947     testptr(ary1, ary1);
3948     jcc(Assembler::zero, FALSE_LABEL);
3949     testptr(ary2, ary2);
3950     jcc(Assembler::zero, FALSE_LABEL);
3951 
3952     // Check the lengths
3953     movl(limit, Address(ary1, length_offset));
3954     cmpl(limit, Address(ary2, length_offset));
3955     jcc(Assembler::notEqual, FALSE_LABEL);
3956   }
3957 
3958   // count == 0
3959   testl(limit, limit);
3960   jcc(Assembler::zero, TRUE_LABEL);
3961 
3962   if (is_array_equ) {
3963     // Load array address
3964     lea(ary1, Address(ary1, base_offset));
3965     lea(ary2, Address(ary2, base_offset));
3966   }
3967 
3968   if (is_array_equ && is_char) {
3969     // arrays_equals when used for char[].
3970     shll(limit, 1);      // byte count != 0
3971   }
3972   movl(result, limit); // copy
3973 
3974   if (UseAVX >= 2) {
3975     // With AVX2, use 32-byte vector compare
3976     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
3977 
3978     // Compare 32-byte vectors
3979     andl(result, 0x0000001f);  //   tail count (in bytes)
3980     andl(limit, 0xffffffe0);   // vector count (in bytes)
3981     jcc(Assembler::zero, COMPARE_TAIL);
3982 
3983     lea(ary1, Address(ary1, limit, Address::times_1));
3984     lea(ary2, Address(ary2, limit, Address::times_1));
3985     negptr(limit);
3986 
3987 #ifdef _LP64
3988     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3989       Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
3990 
3991       cmpl(limit, -64);
3992       jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3993 
3994       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3995 
3996       evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
3997       evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
3998       kortestql(mask, mask);
3999       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4000       addptr(limit, 64);  // update since we already compared at this addr
4001       cmpl(limit, -64);
4002       jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
4003 
4004       // At this point we may still need to compare -limit+result bytes.
4005       // We could execute the next two instruction and just continue via non-wide path:
4006       //  cmpl(limit, 0);
4007       //  jcc(Assembler::equal, COMPARE_TAIL);  // true
4008       // But since we stopped at the points ary{1,2}+limit which are
4009       // not farther than 64 bytes from the ends of arrays ary{1,2}+result
4010       // (|limit| <= 32 and result < 32),
4011       // we may just compare the last 64 bytes.
4012       //
4013       addptr(result, -64);   // it is safe, bc we just came from this area
4014       evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
4015       evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
4016       kortestql(mask, mask);
4017       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
4018 
4019       jmp(TRUE_LABEL);
4020 
4021       bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
4022 
4023     }//if (VM_Version::supports_avx512vlbw())
4024 #endif //_LP64
4025     bind(COMPARE_WIDE_VECTORS);
4026     vmovdqu(vec1, Address(ary1, limit, Address::times_1));
4027     vmovdqu(vec2, Address(ary2, limit, Address::times_1));
4028     vpxor(vec1, vec2);
4029 
4030     vptest(vec1, vec1);
4031     jcc(Assembler::notZero, FALSE_LABEL);
4032     addptr(limit, 32);
4033     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4034 
4035     testl(result, result);
4036     jcc(Assembler::zero, TRUE_LABEL);
4037 
4038     vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
4039     vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
4040     vpxor(vec1, vec2);
4041 
4042     vptest(vec1, vec1);
4043     jccb(Assembler::notZero, FALSE_LABEL);
4044     jmpb(TRUE_LABEL);
4045 
4046     bind(COMPARE_TAIL); // limit is zero
4047     movl(limit, result);
4048     // Fallthru to tail compare
4049   } else if (UseSSE42Intrinsics) {
4050     // With SSE4.2, use double quad vector compare
4051     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4052 
4053     // Compare 16-byte vectors
4054     andl(result, 0x0000000f);  //   tail count (in bytes)
4055     andl(limit, 0xfffffff0);   // vector count (in bytes)
4056     jcc(Assembler::zero, COMPARE_TAIL);
4057 
4058     lea(ary1, Address(ary1, limit, Address::times_1));
4059     lea(ary2, Address(ary2, limit, Address::times_1));
4060     negptr(limit);
4061 
4062     bind(COMPARE_WIDE_VECTORS);
4063     movdqu(vec1, Address(ary1, limit, Address::times_1));
4064     movdqu(vec2, Address(ary2, limit, Address::times_1));
4065     pxor(vec1, vec2);
4066 
4067     ptest(vec1, vec1);
4068     jcc(Assembler::notZero, FALSE_LABEL);
4069     addptr(limit, 16);
4070     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4071 
4072     testl(result, result);
4073     jcc(Assembler::zero, TRUE_LABEL);
4074 
4075     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
4076     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
4077     pxor(vec1, vec2);
4078 
4079     ptest(vec1, vec1);
4080     jccb(Assembler::notZero, FALSE_LABEL);
4081     jmpb(TRUE_LABEL);
4082 
4083     bind(COMPARE_TAIL); // limit is zero
4084     movl(limit, result);
4085     // Fallthru to tail compare
4086   }
4087 
4088   // Compare 4-byte vectors
4089   andl(limit, 0xfffffffc); // vector count (in bytes)
4090   jccb(Assembler::zero, COMPARE_CHAR);
4091 
4092   lea(ary1, Address(ary1, limit, Address::times_1));
4093   lea(ary2, Address(ary2, limit, Address::times_1));
4094   negptr(limit);
4095 
4096   bind(COMPARE_VECTORS);
4097   movl(chr, Address(ary1, limit, Address::times_1));
4098   cmpl(chr, Address(ary2, limit, Address::times_1));
4099   jccb(Assembler::notEqual, FALSE_LABEL);
4100   addptr(limit, 4);
4101   jcc(Assembler::notZero, COMPARE_VECTORS);
4102 
4103   // Compare trailing char (final 2 bytes), if any
4104   bind(COMPARE_CHAR);
4105   testl(result, 0x2);   // tail  char
4106   jccb(Assembler::zero, COMPARE_BYTE);
4107   load_unsigned_short(chr, Address(ary1, 0));
4108   load_unsigned_short(limit, Address(ary2, 0));
4109   cmpl(chr, limit);
4110   jccb(Assembler::notEqual, FALSE_LABEL);
4111 
4112   if (is_array_equ && is_char) {
4113     bind(COMPARE_BYTE);
4114   } else {
4115     lea(ary1, Address(ary1, 2));
4116     lea(ary2, Address(ary2, 2));
4117 
4118     bind(COMPARE_BYTE);
4119     testl(result, 0x1);   // tail  byte
4120     jccb(Assembler::zero, TRUE_LABEL);
4121     load_unsigned_byte(chr, Address(ary1, 0));
4122     load_unsigned_byte(limit, Address(ary2, 0));
4123     cmpl(chr, limit);
4124     jccb(Assembler::notEqual, FALSE_LABEL);
4125   }
4126   bind(TRUE_LABEL);
4127   movl(result, 1);   // return true
4128   jmpb(DONE);
4129 
4130   bind(FALSE_LABEL);
4131   xorl(result, result); // return false
4132 
4133   // That's it
4134   bind(DONE);
4135   if (UseAVX >= 2) {
4136     // clean upper bits of YMM registers
4137     vpxor(vec1, vec1);
4138     vpxor(vec2, vec2);
4139   }
4140 }
4141 
4142 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4143                                     XMMRegister src1, int imm8, bool merge, int vlen_enc) {
4144   switch(ideal_opc) {
4145     case Op_LShiftVS:
4146       Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break;
4147     case Op_LShiftVI:
4148       Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break;
4149     case Op_LShiftVL:
4150       Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break;
4151     case Op_RShiftVS:
4152       Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break;
4153     case Op_RShiftVI:
4154       Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break;
4155     case Op_RShiftVL:
4156       Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break;
4157     case Op_URShiftVS:
4158       Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break;
4159     case Op_URShiftVI:
4160       Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break;
4161     case Op_URShiftVL:
4162       Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break;
4163     case Op_RotateRightV:
4164       evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4165     case Op_RotateLeftV:
4166       evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4167     default:
4168       fatal("Unsupported masked operation"); break;
4169   }
4170 }
4171 
4172 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4173                                     XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc,
4174                                     bool is_varshift) {
4175   switch (ideal_opc) {
4176     case Op_AddVB:
4177       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4178     case Op_AddVS:
4179       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4180     case Op_AddVI:
4181       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4182     case Op_AddVL:
4183       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4184     case Op_AddVF:
4185       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4186     case Op_AddVD:
4187       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4188     case Op_SubVB:
4189       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4190     case Op_SubVS:
4191       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4192     case Op_SubVI:
4193       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4194     case Op_SubVL:
4195       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4196     case Op_SubVF:
4197       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4198     case Op_SubVD:
4199       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4200     case Op_MulVS:
4201       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4202     case Op_MulVI:
4203       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4204     case Op_MulVL:
4205       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4206     case Op_MulVF:
4207       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4208     case Op_MulVD:
4209       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4210     case Op_DivVF:
4211       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4212     case Op_DivVD:
4213       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4214     case Op_SqrtVF:
4215       evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break;
4216     case Op_SqrtVD:
4217       evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break;
4218     case Op_AbsVB:
4219       evpabsb(dst, mask, src2, merge, vlen_enc); break;
4220     case Op_AbsVS:
4221       evpabsw(dst, mask, src2, merge, vlen_enc); break;
4222     case Op_AbsVI:
4223       evpabsd(dst, mask, src2, merge, vlen_enc); break;
4224     case Op_AbsVL:
4225       evpabsq(dst, mask, src2, merge, vlen_enc); break;
4226     case Op_FmaVF:
4227       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4228     case Op_FmaVD:
4229       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4230     case Op_VectorRearrange:
4231       evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break;
4232     case Op_LShiftVS:
4233       evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4234     case Op_LShiftVI:
4235       evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4236     case Op_LShiftVL:
4237       evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4238     case Op_RShiftVS:
4239       evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4240     case Op_RShiftVI:
4241       evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4242     case Op_RShiftVL:
4243       evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4244     case Op_URShiftVS:
4245       evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4246     case Op_URShiftVI:
4247       evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4248     case Op_URShiftVL:
4249       evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4250     case Op_RotateLeftV:
4251       evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4252     case Op_RotateRightV:
4253       evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4254     case Op_MaxV:
4255       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4256     case Op_MinV:
4257       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4258     case Op_XorV:
4259       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4260     case Op_OrV:
4261       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4262     case Op_AndV:
4263       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4264     default:
4265       fatal("Unsupported masked operation"); break;
4266   }
4267 }
4268 
4269 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4270                                     XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4271   switch (ideal_opc) {
4272     case Op_AddVB:
4273       evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4274     case Op_AddVS:
4275       evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4276     case Op_AddVI:
4277       evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4278     case Op_AddVL:
4279       evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4280     case Op_AddVF:
4281       evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4282     case Op_AddVD:
4283       evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4284     case Op_SubVB:
4285       evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4286     case Op_SubVS:
4287       evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4288     case Op_SubVI:
4289       evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4290     case Op_SubVL:
4291       evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4292     case Op_SubVF:
4293       evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4294     case Op_SubVD:
4295       evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4296     case Op_MulVS:
4297       evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4298     case Op_MulVI:
4299       evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4300     case Op_MulVL:
4301       evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4302     case Op_MulVF:
4303       evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4304     case Op_MulVD:
4305       evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4306     case Op_DivVF:
4307       evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4308     case Op_DivVD:
4309       evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4310     case Op_FmaVF:
4311       evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4312     case Op_FmaVD:
4313       evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4314     case Op_MaxV:
4315       evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4316     case Op_MinV:
4317       evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4318     case Op_XorV:
4319       evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4320     case Op_OrV:
4321       evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4322     case Op_AndV:
4323       evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4324     default:
4325       fatal("Unsupported masked operation"); break;
4326   }
4327 }
4328 
4329 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst,
4330                                   KRegister src1, KRegister src2) {
4331   BasicType etype = T_ILLEGAL;
4332   switch(mask_len) {
4333     case 2:
4334     case 4:
4335     case 8:  etype = T_BYTE; break;
4336     case 16: etype = T_SHORT; break;
4337     case 32: etype = T_INT; break;
4338     case 64: etype = T_LONG; break;
4339     default: fatal("Unsupported type"); break;
4340   }
4341   assert(etype != T_ILLEGAL, "");
4342   switch(ideal_opc) {
4343     case Op_AndVMask:
4344       kand(etype, dst, src1, src2); break;
4345     case Op_OrVMask:
4346       kor(etype, dst, src1, src2); break;
4347     case Op_XorVMask:
4348       kxor(etype, dst, src1, src2); break;
4349     default:
4350       fatal("Unsupported masked operation"); break;
4351   }
4352 }
4353 
4354 /*
4355  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4356  * If src is NaN, the result is 0.
4357  * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE,
4358  * the result is equal to the value of Integer.MIN_VALUE.
4359  * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE,
4360  * the result is equal to the value of Integer.MAX_VALUE.
4361  */
4362 void C2_MacroAssembler::vector_cast_float_special_cases_avx(XMMRegister dst, XMMRegister src, AddressLiteral float_sign_flip, int vec_enc,
4363                                                             XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
4364                                                             Register rscratch) {
4365   Label done;
4366   vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch);
4367   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
4368   vptest(xtmp2, xtmp2, vec_enc);
4369   jccb(Assembler::equal, done);
4370 
4371   vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc);
4372   vpxor(xtmp1, xtmp1, xtmp4, vec_enc);
4373 
4374   vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
4375   vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc);
4376   vblendvps(dst, dst, xtmp4, xtmp3, vec_enc);
4377 
4378   // Recompute the mask for remaining special value.
4379   vpxor(xtmp2, xtmp2, xtmp3, vec_enc);
4380   // Extract SRC values corresponding to TRUE mask lanes.
4381   vpand(xtmp4, xtmp2, src, vec_enc);
4382   // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special
4383   // values are set.
4384   vpxor(xtmp3, xtmp2, xtmp4, vec_enc);
4385 
4386   vblendvps(dst, dst, xtmp1, xtmp3, vec_enc);
4387   bind(done);
4388 }
4389 
4390 void C2_MacroAssembler::vector_cast_float_special_cases_evex(XMMRegister dst, XMMRegister src, AddressLiteral float_sign_flip, int vec_enc,
4391                                                              XMMRegister xtmp1, XMMRegister xtmp2,
4392                                                              KRegister ktmp1, KRegister ktmp2,
4393                                                              Register rscratch) {
4394   Label done;
4395   evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
4396   Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4397   kortestwl(ktmp1, ktmp1);
4398   jccb(Assembler::equal, done);
4399 
4400   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4401   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4402   evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
4403 
4404   kxorwl(ktmp1, ktmp1, ktmp2);
4405   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4406   vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4407   evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
4408   bind(done);
4409 }
4410 
4411 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src,
4412                                                                      AddressLiteral double_sign_flip, int vec_enc,
4413                                                                      XMMRegister xtmp1, XMMRegister xtmp2,
4414                                                                      KRegister ktmp1, KRegister ktmp2,
4415                                                                      Register rscratch) {
4416   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4417 
4418   Label done;
4419   evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
4420   Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc);
4421   kortestwl(ktmp1, ktmp1);
4422   jccb(Assembler::equal, done);
4423 
4424   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4425   evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4426   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
4427 
4428   kxorwl(ktmp1, ktmp1, ktmp2);
4429   evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4430   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4431   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
4432   bind(done);
4433 }
4434 
4435 /*
4436  * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4437  * If src is NaN, the result is 0.
4438  * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE,
4439  * the result is equal to the value of Long.MIN_VALUE.
4440  * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE,
4441  * the result is equal to the value of Long.MAX_VALUE.
4442  */
4443 void C2_MacroAssembler::vector_cast_double_special_cases_evex(XMMRegister dst, XMMRegister src,
4444                                                               AddressLiteral double_sign_flip, int vec_enc,
4445                                                               XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4446                                                               Register rscratch) {
4447   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4448 
4449   Label done;
4450   evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
4451   evpcmpeqq(ktmp1, xtmp1, dst, vec_enc);
4452   kortestwl(ktmp1, ktmp1);
4453   jccb(Assembler::equal, done);
4454 
4455   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4456   evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4457   evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
4458 
4459   kxorwl(ktmp1, ktmp1, ktmp2);
4460   evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4461   vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4462   evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
4463   bind(done);
4464 }
4465 
4466 /*
4467  * Algorithm for vector D2L and F2I conversions:-
4468  * a) Perform vector D2L/F2I cast.
4469  * b) Choose fast path if none of the result vector lane contains 0x80000000 value.
4470  *    It signifies that source value could be any of the special floating point
4471  *    values(NaN,-Inf,Inf,Max,-Min).
4472  * c) Set destination to zero if source is NaN value.
4473  * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value.
4474  */
4475 
4476 void C2_MacroAssembler::vector_castD2L_evex(XMMRegister dst, XMMRegister src, AddressLiteral double_sign_flip, int vec_enc,
4477                                             XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, Register rscratch) {
4478   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4479 
4480   evcvttpd2qq(dst, src, vec_enc);
4481   vector_cast_double_special_cases_evex(dst, src, double_sign_flip, vec_enc,
4482                                         xtmp1, xtmp2, ktmp1, ktmp2, rscratch);
4483 }
4484 
4485 void C2_MacroAssembler::vector_castF2I_avx(XMMRegister dst, XMMRegister src, AddressLiteral float_sign_flip, int vec_enc,
4486                                            XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, Register rscratch) {
4487   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4488 
4489   vcvttps2dq(dst, src, vec_enc);
4490   vector_cast_float_special_cases_avx(dst, src, float_sign_flip, vec_enc,
4491                                       xtmp1, xtmp2, xtmp3, xtmp4, rscratch);
4492 }
4493 
4494 void C2_MacroAssembler::vector_castF2I_evex(XMMRegister dst, XMMRegister src, AddressLiteral float_sign_flip, int vec_enc,
4495                                             XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, Register rscratch) {
4496   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4497 
4498   vcvttps2dq(dst, src, vec_enc);
4499   vector_cast_float_special_cases_evex(dst, src, float_sign_flip, vec_enc,
4500                                        xtmp1, xtmp2, ktmp1, ktmp2, rscratch);
4501 }
4502 
4503 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, AddressLiteral float_sign_flip, int vec_enc,
4504                                             XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, Register rscratch) {
4505   assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4506 
4507   evcvttps2qq(dst, src, vec_enc);
4508   vector_cast_float_to_long_special_cases_evex(dst, src, float_sign_flip, vec_enc,
4509                                                xtmp1, xtmp2, ktmp1, ktmp2, rscratch);
4510 }
4511 
4512 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, AddressLiteral double_sign_flip, int vec_enc,
4513                                             XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, Register rscratch) {
4514   assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4515 
4516   vector_castD2L_evex(dst, src, double_sign_flip, vec_enc,
4517                       xtmp1, xtmp2, ktmp1, ktmp2, rscratch);
4518   if (to_elem_bt != T_LONG) {
4519     switch(to_elem_bt) {
4520       case T_INT:
4521         evpmovsqd(dst, dst, vec_enc);
4522         break;
4523       case T_SHORT:
4524         evpmovsqd(dst, dst, vec_enc);
4525         evpmovdw(dst, dst, vec_enc);
4526         break;
4527       case T_BYTE:
4528         evpmovsqd(dst, dst, vec_enc);
4529         evpmovdb(dst, dst, vec_enc);
4530         break;
4531       default: assert(false, "%s", type2name(to_elem_bt));
4532     }
4533   }
4534 }
4535 
4536 #ifdef _LP64
4537 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src,
4538                                                  AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
4539                                                  Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
4540   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
4541   // and re-instantiate original MXCSR.RC mode after that.
4542   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
4543 
4544   mov64(tmp, julong_cast(0.5L));
4545   evpbroadcastq(xtmp1, tmp, vec_enc);
4546   vaddpd(xtmp1, src , xtmp1, vec_enc);
4547   evcvtpd2qq(dst, xtmp1, vec_enc);
4548   vector_cast_double_special_cases_evex(dst, src, double_sign_flip, vec_enc,
4549                                         xtmp1, xtmp2, ktmp1, ktmp2, tmp);
4550 
4551   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
4552 }
4553 
4554 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src,
4555                                                 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
4556                                                 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
4557   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
4558   // and re-instantiate original MXCSR.RC mode after that.
4559   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
4560 
4561   movl(tmp, jint_cast(0.5));
4562   movq(xtmp1, tmp);
4563   vbroadcastss(xtmp1, xtmp1, vec_enc);
4564   vaddps(xtmp1, src , xtmp1, vec_enc);
4565   vcvtps2dq(dst, xtmp1, vec_enc);
4566   vector_cast_float_special_cases_evex(dst, src, float_sign_flip, vec_enc,
4567                                        xtmp1, xtmp2, ktmp1, ktmp2, tmp);
4568 
4569   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
4570 }
4571 
4572 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src,
4573                                                AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
4574                                                Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) {
4575   // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
4576   // and re-instantiate original MXCSR.RC mode after that.
4577   ldmxcsr(new_mxcsr, tmp /*rscratch*/);
4578 
4579   movl(tmp, jint_cast(0.5));
4580   movq(xtmp1, tmp);
4581   vbroadcastss(xtmp1, xtmp1, vec_enc);
4582   vaddps(xtmp1, src , xtmp1, vec_enc);
4583   vcvtps2dq(dst, xtmp1, vec_enc);
4584   vector_cast_float_special_cases_avx(dst, src, float_sign_flip, vec_enc,
4585                                       xtmp1, xtmp2, xtmp3, xtmp4, tmp);
4586 
4587   ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
4588 }
4589 #endif // _LP64
4590 
4591 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
4592                                              BasicType from_elem_bt, BasicType to_elem_bt) {
4593   switch (from_elem_bt) {
4594     case T_BYTE:
4595       switch (to_elem_bt) {
4596         case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break;
4597         case T_INT:   vpmovzxbd(dst, src, vlen_enc); break;
4598         case T_LONG:  vpmovzxbq(dst, src, vlen_enc); break;
4599         default: ShouldNotReachHere();
4600       }
4601       break;
4602     case T_SHORT:
4603       switch (to_elem_bt) {
4604         case T_INT:  vpmovzxwd(dst, src, vlen_enc); break;
4605         case T_LONG: vpmovzxwq(dst, src, vlen_enc); break;
4606         default: ShouldNotReachHere();
4607       }
4608       break;
4609     case T_INT:
4610       assert(to_elem_bt == T_LONG, "");
4611       vpmovzxdq(dst, src, vlen_enc);
4612       break;
4613     default:
4614       ShouldNotReachHere();
4615   }
4616 }
4617 
4618 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3,
4619                                    bool merge, BasicType bt, int vlen_enc) {
4620   if (bt == T_INT) {
4621     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
4622   } else {
4623     assert(bt == T_LONG, "");
4624     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
4625   }
4626 }
4627 
4628 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3,
4629                                    bool merge, BasicType bt, int vlen_enc) {
4630   if (bt == T_INT) {
4631     evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
4632   } else {
4633     assert(bt == T_LONG, "");
4634     evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
4635   }
4636 }
4637 
4638 #ifdef _LP64
4639 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1,
4640                                                Register rtmp2, XMMRegister xtmp, int mask_len,
4641                                                int vec_enc) {
4642   int index = 0;
4643   int vindex = 0;
4644   mov64(rtmp1, 0x0101010101010101L);
4645   pdepq(rtmp1, src, rtmp1);
4646   if (mask_len > 8) {
4647     movq(rtmp2, src);
4648     vpxor(xtmp, xtmp, xtmp, vec_enc);
4649     movq(xtmp, rtmp1);
4650   }
4651   movq(dst, rtmp1);
4652 
4653   mask_len -= 8;
4654   while (mask_len > 0) {
4655     assert ((mask_len & 0x7) == 0, "mask must be multiple of 8");
4656     index++;
4657     if ((index % 2) == 0) {
4658       pxor(xtmp, xtmp);
4659     }
4660     mov64(rtmp1, 0x0101010101010101L);
4661     shrq(rtmp2, 8);
4662     pdepq(rtmp1, rtmp2, rtmp1);
4663     pinsrq(xtmp, rtmp1, index % 2);
4664     vindex = index / 2;
4665     if (vindex) {
4666       // Write entire 16 byte vector when both 64 bit
4667       // lanes are update to save redundant instructions.
4668       if (index % 2) {
4669         vinsertf128(dst, dst, xtmp, vindex);
4670       }
4671     } else {
4672       vmovdqu(dst, xtmp);
4673     }
4674     mask_len -= 8;
4675   }
4676 }
4677 
4678 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) {
4679   switch(opc) {
4680     case Op_VectorMaskTrueCount:
4681       popcntq(dst, tmp);
4682       break;
4683     case Op_VectorMaskLastTrue:
4684       if (VM_Version::supports_lzcnt()) {
4685         lzcntq(tmp, tmp);
4686         movl(dst, 63);
4687         subl(dst, tmp);
4688       } else {
4689         movl(dst, -1);
4690         bsrq(tmp, tmp);
4691         cmov32(Assembler::notZero, dst, tmp);
4692       }
4693       break;
4694     case Op_VectorMaskFirstTrue:
4695       if (VM_Version::supports_bmi1()) {
4696         if (masklen < 32) {
4697           orl(tmp, 1 << masklen);
4698           tzcntl(dst, tmp);
4699         } else if (masklen == 32) {
4700           tzcntl(dst, tmp);
4701         } else {
4702           assert(masklen == 64, "");
4703           tzcntq(dst, tmp);
4704         }
4705       } else {
4706         if (masklen < 32) {
4707           orl(tmp, 1 << masklen);
4708           bsfl(dst, tmp);
4709         } else {
4710           assert(masklen == 32 || masklen == 64, "");
4711           movl(dst, masklen);
4712           if (masklen == 32)  {
4713             bsfl(tmp, tmp);
4714           } else {
4715             bsfq(tmp, tmp);
4716           }
4717           cmov32(Assembler::notZero, dst, tmp);
4718         }
4719       }
4720       break;
4721     case Op_VectorMaskToLong:
4722       assert(dst == tmp, "Dst and tmp should be the same for toLong operations");
4723       break;
4724     default: assert(false, "Unhandled mask operation");
4725   }
4726 }
4727 
4728 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp,
4729                                               int masklen, int masksize, int vec_enc) {
4730   assert(VM_Version::supports_popcnt(), "");
4731 
4732   if(VM_Version::supports_avx512bw()) {
4733     kmovql(tmp, mask);
4734   } else {
4735     assert(masklen <= 16, "");
4736     kmovwl(tmp, mask);
4737   }
4738 
4739   // Mask generated out of partial vector comparisons/replicate/mask manipulation
4740   // operations needs to be clipped.
4741   if (masksize < 16 && opc != Op_VectorMaskFirstTrue) {
4742     andq(tmp, (1 << masklen) - 1);
4743   }
4744 
4745   vector_mask_operation_helper(opc, dst, tmp, masklen);
4746 }
4747 
4748 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
4749                                               Register tmp, int masklen, BasicType bt, int vec_enc) {
4750   assert(vec_enc == AVX_128bit && VM_Version::supports_avx() ||
4751          vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4), "");
4752   assert(VM_Version::supports_popcnt(), "");
4753 
4754   bool need_clip = false;
4755   switch(bt) {
4756     case T_BOOLEAN:
4757       // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1
4758       vpxor(xtmp, xtmp, xtmp, vec_enc);
4759       vpsubb(xtmp, xtmp, mask, vec_enc);
4760       vpmovmskb(tmp, xtmp, vec_enc);
4761       need_clip = masklen < 16;
4762       break;
4763     case T_BYTE:
4764       vpmovmskb(tmp, mask, vec_enc);
4765       need_clip = masklen < 16;
4766       break;
4767     case T_SHORT:
4768       vpacksswb(xtmp, mask, mask, vec_enc);
4769       if (masklen >= 16) {
4770         vpermpd(xtmp, xtmp, 8, vec_enc);
4771       }
4772       vpmovmskb(tmp, xtmp, Assembler::AVX_128bit);
4773       need_clip = masklen < 16;
4774       break;
4775     case T_INT:
4776     case T_FLOAT:
4777       vmovmskps(tmp, mask, vec_enc);
4778       need_clip = masklen < 4;
4779       break;
4780     case T_LONG:
4781     case T_DOUBLE:
4782       vmovmskpd(tmp, mask, vec_enc);
4783       need_clip = masklen < 2;
4784       break;
4785     default: assert(false, "Unhandled type, %s", type2name(bt));
4786   }
4787 
4788   // Mask generated out of partial vector comparisons/replicate/mask manipulation
4789   // operations needs to be clipped.
4790   if (need_clip && opc != Op_VectorMaskFirstTrue) {
4791     // need_clip implies masklen < 32
4792     andq(tmp, (1 << masklen) - 1);
4793   }
4794 
4795   vector_mask_operation_helper(opc, dst, tmp, masklen);
4796 }
4797 
4798 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1,
4799                                              Register rtmp2, int mask_len) {
4800   kmov(rtmp1, src);
4801   andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len)));
4802   mov64(rtmp2, -1L);
4803   pextq(rtmp2, rtmp2, rtmp1);
4804   kmov(dst, rtmp2);
4805 }
4806 
4807 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask,
4808                                                bool merge, BasicType bt, int vec_enc) {
4809   if (opcode == Op_CompressV) {
4810     switch(bt) {
4811     case T_BYTE:
4812       evpcompressb(dst, mask, src, merge, vec_enc);
4813       break;
4814     case T_CHAR:
4815     case T_SHORT:
4816       evpcompressw(dst, mask, src, merge, vec_enc);
4817       break;
4818     case T_INT:
4819       evpcompressd(dst, mask, src, merge, vec_enc);
4820       break;
4821     case T_FLOAT:
4822       evcompressps(dst, mask, src, merge, vec_enc);
4823       break;
4824     case T_LONG:
4825       evpcompressq(dst, mask, src, merge, vec_enc);
4826       break;
4827     case T_DOUBLE:
4828       evcompresspd(dst, mask, src, merge, vec_enc);
4829       break;
4830     default:
4831       fatal("Unsupported type %s", type2name(bt));
4832       break;
4833     }
4834   } else {
4835     assert(opcode == Op_ExpandV, "");
4836     switch(bt) {
4837     case T_BYTE:
4838       evpexpandb(dst, mask, src, merge, vec_enc);
4839       break;
4840     case T_CHAR:
4841     case T_SHORT:
4842       evpexpandw(dst, mask, src, merge, vec_enc);
4843       break;
4844     case T_INT:
4845       evpexpandd(dst, mask, src, merge, vec_enc);
4846       break;
4847     case T_FLOAT:
4848       evexpandps(dst, mask, src, merge, vec_enc);
4849       break;
4850     case T_LONG:
4851       evpexpandq(dst, mask, src, merge, vec_enc);
4852       break;
4853     case T_DOUBLE:
4854       evexpandpd(dst, mask, src, merge, vec_enc);
4855       break;
4856     default:
4857       fatal("Unsupported type %s", type2name(bt));
4858       break;
4859     }
4860   }
4861 }
4862 #endif
4863 
4864 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
4865                                            KRegister ktmp1, int vec_enc) {
4866   if (opcode == Op_SignumVD) {
4867     vsubpd(dst, zero, one, vec_enc);
4868     // if src < 0 ? -1 : 1
4869     evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
4870     evblendmpd(dst, ktmp1, one, dst, true, vec_enc);
4871     // if src == NaN, -0.0 or 0.0 return src.
4872     evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
4873     evblendmpd(dst, ktmp1, dst, src, true, vec_enc);
4874   } else {
4875     assert(opcode == Op_SignumVF, "");
4876     vsubps(dst, zero, one, vec_enc);
4877     // if src < 0 ? -1 : 1
4878     evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
4879     evblendmps(dst, ktmp1, one, dst, true, vec_enc);
4880     // if src == NaN, -0.0 or 0.0 return src.
4881     evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
4882     evblendmps(dst, ktmp1, dst, src, true, vec_enc);
4883   }
4884 }
4885 
4886 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
4887                                           XMMRegister xtmp1, int vec_enc) {
4888   if (opcode == Op_SignumVD) {
4889     vsubpd(dst, zero, one, vec_enc);
4890     // if src < 0 ? -1 : 1
4891     vblendvpd(dst, one, dst, src, vec_enc);
4892     // if src == NaN, -0.0 or 0.0 return src.
4893     vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
4894     vblendvpd(dst, dst, src, xtmp1, vec_enc);
4895   } else {
4896     assert(opcode == Op_SignumVF, "");
4897     vsubps(dst, zero, one, vec_enc);
4898     // if src < 0 ? -1 : 1
4899     vblendvps(dst, one, dst, src, vec_enc);
4900     // if src == NaN, -0.0 or 0.0 return src.
4901     vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
4902     vblendvps(dst, dst, src, xtmp1, vec_enc);
4903   }
4904 }
4905 
4906 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) {
4907   if (VM_Version::supports_avx512bw()) {
4908     if (mask_len > 32) {
4909       kmovql(dst, src);
4910     } else {
4911       kmovdl(dst, src);
4912       if (mask_len != 32) {
4913         kshiftrdl(dst, dst, 32 - mask_len);
4914       }
4915     }
4916   } else {
4917     assert(mask_len <= 16, "");
4918     kmovwl(dst, src);
4919     if (mask_len != 16) {
4920       kshiftrwl(dst, dst, 16 - mask_len);
4921     }
4922   }
4923 }
4924 
4925 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) {
4926   int lane_size = type2aelembytes(bt);
4927   bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
4928   if ((is_LP64 || lane_size < 8) &&
4929       ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) ||
4930        (is_subword_type(bt) && VM_Version::supports_avx512vlbw()))) {
4931     movptr(rtmp, imm32);
4932     switch(lane_size) {
4933       case 1 : evpbroadcastb(dst, rtmp, vec_enc); break;
4934       case 2 : evpbroadcastw(dst, rtmp, vec_enc); break;
4935       case 4 : evpbroadcastd(dst, rtmp, vec_enc); break;
4936       case 8 : evpbroadcastq(dst, rtmp, vec_enc); break;
4937       fatal("Unsupported lane size %d", lane_size);
4938       break;
4939     }
4940   } else {
4941     movptr(rtmp, imm32);
4942     LP64_ONLY(movq(dst, rtmp)) NOT_LP64(movdl(dst, rtmp));
4943     switch(lane_size) {
4944       case 1 : vpbroadcastb(dst, dst, vec_enc); break;
4945       case 2 : vpbroadcastw(dst, dst, vec_enc); break;
4946       case 4 : vpbroadcastd(dst, dst, vec_enc); break;
4947       case 8 : vpbroadcastq(dst, dst, vec_enc); break;
4948       fatal("Unsupported lane size %d", lane_size);
4949       break;
4950     }
4951   }
4952 }
4953 
4954 //
4955 // Following is lookup table based popcount computation algorithm:-
4956 //       Index   Bit set count
4957 //     [ 0000 ->   0,
4958 //       0001 ->   1,
4959 //       0010 ->   1,
4960 //       0011 ->   2,
4961 //       0100 ->   1,
4962 //       0101 ->   2,
4963 //       0110 ->   2,
4964 //       0111 ->   3,
4965 //       1000 ->   1,
4966 //       1001 ->   2,
4967 //       1010 ->   3,
4968 //       1011 ->   3,
4969 //       1100 ->   2,
4970 //       1101 ->   3,
4971 //       1111 ->   4 ]
4972 //  a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as
4973 //     shuffle indices for lookup table access.
4974 //  b. Right shift each byte of vector lane by 4 positions.
4975 //  c. Count the number of 1s in 4 MSB bits each byte. These bits are used as
4976 //     shuffle indices for lookup table access.
4977 //  d. Add the bitset count of upper and lower 4 bits of each byte.
4978 //  e. Unpack double words to quad words and compute sum of absolute difference of bitset
4979 //     count of all the bytes of a quadword.
4980 //  f. Perform step e. for upper 128bit vector lane.
4981 //  g. Pack the bitset count of quadwords back to double word.
4982 //  h. Unpacking and packing operations are not needed for 64bit vector lane.
4983 
4984 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4985                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
4986   assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), "");
4987   vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc);
4988   vpsrlw(dst, src, 4, vec_enc);
4989   vpand(dst, dst, xtmp1, vec_enc);
4990   vpand(xtmp1, src, xtmp1, vec_enc);
4991   vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg);
4992   vpshufb(xtmp1, xtmp2, xtmp1, vec_enc);
4993   vpshufb(dst, xtmp2, dst, vec_enc);
4994   vpaddb(dst, dst, xtmp1, vec_enc);
4995 }
4996 
4997 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4998                                             XMMRegister xtmp2, Register rtmp, int vec_enc) {
4999   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5000   // Following code is as per steps e,f,g and h of above algorithm.
5001   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5002   vpunpckhdq(dst, xtmp1, xtmp2, vec_enc);
5003   vpsadbw(dst, dst, xtmp2, vec_enc);
5004   vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc);
5005   vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc);
5006   vpackuswb(dst, xtmp1, dst, vec_enc);
5007 }
5008 
5009 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5010                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5011   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5012   // Add the popcount of upper and lower bytes of word.
5013   vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc);
5014   vpsrlw(dst, xtmp1, 8, vec_enc);
5015   vpand(xtmp1, xtmp1, xtmp2, vec_enc);
5016   vpaddw(dst, dst, xtmp1, vec_enc);
5017 }
5018 
5019 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5020                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
5021   vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5022   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5023   vpsadbw(dst, xtmp1, xtmp2, vec_enc);
5024 }
5025 
5026 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5027                                                  XMMRegister xtmp2, Register rtmp, int vec_enc) {
5028   switch(bt) {
5029     case T_LONG:
5030       vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5031       break;
5032     case T_INT:
5033       vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5034       break;
5035     case T_CHAR:
5036     case T_SHORT:
5037       vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5038       break;
5039     case T_BYTE:
5040     case T_BOOLEAN:
5041       vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5042       break;
5043     default:
5044       fatal("Unsupported type %s", type2name(bt));
5045       break;
5046   }
5047 }
5048 
5049 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5050                                                       KRegister mask, bool merge, int vec_enc) {
5051   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5052   switch(bt) {
5053     case T_LONG:
5054       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5055       evpopcntq(dst, mask, src, merge, vec_enc);
5056       break;
5057     case T_INT:
5058       assert(VM_Version::supports_avx512_vpopcntdq(), "");
5059       evpopcntd(dst, mask, src, merge, vec_enc);
5060       break;
5061     case T_CHAR:
5062     case T_SHORT:
5063       assert(VM_Version::supports_avx512_bitalg(), "");
5064       evpopcntw(dst, mask, src, merge, vec_enc);
5065       break;
5066     case T_BYTE:
5067     case T_BOOLEAN:
5068       assert(VM_Version::supports_avx512_bitalg(), "");
5069       evpopcntb(dst, mask, src, merge, vec_enc);
5070       break;
5071     default:
5072       fatal("Unsupported type %s", type2name(bt));
5073       break;
5074   }
5075 }
5076 
5077 #ifndef _LP64
5078 void C2_MacroAssembler::vector_maskall_operation32(KRegister dst, Register src, KRegister tmp, int mask_len) {
5079   assert(VM_Version::supports_avx512bw(), "");
5080   kmovdl(tmp, src);
5081   kunpckdql(dst, tmp, tmp);
5082 }
5083 #endif
5084 
5085 // Bit reversal algorithm first reverses the bits of each byte followed by
5086 // a byte level reversal for multi-byte primitive types (short/int/long).
5087 // Algorithm performs a lookup table access to get reverse bit sequence
5088 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte
5089 // is obtained by swapping the reverse bit sequences of upper and lower
5090 // nibble of a byte.
5091 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5092                                            XMMRegister xtmp2, Register rtmp, int vec_enc) {
5093   if (VM_Version::supports_avx512vlbw()) {
5094 
5095     // Get the reverse bit sequence of lower nibble of each byte.
5096     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg);
5097     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5098     vpandq(dst, xtmp2, src, vec_enc);
5099     vpshufb(dst, xtmp1, dst, vec_enc);
5100     vpsllq(dst, dst, 4, vec_enc);
5101 
5102     // Get the reverse bit sequence of upper nibble of each byte.
5103     vpandn(xtmp2, xtmp2, src, vec_enc);
5104     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5105     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5106 
5107     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5108     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5109     vporq(xtmp2, dst, xtmp2, vec_enc);
5110     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
5111 
5112   } else if(vec_enc == Assembler::AVX_512bit) {
5113     // Shift based bit reversal.
5114     assert(bt == T_LONG || bt == T_INT, "");
5115 
5116     // Swap lower and upper nibble of each byte.
5117     vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc);
5118 
5119     // Swap two least and most significant bits of each nibble.
5120     vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc);
5121 
5122     // Swap adjacent pair of bits.
5123     evmovdqul(xtmp1, k0, dst, true, vec_enc);
5124     vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc);
5125 
5126     evmovdqul(xtmp1, k0, dst, true, vec_enc);
5127     vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc);
5128   } else {
5129     vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp);
5130     vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5131 
5132     // Get the reverse bit sequence of lower nibble of each byte.
5133     vpand(dst, xtmp2, src, vec_enc);
5134     vpshufb(dst, xtmp1, dst, vec_enc);
5135     vpsllq(dst, dst, 4, vec_enc);
5136 
5137     // Get the reverse bit sequence of upper nibble of each byte.
5138     vpandn(xtmp2, xtmp2, src, vec_enc);
5139     vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5140     vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5141 
5142     // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5143     // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5144     vpor(xtmp2, dst, xtmp2, vec_enc);
5145     vector_reverse_byte(bt, dst, xtmp2, vec_enc);
5146   }
5147 }
5148 
5149 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc,
5150                                                 XMMRegister xtmp, Register rscratch) {
5151   assert(VM_Version::supports_gfni(), "");
5152   assert(rscratch != noreg || always_reachable(mask), "missing");
5153 
5154   // Galois field instruction based bit reversal based on following algorithm.
5155   // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
5156   vpbroadcastq(xtmp, mask, vec_enc, rscratch);
5157   vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc);
5158   vector_reverse_byte(bt, dst, xtmp, vec_enc);
5159 }
5160 
5161 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src,
5162                                           XMMRegister xtmp1, Register rtmp, int vec_enc) {
5163   vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc);
5164   vpandq(dst, xtmp1, src, vec_enc);
5165   vpsllq(dst, dst, nbits, vec_enc);
5166   vpandn(xtmp1, xtmp1, src, vec_enc);
5167   vpsrlq(xtmp1, xtmp1, nbits, vec_enc);
5168   vporq(dst, dst, xtmp1, vec_enc);
5169 }
5170 
5171 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5172                                               XMMRegister xtmp2, Register rtmp, int vec_enc) {
5173   // Shift based bit reversal.
5174   assert(VM_Version::supports_evex(), "");
5175   switch(bt) {
5176     case T_LONG:
5177       // Swap upper and lower double word of each quad word.
5178       evprorq(xtmp1, k0, src, 32, true, vec_enc);
5179       evprord(xtmp1, k0, xtmp1, 16, true, vec_enc);
5180       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
5181       break;
5182     case T_INT:
5183       // Swap upper and lower word of each double word.
5184       evprord(xtmp1, k0, src, 16, true, vec_enc);
5185       vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
5186       break;
5187     case T_CHAR:
5188     case T_SHORT:
5189       // Swap upper and lower byte of each word.
5190       vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc);
5191       break;
5192     case T_BYTE:
5193       evmovdquq(dst, k0, src, true, vec_enc);
5194       break;
5195     default:
5196       fatal("Unsupported type %s", type2name(bt));
5197       break;
5198   }
5199 }
5200 
5201 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) {
5202   if (bt == T_BYTE) {
5203     if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) {
5204       evmovdquq(dst, k0, src, true, vec_enc);
5205     } else {
5206       vmovdqu(dst, src);
5207     }
5208     return;
5209   }
5210   // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using
5211   // pre-computed shuffle indices.
5212   switch(bt) {
5213     case T_LONG:
5214       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg);
5215       break;
5216     case T_INT:
5217       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg);
5218       break;
5219     case T_CHAR:
5220     case T_SHORT:
5221       vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg);
5222       break;
5223     default:
5224       fatal("Unsupported type %s", type2name(bt));
5225       break;
5226   }
5227   vpshufb(dst, src, dst, vec_enc);
5228 }
5229 
5230 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5231                                                         XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
5232                                                         KRegister ktmp, Register rtmp, bool merge, int vec_enc) {
5233   assert(is_integral_type(bt), "");
5234   assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5235   assert(VM_Version::supports_avx512cd(), "");
5236   switch(bt) {
5237     case T_LONG:
5238       evplzcntq(dst, ktmp, src, merge, vec_enc);
5239       break;
5240     case T_INT:
5241       evplzcntd(dst, ktmp, src, merge, vec_enc);
5242       break;
5243     case T_SHORT:
5244       vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc);
5245       vpunpcklwd(xtmp2, xtmp1, src, vec_enc);
5246       evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc);
5247       vpunpckhwd(dst, xtmp1, src, vec_enc);
5248       evplzcntd(dst, ktmp, dst, merge, vec_enc);
5249       vpackusdw(dst, xtmp2, dst, vec_enc);
5250       break;
5251     case T_BYTE:
5252       // T1 = Compute leading zero counts of 4 LSB bits of each byte by
5253       // accessing the lookup table.
5254       // T2 = Compute leading zero counts of 4 MSB bits of each byte by
5255       // accessing the lookup table.
5256       // Add T1 to T2 if 4 MSB bits of byte are all zeros.
5257       assert(VM_Version::supports_avx512bw(), "");
5258       evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp);
5259       vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc);
5260       vpand(xtmp2, dst, src, vec_enc);
5261       vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5262       vpsrlw(xtmp3, src, 4, vec_enc);
5263       vpand(xtmp3, dst, xtmp3, vec_enc);
5264       vpshufb(dst, xtmp1, xtmp3, vec_enc);
5265       vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
5266       evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc);
5267       evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc);
5268       break;
5269     default:
5270       fatal("Unsupported type %s", type2name(bt));
5271       break;
5272   }
5273 }
5274 
5275 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5276                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
5277   vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp);
5278   vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5279   // T1 = Compute leading zero counts of 4 LSB bits of each byte by
5280   // accessing the lookup table.
5281   vpand(dst, xtmp2, src, vec_enc);
5282   vpshufb(dst, xtmp1, dst, vec_enc);
5283   // T2 = Compute leading zero counts of 4 MSB bits of each byte by
5284   // accessing the lookup table.
5285   vpsrlw(xtmp3, src, 4, vec_enc);
5286   vpand(xtmp3, xtmp2, xtmp3, vec_enc);
5287   vpshufb(xtmp2, xtmp1, xtmp3, vec_enc);
5288   // Add T1 to T2 if 4 MSB bits of byte are all zeros.
5289   vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
5290   vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc);
5291   vpaddb(dst, dst, xtmp2, vec_enc);
5292   vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc);
5293 }
5294 
5295 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5296                                                              XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
5297   vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
5298   // Add zero counts of lower byte and upper byte of a word if
5299   // upper byte holds a zero value.
5300   vpsrlw(xtmp3, src, 8, vec_enc);
5301   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
5302   vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc);
5303   vpsllw(xtmp2, dst, 8, vec_enc);
5304   vpaddw(xtmp2, xtmp2, dst, vec_enc);
5305   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
5306   vpsrlw(dst, dst, 8, vec_enc);
5307 }
5308 
5309 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5310                                                            XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) {
5311   // Since IEEE 754 floating point format represents mantissa in 1.0 format
5312   // hence biased exponent can be used to compute leading zero count as per
5313   // following formula:-
5314   // LZCNT = 32 - (biased_exp - 127)
5315   // Special handling has been introduced for Zero, Max_Int and -ve source values.
5316 
5317   // Broadcast 0xFF
5318   vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc);
5319   vpsrld(xtmp1, xtmp1, 24, vec_enc);
5320 
5321   // Extract biased exponent.
5322   vcvtdq2ps(dst, src, vec_enc);
5323   vpsrld(dst, dst, 23, vec_enc);
5324   vpand(dst, dst, xtmp1, vec_enc);
5325 
5326   // Broadcast 127.
5327   vpsrld(xtmp1, xtmp1, 1, vec_enc);
5328   // Exponent = biased_exp - 127
5329   vpsubd(dst, dst, xtmp1, vec_enc);
5330 
5331   // Exponent = Exponent  + 1
5332   vpsrld(xtmp3, xtmp1, 6, vec_enc);
5333   vpaddd(dst, dst, xtmp3, vec_enc);
5334 
5335   // Replace -ve exponent with zero, exponent is -ve when src
5336   // lane contains a zero value.
5337   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5338   vblendvps(dst, dst, xtmp2, dst, vec_enc);
5339 
5340   // Rematerialize broadcast 32.
5341   vpslld(xtmp1, xtmp3, 5, vec_enc);
5342   // Exponent is 32 if corresponding source lane contains max_int value.
5343   vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
5344   // LZCNT = 32 - exponent
5345   vpsubd(dst, xtmp1, dst, vec_enc);
5346 
5347   // Replace LZCNT with a value 1 if corresponding source lane
5348   // contains max_int value.
5349   vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc);
5350 
5351   // Replace biased_exp with 0 if source lane value is less than zero.
5352   vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5353   vblendvps(dst, dst, xtmp2, src, vec_enc);
5354 }
5355 
5356 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5357                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
5358   vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
5359   // Add zero counts of lower word and upper word of a double word if
5360   // upper word holds a zero value.
5361   vpsrld(xtmp3, src, 16, vec_enc);
5362   // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
5363   vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc);
5364   vpslld(xtmp2, dst, 16, vec_enc);
5365   vpaddd(xtmp2, xtmp2, dst, vec_enc);
5366   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
5367   vpsrld(dst, dst, 16, vec_enc);
5368   // Add zero counts of lower doubleword and upper doubleword of a
5369   // quadword if upper doubleword holds a zero value.
5370   vpsrlq(xtmp3, src, 32, vec_enc);
5371   vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc);
5372   vpsllq(xtmp2, dst, 32, vec_enc);
5373   vpaddq(xtmp2, xtmp2, dst, vec_enc);
5374   vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
5375   vpsrlq(dst, dst, 32, vec_enc);
5376 }
5377 
5378 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src,
5379                                                        XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
5380                                                        Register rtmp, int vec_enc) {
5381   assert(is_integral_type(bt), "unexpected type");
5382   assert(vec_enc < Assembler::AVX_512bit, "");
5383   switch(bt) {
5384     case T_LONG:
5385       vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
5386       break;
5387     case T_INT:
5388       vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
5389       break;
5390     case T_SHORT:
5391       vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
5392       break;
5393     case T_BYTE:
5394       vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
5395       break;
5396     default:
5397       fatal("Unsupported type %s", type2name(bt));
5398       break;
5399   }
5400 }
5401 
5402 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) {
5403   switch(bt) {
5404     case T_BYTE:
5405       vpsubb(dst, src1, src2, vec_enc);
5406       break;
5407     case T_SHORT:
5408       vpsubw(dst, src1, src2, vec_enc);
5409       break;
5410     case T_INT:
5411       vpsubd(dst, src1, src2, vec_enc);
5412       break;
5413     case T_LONG:
5414       vpsubq(dst, src1, src2, vec_enc);
5415       break;
5416     default:
5417       fatal("Unsupported type %s", type2name(bt));
5418       break;
5419   }
5420 }
5421 
5422 // Trailing zero count computation is based on leading zero count operation as per
5423 // following equation. All AVX3 targets support AVX512CD feature which offers
5424 // direct vector instruction to compute leading zero count.
5425 //      CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x)
5426 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5427                                                          XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
5428                                                          XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) {
5429   assert(is_integral_type(bt), "");
5430   // xtmp = -1
5431   vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc);
5432   // xtmp = xtmp + src
5433   vpadd(bt, xtmp4, xtmp4, src, vec_enc);
5434   // xtmp = xtmp & ~src
5435   vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc);
5436   vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc);
5437   vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc);
5438   vpsub(bt, dst, xtmp4, dst, vec_enc);
5439 }
5440 
5441 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation
5442 //      CTZ = PRIM_TYPE_WIDHT - POPC(x | -x)
5443 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5444                                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
5445   assert(is_integral_type(bt), "");
5446   // xtmp = 0
5447   vpxor(xtmp3 , xtmp3, xtmp3, vec_enc);
5448   // xtmp = 0 - src
5449   vpsub(bt, xtmp3, xtmp3, src, vec_enc);
5450   // xtmp = xtmp | src
5451   vpor(xtmp3, xtmp3, src, vec_enc);
5452   vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc);
5453   vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc);
5454   vpsub(bt, dst, xtmp1, dst, vec_enc);
5455 }
5456 
5457 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) {
5458   Label done;
5459   Label neg_divisor_fastpath;
5460   cmpl(divisor, 0);
5461   jccb(Assembler::less, neg_divisor_fastpath);
5462   xorl(rdx, rdx);
5463   divl(divisor);
5464   jmpb(done);
5465   bind(neg_divisor_fastpath);
5466   // Fastpath for divisor < 0:
5467   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
5468   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
5469   movl(rdx, rax);
5470   subl(rdx, divisor);
5471   if (VM_Version::supports_bmi1()) {
5472     andnl(rax, rdx, rax);
5473   } else {
5474     notl(rdx);
5475     andl(rax, rdx);
5476   }
5477   shrl(rax, 31);
5478   bind(done);
5479 }
5480 
5481 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) {
5482   Label done;
5483   Label neg_divisor_fastpath;
5484   cmpl(divisor, 0);
5485   jccb(Assembler::less, neg_divisor_fastpath);
5486   xorl(rdx, rdx);
5487   divl(divisor);
5488   jmpb(done);
5489   bind(neg_divisor_fastpath);
5490   // Fastpath when divisor < 0:
5491   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
5492   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
5493   movl(rdx, rax);
5494   subl(rax, divisor);
5495   if (VM_Version::supports_bmi1()) {
5496     andnl(rax, rax, rdx);
5497   } else {
5498     notl(rax);
5499     andl(rax, rdx);
5500   }
5501   sarl(rax, 31);
5502   andl(rax, divisor);
5503   subl(rdx, rax);
5504   bind(done);
5505 }
5506 
5507 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) {
5508   Label done;
5509   Label neg_divisor_fastpath;
5510 
5511   cmpl(divisor, 0);
5512   jccb(Assembler::less, neg_divisor_fastpath);
5513   xorl(rdx, rdx);
5514   divl(divisor);
5515   jmpb(done);
5516   bind(neg_divisor_fastpath);
5517   // Fastpath for divisor < 0:
5518   // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
5519   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
5520   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
5521   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
5522   movl(rdx, rax);
5523   subl(rax, divisor);
5524   if (VM_Version::supports_bmi1()) {
5525     andnl(rax, rax, rdx);
5526   } else {
5527     notl(rax);
5528     andl(rax, rdx);
5529   }
5530   movl(tmp, rax);
5531   shrl(rax, 31); // quotient
5532   sarl(tmp, 31);
5533   andl(tmp, divisor);
5534   subl(rdx, tmp); // remainder
5535   bind(done);
5536 }
5537 
5538 #ifdef _LP64
5539 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1,
5540                                  XMMRegister xtmp2, Register rtmp) {
5541   if(VM_Version::supports_gfni()) {
5542     // Galois field instruction based bit reversal based on following algorithm.
5543     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
5544     mov64(rtmp, 0x8040201008040201L);
5545     movq(xtmp1, src);
5546     movq(xtmp2, rtmp);
5547     gf2p8affineqb(xtmp1, xtmp2, 0);
5548     movq(dst, xtmp1);
5549   } else {
5550     // Swap even and odd numbered bits.
5551     movl(rtmp, src);
5552     andl(rtmp, 0x55555555);
5553     shll(rtmp, 1);
5554     movl(dst, src);
5555     andl(dst, 0xAAAAAAAA);
5556     shrl(dst, 1);
5557     orl(dst, rtmp);
5558 
5559     // Swap LSB and MSB 2 bits of each nibble.
5560     movl(rtmp, dst);
5561     andl(rtmp, 0x33333333);
5562     shll(rtmp, 2);
5563     andl(dst, 0xCCCCCCCC);
5564     shrl(dst, 2);
5565     orl(dst, rtmp);
5566 
5567     // Swap LSB and MSB 4 bits of each byte.
5568     movl(rtmp, dst);
5569     andl(rtmp, 0x0F0F0F0F);
5570     shll(rtmp, 4);
5571     andl(dst, 0xF0F0F0F0);
5572     shrl(dst, 4);
5573     orl(dst, rtmp);
5574   }
5575   bswapl(dst);
5576 }
5577 
5578 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1,
5579                                  XMMRegister xtmp2, Register rtmp1, Register rtmp2) {
5580   if(VM_Version::supports_gfni()) {
5581     // Galois field instruction based bit reversal based on following algorithm.
5582     // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
5583     mov64(rtmp1, 0x8040201008040201L);
5584     movq(xtmp1, src);
5585     movq(xtmp2, rtmp1);
5586     gf2p8affineqb(xtmp1, xtmp2, 0);
5587     movq(dst, xtmp1);
5588   } else {
5589     // Swap even and odd numbered bits.
5590     movq(rtmp1, src);
5591     mov64(rtmp2, 0x5555555555555555L);
5592     andq(rtmp1, rtmp2);
5593     shlq(rtmp1, 1);
5594     movq(dst, src);
5595     notq(rtmp2);
5596     andq(dst, rtmp2);
5597     shrq(dst, 1);
5598     orq(dst, rtmp1);
5599 
5600     // Swap LSB and MSB 2 bits of each nibble.
5601     movq(rtmp1, dst);
5602     mov64(rtmp2, 0x3333333333333333L);
5603     andq(rtmp1, rtmp2);
5604     shlq(rtmp1, 2);
5605     notq(rtmp2);
5606     andq(dst, rtmp2);
5607     shrq(dst, 2);
5608     orq(dst, rtmp1);
5609 
5610     // Swap LSB and MSB 4 bits of each byte.
5611     movq(rtmp1, dst);
5612     mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL);
5613     andq(rtmp1, rtmp2);
5614     shlq(rtmp1, 4);
5615     notq(rtmp2);
5616     andq(dst, rtmp2);
5617     shrq(dst, 4);
5618     orq(dst, rtmp1);
5619   }
5620   bswapq(dst);
5621 }
5622 
5623 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) {
5624   Label done;
5625   Label neg_divisor_fastpath;
5626   cmpq(divisor, 0);
5627   jccb(Assembler::less, neg_divisor_fastpath);
5628   xorl(rdx, rdx);
5629   divq(divisor);
5630   jmpb(done);
5631   bind(neg_divisor_fastpath);
5632   // Fastpath for divisor < 0:
5633   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
5634   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
5635   movq(rdx, rax);
5636   subq(rdx, divisor);
5637   if (VM_Version::supports_bmi1()) {
5638     andnq(rax, rdx, rax);
5639   } else {
5640     notq(rdx);
5641     andq(rax, rdx);
5642   }
5643   shrq(rax, 63);
5644   bind(done);
5645 }
5646 
5647 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) {
5648   Label done;
5649   Label neg_divisor_fastpath;
5650   cmpq(divisor, 0);
5651   jccb(Assembler::less, neg_divisor_fastpath);
5652   xorq(rdx, rdx);
5653   divq(divisor);
5654   jmp(done);
5655   bind(neg_divisor_fastpath);
5656   // Fastpath when divisor < 0:
5657   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
5658   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
5659   movq(rdx, rax);
5660   subq(rax, divisor);
5661   if (VM_Version::supports_bmi1()) {
5662     andnq(rax, rax, rdx);
5663   } else {
5664     notq(rax);
5665     andq(rax, rdx);
5666   }
5667   sarq(rax, 63);
5668   andq(rax, divisor);
5669   subq(rdx, rax);
5670   bind(done);
5671 }
5672 
5673 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) {
5674   Label done;
5675   Label neg_divisor_fastpath;
5676   cmpq(divisor, 0);
5677   jccb(Assembler::less, neg_divisor_fastpath);
5678   xorq(rdx, rdx);
5679   divq(divisor);
5680   jmp(done);
5681   bind(neg_divisor_fastpath);
5682   // Fastpath for divisor < 0:
5683   // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
5684   // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
5685   // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
5686   // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
5687   movq(rdx, rax);
5688   subq(rax, divisor);
5689   if (VM_Version::supports_bmi1()) {
5690     andnq(rax, rax, rdx);
5691   } else {
5692     notq(rax);
5693     andq(rax, rdx);
5694   }
5695   movq(tmp, rax);
5696   shrq(rax, 63); // quotient
5697   sarq(tmp, 63);
5698   andq(tmp, divisor);
5699   subq(rdx, tmp); // remainder
5700   bind(done);
5701 }
5702 #endif
5703 
5704 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1,
5705                                         XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp,
5706                                         int vlen_enc) {
5707   assert(VM_Version::supports_avx512bw(), "");
5708   // Byte shuffles are inlane operations and indices are determined using
5709   // lower 4 bit of each shuffle lane, thus all shuffle indices are
5710   // normalized to index range 0-15. This makes sure that all the multiples
5711   // of an index value are placed at same relative position in 128 bit
5712   // lane i.e. elements corresponding to shuffle indices 16, 32 and 64
5713   // will be 16th element in their respective 128 bit lanes.
5714   movl(rtmp, 16);
5715   evpbroadcastb(xtmp1, rtmp, vlen_enc);
5716 
5717   // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16,
5718   // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using
5719   // original shuffle indices and move the shuffled lanes corresponding to true
5720   // mask to destination vector.
5721   evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
5722   evshufi64x2(xtmp2, src, src, 0x0, vlen_enc);
5723   evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc);
5724 
5725   // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32
5726   // and broadcasting second 128 bit lane.
5727   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
5728   vpsllq(xtmp2, xtmp1, 0x1, vlen_enc);
5729   evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
5730   evshufi64x2(xtmp3, src, src, 0x55, vlen_enc);
5731   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
5732 
5733   // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48
5734   // and broadcasting third 128 bit lane.
5735   evpcmpb(ktmp, k0, shuffle,  xtmp2, Assembler::nlt, true, vlen_enc);
5736   vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc);
5737   evpcmpb(ktmp, ktmp, shuffle,  xtmp1, Assembler::lt, true, vlen_enc);
5738   evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc);
5739   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
5740 
5741   // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64
5742   // and broadcasting third 128 bit lane.
5743   evpcmpb(ktmp, k0, shuffle,  xtmp1, Assembler::nlt, true, vlen_enc);
5744   vpsllq(xtmp2, xtmp2, 0x1, vlen_enc);
5745   evpcmpb(ktmp, ktmp, shuffle,  xtmp2, Assembler::lt, true, vlen_enc);
5746   evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc);
5747   evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
5748 }
5749