1 /*
   2  * Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "asm/assembler.hpp"
  26 #include "asm/assembler.inline.hpp"
  27 #include "opto/c2_MacroAssembler.hpp"
  28 #include "opto/compile.hpp"
  29 #include "opto/intrinsicnode.hpp"
  30 #include "opto/matcher.hpp"
  31 #include "opto/output.hpp"
  32 #include "opto/subnode.hpp"
  33 #include "runtime/stubRoutines.hpp"
  34 #include "utilities/globalDefinitions.hpp"
  35 #include "utilities/powerOfTwo.hpp"
  36 
  37 #ifdef PRODUCT
  38 #define BLOCK_COMMENT(str) /* nothing */
  39 #define STOP(error) stop(error)
  40 #else
  41 #define BLOCK_COMMENT(str) block_comment(str)
  42 #define STOP(error) block_comment(error); stop(error)
  43 #endif
  44 
  45 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  46 
  47 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
  48 
  49 void C2_MacroAssembler::entry_barrier() {
  50   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
  51   // Dummy labels for just measuring the code size
  52   Label dummy_slow_path;
  53   Label dummy_continuation;
  54   Label dummy_guard;
  55   Label* slow_path = &dummy_slow_path;
  56   Label* continuation = &dummy_continuation;
  57   Label* guard = &dummy_guard;
  58   if (!Compile::current()->output()->in_scratch_emit_size()) {
  59     // Use real labels from actual stub when not emitting code for the purpose of measuring its size
  60     C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
  61     Compile::current()->output()->add_stub(stub);
  62     slow_path = &stub->entry();
  63     continuation = &stub->continuation();
  64     guard = &stub->guard();
  65   }
  66   // In the C2 code, we move the non-hot part of nmethod entry barriers out-of-line to a stub.
  67   bs->nmethod_entry_barrier(this, slow_path, continuation, guard);
  68 }
  69 
  70 // jdk.internal.util.ArraysSupport.vectorizedHashCode
  71 address C2_MacroAssembler::arrays_hashcode(Register ary, Register cnt, Register result,
  72                                            FloatRegister vdata0, FloatRegister vdata1,
  73                                            FloatRegister vdata2, FloatRegister vdata3,
  74                                            FloatRegister vmul0, FloatRegister vmul1,
  75                                            FloatRegister vmul2, FloatRegister vmul3,
  76                                            FloatRegister vpow, FloatRegister vpowm,
  77                                            BasicType eltype) {
  78   ARRAYS_HASHCODE_REGISTERS;
  79 
  80   Register tmp1 = rscratch1, tmp2 = rscratch2;
  81 
  82   Label TAIL, STUB_SWITCH, STUB_SWITCH_OUT, LOOP, BR_BASE, LARGE, DONE;
  83 
  84   // Vectorization factor. Number of array elements loaded to one SIMD&FP registers by the stubs. We
  85   // use 8H load arrangements for chars and shorts and 8B for booleans and bytes. It's possible to
  86   // use 4H for chars and shorts instead, but using 8H gives better performance.
  87   const size_t vf = eltype == T_BOOLEAN || eltype == T_BYTE ? 8
  88                     : eltype == T_CHAR || eltype == T_SHORT ? 8
  89                     : eltype == T_INT                       ? 4
  90                                                             : 0;
  91   guarantee(vf, "unsupported eltype");
  92 
  93   // Unroll factor for the scalar loop below. The value is chosen based on performance analysis.
  94   const size_t unroll_factor = 4;
  95 
  96   switch (eltype) {
  97   case T_BOOLEAN:
  98     BLOCK_COMMENT("arrays_hashcode(unsigned byte) {");
  99     break;
 100   case T_CHAR:
 101     BLOCK_COMMENT("arrays_hashcode(char) {");
 102     break;
 103   case T_BYTE:
 104     BLOCK_COMMENT("arrays_hashcode(byte) {");
 105     break;
 106   case T_SHORT:
 107     BLOCK_COMMENT("arrays_hashcode(short) {");
 108     break;
 109   case T_INT:
 110     BLOCK_COMMENT("arrays_hashcode(int) {");
 111     break;
 112   default:
 113     ShouldNotReachHere();
 114   }
 115 
 116   // large_arrays_hashcode(T_INT) performs worse than the scalar loop below when the Neon loop
 117   // implemented by the stub executes just once. Call the stub only if at least two iterations will
 118   // be executed.
 119   const size_t large_threshold = eltype == T_INT ? vf * 2 : vf;
 120   cmpw(cnt, large_threshold);
 121   br(Assembler::HS, LARGE);
 122 
 123   bind(TAIL);
 124 
 125   // The andr performs cnt % uf where uf = unroll_factor. The subtract shifted by 3 offsets past
 126   // uf - (cnt % uf) pairs of load + madd insns i.e. it only executes cnt % uf load + madd pairs.
 127   // Iteration eats up the remainder, uf elements at a time.
 128   assert(is_power_of_2(unroll_factor), "can't use this value to calculate the jump target PC");
 129   andr(tmp2, cnt, unroll_factor - 1);
 130   adr(tmp1, BR_BASE);
 131   // For Cortex-A53 offset is 4 because 2 nops are generated.
 132   sub(tmp1, tmp1, tmp2, ext::sxtw, VM_Version::supports_a53mac() ? 4 : 3);
 133   movw(tmp2, 0x1f);
 134   br(tmp1);
 135 
 136   bind(LOOP);
 137   for (size_t i = 0; i < unroll_factor; ++i) {
 138     load(tmp1, Address(post(ary, type2aelembytes(eltype))), eltype);
 139     maddw(result, result, tmp2, tmp1);
 140     // maddw generates an extra nop for Cortex-A53 (see maddw definition in macroAssembler).
 141     // Generate 2nd nop to have 4 instructions per iteration.
 142     if (VM_Version::supports_a53mac()) {
 143       nop();
 144     }
 145   }
 146   bind(BR_BASE);
 147   subsw(cnt, cnt, unroll_factor);
 148   br(Assembler::HS, LOOP);
 149 
 150   b(DONE);
 151 
 152   bind(LARGE);
 153 
 154   RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_arrays_hashcode(eltype));
 155   assert(stub.target() != nullptr, "array_hashcode stub has not been generated");
 156   address tpc = trampoline_call(stub);
 157   if (tpc == nullptr) {
 158     DEBUG_ONLY(reset_labels(TAIL, BR_BASE));
 159     postcond(pc() == badAddress);
 160     return nullptr;
 161   }
 162 
 163   bind(DONE);
 164 
 165   BLOCK_COMMENT("} // arrays_hashcode");
 166 
 167   postcond(pc() != badAddress);
 168   return pc();
 169 }
 170 
 171 void C2_MacroAssembler::fast_lock(Register obj, Register box, Register t1,
 172                                   Register t2, Register t3) {
 173   assert_different_registers(obj, box, t1, t2, t3, rscratch2);
 174 
 175   // Handle inflated monitor.
 176   Label inflated;
 177   // Finish fast lock successfully. MUST branch to with flag == EQ
 178   Label locked;
 179   // Finish fast lock unsuccessfully. MUST branch to with flag == NE
 180   Label slow_path;
 181 
 182   if (UseObjectMonitorTable) {
 183     // Clear cache in case fast locking succeeds or we need to take the slow-path.
 184     str(zr, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 185   }
 186 
 187   if (DiagnoseSyncOnValueBasedClasses != 0) {
 188     load_klass(t1, obj);
 189     ldrb(t1, Address(t1, Klass::misc_flags_offset()));
 190     tst(t1, KlassFlags::_misc_is_value_based_class);
 191     br(Assembler::NE, slow_path);
 192   }
 193 
 194   const Register t1_mark = t1;
 195   const Register t3_t = t3;
 196 
 197   { // Fast locking
 198 
 199     // Push lock to the lock stack and finish successfully. MUST branch to with flag == EQ
 200     Label push;
 201 
 202     const Register t2_top = t2;
 203 
 204     // Check if lock-stack is full.
 205     ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 206     cmpw(t2_top, (unsigned)LockStack::end_offset() - 1);
 207     br(Assembler::GT, slow_path);
 208 
 209     // Check if recursive.
 210     subw(t3_t, t2_top, oopSize);
 211     ldr(t3_t, Address(rthread, t3_t));
 212     cmp(obj, t3_t);
 213     br(Assembler::EQ, push);
 214 
 215     // Relaxed normal load to check for monitor. Optimization for monitor case.
 216     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 217     tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
 218 
 219     // Not inflated
 220     assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea");
 221 
 222     // Try to lock. Transition lock-bits 0b01 => 0b00
 223     orr(t1_mark, t1_mark, markWord::unlocked_value);
 224     eor(t3_t, t1_mark, markWord::unlocked_value);
 225     cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
 226             /*acquire*/ true, /*release*/ false, /*weak*/ false, noreg);
 227     br(Assembler::NE, slow_path);
 228 
 229     bind(push);
 230     // After successful lock, push object on lock-stack.
 231     str(obj, Address(rthread, t2_top));
 232     addw(t2_top, t2_top, oopSize);
 233     strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 234     b(locked);
 235   }
 236 
 237   { // Handle inflated monitor.
 238     bind(inflated);
 239 
 240     const Register t1_monitor = t1;
 241 
 242     if (!UseObjectMonitorTable) {
 243       assert(t1_monitor == t1_mark, "should be the same here");
 244     } else {
 245       Label monitor_found;
 246 
 247       // Load cache address
 248       lea(t3_t, Address(rthread, JavaThread::om_cache_oops_offset()));
 249 
 250       const int num_unrolled = 2;
 251       for (int i = 0; i < num_unrolled; i++) {
 252         ldr(t1, Address(t3_t));
 253         cmp(obj, t1);
 254         br(Assembler::EQ, monitor_found);
 255         increment(t3_t, in_bytes(OMCache::oop_to_oop_difference()));
 256       }
 257 
 258       Label loop;
 259 
 260       // Search for obj in cache.
 261       bind(loop);
 262 
 263       // Check for match.
 264       ldr(t1, Address(t3_t));
 265       cmp(obj, t1);
 266       br(Assembler::EQ, monitor_found);
 267 
 268       // Search until null encountered, guaranteed _null_sentinel at end.
 269       increment(t3_t, in_bytes(OMCache::oop_to_oop_difference()));
 270       cbnz(t1, loop);
 271       // Cache Miss, NE set from cmp above, cbnz does not set flags
 272       b(slow_path);
 273 
 274       bind(monitor_found);
 275       ldr(t1_monitor, Address(t3_t, OMCache::oop_to_monitor_difference()));
 276     }
 277 
 278     const Register t2_owner_addr = t2;
 279     const Register t3_owner = t3;
 280     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 281     const Address owner_address(t1_monitor, ObjectMonitor::owner_offset() - monitor_tag);
 282     const Address recursions_address(t1_monitor, ObjectMonitor::recursions_offset() - monitor_tag);
 283 
 284     Label monitor_locked;
 285 
 286     // Compute owner address.
 287     lea(t2_owner_addr, owner_address);
 288 
 289     // Try to CAS owner (no owner => current thread's _monitor_owner_id).
 290     ldr(rscratch2, Address(rthread, JavaThread::monitor_owner_id_offset()));
 291     cmpxchg(t2_owner_addr, zr, rscratch2, Assembler::xword, /*acquire*/ true,
 292             /*release*/ false, /*weak*/ false, t3_owner);
 293     br(Assembler::EQ, monitor_locked);
 294 
 295     // Check if recursive.
 296     cmp(t3_owner, rscratch2);
 297     br(Assembler::NE, slow_path);
 298 
 299     // Recursive.
 300     increment(recursions_address, 1);
 301 
 302     bind(monitor_locked);
 303     if (UseObjectMonitorTable) {
 304       str(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 305     }
 306   }
 307 
 308   bind(locked);
 309 
 310 #ifdef ASSERT
 311   // Check that locked label is reached with Flags == EQ.
 312   Label flag_correct;
 313   br(Assembler::EQ, flag_correct);
 314   stop("Fast Lock Flag != EQ");
 315 #endif
 316 
 317   bind(slow_path);
 318 #ifdef ASSERT
 319   // Check that slow_path label is reached with Flags == NE.
 320   br(Assembler::NE, flag_correct);
 321   stop("Fast Lock Flag != NE");
 322   bind(flag_correct);
 323 #endif
 324   // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
 325 }
 326 
 327 void C2_MacroAssembler::fast_unlock(Register obj, Register box, Register t1,
 328                                     Register t2, Register t3) {
 329   assert_different_registers(obj, box, t1, t2, t3);
 330 
 331   // Handle inflated monitor.
 332   Label inflated, inflated_load_mark;
 333   // Finish fast unlock successfully. MUST branch to with flag == EQ
 334   Label unlocked;
 335   // Finish fast unlock unsuccessfully. MUST branch to with flag == NE
 336   Label slow_path;
 337 
 338   const Register t1_mark = t1;
 339   const Register t2_top = t2;
 340   const Register t3_t = t3;
 341 
 342   { // Fast unlock
 343 
 344     Label push_and_slow_path;
 345 
 346     // Check if obj is top of lock-stack.
 347     ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 348     subw(t2_top, t2_top, oopSize);
 349     ldr(t3_t, Address(rthread, t2_top));
 350     cmp(obj, t3_t);
 351     // Top of lock stack was not obj. Must be monitor.
 352     br(Assembler::NE, inflated_load_mark);
 353 
 354     // Pop lock-stack.
 355     DEBUG_ONLY(str(zr, Address(rthread, t2_top));)
 356     strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 357 
 358     // Check if recursive.
 359     subw(t3_t, t2_top, oopSize);
 360     ldr(t3_t, Address(rthread, t3_t));
 361     cmp(obj, t3_t);
 362     br(Assembler::EQ, unlocked);
 363 
 364     // Not recursive.
 365     // Load Mark.
 366     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 367 
 368     // Check header for monitor (0b10).
 369     // Because we got here by popping (meaning we pushed in locked)
 370     // there will be no monitor in the box. So we need to push back the obj
 371     // so that the runtime can fix any potential anonymous owner.
 372     tbnz(t1_mark, exact_log2(markWord::monitor_value), UseObjectMonitorTable ? push_and_slow_path : inflated);
 373 
 374     // Try to unlock. Transition lock bits 0b00 => 0b01
 375     assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea");
 376     orr(t3_t, t1_mark, markWord::unlocked_value);
 377     cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
 378             /*acquire*/ false, /*release*/ true, /*weak*/ false, noreg);
 379     br(Assembler::EQ, unlocked);
 380 
 381     bind(push_and_slow_path);
 382     // Compare and exchange failed.
 383     // Restore lock-stack and handle the unlock in runtime.
 384     DEBUG_ONLY(str(obj, Address(rthread, t2_top));)
 385     addw(t2_top, t2_top, oopSize);
 386     str(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 387     b(slow_path);
 388   }
 389 
 390 
 391   { // Handle inflated monitor.
 392     bind(inflated_load_mark);
 393     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 394 #ifdef ASSERT
 395     tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
 396     stop("Fast Unlock not monitor");
 397 #endif
 398 
 399     bind(inflated);
 400 
 401 #ifdef ASSERT
 402     Label check_done;
 403     subw(t2_top, t2_top, oopSize);
 404     cmpw(t2_top, in_bytes(JavaThread::lock_stack_base_offset()));
 405     br(Assembler::LT, check_done);
 406     ldr(t3_t, Address(rthread, t2_top));
 407     cmp(obj, t3_t);
 408     br(Assembler::NE, inflated);
 409     stop("Fast Unlock lock on stack");
 410     bind(check_done);
 411 #endif
 412 
 413     const Register t1_monitor = t1;
 414 
 415     if (!UseObjectMonitorTable) {
 416       assert(t1_monitor == t1_mark, "should be the same here");
 417 
 418       // Untag the monitor.
 419       add(t1_monitor, t1_mark, -(int)markWord::monitor_value);
 420     } else {
 421       ldr(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 422       // null check with Flags == NE, no valid pointer below alignof(ObjectMonitor*)
 423       cmp(t1_monitor, checked_cast<uint8_t>(alignof(ObjectMonitor*)));
 424       br(Assembler::LO, slow_path);
 425     }
 426 
 427     const Register t2_recursions = t2;
 428     Label not_recursive;
 429 
 430     // Check if recursive.
 431     ldr(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
 432     cbz(t2_recursions, not_recursive);
 433 
 434     // Recursive unlock.
 435     sub(t2_recursions, t2_recursions, 1u);
 436     str(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
 437     // Set flag == EQ
 438     cmp(t2_recursions, t2_recursions);
 439     b(unlocked);
 440 
 441     bind(not_recursive);
 442 
 443     const Register t2_owner_addr = t2;
 444 
 445     // Compute owner address.
 446     lea(t2_owner_addr, Address(t1_monitor, ObjectMonitor::owner_offset()));
 447 
 448     // Set owner to null.
 449     // Release to satisfy the JMM
 450     stlr(zr, t2_owner_addr);
 451     // We need a full fence after clearing owner to avoid stranding.
 452     // StoreLoad achieves this.
 453     membar(StoreLoad);
 454 
 455     // Check if the entry_list is empty.
 456     ldr(rscratch1, Address(t1_monitor, ObjectMonitor::entry_list_offset()));
 457     cmp(rscratch1, zr);
 458     br(Assembler::EQ, unlocked);  // If so we are done.
 459 
 460     // Check if there is a successor.
 461     ldr(rscratch1, Address(t1_monitor, ObjectMonitor::succ_offset()));
 462     cmp(rscratch1, zr);
 463     br(Assembler::NE, unlocked);  // If so we are done.
 464 
 465     // Save the monitor pointer in the current thread, so we can try to
 466     // reacquire the lock in SharedRuntime::monitor_exit_helper().
 467     str(t1_monitor, Address(rthread, JavaThread::unlocked_inflated_monitor_offset()));
 468 
 469     cmp(zr, rthread); // Set Flag to NE => slow path
 470     b(slow_path);
 471   }
 472 
 473   bind(unlocked);
 474   cmp(zr, zr); // Set Flags to EQ => fast path
 475 
 476 #ifdef ASSERT
 477   // Check that unlocked label is reached with Flags == EQ.
 478   Label flag_correct;
 479   br(Assembler::EQ, flag_correct);
 480   stop("Fast Unlock Flag != EQ");
 481 #endif
 482 
 483   bind(slow_path);
 484 #ifdef ASSERT
 485   // Check that slow_path label is reached with Flags == NE.
 486   br(Assembler::NE, flag_correct);
 487   stop("Fast Unlock Flag != NE");
 488   bind(flag_correct);
 489 #endif
 490   // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
 491 }
 492 
 493 // Search for str1 in str2 and return index or -1
 494 // Clobbers: rscratch1, rscratch2, rflags. May also clobber v0-v1, when icnt1==-1.
 495 void C2_MacroAssembler::string_indexof(Register str2, Register str1,
 496                                        Register cnt2, Register cnt1,
 497                                        Register tmp1, Register tmp2,
 498                                        Register tmp3, Register tmp4,
 499                                        Register tmp5, Register tmp6,
 500                                        int icnt1, Register result, int ae) {
 501   // NOTE: tmp5, tmp6 can be zr depending on specific method version
 502   Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
 503 
 504   Register ch1 = rscratch1;
 505   Register ch2 = rscratch2;
 506   Register cnt1tmp = tmp1;
 507   Register cnt2tmp = tmp2;
 508   Register cnt1_neg = cnt1;
 509   Register cnt2_neg = cnt2;
 510   Register result_tmp = tmp4;
 511 
 512   bool isL = ae == StrIntrinsicNode::LL;
 513 
 514   bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
 515   bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
 516   int str1_chr_shift = str1_isL ? 0:1;
 517   int str2_chr_shift = str2_isL ? 0:1;
 518   int str1_chr_size = str1_isL ? 1:2;
 519   int str2_chr_size = str2_isL ? 1:2;
 520   chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
 521                                       (chr_insn)&MacroAssembler::ldrh;
 522   chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
 523                                       (chr_insn)&MacroAssembler::ldrh;
 524   chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
 525   chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
 526 
 527   // Note, inline_string_indexOf() generates checks:
 528   // if (substr.count > string.count) return -1;
 529   // if (substr.count == 0) return 0;
 530 
 531   // We have two strings, a source string in str2, cnt2 and a pattern string
 532   // in str1, cnt1. Find the 1st occurrence of pattern in source or return -1.
 533 
 534   // For larger pattern and source we use a simplified Boyer Moore algorithm.
 535   // With a small pattern and source we use linear scan.
 536 
 537   if (icnt1 == -1) {
 538     sub(result_tmp, cnt2, cnt1);
 539     cmp(cnt1, (u1)8);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
 540     br(LT, LINEARSEARCH);
 541     dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
 542     subs(zr, cnt1, 256);
 543     lsr(tmp1, cnt2, 2);
 544     ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
 545     br(GE, LINEARSTUB);
 546   }
 547 
 548 // The Boyer Moore alogorithm is based on the description here:-
 549 //
 550 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
 551 //
 552 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
 553 // and the 'Good Suffix' rule.
 554 //
 555 // These rules are essentially heuristics for how far we can shift the
 556 // pattern along the search string.
 557 //
 558 // The implementation here uses the 'Bad Character' rule only because of the
 559 // complexity of initialisation for the 'Good Suffix' rule.
 560 //
 561 // This is also known as the Boyer-Moore-Horspool algorithm:-
 562 //
 563 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
 564 //
 565 // This particular implementation has few java-specific optimizations.
 566 //
 567 // #define ASIZE 256
 568 //
 569 //    int bm(unsigned char *x, int m, unsigned char *y, int n) {
 570 //       int i, j;
 571 //       unsigned c;
 572 //       unsigned char bc[ASIZE];
 573 //
 574 //       /* Preprocessing */
 575 //       for (i = 0; i < ASIZE; ++i)
 576 //          bc[i] = m;
 577 //       for (i = 0; i < m - 1; ) {
 578 //          c = x[i];
 579 //          ++i;
 580 //          // c < 256 for Latin1 string, so, no need for branch
 581 //          #ifdef PATTERN_STRING_IS_LATIN1
 582 //          bc[c] = m - i;
 583 //          #else
 584 //          if (c < ASIZE) bc[c] = m - i;
 585 //          #endif
 586 //       }
 587 //
 588 //       /* Searching */
 589 //       j = 0;
 590 //       while (j <= n - m) {
 591 //          c = y[i+j];
 592 //          if (x[m-1] == c)
 593 //            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
 594 //          if (i < 0) return j;
 595 //          // c < 256 for Latin1 string, so, no need for branch
 596 //          #ifdef SOURCE_STRING_IS_LATIN1
 597 //          // LL case: (c< 256) always true. Remove branch
 598 //          j += bc[y[j+m-1]];
 599 //          #endif
 600 //          #ifndef PATTERN_STRING_IS_UTF
 601 //          // UU case: need if (c<ASIZE) check. Skip 1 character if not.
 602 //          if (c < ASIZE)
 603 //            j += bc[y[j+m-1]];
 604 //          else
 605 //            j += 1
 606 //          #endif
 607 //          #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
 608 //          // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
 609 //          if (c < ASIZE)
 610 //            j += bc[y[j+m-1]];
 611 //          else
 612 //            j += m
 613 //          #endif
 614 //       }
 615 //    }
 616 
 617   if (icnt1 == -1) {
 618     Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
 619         BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
 620     Register cnt1end = tmp2;
 621     Register str2end = cnt2;
 622     Register skipch = tmp2;
 623 
 624     // str1 length is >=8, so, we can read at least 1 register for cases when
 625     // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
 626     // UL case. We'll re-read last character in inner pre-loop code to have
 627     // single outer pre-loop load
 628     const int firstStep = isL ? 7 : 3;
 629 
 630     const int ASIZE = 256;
 631     const int STORED_BYTES = 32; // amount of bytes stored per instruction
 632     sub(sp, sp, ASIZE);
 633     mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
 634     mov(ch1, sp);
 635     BIND(BM_INIT_LOOP);
 636       stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
 637       subs(tmp5, tmp5, 1);
 638       br(GT, BM_INIT_LOOP);
 639 
 640       sub(cnt1tmp, cnt1, 1);
 641       mov(tmp5, str2);
 642       add(str2end, str2, result_tmp, LSL, str2_chr_shift);
 643       sub(ch2, cnt1, 1);
 644       mov(tmp3, str1);
 645     BIND(BCLOOP);
 646       (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
 647       if (!str1_isL) {
 648         subs(zr, ch1, ASIZE);
 649         br(HS, BCSKIP);
 650       }
 651       strb(ch2, Address(sp, ch1));
 652     BIND(BCSKIP);
 653       subs(ch2, ch2, 1);
 654       br(GT, BCLOOP);
 655 
 656       add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
 657       if (str1_isL == str2_isL) {
 658         // load last 8 bytes (8LL/4UU symbols)
 659         ldr(tmp6, Address(tmp6, -wordSize));
 660       } else {
 661         ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
 662         // convert Latin1 to UTF. We'll have to wait until load completed, but
 663         // it's still faster than per-character loads+checks
 664         lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
 665         ubfx(ch1, tmp6, 8, 8); // str1[N-2]
 666         ubfx(ch2, tmp6, 16, 8); // str1[N-3]
 667         andr(tmp6, tmp6, 0xFF); // str1[N-4]
 668         orr(ch2, ch1, ch2, LSL, 16);
 669         orr(tmp6, tmp6, tmp3, LSL, 48);
 670         orr(tmp6, tmp6, ch2, LSL, 16);
 671       }
 672     BIND(BMLOOPSTR2);
 673       (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 674       sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
 675       if (str1_isL == str2_isL) {
 676         // re-init tmp3. It's for free because it's executed in parallel with
 677         // load above. Alternative is to initialize it before loop, but it'll
 678         // affect performance on in-order systems with 2 or more ld/st pipelines
 679         lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
 680       }
 681       if (!isL) { // UU/UL case
 682         lsl(ch2, cnt1tmp, 1); // offset in bytes
 683       }
 684       cmp(tmp3, skipch);
 685       br(NE, BMSKIP);
 686       ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
 687       mov(ch1, tmp6);
 688       if (isL) {
 689         b(BMLOOPSTR1_AFTER_LOAD);
 690       } else {
 691         sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
 692         b(BMLOOPSTR1_CMP);
 693       }
 694     BIND(BMLOOPSTR1);
 695       (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
 696       (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 697     BIND(BMLOOPSTR1_AFTER_LOAD);
 698       subs(cnt1tmp, cnt1tmp, 1);
 699       br(LT, BMLOOPSTR1_LASTCMP);
 700     BIND(BMLOOPSTR1_CMP);
 701       cmp(ch1, ch2);
 702       br(EQ, BMLOOPSTR1);
 703     BIND(BMSKIP);
 704       if (!isL) {
 705         // if we've met UTF symbol while searching Latin1 pattern, then we can
 706         // skip cnt1 symbols
 707         if (str1_isL != str2_isL) {
 708           mov(result_tmp, cnt1);
 709         } else {
 710           mov(result_tmp, 1);
 711         }
 712         subs(zr, skipch, ASIZE);
 713         br(HS, BMADV);
 714       }
 715       ldrb(result_tmp, Address(sp, skipch)); // load skip distance
 716     BIND(BMADV);
 717       sub(cnt1tmp, cnt1, 1);
 718       add(str2, str2, result_tmp, LSL, str2_chr_shift);
 719       cmp(str2, str2end);
 720       br(LE, BMLOOPSTR2);
 721       add(sp, sp, ASIZE);
 722       b(NOMATCH);
 723     BIND(BMLOOPSTR1_LASTCMP);
 724       cmp(ch1, ch2);
 725       br(NE, BMSKIP);
 726     BIND(BMMATCH);
 727       sub(result, str2, tmp5);
 728       if (!str2_isL) lsr(result, result, 1);
 729       add(sp, sp, ASIZE);
 730       b(DONE);
 731 
 732     BIND(LINEARSTUB);
 733     cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm
 734     br(LT, LINEAR_MEDIUM);
 735     mov(result, zr);
 736     RuntimeAddress stub = nullptr;
 737     if (isL) {
 738       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
 739       assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated");
 740     } else if (str1_isL) {
 741       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
 742        assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated");
 743     } else {
 744       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
 745       assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated");
 746     }
 747     address call = trampoline_call(stub);
 748     if (call == nullptr) {
 749       DEBUG_ONLY(reset_labels(LINEARSEARCH, LINEAR_MEDIUM, DONE, NOMATCH, MATCH));
 750       ciEnv::current()->record_failure("CodeCache is full");
 751       return;
 752     }
 753     b(DONE);
 754   }
 755 
 756   BIND(LINEARSEARCH);
 757   {
 758     Label DO1, DO2, DO3;
 759 
 760     Register str2tmp = tmp2;
 761     Register first = tmp3;
 762 
 763     if (icnt1 == -1)
 764     {
 765         Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
 766 
 767         cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2));
 768         br(LT, DOSHORT);
 769       BIND(LINEAR_MEDIUM);
 770         (this->*str1_load_1chr)(first, Address(str1));
 771         lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
 772         sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
 773         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 774         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 775 
 776       BIND(FIRST_LOOP);
 777         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
 778         cmp(first, ch2);
 779         br(EQ, STR1_LOOP);
 780       BIND(STR2_NEXT);
 781         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 782         br(LE, FIRST_LOOP);
 783         b(NOMATCH);
 784 
 785       BIND(STR1_LOOP);
 786         adds(cnt1tmp, cnt1_neg, str1_chr_size);
 787         add(cnt2tmp, cnt2_neg, str2_chr_size);
 788         br(GE, MATCH);
 789 
 790       BIND(STR1_NEXT);
 791         (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
 792         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
 793         cmp(ch1, ch2);
 794         br(NE, STR2_NEXT);
 795         adds(cnt1tmp, cnt1tmp, str1_chr_size);
 796         add(cnt2tmp, cnt2tmp, str2_chr_size);
 797         br(LT, STR1_NEXT);
 798         b(MATCH);
 799 
 800       BIND(DOSHORT);
 801       if (str1_isL == str2_isL) {
 802         cmp(cnt1, (u1)2);
 803         br(LT, DO1);
 804         br(GT, DO3);
 805       }
 806     }
 807 
 808     if (icnt1 == 4) {
 809       Label CH1_LOOP;
 810 
 811         (this->*load_4chr)(ch1, str1);
 812         sub(result_tmp, cnt2, 4);
 813         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 814         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 815 
 816       BIND(CH1_LOOP);
 817         (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
 818         cmp(ch1, ch2);
 819         br(EQ, MATCH);
 820         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 821         br(LE, CH1_LOOP);
 822         b(NOMATCH);
 823       }
 824 
 825     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
 826       Label CH1_LOOP;
 827 
 828       BIND(DO2);
 829         (this->*load_2chr)(ch1, str1);
 830         if (icnt1 == 2) {
 831           sub(result_tmp, cnt2, 2);
 832         }
 833         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 834         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 835       BIND(CH1_LOOP);
 836         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
 837         cmp(ch1, ch2);
 838         br(EQ, MATCH);
 839         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 840         br(LE, CH1_LOOP);
 841         b(NOMATCH);
 842     }
 843 
 844     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
 845       Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
 846 
 847       BIND(DO3);
 848         (this->*load_2chr)(first, str1);
 849         (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
 850         if (icnt1 == 3) {
 851           sub(result_tmp, cnt2, 3);
 852         }
 853         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 854         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 855       BIND(FIRST_LOOP);
 856         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
 857         cmpw(first, ch2);
 858         br(EQ, STR1_LOOP);
 859       BIND(STR2_NEXT);
 860         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 861         br(LE, FIRST_LOOP);
 862         b(NOMATCH);
 863 
 864       BIND(STR1_LOOP);
 865         add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
 866         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
 867         cmp(ch1, ch2);
 868         br(NE, STR2_NEXT);
 869         b(MATCH);
 870     }
 871 
 872     if (icnt1 == -1 || icnt1 == 1) {
 873       Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
 874 
 875       BIND(DO1);
 876         (this->*str1_load_1chr)(ch1, str1);
 877         cmp(cnt2, (u1)8);
 878         br(LT, DO1_SHORT);
 879 
 880         sub(result_tmp, cnt2, 8/str2_chr_size);
 881         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 882         mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
 883         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 884 
 885         if (str2_isL) {
 886           orr(ch1, ch1, ch1, LSL, 8);
 887         }
 888         orr(ch1, ch1, ch1, LSL, 16);
 889         orr(ch1, ch1, ch1, LSL, 32);
 890       BIND(CH1_LOOP);
 891         ldr(ch2, Address(str2, cnt2_neg));
 892         eor(ch2, ch1, ch2);
 893         sub(tmp1, ch2, tmp3);
 894         orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 895         bics(tmp1, tmp1, tmp2);
 896         br(NE, HAS_ZERO);
 897         adds(cnt2_neg, cnt2_neg, 8);
 898         br(LT, CH1_LOOP);
 899 
 900         cmp(cnt2_neg, (u1)8);
 901         mov(cnt2_neg, 0);
 902         br(LT, CH1_LOOP);
 903         b(NOMATCH);
 904 
 905       BIND(HAS_ZERO);
 906         rev(tmp1, tmp1);
 907         clz(tmp1, tmp1);
 908         add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
 909         b(MATCH);
 910 
 911       BIND(DO1_SHORT);
 912         mov(result_tmp, cnt2);
 913         lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
 914         sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
 915       BIND(DO1_LOOP);
 916         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
 917         cmpw(ch1, ch2);
 918         br(EQ, MATCH);
 919         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 920         br(LT, DO1_LOOP);
 921     }
 922   }
 923   BIND(NOMATCH);
 924     mov(result, -1);
 925     b(DONE);
 926   BIND(MATCH);
 927     add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
 928   BIND(DONE);
 929 }
 930 
 931 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
 932 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
 933 
 934 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1,
 935                                             Register ch, Register result,
 936                                             Register tmp1, Register tmp2, Register tmp3)
 937 {
 938   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
 939   Register cnt1_neg = cnt1;
 940   Register ch1 = rscratch1;
 941   Register result_tmp = rscratch2;
 942 
 943   cbz(cnt1, NOMATCH);
 944 
 945   cmp(cnt1, (u1)4);
 946   br(LT, DO1_SHORT);
 947 
 948   orr(ch, ch, ch, LSL, 16);
 949   orr(ch, ch, ch, LSL, 32);
 950 
 951   sub(cnt1, cnt1, 4);
 952   mov(result_tmp, cnt1);
 953   lea(str1, Address(str1, cnt1, Address::uxtw(1)));
 954   sub(cnt1_neg, zr, cnt1, LSL, 1);
 955 
 956   mov(tmp3, 0x0001000100010001);
 957 
 958   BIND(CH1_LOOP);
 959     ldr(ch1, Address(str1, cnt1_neg));
 960     eor(ch1, ch, ch1);
 961     sub(tmp1, ch1, tmp3);
 962     orr(tmp2, ch1, 0x7fff7fff7fff7fff);
 963     bics(tmp1, tmp1, tmp2);
 964     br(NE, HAS_ZERO);
 965     adds(cnt1_neg, cnt1_neg, 8);
 966     br(LT, CH1_LOOP);
 967 
 968     cmp(cnt1_neg, (u1)8);
 969     mov(cnt1_neg, 0);
 970     br(LT, CH1_LOOP);
 971     b(NOMATCH);
 972 
 973   BIND(HAS_ZERO);
 974     rev(tmp1, tmp1);
 975     clz(tmp1, tmp1);
 976     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
 977     b(MATCH);
 978 
 979   BIND(DO1_SHORT);
 980     mov(result_tmp, cnt1);
 981     lea(str1, Address(str1, cnt1, Address::uxtw(1)));
 982     sub(cnt1_neg, zr, cnt1, LSL, 1);
 983   BIND(DO1_LOOP);
 984     ldrh(ch1, Address(str1, cnt1_neg));
 985     cmpw(ch, ch1);
 986     br(EQ, MATCH);
 987     adds(cnt1_neg, cnt1_neg, 2);
 988     br(LT, DO1_LOOP);
 989   BIND(NOMATCH);
 990     mov(result, -1);
 991     b(DONE);
 992   BIND(MATCH);
 993     add(result, result_tmp, cnt1_neg, ASR, 1);
 994   BIND(DONE);
 995 }
 996 
 997 void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1,
 998                                                 Register ch, Register result,
 999                                                 FloatRegister ztmp1,
1000                                                 FloatRegister ztmp2,
1001                                                 PRegister tmp_pg,
1002                                                 PRegister tmp_pdn, bool isL)
1003 {
1004   // Note that `tmp_pdn` should *NOT* be used as governing predicate register.
1005   assert(tmp_pg->is_governing(),
1006          "this register has to be a governing predicate register");
1007 
1008   Label LOOP, MATCH, DONE, NOMATCH;
1009   Register vec_len = rscratch1;
1010   Register idx = rscratch2;
1011 
1012   SIMD_RegVariant T = (isL == true) ? B : H;
1013 
1014   cbz(cnt1, NOMATCH);
1015 
1016   // Assign the particular char throughout the vector.
1017   sve_dup(ztmp2, T, ch);
1018   if (isL) {
1019     sve_cntb(vec_len);
1020   } else {
1021     sve_cnth(vec_len);
1022   }
1023   mov(idx, 0);
1024 
1025   // Generate a predicate to control the reading of input string.
1026   sve_whilelt(tmp_pg, T, idx, cnt1);
1027 
1028   BIND(LOOP);
1029     // Read a vector of 8- or 16-bit data depending on the string type. Note
1030     // that inactive elements indicated by the predicate register won't cause
1031     // a data read from memory to the destination vector.
1032     if (isL) {
1033       sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx));
1034     } else {
1035       sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1)));
1036     }
1037     add(idx, idx, vec_len);
1038 
1039     // Perform the comparison. An element of the destination predicate is set
1040     // to active if the particular char is matched.
1041     sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2);
1042 
1043     // Branch if the particular char is found.
1044     br(NE, MATCH);
1045 
1046     sve_whilelt(tmp_pg, T, idx, cnt1);
1047 
1048     // Loop back if the particular char not found.
1049     br(MI, LOOP);
1050 
1051   BIND(NOMATCH);
1052     mov(result, -1);
1053     b(DONE);
1054 
1055   BIND(MATCH);
1056     // Undo the index increment.
1057     sub(idx, idx, vec_len);
1058 
1059     // Crop the vector to find its location.
1060     sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */);
1061     add(result, idx, -1);
1062     sve_incp(result, T, tmp_pdn);
1063   BIND(DONE);
1064 }
1065 
1066 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1,
1067                                             Register ch, Register result,
1068                                             Register tmp1, Register tmp2, Register tmp3)
1069 {
1070   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
1071   Register cnt1_neg = cnt1;
1072   Register ch1 = rscratch1;
1073   Register result_tmp = rscratch2;
1074 
1075   cbz(cnt1, NOMATCH);
1076 
1077   cmp(cnt1, (u1)8);
1078   br(LT, DO1_SHORT);
1079 
1080   orr(ch, ch, ch, LSL, 8);
1081   orr(ch, ch, ch, LSL, 16);
1082   orr(ch, ch, ch, LSL, 32);
1083 
1084   sub(cnt1, cnt1, 8);
1085   mov(result_tmp, cnt1);
1086   lea(str1, Address(str1, cnt1));
1087   sub(cnt1_neg, zr, cnt1);
1088 
1089   mov(tmp3, 0x0101010101010101);
1090 
1091   BIND(CH1_LOOP);
1092     ldr(ch1, Address(str1, cnt1_neg));
1093     eor(ch1, ch, ch1);
1094     sub(tmp1, ch1, tmp3);
1095     orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f);
1096     bics(tmp1, tmp1, tmp2);
1097     br(NE, HAS_ZERO);
1098     adds(cnt1_neg, cnt1_neg, 8);
1099     br(LT, CH1_LOOP);
1100 
1101     cmp(cnt1_neg, (u1)8);
1102     mov(cnt1_neg, 0);
1103     br(LT, CH1_LOOP);
1104     b(NOMATCH);
1105 
1106   BIND(HAS_ZERO);
1107     rev(tmp1, tmp1);
1108     clz(tmp1, tmp1);
1109     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
1110     b(MATCH);
1111 
1112   BIND(DO1_SHORT);
1113     mov(result_tmp, cnt1);
1114     lea(str1, Address(str1, cnt1));
1115     sub(cnt1_neg, zr, cnt1);
1116   BIND(DO1_LOOP);
1117     ldrb(ch1, Address(str1, cnt1_neg));
1118     cmp(ch, ch1);
1119     br(EQ, MATCH);
1120     adds(cnt1_neg, cnt1_neg, 1);
1121     br(LT, DO1_LOOP);
1122   BIND(NOMATCH);
1123     mov(result, -1);
1124     b(DONE);
1125   BIND(MATCH);
1126     add(result, result_tmp, cnt1_neg);
1127   BIND(DONE);
1128 }
1129 
1130 // Compare strings.
1131 void C2_MacroAssembler::string_compare(Register str1, Register str2,
1132     Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
1133     FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
1134     PRegister pgtmp1, PRegister pgtmp2, int ae) {
1135   Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
1136       DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
1137       SHORT_LOOP_START, TAIL_CHECK;
1138 
1139   bool isLL = ae == StrIntrinsicNode::LL;
1140   bool isLU = ae == StrIntrinsicNode::LU;
1141   bool isUL = ae == StrIntrinsicNode::UL;
1142 
1143   // The stub threshold for LL strings is: 72 (64 + 8) chars
1144   // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch)
1145   // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least)
1146   const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36);
1147 
1148   bool str1_isL = isLL || isLU;
1149   bool str2_isL = isLL || isUL;
1150 
1151   int str1_chr_shift = str1_isL ? 0 : 1;
1152   int str2_chr_shift = str2_isL ? 0 : 1;
1153   int str1_chr_size = str1_isL ? 1 : 2;
1154   int str2_chr_size = str2_isL ? 1 : 2;
1155   int minCharsInWord = isLL ? wordSize : wordSize/2;
1156 
1157   FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
1158   chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
1159                                       (chr_insn)&MacroAssembler::ldrh;
1160   chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
1161                                       (chr_insn)&MacroAssembler::ldrh;
1162   uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
1163                             (uxt_insn)&MacroAssembler::uxthw;
1164 
1165   BLOCK_COMMENT("string_compare {");
1166 
1167   // Bizarrely, the counts are passed in bytes, regardless of whether they
1168   // are L or U strings, however the result is always in characters.
1169   if (!str1_isL) asrw(cnt1, cnt1, 1);
1170   if (!str2_isL) asrw(cnt2, cnt2, 1);
1171 
1172   // Compute the minimum of the string lengths and save the difference.
1173   subsw(result, cnt1, cnt2);
1174   cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
1175 
1176   // A very short string
1177   cmpw(cnt2, minCharsInWord);
1178   br(Assembler::LE, SHORT_STRING);
1179 
1180   // Compare longwords
1181   // load first parts of strings and finish initialization while loading
1182   {
1183     if (str1_isL == str2_isL) { // LL or UU
1184       ldr(tmp1, Address(str1));
1185       cmp(str1, str2);
1186       br(Assembler::EQ, DONE);
1187       ldr(tmp2, Address(str2));
1188       cmp(cnt2, stub_threshold);
1189       br(GE, STUB);
1190       subsw(cnt2, cnt2, minCharsInWord);
1191       br(EQ, TAIL_CHECK);
1192       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1193       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1194       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1195     } else if (isLU) {
1196       ldrs(vtmp, Address(str1));
1197       ldr(tmp2, Address(str2));
1198       cmp(cnt2, stub_threshold);
1199       br(GE, STUB);
1200       subw(cnt2, cnt2, 4);
1201       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1202       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1203       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1204       zip1(vtmp, T8B, vtmp, vtmpZ);
1205       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1206       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1207       add(cnt1, cnt1, 4);
1208       fmovd(tmp1, vtmp);
1209     } else { // UL case
1210       ldr(tmp1, Address(str1));
1211       ldrs(vtmp, Address(str2));
1212       cmp(cnt2, stub_threshold);
1213       br(GE, STUB);
1214       subw(cnt2, cnt2, 4);
1215       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1216       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1217       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1218       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1219       zip1(vtmp, T8B, vtmp, vtmpZ);
1220       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1221       add(cnt1, cnt1, 8);
1222       fmovd(tmp2, vtmp);
1223     }
1224     adds(cnt2, cnt2, isUL ? 4 : 8);
1225     br(GE, TAIL);
1226     eor(rscratch2, tmp1, tmp2);
1227     cbnz(rscratch2, DIFF);
1228     // main loop
1229     bind(NEXT_WORD);
1230     if (str1_isL == str2_isL) {
1231       ldr(tmp1, Address(str1, cnt2));
1232       ldr(tmp2, Address(str2, cnt2));
1233       adds(cnt2, cnt2, 8);
1234     } else if (isLU) {
1235       ldrs(vtmp, Address(str1, cnt1));
1236       ldr(tmp2, Address(str2, cnt2));
1237       add(cnt1, cnt1, 4);
1238       zip1(vtmp, T8B, vtmp, vtmpZ);
1239       fmovd(tmp1, vtmp);
1240       adds(cnt2, cnt2, 8);
1241     } else { // UL
1242       ldrs(vtmp, Address(str2, cnt2));
1243       ldr(tmp1, Address(str1, cnt1));
1244       zip1(vtmp, T8B, vtmp, vtmpZ);
1245       add(cnt1, cnt1, 8);
1246       fmovd(tmp2, vtmp);
1247       adds(cnt2, cnt2, 4);
1248     }
1249     br(GE, TAIL);
1250 
1251     eor(rscratch2, tmp1, tmp2);
1252     cbz(rscratch2, NEXT_WORD);
1253     b(DIFF);
1254     bind(TAIL);
1255     eor(rscratch2, tmp1, tmp2);
1256     cbnz(rscratch2, DIFF);
1257     // Last longword.  In the case where length == 4 we compare the
1258     // same longword twice, but that's still faster than another
1259     // conditional branch.
1260     if (str1_isL == str2_isL) {
1261       ldr(tmp1, Address(str1));
1262       ldr(tmp2, Address(str2));
1263     } else if (isLU) {
1264       ldrs(vtmp, Address(str1));
1265       ldr(tmp2, Address(str2));
1266       zip1(vtmp, T8B, vtmp, vtmpZ);
1267       fmovd(tmp1, vtmp);
1268     } else { // UL
1269       ldrs(vtmp, Address(str2));
1270       ldr(tmp1, Address(str1));
1271       zip1(vtmp, T8B, vtmp, vtmpZ);
1272       fmovd(tmp2, vtmp);
1273     }
1274     bind(TAIL_CHECK);
1275     eor(rscratch2, tmp1, tmp2);
1276     cbz(rscratch2, DONE);
1277 
1278     // Find the first different characters in the longwords and
1279     // compute their difference.
1280     bind(DIFF);
1281     rev(rscratch2, rscratch2);
1282     clz(rscratch2, rscratch2);
1283     andr(rscratch2, rscratch2, isLL ? -8 : -16);
1284     lsrv(tmp1, tmp1, rscratch2);
1285     (this->*ext_chr)(tmp1, tmp1);
1286     lsrv(tmp2, tmp2, rscratch2);
1287     (this->*ext_chr)(tmp2, tmp2);
1288     subw(result, tmp1, tmp2);
1289     b(DONE);
1290   }
1291 
1292   bind(STUB);
1293     RuntimeAddress stub = nullptr;
1294     switch(ae) {
1295       case StrIntrinsicNode::LL:
1296         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
1297         break;
1298       case StrIntrinsicNode::UU:
1299         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
1300         break;
1301       case StrIntrinsicNode::LU:
1302         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
1303         break;
1304       case StrIntrinsicNode::UL:
1305         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
1306         break;
1307       default:
1308         ShouldNotReachHere();
1309      }
1310     assert(stub.target() != nullptr, "compare_long_string stub has not been generated");
1311     address call = trampoline_call(stub);
1312     if (call == nullptr) {
1313       DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START));
1314       ciEnv::current()->record_failure("CodeCache is full");
1315       return;
1316     }
1317     b(DONE);
1318 
1319   bind(SHORT_STRING);
1320   // Is the minimum length zero?
1321   cbz(cnt2, DONE);
1322   // arrange code to do most branches while loading and loading next characters
1323   // while comparing previous
1324   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1325   subs(cnt2, cnt2, 1);
1326   br(EQ, SHORT_LAST_INIT);
1327   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1328   b(SHORT_LOOP_START);
1329   bind(SHORT_LOOP);
1330   subs(cnt2, cnt2, 1);
1331   br(EQ, SHORT_LAST);
1332   bind(SHORT_LOOP_START);
1333   (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
1334   (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
1335   cmp(tmp1, cnt1);
1336   br(NE, SHORT_LOOP_TAIL);
1337   subs(cnt2, cnt2, 1);
1338   br(EQ, SHORT_LAST2);
1339   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1340   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1341   cmp(tmp2, rscratch1);
1342   br(EQ, SHORT_LOOP);
1343   sub(result, tmp2, rscratch1);
1344   b(DONE);
1345   bind(SHORT_LOOP_TAIL);
1346   sub(result, tmp1, cnt1);
1347   b(DONE);
1348   bind(SHORT_LAST2);
1349   cmp(tmp2, rscratch1);
1350   br(EQ, DONE);
1351   sub(result, tmp2, rscratch1);
1352 
1353   b(DONE);
1354   bind(SHORT_LAST_INIT);
1355   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1356   bind(SHORT_LAST);
1357   cmp(tmp1, cnt1);
1358   br(EQ, DONE);
1359   sub(result, tmp1, cnt1);
1360 
1361   bind(DONE);
1362 
1363   BLOCK_COMMENT("} string_compare");
1364 }
1365 
1366 void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1,
1367                                      FloatRegister src2, Condition cond, bool isQ) {
1368   SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1369   FloatRegister zn = src1, zm = src2;
1370   bool needs_negation = false;
1371   switch (cond) {
1372     case LT: cond = GT; zn = src2; zm = src1; break;
1373     case LE: cond = GE; zn = src2; zm = src1; break;
1374     case LO: cond = HI; zn = src2; zm = src1; break;
1375     case LS: cond = HS; zn = src2; zm = src1; break;
1376     case NE: cond = EQ; needs_negation = true; break;
1377     default:
1378       break;
1379   }
1380 
1381   if (is_floating_point_type(bt)) {
1382     fcm(cond, dst, size, zn, zm);
1383   } else {
1384     cm(cond, dst, size, zn, zm);
1385   }
1386 
1387   if (needs_negation) {
1388     notr(dst, isQ ? T16B : T8B, dst);
1389   }
1390 }
1391 
1392 void C2_MacroAssembler::neon_compare_zero(FloatRegister dst, BasicType bt, FloatRegister src,
1393                                           Condition cond, bool isQ) {
1394   SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1395   if (bt == T_FLOAT || bt == T_DOUBLE) {
1396     if (cond == Assembler::NE) {
1397       fcm(Assembler::EQ, dst, size, src);
1398       notr(dst, isQ ? T16B : T8B, dst);
1399     } else {
1400       fcm(cond, dst, size, src);
1401     }
1402   } else {
1403     if (cond == Assembler::NE) {
1404       cm(Assembler::EQ, dst, size, src);
1405       notr(dst, isQ ? T16B : T8B, dst);
1406     } else {
1407       cm(cond, dst, size, src);
1408     }
1409   }
1410 }
1411 
1412 // Compress the least significant bit of each byte to the rightmost and clear
1413 // the higher garbage bits.
1414 void C2_MacroAssembler::bytemask_compress(Register dst) {
1415   // Example input, dst = 0x01 00 00 00 01 01 00 01
1416   // The "??" bytes are garbage.
1417   orr(dst, dst, dst, Assembler::LSR, 7);  // dst = 0x?? 02 ?? 00 ?? 03 ?? 01
1418   orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D
1419   orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D
1420   andr(dst, dst, 0xff);                   // dst = 0x8D
1421 }
1422 
1423 // Pack the value of each mask element in "src" into a long value in "dst", at most
1424 // the first 64 lane elements. The input "src" is a vector of boolean represented as
1425 // bytes with 0x00/0x01 as element values. Each lane value from "src" is packed into
1426 // one bit in "dst".
1427 //
1428 // Example:   src = 0x0001010000010001 0100000001010001, lane_cnt = 16
1429 // Expected:  dst = 0x658D
1430 //
1431 // Clobbers: rscratch1
1432 void C2_MacroAssembler::sve_vmask_tolong(Register dst, FloatRegister src,
1433                                          FloatRegister vtmp, int lane_cnt) {
1434   assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
1435   assert_different_registers(dst, rscratch1);
1436   assert_different_registers(src, vtmp);
1437   assert(UseSVE > 0, "must be");
1438 
1439   // Compress the lowest 8 bytes.
1440   fmovd(dst, src);
1441   bytemask_compress(dst);
1442   if (lane_cnt <= 8) return;
1443 
1444   // Repeat on higher bytes and join the results.
1445   // Compress 8 bytes in each iteration.
1446   for (int idx = 1; idx < (lane_cnt / 8); idx++) {
1447     sve_extract_integral(rscratch1, T_LONG, src, idx, vtmp);
1448     bytemask_compress(rscratch1);
1449     orr(dst, dst, rscratch1, Assembler::LSL, idx << 3);
1450   }
1451 }
1452 
1453 // The function is same as above "sve_vmask_tolong", but it uses SVE2's BEXT
1454 // instruction which requires the FEAT_BITPERM feature.
1455 void C2_MacroAssembler::sve2_vmask_tolong(Register dst, FloatRegister src,
1456                                           FloatRegister vtmp1, FloatRegister vtmp2,
1457                                           int lane_cnt) {
1458   assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
1459   assert_different_registers(src, vtmp1, vtmp2);
1460   assert(UseSVE > 1 && VM_Version::supports_svebitperm(), "must be");
1461 
1462   // Given a vector with the value 0x00 or 0x01 in each byte, the basic idea
1463   // is to compress each significant bit of the byte in a cross-lane way. Due
1464   // to the lack of a cross-lane bit-compress instruction, we use BEXT
1465   // (bit-compress in each lane) with the biggest lane size (T = D) then
1466   // concatenate the results.
1467 
1468   // The second source input of BEXT, initialized with 0x01 in each byte.
1469   // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1470   sve_dup(vtmp2, B, 1);
1471 
1472   // BEXT vtmp1.D, src.D, vtmp2.D
1473   // src   = 0x0001010000010001 | 0x0100000001010001
1474   // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1475   //         ---------------------------------------
1476   // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1477   sve_bext(vtmp1, D, src, vtmp2);
1478 
1479   // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the
1480   // result to dst.
1481   // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1482   // dst   = 0x658D
1483   if (lane_cnt <= 8) {
1484     // No need to concatenate.
1485     umov(dst, vtmp1, B, 0);
1486   } else if (lane_cnt <= 16) {
1487     ins(vtmp1, B, vtmp1, 1, 8);
1488     umov(dst, vtmp1, H, 0);
1489   } else {
1490     // As the lane count is 64 at most, the final expected value must be in
1491     // the lowest 64 bits after narrowing vtmp1 from D to B.
1492     sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2);
1493     umov(dst, vtmp1, D, 0);
1494   }
1495 }
1496 
1497 // Unpack the mask, a long value in "src", into a vector register of boolean
1498 // represented as bytes with 0x00/0x01 as element values in "dst".  Each bit in
1499 // "src" is unpacked into one byte lane in "dst". Note that "dst" can support at
1500 // most 64 lanes.
1501 //
1502 // Below example gives the expected dst vector register, with a valid src(0x658D)
1503 // on a 128-bit vector size machine.
1504 // dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01
1505 void C2_MacroAssembler::sve_vmask_fromlong(FloatRegister dst, Register src,
1506                                            FloatRegister vtmp, int lane_cnt) {
1507   assert_different_registers(dst, vtmp);
1508   assert(UseSVE == 2 && VM_Version::supports_svebitperm() &&
1509          lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported");
1510 
1511   // Example:   src = 0x658D, lane_cnt = 16
1512   // Expected:  dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01
1513 
1514   // Put long value from general purpose register into the first lane of vector.
1515   // vtmp = 0x0000000000000000 | 0x000000000000658D
1516   sve_dup(vtmp, B, 0);
1517   mov(vtmp, D, 0, src);
1518 
1519   // Transform the value in the first lane which is mask in bit now to the mask in
1520   // byte, which can be done by SVE2's BDEP instruction.
1521 
1522   // The first source input of BDEP instruction. Deposite each byte in every 8 bytes.
1523   // vtmp = 0x0000000000000065 | 0x000000000000008D
1524   if (lane_cnt <= 8) {
1525     // Nothing. As only one byte exsits.
1526   } else if (lane_cnt <= 16) {
1527     ins(vtmp, B, vtmp, 8, 1);
1528   } else {
1529     sve_vector_extend(vtmp, D, vtmp, B);
1530   }
1531 
1532   // The second source input of BDEP instruction, initialized with 0x01 for each byte.
1533   // dst = 0x01010101 0x01010101 0x01010101 0x01010101
1534   sve_dup(dst, B, 1);
1535 
1536   // BDEP dst.D, vtmp.D, dst.D
1537   // vtmp = 0x0000000000000065 | 0x000000000000008D
1538   // dst  = 0x0101010101010101 | 0x0101010101010101
1539   //        ---------------------------------------
1540   // dst  = 0x0001010000010001 | 0x0100000001010001
1541   sve_bdep(dst, D, vtmp, dst);
1542 }
1543 
1544 // Clobbers: rflags
1545 void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg,
1546                                     FloatRegister zn, FloatRegister zm, Condition cond) {
1547   assert(pg->is_governing(), "This register has to be a governing predicate register");
1548   FloatRegister z1 = zn, z2 = zm;
1549   switch (cond) {
1550     case LE: z1 = zm; z2 = zn; cond = GE; break;
1551     case LT: z1 = zm; z2 = zn; cond = GT; break;
1552     case LO: z1 = zm; z2 = zn; cond = HI; break;
1553     case LS: z1 = zm; z2 = zn; cond = HS; break;
1554     default:
1555       break;
1556   }
1557 
1558   SIMD_RegVariant size = elemType_to_regVariant(bt);
1559   if (is_floating_point_type(bt)) {
1560     sve_fcm(cond, pd, size, pg, z1, z2);
1561   } else {
1562     assert(is_integral_type(bt), "unsupported element type");
1563     sve_cmp(cond, pd, size, pg, z1, z2);
1564   }
1565 }
1566 
1567 // Get index of the last mask lane that is set
1568 void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) {
1569   SIMD_RegVariant size = elemType_to_regVariant(bt);
1570   sve_rev(ptmp, size, src);
1571   sve_brkb(ptmp, ptrue, ptmp, false);
1572   sve_cntp(dst, size, ptrue, ptmp);
1573   movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1);
1574   subw(dst, rscratch1, dst);
1575 }
1576 
1577 // Extend integer vector src to dst with the same lane count
1578 // but larger element size, e.g. 4B -> 4I
1579 void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes,
1580                                            FloatRegister src, BasicType src_bt, bool is_unsigned) {
1581   if (src_bt == T_BYTE) {
1582     // 4B to 4S/4I, 8B to 8S
1583     assert(dst_vlen_in_bytes == 8 || dst_vlen_in_bytes == 16, "unsupported");
1584     assert(dst_bt == T_SHORT || dst_bt == T_INT, "unsupported");
1585     _xshll(is_unsigned, dst, T8H, src, T8B, 0);
1586     if (dst_bt == T_INT) {
1587       _xshll(is_unsigned, dst, T4S, dst, T4H, 0);
1588     }
1589   } else if (src_bt == T_SHORT) {
1590     // 2S to 2I/2L, 4S to 4I
1591     assert(dst_vlen_in_bytes == 8 || dst_vlen_in_bytes == 16, "unsupported");
1592     assert(dst_bt == T_INT || dst_bt == T_LONG, "unsupported");
1593     _xshll(is_unsigned, dst, T4S, src, T4H, 0);
1594     if (dst_bt == T_LONG) {
1595       _xshll(is_unsigned, dst, T2D, dst, T2S, 0);
1596     }
1597   } else if (src_bt == T_INT) {
1598     // 2I to 2L
1599     assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported");
1600     _xshll(is_unsigned, dst, T2D, src, T2S, 0);
1601   } else {
1602     ShouldNotReachHere();
1603   }
1604 }
1605 
1606 // Narrow integer vector src down to dst with the same lane count
1607 // but smaller element size, e.g. 4I -> 4B
1608 void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt,
1609                                            FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) {
1610   if (src_bt == T_SHORT) {
1611     // 4S/8S to 4B/8B
1612     assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
1613     assert(dst_bt == T_BYTE, "unsupported");
1614     xtn(dst, T8B, src, T8H);
1615   } else if (src_bt == T_INT) {
1616     // 2I to 2S, 4I to 4B/4S
1617     assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
1618     assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported");
1619     xtn(dst, T4H, src, T4S);
1620     if (dst_bt == T_BYTE) {
1621       xtn(dst, T8B, dst, T8H);
1622     }
1623   } else if (src_bt == T_LONG) {
1624     // 2L to 2S/2I
1625     assert(src_vlen_in_bytes == 16, "unsupported");
1626     assert(dst_bt == T_INT || dst_bt == T_SHORT, "unsupported");
1627     xtn(dst, T2S, src, T2D);
1628     if (dst_bt == T_SHORT) {
1629       xtn(dst, T4H, dst, T4S);
1630     }
1631   } else {
1632     ShouldNotReachHere();
1633   }
1634 }
1635 
1636 void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size,
1637                                           FloatRegister src, SIMD_RegVariant src_size,
1638                                           bool is_unsigned) {
1639   assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size");
1640 
1641   if (src_size == B) {
1642     switch (dst_size) {
1643     case H:
1644       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1645       break;
1646     case S:
1647       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1648       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1649       break;
1650     case D:
1651       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1652       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1653       _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1654       break;
1655     default:
1656       ShouldNotReachHere();
1657     }
1658   } else if (src_size == H) {
1659     if (dst_size == S) {
1660       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1661     } else { // D
1662       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1663       _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1664     }
1665   } else if (src_size == S) {
1666     _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, src);
1667   }
1668 }
1669 
1670 // Vector narrow from src to dst with specified element sizes.
1671 // High part of dst vector will be filled with zero.
1672 void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size,
1673                                           FloatRegister src, SIMD_RegVariant src_size,
1674                                           FloatRegister tmp) {
1675   assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size");
1676   assert_different_registers(src, tmp);
1677   sve_dup(tmp, src_size, 0);
1678   if (src_size == D) {
1679     switch (dst_size) {
1680     case S:
1681       sve_uzp1(dst, S, src, tmp);
1682       break;
1683     case H:
1684       assert_different_registers(dst, tmp);
1685       sve_uzp1(dst, S, src, tmp);
1686       sve_uzp1(dst, H, dst, tmp);
1687       break;
1688     case B:
1689       assert_different_registers(dst, tmp);
1690       sve_uzp1(dst, S, src, tmp);
1691       sve_uzp1(dst, H, dst, tmp);
1692       sve_uzp1(dst, B, dst, tmp);
1693       break;
1694     default:
1695       ShouldNotReachHere();
1696     }
1697   } else if (src_size == S) {
1698     if (dst_size == H) {
1699       sve_uzp1(dst, H, src, tmp);
1700     } else { // B
1701       assert_different_registers(dst, tmp);
1702       sve_uzp1(dst, H, src, tmp);
1703       sve_uzp1(dst, B, dst, tmp);
1704     }
1705   } else if (src_size == H) {
1706     sve_uzp1(dst, B, src, tmp);
1707   }
1708 }
1709 
1710 // Extend src predicate to dst predicate with the same lane count but larger
1711 // element size, e.g. 64Byte -> 512Long
1712 void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src,
1713                                              uint dst_element_length_in_bytes,
1714                                              uint src_element_length_in_bytes) {
1715   if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) {
1716     sve_punpklo(dst, src);
1717   } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) {
1718     sve_punpklo(dst, src);
1719     sve_punpklo(dst, dst);
1720   } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) {
1721     sve_punpklo(dst, src);
1722     sve_punpklo(dst, dst);
1723     sve_punpklo(dst, dst);
1724   } else {
1725     assert(false, "unsupported");
1726     ShouldNotReachHere();
1727   }
1728 }
1729 
1730 // Narrow src predicate to dst predicate with the same lane count but
1731 // smaller element size, e.g. 512Long -> 64Byte
1732 void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src, PRegister ptmp,
1733                                              uint dst_element_length_in_bytes, uint src_element_length_in_bytes) {
1734   // The insignificant bits in src predicate are expected to be zero.
1735   // To ensure the higher order bits of the resultant narrowed vector are 0, an all-zero predicate is
1736   // passed as the second argument. An example narrowing operation with a given mask would be -
1737   // 128Long -> 64Int on a 128-bit machine i.e 2L -> 2I
1738   // Mask (for 2 Longs) : TF
1739   // Predicate register for the above mask (16 bits) : 00000001 00000000
1740   // After narrowing (uzp1 dst.b, src.b, ptmp.b) : 0000 0000 0001 0000
1741   // Which translates to mask for 2 integers as : TF (lower half is considered while upper half is 0)
1742   assert_different_registers(src, ptmp);
1743   assert_different_registers(dst, ptmp);
1744   sve_pfalse(ptmp);
1745   if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) {
1746     sve_uzp1(dst, B, src, ptmp);
1747   } else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) {
1748     sve_uzp1(dst, H, src, ptmp);
1749     sve_uzp1(dst, B, dst, ptmp);
1750   } else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) {
1751     sve_uzp1(dst, S, src, ptmp);
1752     sve_uzp1(dst, H, dst, ptmp);
1753     sve_uzp1(dst, B, dst, ptmp);
1754   } else {
1755     assert(false, "unsupported");
1756     ShouldNotReachHere();
1757   }
1758 }
1759 
1760 // Vector reduction add for integral type with ASIMD instructions.
1761 void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt,
1762                                                  Register isrc, FloatRegister vsrc,
1763                                                  unsigned vector_length_in_bytes,
1764                                                  FloatRegister vtmp) {
1765   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1766   assert_different_registers(dst, isrc);
1767   bool isQ = vector_length_in_bytes == 16;
1768 
1769   BLOCK_COMMENT("neon_reduce_add_integral {");
1770     switch(bt) {
1771       case T_BYTE:
1772         addv(vtmp, isQ ? T16B : T8B, vsrc);
1773         smov(dst, vtmp, B, 0);
1774         addw(dst, dst, isrc, ext::sxtb);
1775         break;
1776       case T_SHORT:
1777         addv(vtmp, isQ ? T8H : T4H, vsrc);
1778         smov(dst, vtmp, H, 0);
1779         addw(dst, dst, isrc, ext::sxth);
1780         break;
1781       case T_INT:
1782         isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc);
1783         umov(dst, vtmp, S, 0);
1784         addw(dst, dst, isrc);
1785         break;
1786       case T_LONG:
1787         assert(isQ, "unsupported");
1788         addpd(vtmp, vsrc);
1789         umov(dst, vtmp, D, 0);
1790         add(dst, dst, isrc);
1791         break;
1792       default:
1793         assert(false, "unsupported");
1794         ShouldNotReachHere();
1795     }
1796   BLOCK_COMMENT("} neon_reduce_add_integral");
1797 }
1798 
1799 // Vector reduction multiply for integral type with ASIMD instructions.
1800 // Note: temporary registers vtmp1 and vtmp2 are not used in some cases.
1801 // Clobbers: rscratch1
1802 void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt,
1803                                                  Register isrc, FloatRegister vsrc,
1804                                                  unsigned vector_length_in_bytes,
1805                                                  FloatRegister vtmp1, FloatRegister vtmp2) {
1806   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1807   bool isQ = vector_length_in_bytes == 16;
1808 
1809   BLOCK_COMMENT("neon_reduce_mul_integral {");
1810     switch(bt) {
1811       case T_BYTE:
1812         if (isQ) {
1813           // Multiply the lower half and higher half of vector iteratively.
1814           // vtmp1 = vsrc[8:15]
1815           ins(vtmp1, D, vsrc, 0, 1);
1816           // vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7]
1817           mulv(vtmp1, T8B, vtmp1, vsrc);
1818           // vtmp2 = vtmp1[4:7]
1819           ins(vtmp2, S, vtmp1, 0, 1);
1820           // vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3]
1821           mulv(vtmp1, T8B, vtmp2, vtmp1);
1822         } else {
1823           ins(vtmp1, S, vsrc, 0, 1);
1824           mulv(vtmp1, T8B, vtmp1, vsrc);
1825         }
1826         // vtmp2 = vtmp1[2:3]
1827         ins(vtmp2, H, vtmp1, 0, 1);
1828         // vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1]
1829         mulv(vtmp2, T8B, vtmp2, vtmp1);
1830         // dst = vtmp2[0] * isrc * vtmp2[1]
1831         umov(rscratch1, vtmp2, B, 0);
1832         mulw(dst, rscratch1, isrc);
1833         sxtb(dst, dst);
1834         umov(rscratch1, vtmp2, B, 1);
1835         mulw(dst, rscratch1, dst);
1836         sxtb(dst, dst);
1837         break;
1838       case T_SHORT:
1839         if (isQ) {
1840           ins(vtmp2, D, vsrc, 0, 1);
1841           mulv(vtmp2, T4H, vtmp2, vsrc);
1842           ins(vtmp1, S, vtmp2, 0, 1);
1843           mulv(vtmp1, T4H, vtmp1, vtmp2);
1844         } else {
1845           ins(vtmp1, S, vsrc, 0, 1);
1846           mulv(vtmp1, T4H, vtmp1, vsrc);
1847         }
1848         umov(rscratch1, vtmp1, H, 0);
1849         mulw(dst, rscratch1, isrc);
1850         sxth(dst, dst);
1851         umov(rscratch1, vtmp1, H, 1);
1852         mulw(dst, rscratch1, dst);
1853         sxth(dst, dst);
1854         break;
1855       case T_INT:
1856         if (isQ) {
1857           ins(vtmp1, D, vsrc, 0, 1);
1858           mulv(vtmp1, T2S, vtmp1, vsrc);
1859         } else {
1860           vtmp1 = vsrc;
1861         }
1862         umov(rscratch1, vtmp1, S, 0);
1863         mul(dst, rscratch1, isrc);
1864         umov(rscratch1, vtmp1, S, 1);
1865         mul(dst, rscratch1, dst);
1866         break;
1867       case T_LONG:
1868         umov(rscratch1, vsrc, D, 0);
1869         mul(dst, isrc, rscratch1);
1870         umov(rscratch1, vsrc, D, 1);
1871         mul(dst, dst, rscratch1);
1872         break;
1873       default:
1874         assert(false, "unsupported");
1875         ShouldNotReachHere();
1876     }
1877   BLOCK_COMMENT("} neon_reduce_mul_integral");
1878 }
1879 
1880 // Vector reduction multiply for floating-point type with ASIMD instructions.
1881 void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt,
1882                                            FloatRegister fsrc, FloatRegister vsrc,
1883                                            unsigned vector_length_in_bytes,
1884                                            FloatRegister vtmp) {
1885   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1886   bool isQ = vector_length_in_bytes == 16;
1887 
1888   BLOCK_COMMENT("neon_reduce_mul_fp {");
1889     switch(bt) {
1890       case T_FLOAT:
1891         fmuls(dst, fsrc, vsrc);
1892         ins(vtmp, S, vsrc, 0, 1);
1893         fmuls(dst, dst, vtmp);
1894         if (isQ) {
1895           ins(vtmp, S, vsrc, 0, 2);
1896           fmuls(dst, dst, vtmp);
1897           ins(vtmp, S, vsrc, 0, 3);
1898           fmuls(dst, dst, vtmp);
1899          }
1900         break;
1901       case T_DOUBLE:
1902         assert(isQ, "unsupported");
1903         fmuld(dst, fsrc, vsrc);
1904         ins(vtmp, D, vsrc, 0, 1);
1905         fmuld(dst, dst, vtmp);
1906         break;
1907       default:
1908         assert(false, "unsupported");
1909         ShouldNotReachHere();
1910     }
1911   BLOCK_COMMENT("} neon_reduce_mul_fp");
1912 }
1913 
1914 // Helper to select logical instruction
1915 void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd,
1916                                                    Register Rn, Register Rm,
1917                                                    enum shift_kind kind, unsigned shift) {
1918   switch(opc) {
1919     case Op_AndReductionV:
1920       is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift);
1921       break;
1922     case Op_OrReductionV:
1923       is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift);
1924       break;
1925     case Op_XorReductionV:
1926       is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift);
1927       break;
1928     default:
1929       assert(false, "unsupported");
1930       ShouldNotReachHere();
1931   }
1932 }
1933 
1934 // Vector reduction logical operations And, Or, Xor
1935 // Clobbers: rscratch1
1936 void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt,
1937                                             Register isrc, FloatRegister vsrc,
1938                                             unsigned vector_length_in_bytes) {
1939   assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV,
1940          "unsupported");
1941   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1942   assert_different_registers(dst, isrc);
1943   bool isQ = vector_length_in_bytes == 16;
1944 
1945   BLOCK_COMMENT("neon_reduce_logical {");
1946     umov(rscratch1, vsrc, isQ ? D : S, 0);
1947     umov(dst, vsrc, isQ ? D : S, 1);
1948     neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1);
1949     switch(bt) {
1950       case T_BYTE:
1951         if (isQ) {
1952           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1953         }
1954         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
1955         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8);
1956         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1957         sxtb(dst, dst);
1958         break;
1959       case T_SHORT:
1960         if (isQ) {
1961           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1962         }
1963         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
1964         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1965         sxth(dst, dst);
1966         break;
1967       case T_INT:
1968         if (isQ) {
1969           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1970         }
1971         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1972         break;
1973       case T_LONG:
1974         assert(isQ, "unsupported");
1975         neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst);
1976         break;
1977       default:
1978         assert(false, "unsupported");
1979         ShouldNotReachHere();
1980     }
1981   BLOCK_COMMENT("} neon_reduce_logical");
1982 }
1983 
1984 // Vector reduction min/max for integral type with ASIMD instructions.
1985 // Note: vtmp is not used and expected to be fnoreg for T_LONG case.
1986 // Clobbers: rscratch1, rflags
1987 void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt,
1988                                                     Register isrc, FloatRegister vsrc,
1989                                                     unsigned vector_length_in_bytes,
1990                                                     FloatRegister vtmp) {
1991   assert(opc == Op_MinReductionV || opc == Op_MaxReductionV, "unsupported");
1992   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1993   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported");
1994   assert_different_registers(dst, isrc);
1995   bool isQ = vector_length_in_bytes == 16;
1996   bool is_min = opc == Op_MinReductionV;
1997 
1998   BLOCK_COMMENT("neon_reduce_minmax_integral {");
1999     if (bt == T_LONG) {
2000       assert(vtmp == fnoreg, "should be");
2001       assert(isQ, "should be");
2002       umov(rscratch1, vsrc, D, 0);
2003       cmp(isrc, rscratch1);
2004       csel(dst, isrc, rscratch1, is_min ? LT : GT);
2005       umov(rscratch1, vsrc, D, 1);
2006       cmp(dst, rscratch1);
2007       csel(dst, dst, rscratch1, is_min ? LT : GT);
2008     } else {
2009       SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
2010       if (size == T2S) {
2011         is_min ? sminp(vtmp, size, vsrc, vsrc) : smaxp(vtmp, size, vsrc, vsrc);
2012       } else {
2013         is_min ? sminv(vtmp, size, vsrc) : smaxv(vtmp, size, vsrc);
2014       }
2015       if (bt == T_INT) {
2016         umov(dst, vtmp, S, 0);
2017       } else {
2018         smov(dst, vtmp, elemType_to_regVariant(bt), 0);
2019       }
2020       cmpw(dst, isrc);
2021       cselw(dst, dst, isrc, is_min ? LT : GT);
2022     }
2023   BLOCK_COMMENT("} neon_reduce_minmax_integral");
2024 }
2025 
2026 // Vector reduction for integral type with SVE instruction.
2027 // Supported operations are Add, And, Or, Xor, Max, Min.
2028 // rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV.
2029 void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1,
2030                                             FloatRegister src2, PRegister pg, FloatRegister tmp) {
2031   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2032   assert(pg->is_governing(), "This register has to be a governing predicate register");
2033   assert_different_registers(src1, dst);
2034   // Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved.
2035   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2036   switch (opc) {
2037     case Op_AddReductionVI: {
2038       sve_uaddv(tmp, size, pg, src2);
2039       if (bt == T_BYTE) {
2040         smov(dst, tmp, size, 0);
2041         addw(dst, src1, dst, ext::sxtb);
2042       } else if (bt == T_SHORT) {
2043         smov(dst, tmp, size, 0);
2044         addw(dst, src1, dst, ext::sxth);
2045       } else {
2046         umov(dst, tmp, size, 0);
2047         addw(dst, dst, src1);
2048       }
2049       break;
2050     }
2051     case Op_AddReductionVL: {
2052       sve_uaddv(tmp, size, pg, src2);
2053       umov(dst, tmp, size, 0);
2054       add(dst, dst, src1);
2055       break;
2056     }
2057     case Op_AndReductionV: {
2058       sve_andv(tmp, size, pg, src2);
2059       if (bt == T_INT || bt == T_LONG) {
2060         umov(dst, tmp, size, 0);
2061       } else {
2062         smov(dst, tmp, size, 0);
2063       }
2064       if (bt == T_LONG) {
2065         andr(dst, dst, src1);
2066       } else {
2067         andw(dst, dst, src1);
2068       }
2069       break;
2070     }
2071     case Op_OrReductionV: {
2072       sve_orv(tmp, size, pg, src2);
2073       if (bt == T_INT || bt == T_LONG) {
2074         umov(dst, tmp, size, 0);
2075       } else {
2076         smov(dst, tmp, size, 0);
2077       }
2078       if (bt == T_LONG) {
2079         orr(dst, dst, src1);
2080       } else {
2081         orrw(dst, dst, src1);
2082       }
2083       break;
2084     }
2085     case Op_XorReductionV: {
2086       sve_eorv(tmp, size, pg, src2);
2087       if (bt == T_INT || bt == T_LONG) {
2088         umov(dst, tmp, size, 0);
2089       } else {
2090         smov(dst, tmp, size, 0);
2091       }
2092       if (bt == T_LONG) {
2093         eor(dst, dst, src1);
2094       } else {
2095         eorw(dst, dst, src1);
2096       }
2097       break;
2098     }
2099     case Op_MaxReductionV: {
2100       sve_smaxv(tmp, size, pg, src2);
2101       if (bt == T_INT || bt == T_LONG) {
2102         umov(dst, tmp, size, 0);
2103       } else {
2104         smov(dst, tmp, size, 0);
2105       }
2106       if (bt == T_LONG) {
2107         cmp(dst, src1);
2108         csel(dst, dst, src1, Assembler::GT);
2109       } else {
2110         cmpw(dst, src1);
2111         cselw(dst, dst, src1, Assembler::GT);
2112       }
2113       break;
2114     }
2115     case Op_MinReductionV: {
2116       sve_sminv(tmp, size, pg, src2);
2117       if (bt == T_INT || bt == T_LONG) {
2118         umov(dst, tmp, size, 0);
2119       } else {
2120         smov(dst, tmp, size, 0);
2121       }
2122       if (bt == T_LONG) {
2123         cmp(dst, src1);
2124         csel(dst, dst, src1, Assembler::LT);
2125       } else {
2126         cmpw(dst, src1);
2127         cselw(dst, dst, src1, Assembler::LT);
2128       }
2129       break;
2130     }
2131     default:
2132       assert(false, "unsupported");
2133       ShouldNotReachHere();
2134   }
2135 
2136   if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) {
2137     if (bt == T_BYTE) {
2138       sxtb(dst, dst);
2139     } else if (bt == T_SHORT) {
2140       sxth(dst, dst);
2141     }
2142   }
2143 }
2144 
2145 // Set elements of the dst predicate to true for lanes in the range of [0, lane_cnt), or
2146 // to false otherwise. The input "lane_cnt" should be smaller than or equal to the supported
2147 // max vector length of the basic type. Clobbers: rscratch1 and the rFlagsReg.
2148 void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt) {
2149   uint32_t max_vector_length = Matcher::max_vector_size(bt);
2150   assert(lane_cnt <= max_vector_length, "unsupported input lane_cnt");
2151 
2152   // Set all elements to false if the input "lane_cnt" is zero.
2153   if (lane_cnt == 0) {
2154     sve_pfalse(dst);
2155     return;
2156   }
2157 
2158   SIMD_RegVariant size = elemType_to_regVariant(bt);
2159   assert(size != Q, "invalid size");
2160 
2161   // Set all true if "lane_cnt" equals to the max lane count.
2162   if (lane_cnt == max_vector_length) {
2163     sve_ptrue(dst, size, /* ALL */ 0b11111);
2164     return;
2165   }
2166 
2167   // Fixed numbers for "ptrue".
2168   switch(lane_cnt) {
2169   case 1: /* VL1 */
2170   case 2: /* VL2 */
2171   case 3: /* VL3 */
2172   case 4: /* VL4 */
2173   case 5: /* VL5 */
2174   case 6: /* VL6 */
2175   case 7: /* VL7 */
2176   case 8: /* VL8 */
2177     sve_ptrue(dst, size, lane_cnt);
2178     return;
2179   case 16:
2180     sve_ptrue(dst, size, /* VL16 */ 0b01001);
2181     return;
2182   case 32:
2183     sve_ptrue(dst, size, /* VL32 */ 0b01010);
2184     return;
2185   case 64:
2186     sve_ptrue(dst, size, /* VL64 */ 0b01011);
2187     return;
2188   case 128:
2189     sve_ptrue(dst, size, /* VL128 */ 0b01100);
2190     return;
2191   case 256:
2192     sve_ptrue(dst, size, /* VL256 */ 0b01101);
2193     return;
2194   default:
2195     break;
2196   }
2197 
2198   // Special patterns for "ptrue".
2199   if (lane_cnt == round_down_power_of_2(max_vector_length)) {
2200     sve_ptrue(dst, size, /* POW2 */ 0b00000);
2201   } else if (lane_cnt == max_vector_length - (max_vector_length % 4)) {
2202     sve_ptrue(dst, size, /* MUL4 */ 0b11101);
2203   } else if (lane_cnt == max_vector_length - (max_vector_length % 3)) {
2204     sve_ptrue(dst, size, /* MUL3 */ 0b11110);
2205   } else {
2206     // Encode to "whileltw" for the remaining cases.
2207     mov(rscratch1, lane_cnt);
2208     sve_whileltw(dst, size, zr, rscratch1);
2209   }
2210 }
2211 
2212 // Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst.
2213 // Any remaining elements of dst will be filled with zero.
2214 // Clobbers: rscratch1
2215 // Preserves: mask, vzr
2216 void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask,
2217                                            FloatRegister vzr, FloatRegister vtmp,
2218                                            PRegister pgtmp, unsigned vector_length_in_bytes) {
2219   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2220   // When called by sve_compress_byte, src and vtmp may be the same register.
2221   assert_different_registers(dst, src, vzr);
2222   assert_different_registers(dst, vtmp, vzr);
2223   assert_different_registers(mask, pgtmp);
2224   // high <-- low
2225   // Example input:   src   = hh gg ff ee dd cc bb aa, one character is 8 bits.
2226   //                  mask  = 01 00 00 01 01 00 01 01, one character is 1 bit.
2227   // Expected result: dst   = 00 00 00 hh ee dd bb aa
2228 
2229   // Extend lowest half to type INT.
2230   // dst   =  00dd  00cc  00bb  00aa
2231   sve_uunpklo(dst, S, src);
2232   // pgtmp =  0001  0000  0001  0001
2233   sve_punpklo(pgtmp, mask);
2234   // Pack the active elements in size of type INT to the right,
2235   // and fill the remainings with zero.
2236   // dst   =  0000  00dd  00bb  00aa
2237   sve_compact(dst, S, dst, pgtmp);
2238   // Narrow the result back to type SHORT.
2239   // dst   = 00 00 00 00 00 dd bb aa
2240   sve_uzp1(dst, H, dst, vzr);
2241 
2242   // Return if the vector length is no more than MaxVectorSize/2, since the
2243   // highest half is invalid.
2244   if (vector_length_in_bytes <= (MaxVectorSize >> 1)) {
2245     return;
2246   }
2247 
2248   // Count the active elements of lowest half.
2249   // rscratch1 = 3
2250   sve_cntp(rscratch1, S, ptrue, pgtmp);
2251 
2252   // Repeat to the highest half.
2253   // pgtmp =  0001  0000  0000  0001
2254   sve_punpkhi(pgtmp, mask);
2255   // vtmp  =  00hh  00gg  00ff  00ee
2256   sve_uunpkhi(vtmp, S, src);
2257   // vtmp  =  0000  0000  00hh  00ee
2258   sve_compact(vtmp, S, vtmp, pgtmp);
2259   // vtmp  = 00 00 00 00 00 00 hh ee
2260   sve_uzp1(vtmp, H, vtmp, vzr);
2261 
2262   // pgtmp = 00 00 00 00 00 01 01 01
2263   sve_whilelt(pgtmp, H, zr, rscratch1);
2264   // Compressed low:  dst  = 00 00 00 00 00 dd bb aa
2265   // Compressed high: vtmp = 00 00 00 00 00 00 hh ee
2266   // Combine the compressed low with the compressed high:
2267   //                  dst  = 00 00 00 hh ee dd bb aa
2268   sve_splice(dst, H, pgtmp, vtmp);
2269 }
2270 
2271 // Clobbers: rscratch1, rscratch2
2272 // Preserves: src, mask
2273 void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask,
2274                                           FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
2275                                           PRegister ptmp, PRegister pgtmp, unsigned vector_length_in_bytes) {
2276   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2277   assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3);
2278   assert_different_registers(mask, ptmp, pgtmp);
2279   // high <-- low
2280   // Example input:   src   = q p n m l k j i h g f e d c b a, one character is 8 bits.
2281   //                  mask  = 0 1 0 0 0 0 0 1 0 1 0 0 0 1 0 1, one character is 1 bit.
2282   // Expected result: dst   = 0 0 0 0 0 0 0 0 0 0 0 p i g c a
2283   FloatRegister vzr = vtmp3;
2284   sve_dup(vzr, B, 0);
2285 
2286   // Extend lowest half to type SHORT.
2287   // vtmp1 =  0h  0g  0f  0e  0d  0c  0b  0a
2288   sve_uunpklo(vtmp1, H, src);
2289   // ptmp  =  00  01  00  00  00  01  00  01
2290   sve_punpklo(ptmp, mask);
2291   // Pack the active elements in size of type SHORT to the right,
2292   // and fill the remainings with zero.
2293   // dst   =  00  00  00  00  00  0g  0c  0a
2294   unsigned extended_size = vector_length_in_bytes << 1;
2295   sve_compress_short(dst, vtmp1, ptmp, vzr, vtmp2, pgtmp, extended_size > MaxVectorSize ? MaxVectorSize : extended_size);
2296   // Narrow the result back to type BYTE.
2297   // dst   = 0 0 0 0 0 0 0 0 0 0 0 0 0 g c a
2298   sve_uzp1(dst, B, dst, vzr);
2299 
2300   // Return if the vector length is no more than MaxVectorSize/2, since the
2301   // highest half is invalid.
2302   if (vector_length_in_bytes <= (MaxVectorSize >> 1)) {
2303     return;
2304   }
2305   // Count the active elements of lowest half.
2306   // rscratch2 = 3
2307   sve_cntp(rscratch2, H, ptrue, ptmp);
2308 
2309   // Repeat to the highest half.
2310   // ptmp  =  00  01  00  00  00  00  00  01
2311   sve_punpkhi(ptmp, mask);
2312   // vtmp2 =  0q  0p  0n  0m  0l  0k  0j  0i
2313   sve_uunpkhi(vtmp2, H, src);
2314   // vtmp1 =  00  00  00  00  00  00  0p  0i
2315   sve_compress_short(vtmp1, vtmp2, ptmp, vzr, vtmp2, pgtmp, extended_size - MaxVectorSize);
2316   // vtmp1 = 0 0 0 0 0 0 0 0 0 0 0 0 0 0 p i
2317   sve_uzp1(vtmp1, B, vtmp1, vzr);
2318 
2319   // ptmp  = 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1
2320   sve_whilelt(ptmp, B, zr, rscratch2);
2321   // Compressed low:  dst   = 0 0 0 0 0 0 0 0 0 0 0 0 0 g c a
2322   // Compressed high: vtmp1 = 0 0 0 0 0 0 0 0 0 0 0 0 0 0 p i
2323   // Combine the compressed low with the compressed high:
2324   //                  dst   = 0 0 0 0 0 0 0 0 0 0 0 p i g c a
2325   sve_splice(dst, B, ptmp, vtmp1);
2326 }
2327 
2328 void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2329   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2330   SIMD_Arrangement size = isQ ? T16B : T8B;
2331   if (bt == T_BYTE) {
2332     rbit(dst, size, src);
2333   } else {
2334     neon_reverse_bytes(dst, src, bt, isQ);
2335     rbit(dst, size, dst);
2336   }
2337 }
2338 
2339 void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2340   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2341   SIMD_Arrangement size = isQ ? T16B : T8B;
2342   switch (bt) {
2343     case T_BYTE:
2344       if (dst != src) {
2345         orr(dst, size, src, src);
2346       }
2347       break;
2348     case T_SHORT:
2349       rev16(dst, size, src);
2350       break;
2351     case T_INT:
2352       rev32(dst, size, src);
2353       break;
2354     case T_LONG:
2355       rev64(dst, size, src);
2356       break;
2357     default:
2358       assert(false, "unsupported");
2359       ShouldNotReachHere();
2360   }
2361 }
2362 
2363 // VectorRearrange implementation for short/int/float/long/double types with NEON
2364 // instructions. For VectorRearrange short/int/float, we use NEON tbl instruction.
2365 // But since it supports bytes table only, we need to lookup 2/4 bytes as a group.
2366 // For VectorRearrange long/double, we compare the shuffle input with iota indices,
2367 // and use bsl to implement the operation.
2368 void C2_MacroAssembler::neon_rearrange_hsd(FloatRegister dst, FloatRegister src,
2369                                            FloatRegister shuffle, FloatRegister tmp,
2370                                            BasicType bt, bool isQ) {
2371   assert_different_registers(dst, src, shuffle, tmp);
2372   SIMD_Arrangement size1 = isQ ? T16B : T8B;
2373   SIMD_Arrangement size2 = esize2arrangement((uint)type2aelembytes(bt), isQ);
2374 
2375   // Here is an example that rearranges a NEON vector with 4 ints:
2376   // Rearrange V1 int[a0, a1, a2, a3] to V2 int[a2, a3, a0, a1]
2377   //   1. We assume the shuffle input is Vi int[2, 3, 0, 1].
2378   //   2. Multiply Vi int[2, 3, 0, 1] with constant int vector
2379   //      [0x04040404, 0x04040404, 0x04040404, 0x04040404], and get
2380   //      tbl base Vm int[0x08080808, 0x0c0c0c0c, 0x00000000, 0x04040404].
2381   //   3. Add Vm with constant int[0x03020100, 0x03020100, 0x03020100, 0x03020100],
2382   //      and get tbl index Vm int[0x0b0a0908, 0x0f0e0d0c, 0x03020100, 0x07060504]
2383   //   4. Use Vm as index register, and use V1 as table register.
2384   //      Then get V2 as the result by tbl NEON instructions.
2385   switch (bt) {
2386     case T_SHORT:
2387       mov(tmp, size1, 0x02);
2388       mulv(dst, size2, shuffle, tmp);
2389       mov(tmp, size2, 0x0100);
2390       addv(dst, size1, dst, tmp);
2391       tbl(dst, size1, src, 1, dst);
2392       break;
2393     case T_INT:
2394     case T_FLOAT:
2395       mov(tmp, size1, 0x04);
2396       mulv(dst, size2, shuffle, tmp);
2397       mov(tmp, size2, 0x03020100);
2398       addv(dst, size1, dst, tmp);
2399       tbl(dst, size1, src, 1, dst);
2400       break;
2401     case T_LONG:
2402     case T_DOUBLE:
2403       // Load the iota indices for Long type. The indices are ordered by
2404       // type B/S/I/L/F/D, and the offset between two types is 16; Hence
2405       // the offset for L is 48.
2406       lea(rscratch1,
2407           ExternalAddress(StubRoutines::aarch64::vector_iota_indices() + 48));
2408       ldrq(tmp, rscratch1);
2409       // Check whether the input "shuffle" is the same with iota indices.
2410       // Return "src" if true, otherwise swap the two elements of "src".
2411       cm(EQ, dst, size2, shuffle, tmp);
2412       ext(tmp, size1, src, src, 8);
2413       bsl(dst, size1, src, tmp);
2414       break;
2415     default:
2416       assert(false, "unsupported element type");
2417       ShouldNotReachHere();
2418   }
2419 }
2420 
2421 // Extract a scalar element from an sve vector at position 'idx'.
2422 // The input elements in src are expected to be of integral type.
2423 void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src,
2424                                              int idx, FloatRegister vtmp) {
2425   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2426   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2427   if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction
2428     if (bt == T_INT || bt == T_LONG) {
2429       umov(dst, src, size, idx);
2430     } else {
2431       smov(dst, src, size, idx);
2432     }
2433   } else {
2434     sve_orr(vtmp, src, src);
2435     sve_ext(vtmp, vtmp, idx << size);
2436     if (bt == T_INT || bt == T_LONG) {
2437       umov(dst, vtmp, size, 0);
2438     } else {
2439       smov(dst, vtmp, size, 0);
2440     }
2441   }
2442 }
2443 
2444 // java.lang.Math::round intrinsics
2445 
2446 // Clobbers: rscratch1, rflags
2447 void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2448                                           FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) {
2449   assert_different_registers(tmp1, tmp2, tmp3, src, dst);
2450   switch (T) {
2451     case T2S:
2452     case T4S:
2453       fmovs(tmp1, T, 0.5f);
2454       mov(rscratch1, jint_cast(0x1.0p23f));
2455       break;
2456     case T2D:
2457       fmovd(tmp1, T, 0.5);
2458       mov(rscratch1, julong_cast(0x1.0p52));
2459       break;
2460     default:
2461       assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2462   }
2463   fadd(tmp1, T, tmp1, src);
2464   fcvtms(tmp1, T, tmp1);
2465   // tmp1 = floor(src + 0.5, ties to even)
2466 
2467   fcvtas(dst, T, src);
2468   // dst = round(src), ties to away
2469 
2470   fneg(tmp3, T, src);
2471   dup(tmp2, T, rscratch1);
2472   cm(HS, tmp3, T, tmp3, tmp2);
2473   // tmp3 is now a set of flags
2474 
2475   bif(dst, T16B, tmp1, tmp3);
2476   // result in dst
2477 }
2478 
2479 // Clobbers: rscratch1, rflags
2480 void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2481                                          FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) {
2482   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2483   assert_different_registers(tmp1, tmp2, src, dst);
2484 
2485   switch (T) {
2486     case S:
2487       mov(rscratch1, jint_cast(0x1.0p23f));
2488       break;
2489     case D:
2490       mov(rscratch1, julong_cast(0x1.0p52));
2491       break;
2492     default:
2493       assert(T == S || T == D, "invalid register variant");
2494   }
2495 
2496   sve_frinta(dst, T, ptrue, src);
2497   // dst = round(src), ties to away
2498 
2499   Label none;
2500 
2501   sve_fneg(tmp1, T, ptrue, src);
2502   sve_dup(tmp2, T, rscratch1);
2503   sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1);
2504   br(EQ, none);
2505   {
2506     sve_cpy(tmp1, T, pgtmp, 0.5);
2507     sve_fadd(tmp1, T, pgtmp, src);
2508     sve_frintm(dst, T, pgtmp, tmp1);
2509     // dst = floor(src + 0.5, ties to even)
2510   }
2511   bind(none);
2512 
2513   sve_fcvtzs(dst, T, ptrue, dst, T);
2514   // result in dst
2515 }
2516 
2517 void C2_MacroAssembler::vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero,
2518                                            FloatRegister one, SIMD_Arrangement T) {
2519   assert_different_registers(dst, src, zero, one);
2520   assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2521 
2522   facgt(dst, T, src, zero);
2523   ushr(dst, T, dst, 1); // dst=0 for +-0.0 and NaN. 0x7FF..F otherwise
2524   bsl(dst, T == T2S ? T8B : T16B, one, src); // Result in dst
2525 }
2526 
2527 void C2_MacroAssembler::vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero,
2528                                           FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T) {
2529     assert_different_registers(dst, src, zero, one, vtmp);
2530     assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2531 
2532     sve_orr(vtmp, src, src);
2533     sve_fac(Assembler::GT, pgtmp, T, ptrue, src, zero); // pmtp=0 for +-0.0 and NaN. 0x1 otherwise
2534     switch (T) {
2535     case S:
2536       sve_and(vtmp, T, min_jint); // Extract the sign bit of float value in every lane of src
2537       sve_orr(vtmp, T, jint_cast(1.0)); // OR it with +1 to make the final result +1 or -1 depending
2538                                         // on the sign of the float value
2539       break;
2540     case D:
2541       sve_and(vtmp, T, min_jlong);
2542       sve_orr(vtmp, T, jlong_cast(1.0));
2543       break;
2544     default:
2545       assert(false, "unsupported");
2546       ShouldNotReachHere();
2547     }
2548     sve_sel(dst, T, pgtmp, vtmp, src); // Select either from src or vtmp based on the predicate register pgtmp
2549                                        // Result in dst
2550 }
2551 
2552 bool C2_MacroAssembler::in_scratch_emit_size() {
2553   if (ciEnv::current()->task() != nullptr) {
2554     PhaseOutput* phase_output = Compile::current()->output();
2555     if (phase_output != nullptr && phase_output->in_scratch_emit_size()) {
2556       return true;
2557     }
2558   }
2559   return MacroAssembler::in_scratch_emit_size();
2560 }
2561 
2562 static void abort_verify_int_in_range(uint idx, jint val, jint lo, jint hi) {
2563   fatal("Invalid CastII, idx: %u, val: %d, lo: %d, hi: %d", idx, val, lo, hi);
2564 }
2565 
2566 void C2_MacroAssembler::verify_int_in_range(uint idx, const TypeInt* t, Register rval, Register rtmp) {
2567   assert(!t->empty() && !t->singleton(), "%s", Type::str(t));
2568   if (t == TypeInt::INT) {
2569     return;
2570   }
2571   BLOCK_COMMENT("verify_int_in_range {");
2572   Label L_success, L_failure;
2573 
2574   jint lo = t->_lo;
2575   jint hi = t->_hi;
2576 
2577   if (lo != min_jint && hi != max_jint) {
2578     subsw(rtmp, rval, lo);
2579     br(Assembler::LT, L_failure);
2580     subsw(rtmp, rval, hi);
2581     br(Assembler::LE, L_success);
2582   } else if (lo != min_jint) {
2583     subsw(rtmp, rval, lo);
2584     br(Assembler::GE, L_success);
2585   } else if (hi != max_jint) {
2586     subsw(rtmp, rval, hi);
2587     br(Assembler::LE, L_success);
2588   } else {
2589     ShouldNotReachHere();
2590   }
2591 
2592   bind(L_failure);
2593   movw(c_rarg0, idx);
2594   mov(c_rarg1, rval);
2595   movw(c_rarg2, lo);
2596   movw(c_rarg3, hi);
2597   reconstruct_frame_pointer(rtmp);
2598   rt_call(CAST_FROM_FN_PTR(address, abort_verify_int_in_range), rtmp);
2599   hlt(0);
2600 
2601   bind(L_success);
2602   BLOCK_COMMENT("} verify_int_in_range");
2603 }
2604 
2605 static void abort_verify_long_in_range(uint idx, jlong val, jlong lo, jlong hi) {
2606   fatal("Invalid CastLL, idx: %u, val: " JLONG_FORMAT ", lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, val, lo, hi);
2607 }
2608 
2609 void C2_MacroAssembler::verify_long_in_range(uint idx, const TypeLong* t, Register rval, Register rtmp) {
2610   assert(!t->empty() && !t->singleton(), "%s", Type::str(t));
2611   if (t == TypeLong::LONG) {
2612     return;
2613   }
2614   BLOCK_COMMENT("verify_long_in_range {");
2615   Label L_success, L_failure;
2616 
2617   jlong lo = t->_lo;
2618   jlong hi = t->_hi;
2619 
2620   if (lo != min_jlong && hi != max_jlong) {
2621     subs(rtmp, rval, lo);
2622     br(Assembler::LT, L_failure);
2623     subs(rtmp, rval, hi);
2624     br(Assembler::LE, L_success);
2625   } else if (lo != min_jlong) {
2626     subs(rtmp, rval, lo);
2627     br(Assembler::GE, L_success);
2628   } else if (hi != max_jlong) {
2629     subs(rtmp, rval, hi);
2630     br(Assembler::LE, L_success);
2631   } else {
2632     ShouldNotReachHere();
2633   }
2634 
2635   bind(L_failure);
2636   movw(c_rarg0, idx);
2637   mov(c_rarg1, rval);
2638   mov(c_rarg2, lo);
2639   mov(c_rarg3, hi);
2640   reconstruct_frame_pointer(rtmp);
2641   rt_call(CAST_FROM_FN_PTR(address, abort_verify_long_in_range), rtmp);
2642   hlt(0);
2643 
2644   bind(L_success);
2645   BLOCK_COMMENT("} verify_long_in_range");
2646 }
2647 
2648 void C2_MacroAssembler::reconstruct_frame_pointer(Register rtmp) {
2649   const int framesize = Compile::current()->output()->frame_size_in_bytes();
2650   if (PreserveFramePointer) {
2651     // frame pointer is valid
2652 #ifdef ASSERT
2653     // Verify frame pointer value in rfp.
2654     add(rtmp, sp, framesize - 2 * wordSize);
2655     Label L_success;
2656     cmp(rfp, rtmp);
2657     br(Assembler::EQ, L_success);
2658     stop("frame pointer mismatch");
2659     bind(L_success);
2660 #endif // ASSERT
2661   } else {
2662     add(rfp, sp, framesize - 2 * wordSize);
2663   }
2664 }
2665 
2666 // Selects elements from two source vectors (src1, src2) based on index values in the index register
2667 // using Neon instructions and places it in the destination vector element corresponding to the
2668 // index vector element. Each index in the index register must be in the range - [0, 2 * NUM_ELEM),
2669 // where NUM_ELEM is the number of BasicType elements per vector.
2670 // If idx < NUM_ELEM --> selects src1[idx] (idx is an element of the index register)
2671 // Otherwise, selects src2[idx – NUM_ELEM]
2672 void C2_MacroAssembler::select_from_two_vectors_neon(FloatRegister dst, FloatRegister src1,
2673                                                      FloatRegister src2, FloatRegister index,
2674                                                      FloatRegister tmp, unsigned vector_length_in_bytes) {
2675   assert_different_registers(dst, src1, src2, tmp);
2676   SIMD_Arrangement size = vector_length_in_bytes == 16 ? T16B : T8B;
2677 
2678   if (vector_length_in_bytes == 16) {
2679     assert(UseSVE <= 1, "sve must be <= 1");
2680     assert(src1->successor() == src2, "Source registers must be ordered");
2681     // If the vector length is 16B, then use the Neon "tbl" instruction with two vector table
2682     tbl(dst, size, src1, 2, index);
2683   } else { // vector length == 8
2684     assert(UseSVE == 0, "must be Neon only");
2685     // We need to fit both the source vectors (src1, src2) in a 128-bit register because the
2686     // Neon "tbl" instruction supports only looking up 16B vectors. We then use the Neon "tbl"
2687     // instruction with one vector lookup
2688     ins(tmp, D, src1, 0, 0);
2689     ins(tmp, D, src2, 1, 0);
2690     tbl(dst, size, tmp, 1, index);
2691   }
2692 }
2693 
2694 // Selects elements from two source vectors (src1, src2) based on index values in the index register
2695 // using SVE/SVE2 instructions and places it in the destination vector element corresponding to the
2696 // index vector element. Each index in the index register must be in the range - [0, 2 * NUM_ELEM),
2697 // where NUM_ELEM is the number of BasicType elements per vector.
2698 // If idx < NUM_ELEM --> selects src1[idx] (idx is an element of the index register)
2699 // Otherwise, selects src2[idx – NUM_ELEM]
2700 void C2_MacroAssembler::select_from_two_vectors_sve(FloatRegister dst, FloatRegister src1,
2701                                                     FloatRegister src2, FloatRegister index,
2702                                                     FloatRegister tmp, SIMD_RegVariant T,
2703                                                     unsigned vector_length_in_bytes) {
2704   assert_different_registers(dst, src1, src2, index, tmp);
2705 
2706   if (vector_length_in_bytes == 8) {
2707     // We need to fit both the source vectors (src1, src2) in a single vector register because the
2708     // SVE "tbl" instruction is unpredicated and works on the entire vector which can lead to
2709     // incorrect results if each source vector is only partially filled. We then use the SVE "tbl"
2710     // instruction with one vector lookup
2711     assert(UseSVE >= 1, "sve must be >= 1");
2712     ins(tmp, D, src1, 0, 0);
2713     ins(tmp, D, src2, 1, 0);
2714     sve_tbl(dst, T, tmp, index);
2715   } else {  // UseSVE == 2 and vector_length_in_bytes > 8
2716     // If the vector length is > 8, then use the SVE2 "tbl" instruction with the two vector table.
2717     // The assertion - vector_length_in_bytes == MaxVectorSize ensures that this operation
2718     // is not executed on machines where vector_length_in_bytes < MaxVectorSize
2719     // with the only exception of 8B vector length.
2720     assert(UseSVE == 2 && vector_length_in_bytes == MaxVectorSize, "must be");
2721     assert(src1->successor() == src2, "Source registers must be ordered");
2722     sve_tbl(dst, T, src1, src2, index);
2723   }
2724 }
2725 
2726 void C2_MacroAssembler::select_from_two_vectors(FloatRegister dst, FloatRegister src1,
2727                                                 FloatRegister src2, FloatRegister index,
2728                                                 FloatRegister tmp, BasicType bt,
2729                                                 unsigned vector_length_in_bytes) {
2730 
2731   assert_different_registers(dst, src1, src2, index, tmp);
2732 
2733   // The cases that can reach this method are -
2734   // - UseSVE = 0/1, vector_length_in_bytes = 8 or 16, excluding double and long types
2735   // - UseSVE = 2, vector_length_in_bytes >= 8, for all types
2736   //
2737   // SVE/SVE2 tbl instructions are generated when UseSVE = 1 with vector_length_in_bytes = 8
2738   // and UseSVE = 2 with vector_length_in_bytes >= 8
2739   //
2740   // Neon instructions are generated when UseSVE = 0 with vector_length_in_bytes = 8 or 16 and
2741   // UseSVE = 1 with vector_length_in_bytes = 16
2742 
2743   if ((UseSVE == 1 && vector_length_in_bytes == 8) || UseSVE == 2) {
2744     SIMD_RegVariant T = elemType_to_regVariant(bt);
2745     select_from_two_vectors_sve(dst, src1, src2, index, tmp, T, vector_length_in_bytes);
2746     return;
2747   }
2748 
2749   // The only BasicTypes that can reach here are T_SHORT, T_BYTE, T_INT and T_FLOAT
2750   assert(bt != T_DOUBLE && bt != T_LONG, "unsupported basic type");
2751   assert(vector_length_in_bytes <= 16, "length_in_bytes must be <= 16");
2752 
2753   bool isQ = vector_length_in_bytes == 16;
2754 
2755   SIMD_Arrangement size1 = isQ ? T16B : T8B;
2756   SIMD_Arrangement size2 = esize2arrangement((uint)type2aelembytes(bt), isQ);
2757 
2758   // Neon "tbl" instruction only supports byte tables, so we need to look at chunks of
2759   // 2B for selecting shorts or chunks of 4B for selecting ints/floats from the table.
2760   // The index values in "index" register are in the range of [0, 2 * NUM_ELEM) where NUM_ELEM
2761   // is the number of elements that can fit in a vector. For ex. for T_SHORT with 64-bit vector length,
2762   // the indices can range from [0, 8).
2763   // As an example with 64-bit vector length and T_SHORT type - let index = [2, 5, 1, 0]
2764   // Move a constant 0x02 in every byte of tmp - tmp = [0x0202, 0x0202, 0x0202, 0x0202]
2765   // Multiply index vector with tmp to yield - dst = [0x0404, 0x0a0a, 0x0202, 0x0000]
2766   // Move a constant 0x0100 in every 2B of tmp - tmp = [0x0100, 0x0100, 0x0100, 0x0100]
2767   // Add the multiplied result to the vector in tmp to obtain the byte level
2768   // offsets - dst = [0x0504, 0x0b0a, 0x0302, 0x0100]
2769   // Use these offsets in the "tbl" instruction to select chunks of 2B.
2770 
2771   if (bt == T_BYTE) {
2772     select_from_two_vectors_neon(dst, src1, src2, index, tmp, vector_length_in_bytes);
2773   } else {
2774     int elem_size = (bt == T_SHORT) ? 2 : 4;
2775     uint64_t tbl_offset = (bt == T_SHORT) ? 0x0100u : 0x03020100u;
2776 
2777     mov(tmp, size1, elem_size);
2778     mulv(dst, size2, index, tmp);
2779     mov(tmp, size2, tbl_offset);
2780     addv(dst, size1, dst, tmp); // "dst" now contains the processed index elements
2781                                 // to select a set of 2B/4B
2782     select_from_two_vectors_neon(dst, src1, src2, dst, tmp, vector_length_in_bytes);
2783   }
2784 }
2785 
2786 // Vector expand implementation. Elements from the src vector are expanded into
2787 // the dst vector under the control of the vector mask.
2788 // Since there are no native instructions directly corresponding to expand before
2789 // SVE2p2, the following implementations mainly leverages the TBL instruction to
2790 // implement expand. To compute the index input for TBL, the prefix sum algorithm
2791 // (https://en.wikipedia.org/wiki/Prefix_sum) is used. The same algorithm is used
2792 // for NEON and SVE, but with different instructions where appropriate.
2793 
2794 // Vector expand implementation for NEON.
2795 //
2796 // An example of 128-bit Byte vector:
2797 //   Data direction: high <== low
2798 //   Input:
2799 //         src   = g  f  e  d  c  b  a  9  8  7  6  5  4  3  2  1
2800 //         mask  = 0  0 -1 -1  0  0 -1 -1  0  0 -1 -1  0  0 -1 -1
2801 //   Expected result:
2802 //         dst   = 0  0  8  7  0  0  6  5  0  0  4  3  0  0  2  1
2803 void C2_MacroAssembler::vector_expand_neon(FloatRegister dst, FloatRegister src, FloatRegister mask,
2804                                            FloatRegister tmp1, FloatRegister tmp2, BasicType bt,
2805                                            int vector_length_in_bytes) {
2806   assert(vector_length_in_bytes <= 16, "the vector length in bytes for NEON must be <= 16");
2807   assert_different_registers(dst, src, mask, tmp1, tmp2);
2808   // Since the TBL instruction only supports byte table, we need to
2809   // compute indices in byte type for all types.
2810   SIMD_Arrangement size = vector_length_in_bytes == 16 ? T16B : T8B;
2811   // tmp1 =  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
2812   dup(tmp1, size, zr);
2813   // dst  =  0  0  1  1  0  0  1  1  0  0  1  1  0  0  1  1
2814   negr(dst, size, mask);
2815   // Calculate vector index for TBL with prefix sum algorithm.
2816   // dst  =  8  8  8  7  6  6  6  5  4  4  4  3  2  2  2  1
2817   for (int i = 1; i < vector_length_in_bytes; i <<= 1) {
2818     ext(tmp2, size, tmp1, dst, vector_length_in_bytes - i);
2819     addv(dst, size, tmp2, dst);
2820   }
2821   // tmp2 =  0  0 -1 -1  0  0 -1 -1  0  0 -1 -1  0  0 -1 -1
2822   orr(tmp2, size, mask, mask);
2823   // tmp2 =  0  0  8  7  0  0  6  5  0  0  4  3  0  0  2  1
2824   bsl(tmp2, size, dst, tmp1);
2825   // tmp1 =  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
2826   movi(tmp1, size, 1);
2827   // dst  = -1 -1  7  6 -1 -1  5  4 -1 -1  3  2 -1 -1  1  0
2828   subv(dst, size, tmp2, tmp1);
2829   // dst  =  0  0  8  7  0  0  6  5  0  0  4  3  0  0  2  1
2830   tbl(dst, size, src, 1, dst);
2831 }
2832 
2833 // Vector expand implementation for SVE.
2834 //
2835 // An example of 128-bit Short vector:
2836 //   Data direction: high <== low
2837 //   Input:
2838 //         src   = gf ed cb a9 87 65 43 21
2839 //         pg    = 00 01 00 01 00 01 00 01
2840 //   Expected result:
2841 //         dst   = 00 87 00 65 00 43 00 21
2842 void C2_MacroAssembler::vector_expand_sve(FloatRegister dst, FloatRegister src, PRegister pg,
2843                                           FloatRegister tmp1, FloatRegister tmp2, BasicType bt,
2844                                           int vector_length_in_bytes) {
2845   assert(UseSVE > 0, "expand implementation only for SVE");
2846   assert_different_registers(dst, src, tmp1, tmp2);
2847   SIMD_RegVariant size = elemType_to_regVariant(bt);
2848 
2849   // tmp1 = 00 00 00 00 00 00 00 00
2850   sve_dup(tmp1, size, 0);
2851   sve_movprfx(tmp2, tmp1);
2852   // tmp2 = 00 01 00 01 00 01 00 01
2853   sve_cpy(tmp2, size, pg, 1, true);
2854   // Calculate vector index for TBL with prefix sum algorithm.
2855   // tmp2 = 04 04 03 03 02 02 01 01
2856   for (int i = type2aelembytes(bt); i < vector_length_in_bytes; i <<= 1) {
2857     sve_movprfx(dst, tmp1);
2858     // The EXT instruction operates on the full-width sve register. The correct
2859     // index calculation method is:
2860     // vector_length_in_bytes - i + MaxVectorSize - vector_length_in_bytes =>
2861     // MaxVectorSize - i.
2862     sve_ext(dst, tmp2, MaxVectorSize - i);
2863     sve_add(tmp2, size, dst, tmp2);
2864   }
2865   // dst  = 00 04 00 03 00 02 00 01
2866   sve_sel(dst, size, pg, tmp2, tmp1);
2867   // dst  = -1 03 -1 02 -1 01 -1 00
2868   sve_sub(dst, size, 1);
2869   // dst  = 00 87 00 65 00 43 00 21
2870   sve_tbl(dst, size, src, dst);
2871 }