New src/hotspot/cpu/aarch64/c2_MacroAssembler

   1 /*
   2  * Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "asm/assembler.hpp"
  26 #include "asm/assembler.inline.hpp"
  27 #include "opto/c2_MacroAssembler.hpp"
  28 #include "opto/compile.hpp"
  29 #include "opto/intrinsicnode.hpp"
  30 #include "opto/matcher.hpp"
  31 #include "opto/output.hpp"
  32 #include "opto/subnode.hpp"
  33 #include "runtime/stubRoutines.hpp"
  34 #include "utilities/globalDefinitions.hpp"
  35 #include "utilities/powerOfTwo.hpp"
  36 
  37 #ifdef PRODUCT
  38 #define BLOCK_COMMENT(str) /* nothing */
  39 #define STOP(error) stop(error)
  40 #else
  41 #define BLOCK_COMMENT(str) block_comment(str)
  42 #define STOP(error) block_comment(error); stop(error)
  43 #endif
  44 
  45 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  46 
  47 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
  48 
  49 void C2_MacroAssembler::entry_barrier() {
  50   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
  51   // Dummy labels for just measuring the code size
  52   Label dummy_slow_path;
  53   Label dummy_continuation;
  54   Label dummy_guard;
  55   Label* slow_path = &dummy_slow_path;
  56   Label* continuation = &dummy_continuation;
  57   Label* guard = &dummy_guard;
  58   if (!Compile::current()->output()->in_scratch_emit_size()) {
  59     // Use real labels from actual stub when not emitting code for the purpose of measuring its size
  60     C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
  61     Compile::current()->output()->add_stub(stub);
  62     slow_path = &stub->entry();
  63     continuation = &stub->continuation();
  64     guard = &stub->guard();
  65   }
  66   // In the C2 code, we move the non-hot part of nmethod entry barriers out-of-line to a stub.
  67   bs->nmethod_entry_barrier(this, slow_path, continuation, guard);
  68 }
  69 
  70 // jdk.internal.util.ArraysSupport.vectorizedHashCode
  71 address C2_MacroAssembler::arrays_hashcode(Register ary, Register cnt, Register result,
  72                                            FloatRegister vdata0, FloatRegister vdata1,
  73                                            FloatRegister vdata2, FloatRegister vdata3,
  74                                            FloatRegister vmul0, FloatRegister vmul1,
  75                                            FloatRegister vmul2, FloatRegister vmul3,
  76                                            FloatRegister vpow, FloatRegister vpowm,
  77                                            BasicType eltype) {
  78   ARRAYS_HASHCODE_REGISTERS;
  79 
  80   Register tmp1 = rscratch1, tmp2 = rscratch2;
  81 
  82   Label TAIL, STUB_SWITCH, STUB_SWITCH_OUT, LOOP, BR_BASE, LARGE, DONE;
  83 
  84   // Vectorization factor. Number of array elements loaded to one SIMD&FP registers by the stubs. We
  85   // use 8H load arrangements for chars and shorts and 8B for booleans and bytes. It's possible to
  86   // use 4H for chars and shorts instead, but using 8H gives better performance.
  87   const size_t vf = eltype == T_BOOLEAN || eltype == T_BYTE ? 8
  88                     : eltype == T_CHAR || eltype == T_SHORT ? 8
  89                     : eltype == T_INT                       ? 4
  90                                                             : 0;
  91   guarantee(vf, "unsupported eltype");
  92 
  93   // Unroll factor for the scalar loop below. The value is chosen based on performance analysis.
  94   const size_t unroll_factor = 4;
  95 
  96   switch (eltype) {
  97   case T_BOOLEAN:
  98     BLOCK_COMMENT("arrays_hashcode(unsigned byte) {");
  99     break;
 100   case T_CHAR:
 101     BLOCK_COMMENT("arrays_hashcode(char) {");
 102     break;
 103   case T_BYTE:
 104     BLOCK_COMMENT("arrays_hashcode(byte) {");
 105     break;
 106   case T_SHORT:
 107     BLOCK_COMMENT("arrays_hashcode(short) {");
 108     break;
 109   case T_INT:
 110     BLOCK_COMMENT("arrays_hashcode(int) {");
 111     break;
 112   default:
 113     ShouldNotReachHere();
 114   }
 115 
 116   // large_arrays_hashcode(T_INT) performs worse than the scalar loop below when the Neon loop
 117   // implemented by the stub executes just once. Call the stub only if at least two iterations will
 118   // be executed.
 119   const size_t large_threshold = eltype == T_INT ? vf * 2 : vf;
 120   cmpw(cnt, large_threshold);
 121   br(Assembler::HS, LARGE);
 122 
 123   bind(TAIL);
 124 
 125   // The andr performs cnt % uf where uf = unroll_factor. The subtract shifted by 3 offsets past
 126   // uf - (cnt % uf) pairs of load + madd insns i.e. it only executes cnt % uf load + madd pairs.
 127   // Iteration eats up the remainder, uf elements at a time.
 128   assert(is_power_of_2(unroll_factor), "can't use this value to calculate the jump target PC");
 129   andr(tmp2, cnt, unroll_factor - 1);
 130   adr(tmp1, BR_BASE);
 131   // For Cortex-A53 offset is 4 because 2 nops are generated.
 132   sub(tmp1, tmp1, tmp2, ext::sxtw, VM_Version::supports_a53mac() ? 4 : 3);
 133   movw(tmp2, 0x1f);
 134   br(tmp1);
 135 
 136   bind(LOOP);
 137   for (size_t i = 0; i < unroll_factor; ++i) {
 138     load(tmp1, Address(post(ary, type2aelembytes(eltype))), eltype);
 139     maddw(result, result, tmp2, tmp1);
 140     // maddw generates an extra nop for Cortex-A53 (see maddw definition in macroAssembler).
 141     // Generate 2nd nop to have 4 instructions per iteration.
 142     if (VM_Version::supports_a53mac()) {
 143       nop();
 144     }
 145   }
 146   bind(BR_BASE);
 147   subsw(cnt, cnt, unroll_factor);
 148   br(Assembler::HS, LOOP);
 149 
 150   b(DONE);
 151 
 152   bind(LARGE);
 153 
 154   RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_arrays_hashcode(eltype));
 155   assert(stub.target() != nullptr, "array_hashcode stub has not been generated");
 156   address tpc = trampoline_call(stub);
 157   if (tpc == nullptr) {
 158     DEBUG_ONLY(reset_labels(TAIL, BR_BASE));
 159     postcond(pc() == badAddress);
 160     return nullptr;
 161   }
 162 
 163   bind(DONE);
 164 
 165   BLOCK_COMMENT("} // arrays_hashcode");
 166 
 167   postcond(pc() != badAddress);
 168   return pc();
 169 }
 170 
 171 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register t1,
 172                                               Register t2, Register t3) {
 173   assert_different_registers(obj, box, t1, t2, t3, rscratch2);
 174 
 175   // Handle inflated monitor.
 176   Label inflated;
 177   // Finish fast lock successfully. MUST branch to with flag == EQ
 178   Label locked;
 179   // Finish fast lock unsuccessfully. MUST branch to with flag == NE
 180   Label slow_path;
 181 
 182   if (UseObjectMonitorTable) {
 183     // Clear cache in case fast locking succeeds or we need to take the slow-path.
 184     str(zr, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 185   }
 186 
 187   if (DiagnoseSyncOnValueBasedClasses != 0) {
 188     load_klass(t1, obj);
 189     ldrb(t1, Address(t1, Klass::misc_flags_offset()));
 190     tst(t1, KlassFlags::_misc_is_value_based_class);
 191     br(Assembler::NE, slow_path);
 192   }
 193 
 194   const Register t1_mark = t1;
 195   const Register t3_t = t3;
 196 
 197   { // Lightweight locking
 198 
 199     // Push lock to the lock stack and finish successfully. MUST branch to with flag == EQ
 200     Label push;
 201 
 202     const Register t2_top = t2;
 203 
 204     // Check if lock-stack is full.
 205     ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 206     cmpw(t2_top, (unsigned)LockStack::end_offset() - 1);
 207     br(Assembler::GT, slow_path);
 208 
 209     // Check if recursive.
 210     subw(t3_t, t2_top, oopSize);
 211     ldr(t3_t, Address(rthread, t3_t));
 212     cmp(obj, t3_t);
 213     br(Assembler::EQ, push);
 214 
 215     // Relaxed normal load to check for monitor. Optimization for monitor case.
 216     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 217     tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
 218 
 219     // Not inflated
 220     assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea");
 221 
 222     // Try to lock. Transition lock-bits 0b01 => 0b00
 223     orr(t1_mark, t1_mark, markWord::unlocked_value);
 224     eor(t3_t, t1_mark, markWord::unlocked_value);
 225     cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
 226             /*acquire*/ true, /*release*/ false, /*weak*/ false, noreg);
 227     br(Assembler::NE, slow_path);
 228 
 229     bind(push);
 230     // After successful lock, push object on lock-stack.
 231     str(obj, Address(rthread, t2_top));
 232     addw(t2_top, t2_top, oopSize);
 233     strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 234     b(locked);
 235   }
 236 
 237   { // Handle inflated monitor.
 238     bind(inflated);
 239 
 240     const Register t1_monitor = t1;
 241 
 242     if (!UseObjectMonitorTable) {
 243       assert(t1_monitor == t1_mark, "should be the same here");
 244     } else {
 245       Label monitor_found;
 246 
 247       // Load cache address
 248       lea(t3_t, Address(rthread, JavaThread::om_cache_oops_offset()));
 249 
 250       const int num_unrolled = 2;
 251       for (int i = 0; i < num_unrolled; i++) {
 252         ldr(t1, Address(t3_t));
 253         cmp(obj, t1);
 254         br(Assembler::EQ, monitor_found);
 255         increment(t3_t, in_bytes(OMCache::oop_to_oop_difference()));
 256       }
 257 
 258       Label loop;
 259 
 260       // Search for obj in cache.
 261       bind(loop);
 262 
 263       // Check for match.
 264       ldr(t1, Address(t3_t));
 265       cmp(obj, t1);
 266       br(Assembler::EQ, monitor_found);
 267 
 268       // Search until null encountered, guaranteed _null_sentinel at end.
 269       increment(t3_t, in_bytes(OMCache::oop_to_oop_difference()));
 270       cbnz(t1, loop);
 271       // Cache Miss, NE set from cmp above, cbnz does not set flags
 272       b(slow_path);
 273 
 274       bind(monitor_found);
 275       ldr(t1_monitor, Address(t3_t, OMCache::oop_to_monitor_difference()));
 276     }
 277 
 278     const Register t2_owner_addr = t2;
 279     const Register t3_owner = t3;
 280     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 281     const Address owner_address(t1_monitor, ObjectMonitor::owner_offset() - monitor_tag);
 282     const Address recursions_address(t1_monitor, ObjectMonitor::recursions_offset() - monitor_tag);
 283 
 284     Label monitor_locked;
 285 
 286     // Compute owner address.
 287     lea(t2_owner_addr, owner_address);
 288 
 289     // Try to CAS owner (no owner => current thread's _monitor_owner_id).
 290     ldr(rscratch2, Address(rthread, JavaThread::monitor_owner_id_offset()));
 291     cmpxchg(t2_owner_addr, zr, rscratch2, Assembler::xword, /*acquire*/ true,
 292             /*release*/ false, /*weak*/ false, t3_owner);
 293     br(Assembler::EQ, monitor_locked);
 294 
 295     // Check if recursive.
 296     cmp(t3_owner, rscratch2);
 297     br(Assembler::NE, slow_path);
 298 
 299     // Recursive.
 300     increment(recursions_address, 1);
 301 
 302     bind(monitor_locked);
 303     if (UseObjectMonitorTable) {
 304       str(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 305     }
 306   }
 307 
 308   bind(locked);
 309 
 310 #ifdef ASSERT
 311   // Check that locked label is reached with Flags == EQ.
 312   Label flag_correct;
 313   br(Assembler::EQ, flag_correct);
 314   stop("Fast Lock Flag != EQ");
 315 #endif
 316 
 317   bind(slow_path);
 318 #ifdef ASSERT
 319   // Check that slow_path label is reached with Flags == NE.
 320   br(Assembler::NE, flag_correct);
 321   stop("Fast Lock Flag != NE");
 322   bind(flag_correct);
 323 #endif
 324   // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
 325 }
 326 
 327 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register box, Register t1,
 328                                                 Register t2, Register t3) {
 329   assert_different_registers(obj, box, t1, t2, t3);
 330 
 331   // Handle inflated monitor.
 332   Label inflated, inflated_load_mark;
 333   // Finish fast unlock successfully. MUST branch to with flag == EQ
 334   Label unlocked;
 335   // Finish fast unlock unsuccessfully. MUST branch to with flag == NE
 336   Label slow_path;
 337 
 338   const Register t1_mark = t1;
 339   const Register t2_top = t2;
 340   const Register t3_t = t3;
 341 
 342   { // Lightweight unlock
 343 
 344     Label push_and_slow_path;
 345 
 346     // Check if obj is top of lock-stack.
 347     ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 348     subw(t2_top, t2_top, oopSize);
 349     ldr(t3_t, Address(rthread, t2_top));
 350     cmp(obj, t3_t);
 351     // Top of lock stack was not obj. Must be monitor.
 352     br(Assembler::NE, inflated_load_mark);
 353 
 354     // Pop lock-stack.
 355     DEBUG_ONLY(str(zr, Address(rthread, t2_top));)
 356     strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 357 
 358     // Check if recursive.
 359     subw(t3_t, t2_top, oopSize);
 360     ldr(t3_t, Address(rthread, t3_t));
 361     cmp(obj, t3_t);
 362     br(Assembler::EQ, unlocked);
 363 
 364     // Not recursive.
 365     // Load Mark.
 366     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 367 
 368     // Check header for monitor (0b10).
 369     // Because we got here by popping (meaning we pushed in locked)
 370     // there will be no monitor in the box. So we need to push back the obj
 371     // so that the runtime can fix any potential anonymous owner.
 372     tbnz(t1_mark, exact_log2(markWord::monitor_value), UseObjectMonitorTable ? push_and_slow_path : inflated);
 373 
 374     // Try to unlock. Transition lock bits 0b00 => 0b01
 375     assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea");
 376     orr(t3_t, t1_mark, markWord::unlocked_value);
 377     cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
 378             /*acquire*/ false, /*release*/ true, /*weak*/ false, noreg);
 379     br(Assembler::EQ, unlocked);
 380 
 381     bind(push_and_slow_path);
 382     // Compare and exchange failed.
 383     // Restore lock-stack and handle the unlock in runtime.
 384     DEBUG_ONLY(str(obj, Address(rthread, t2_top));)
 385     addw(t2_top, t2_top, oopSize);
 386     str(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 387     b(slow_path);
 388   }
 389 
 390 
 391   { // Handle inflated monitor.
 392     bind(inflated_load_mark);
 393     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 394 #ifdef ASSERT
 395     tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
 396     stop("Fast Unlock not monitor");
 397 #endif
 398 
 399     bind(inflated);
 400 
 401 #ifdef ASSERT
 402     Label check_done;
 403     subw(t2_top, t2_top, oopSize);
 404     cmpw(t2_top, in_bytes(JavaThread::lock_stack_base_offset()));
 405     br(Assembler::LT, check_done);
 406     ldr(t3_t, Address(rthread, t2_top));
 407     cmp(obj, t3_t);
 408     br(Assembler::NE, inflated);
 409     stop("Fast Unlock lock on stack");
 410     bind(check_done);
 411 #endif
 412 
 413     const Register t1_monitor = t1;
 414 
 415     if (!UseObjectMonitorTable) {
 416       assert(t1_monitor == t1_mark, "should be the same here");
 417 
 418       // Untag the monitor.
 419       add(t1_monitor, t1_mark, -(int)markWord::monitor_value);
 420     } else {
 421       ldr(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 422       // null check with Flags == NE, no valid pointer below alignof(ObjectMonitor*)
 423       cmp(t1_monitor, checked_cast<uint8_t>(alignof(ObjectMonitor*)));
 424       br(Assembler::LO, slow_path);
 425     }
 426 
 427     const Register t2_recursions = t2;
 428     Label not_recursive;
 429 
 430     // Check if recursive.
 431     ldr(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
 432     cbz(t2_recursions, not_recursive);
 433 
 434     // Recursive unlock.
 435     sub(t2_recursions, t2_recursions, 1u);
 436     str(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
 437     // Set flag == EQ
 438     cmp(t2_recursions, t2_recursions);
 439     b(unlocked);
 440 
 441     bind(not_recursive);
 442 
 443     const Register t2_owner_addr = t2;
 444 
 445     // Compute owner address.
 446     lea(t2_owner_addr, Address(t1_monitor, ObjectMonitor::owner_offset()));
 447 
 448     // Set owner to null.
 449     // Release to satisfy the JMM
 450     stlr(zr, t2_owner_addr);
 451     // We need a full fence after clearing owner to avoid stranding.
 452     // StoreLoad achieves this.
 453     membar(StoreLoad);
 454 
 455     // Check if the entry_list is empty.
 456     ldr(rscratch1, Address(t1_monitor, ObjectMonitor::entry_list_offset()));
 457     cmp(rscratch1, zr);
 458     br(Assembler::EQ, unlocked);  // If so we are done.
 459 
 460     // Check if there is a successor.
 461     ldr(rscratch1, Address(t1_monitor, ObjectMonitor::succ_offset()));
 462     cmp(rscratch1, zr);
 463     br(Assembler::NE, unlocked);  // If so we are done.
 464 
 465     // Save the monitor pointer in the current thread, so we can try to
 466     // reacquire the lock in SharedRuntime::monitor_exit_helper().
 467     str(t1_monitor, Address(rthread, JavaThread::unlocked_inflated_monitor_offset()));
 468 
 469     cmp(zr, rthread); // Set Flag to NE => slow path
 470     b(slow_path);
 471   }
 472 
 473   bind(unlocked);
 474   cmp(zr, zr); // Set Flags to EQ => fast path
 475 
 476 #ifdef ASSERT
 477   // Check that unlocked label is reached with Flags == EQ.
 478   Label flag_correct;
 479   br(Assembler::EQ, flag_correct);
 480   stop("Fast Unlock Flag != EQ");
 481 #endif
 482 
 483   bind(slow_path);
 484 #ifdef ASSERT
 485   // Check that slow_path label is reached with Flags == NE.
 486   br(Assembler::NE, flag_correct);
 487   stop("Fast Unlock Flag != NE");
 488   bind(flag_correct);
 489 #endif
 490   // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
 491 }
 492 
 493 // Search for str1 in str2 and return index or -1
 494 // Clobbers: rscratch1, rscratch2, rflags. May also clobber v0-v1, when icnt1==-1.
 495 void C2_MacroAssembler::string_indexof(Register str2, Register str1,
 496                                        Register cnt2, Register cnt1,
 497                                        Register tmp1, Register tmp2,
 498                                        Register tmp3, Register tmp4,
 499                                        Register tmp5, Register tmp6,
 500                                        int icnt1, Register result, int ae) {
 501   // NOTE: tmp5, tmp6 can be zr depending on specific method version
 502   Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
 503 
 504   Register ch1 = rscratch1;
 505   Register ch2 = rscratch2;
 506   Register cnt1tmp = tmp1;
 507   Register cnt2tmp = tmp2;
 508   Register cnt1_neg = cnt1;
 509   Register cnt2_neg = cnt2;
 510   Register result_tmp = tmp4;
 511 
 512   bool isL = ae == StrIntrinsicNode::LL;
 513 
 514   bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
 515   bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
 516   int str1_chr_shift = str1_isL ? 0:1;
 517   int str2_chr_shift = str2_isL ? 0:1;
 518   int str1_chr_size = str1_isL ? 1:2;
 519   int str2_chr_size = str2_isL ? 1:2;
 520   chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
 521                                       (chr_insn)&MacroAssembler::ldrh;
 522   chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
 523                                       (chr_insn)&MacroAssembler::ldrh;
 524   chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
 525   chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
 526 
 527   // Note, inline_string_indexOf() generates checks:
 528   // if (substr.count > string.count) return -1;
 529   // if (substr.count == 0) return 0;
 530 
 531   // We have two strings, a source string in str2, cnt2 and a pattern string
 532   // in str1, cnt1. Find the 1st occurrence of pattern in source or return -1.
 533 
 534   // For larger pattern and source we use a simplified Boyer Moore algorithm.
 535   // With a small pattern and source we use linear scan.
 536 
 537   if (icnt1 == -1) {
 538     sub(result_tmp, cnt2, cnt1);
 539     cmp(cnt1, (u1)8);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
 540     br(LT, LINEARSEARCH);
 541     dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
 542     subs(zr, cnt1, 256);
 543     lsr(tmp1, cnt2, 2);
 544     ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
 545     br(GE, LINEARSTUB);
 546   }
 547 
 548 // The Boyer Moore alogorithm is based on the description here:-
 549 //
 550 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
 551 //
 552 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
 553 // and the 'Good Suffix' rule.
 554 //
 555 // These rules are essentially heuristics for how far we can shift the
 556 // pattern along the search string.
 557 //
 558 // The implementation here uses the 'Bad Character' rule only because of the
 559 // complexity of initialisation for the 'Good Suffix' rule.
 560 //
 561 // This is also known as the Boyer-Moore-Horspool algorithm:-
 562 //
 563 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
 564 //
 565 // This particular implementation has few java-specific optimizations.
 566 //
 567 // #define ASIZE 256
 568 //
 569 //    int bm(unsigned char *x, int m, unsigned char *y, int n) {
 570 //       int i, j;
 571 //       unsigned c;
 572 //       unsigned char bc[ASIZE];
 573 //
 574 //       /* Preprocessing */
 575 //       for (i = 0; i < ASIZE; ++i)
 576 //          bc[i] = m;
 577 //       for (i = 0; i < m - 1; ) {
 578 //          c = x[i];
 579 //          ++i;
 580 //          // c < 256 for Latin1 string, so, no need for branch
 581 //          #ifdef PATTERN_STRING_IS_LATIN1
 582 //          bc[c] = m - i;
 583 //          #else
 584 //          if (c < ASIZE) bc[c] = m - i;
 585 //          #endif
 586 //       }
 587 //
 588 //       /* Searching */
 589 //       j = 0;
 590 //       while (j <= n - m) {
 591 //          c = y[i+j];
 592 //          if (x[m-1] == c)
 593 //            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
 594 //          if (i < 0) return j;
 595 //          // c < 256 for Latin1 string, so, no need for branch
 596 //          #ifdef SOURCE_STRING_IS_LATIN1
 597 //          // LL case: (c< 256) always true. Remove branch
 598 //          j += bc[y[j+m-1]];
 599 //          #endif
 600 //          #ifndef PATTERN_STRING_IS_UTF
 601 //          // UU case: need if (c<ASIZE) check. Skip 1 character if not.
 602 //          if (c < ASIZE)
 603 //            j += bc[y[j+m-1]];
 604 //          else
 605 //            j += 1
 606 //          #endif
 607 //          #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
 608 //          // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
 609 //          if (c < ASIZE)
 610 //            j += bc[y[j+m-1]];
 611 //          else
 612 //            j += m
 613 //          #endif
 614 //       }
 615 //    }
 616 
 617   if (icnt1 == -1) {
 618     Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
 619         BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
 620     Register cnt1end = tmp2;
 621     Register str2end = cnt2;
 622     Register skipch = tmp2;
 623 
 624     // str1 length is >=8, so, we can read at least 1 register for cases when
 625     // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
 626     // UL case. We'll re-read last character in inner pre-loop code to have
 627     // single outer pre-loop load
 628     const int firstStep = isL ? 7 : 3;
 629 
 630     const int ASIZE = 256;
 631     const int STORED_BYTES = 32; // amount of bytes stored per instruction
 632     sub(sp, sp, ASIZE);
 633     mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
 634     mov(ch1, sp);
 635     BIND(BM_INIT_LOOP);
 636       stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
 637       subs(tmp5, tmp5, 1);
 638       br(GT, BM_INIT_LOOP);
 639 
 640       sub(cnt1tmp, cnt1, 1);
 641       mov(tmp5, str2);
 642       add(str2end, str2, result_tmp, LSL, str2_chr_shift);
 643       sub(ch2, cnt1, 1);
 644       mov(tmp3, str1);
 645     BIND(BCLOOP);
 646       (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
 647       if (!str1_isL) {
 648         subs(zr, ch1, ASIZE);
 649         br(HS, BCSKIP);
 650       }
 651       strb(ch2, Address(sp, ch1));
 652     BIND(BCSKIP);
 653       subs(ch2, ch2, 1);
 654       br(GT, BCLOOP);
 655 
 656       add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
 657       if (str1_isL == str2_isL) {
 658         // load last 8 bytes (8LL/4UU symbols)
 659         ldr(tmp6, Address(tmp6, -wordSize));
 660       } else {
 661         ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
 662         // convert Latin1 to UTF. We'll have to wait until load completed, but
 663         // it's still faster than per-character loads+checks
 664         lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
 665         ubfx(ch1, tmp6, 8, 8); // str1[N-2]
 666         ubfx(ch2, tmp6, 16, 8); // str1[N-3]
 667         andr(tmp6, tmp6, 0xFF); // str1[N-4]
 668         orr(ch2, ch1, ch2, LSL, 16);
 669         orr(tmp6, tmp6, tmp3, LSL, 48);
 670         orr(tmp6, tmp6, ch2, LSL, 16);
 671       }
 672     BIND(BMLOOPSTR2);
 673       (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 674       sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
 675       if (str1_isL == str2_isL) {
 676         // re-init tmp3. It's for free because it's executed in parallel with
 677         // load above. Alternative is to initialize it before loop, but it'll
 678         // affect performance on in-order systems with 2 or more ld/st pipelines
 679         lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
 680       }
 681       if (!isL) { // UU/UL case
 682         lsl(ch2, cnt1tmp, 1); // offset in bytes
 683       }
 684       cmp(tmp3, skipch);
 685       br(NE, BMSKIP);
 686       ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
 687       mov(ch1, tmp6);
 688       if (isL) {
 689         b(BMLOOPSTR1_AFTER_LOAD);
 690       } else {
 691         sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
 692         b(BMLOOPSTR1_CMP);
 693       }
 694     BIND(BMLOOPSTR1);
 695       (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
 696       (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 697     BIND(BMLOOPSTR1_AFTER_LOAD);
 698       subs(cnt1tmp, cnt1tmp, 1);
 699       br(LT, BMLOOPSTR1_LASTCMP);
 700     BIND(BMLOOPSTR1_CMP);
 701       cmp(ch1, ch2);
 702       br(EQ, BMLOOPSTR1);
 703     BIND(BMSKIP);
 704       if (!isL) {
 705         // if we've met UTF symbol while searching Latin1 pattern, then we can
 706         // skip cnt1 symbols
 707         if (str1_isL != str2_isL) {
 708           mov(result_tmp, cnt1);
 709         } else {
 710           mov(result_tmp, 1);
 711         }
 712         subs(zr, skipch, ASIZE);
 713         br(HS, BMADV);
 714       }
 715       ldrb(result_tmp, Address(sp, skipch)); // load skip distance
 716     BIND(BMADV);
 717       sub(cnt1tmp, cnt1, 1);
 718       add(str2, str2, result_tmp, LSL, str2_chr_shift);
 719       cmp(str2, str2end);
 720       br(LE, BMLOOPSTR2);
 721       add(sp, sp, ASIZE);
 722       b(NOMATCH);
 723     BIND(BMLOOPSTR1_LASTCMP);
 724       cmp(ch1, ch2);
 725       br(NE, BMSKIP);
 726     BIND(BMMATCH);
 727       sub(result, str2, tmp5);
 728       if (!str2_isL) lsr(result, result, 1);
 729       add(sp, sp, ASIZE);
 730       b(DONE);
 731 
 732     BIND(LINEARSTUB);
 733     cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm
 734     br(LT, LINEAR_MEDIUM);
 735     mov(result, zr);
 736     RuntimeAddress stub = nullptr;
 737     if (isL) {
 738       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
 739       assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated");
 740     } else if (str1_isL) {
 741       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
 742        assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated");
 743     } else {
 744       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
 745       assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated");
 746     }
 747     address call = trampoline_call(stub);
 748     if (call == nullptr) {
 749       DEBUG_ONLY(reset_labels(LINEARSEARCH, LINEAR_MEDIUM, DONE, NOMATCH, MATCH));
 750       ciEnv::current()->record_failure("CodeCache is full");
 751       return;
 752     }
 753     b(DONE);
 754   }
 755 
 756   BIND(LINEARSEARCH);
 757   {
 758     Label DO1, DO2, DO3;
 759 
 760     Register str2tmp = tmp2;
 761     Register first = tmp3;
 762 
 763     if (icnt1 == -1)
 764     {
 765         Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
 766 
 767         cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2));
 768         br(LT, DOSHORT);
 769       BIND(LINEAR_MEDIUM);
 770         (this->*str1_load_1chr)(first, Address(str1));
 771         lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
 772         sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
 773         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 774         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 775 
 776       BIND(FIRST_LOOP);
 777         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
 778         cmp(first, ch2);
 779         br(EQ, STR1_LOOP);
 780       BIND(STR2_NEXT);
 781         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 782         br(LE, FIRST_LOOP);
 783         b(NOMATCH);
 784 
 785       BIND(STR1_LOOP);
 786         adds(cnt1tmp, cnt1_neg, str1_chr_size);
 787         add(cnt2tmp, cnt2_neg, str2_chr_size);
 788         br(GE, MATCH);
 789 
 790       BIND(STR1_NEXT);
 791         (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
 792         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
 793         cmp(ch1, ch2);
 794         br(NE, STR2_NEXT);
 795         adds(cnt1tmp, cnt1tmp, str1_chr_size);
 796         add(cnt2tmp, cnt2tmp, str2_chr_size);
 797         br(LT, STR1_NEXT);
 798         b(MATCH);
 799 
 800       BIND(DOSHORT);
 801       if (str1_isL == str2_isL) {
 802         cmp(cnt1, (u1)2);
 803         br(LT, DO1);
 804         br(GT, DO3);
 805       }
 806     }
 807 
 808     if (icnt1 == 4) {
 809       Label CH1_LOOP;
 810 
 811         (this->*load_4chr)(ch1, str1);
 812         sub(result_tmp, cnt2, 4);
 813         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 814         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 815 
 816       BIND(CH1_LOOP);
 817         (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
 818         cmp(ch1, ch2);
 819         br(EQ, MATCH);
 820         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 821         br(LE, CH1_LOOP);
 822         b(NOMATCH);
 823       }
 824 
 825     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
 826       Label CH1_LOOP;
 827 
 828       BIND(DO2);
 829         (this->*load_2chr)(ch1, str1);
 830         if (icnt1 == 2) {
 831           sub(result_tmp, cnt2, 2);
 832         }
 833         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 834         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 835       BIND(CH1_LOOP);
 836         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
 837         cmp(ch1, ch2);
 838         br(EQ, MATCH);
 839         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 840         br(LE, CH1_LOOP);
 841         b(NOMATCH);
 842     }
 843 
 844     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
 845       Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
 846 
 847       BIND(DO3);
 848         (this->*load_2chr)(first, str1);
 849         (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
 850         if (icnt1 == 3) {
 851           sub(result_tmp, cnt2, 3);
 852         }
 853         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 854         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 855       BIND(FIRST_LOOP);
 856         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
 857         cmpw(first, ch2);
 858         br(EQ, STR1_LOOP);
 859       BIND(STR2_NEXT);
 860         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 861         br(LE, FIRST_LOOP);
 862         b(NOMATCH);
 863 
 864       BIND(STR1_LOOP);
 865         add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
 866         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
 867         cmp(ch1, ch2);
 868         br(NE, STR2_NEXT);
 869         b(MATCH);
 870     }
 871 
 872     if (icnt1 == -1 || icnt1 == 1) {
 873       Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
 874 
 875       BIND(DO1);
 876         (this->*str1_load_1chr)(ch1, str1);
 877         cmp(cnt2, (u1)8);
 878         br(LT, DO1_SHORT);
 879 
 880         sub(result_tmp, cnt2, 8/str2_chr_size);
 881         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 882         mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
 883         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 884 
 885         if (str2_isL) {
 886           orr(ch1, ch1, ch1, LSL, 8);
 887         }
 888         orr(ch1, ch1, ch1, LSL, 16);
 889         orr(ch1, ch1, ch1, LSL, 32);
 890       BIND(CH1_LOOP);
 891         ldr(ch2, Address(str2, cnt2_neg));
 892         eor(ch2, ch1, ch2);
 893         sub(tmp1, ch2, tmp3);
 894         orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 895         bics(tmp1, tmp1, tmp2);
 896         br(NE, HAS_ZERO);
 897         adds(cnt2_neg, cnt2_neg, 8);
 898         br(LT, CH1_LOOP);
 899 
 900         cmp(cnt2_neg, (u1)8);
 901         mov(cnt2_neg, 0);
 902         br(LT, CH1_LOOP);
 903         b(NOMATCH);
 904 
 905       BIND(HAS_ZERO);
 906         rev(tmp1, tmp1);
 907         clz(tmp1, tmp1);
 908         add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
 909         b(MATCH);
 910 
 911       BIND(DO1_SHORT);
 912         mov(result_tmp, cnt2);
 913         lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
 914         sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
 915       BIND(DO1_LOOP);
 916         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
 917         cmpw(ch1, ch2);
 918         br(EQ, MATCH);
 919         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 920         br(LT, DO1_LOOP);
 921     }
 922   }
 923   BIND(NOMATCH);
 924     mov(result, -1);
 925     b(DONE);
 926   BIND(MATCH);
 927     add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
 928   BIND(DONE);
 929 }
 930 
 931 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
 932 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
 933 
 934 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1,
 935                                             Register ch, Register result,
 936                                             Register tmp1, Register tmp2, Register tmp3)
 937 {
 938   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
 939   Register cnt1_neg = cnt1;
 940   Register ch1 = rscratch1;
 941   Register result_tmp = rscratch2;
 942 
 943   cbz(cnt1, NOMATCH);
 944 
 945   cmp(cnt1, (u1)4);
 946   br(LT, DO1_SHORT);
 947 
 948   orr(ch, ch, ch, LSL, 16);
 949   orr(ch, ch, ch, LSL, 32);
 950 
 951   sub(cnt1, cnt1, 4);
 952   mov(result_tmp, cnt1);
 953   lea(str1, Address(str1, cnt1, Address::uxtw(1)));
 954   sub(cnt1_neg, zr, cnt1, LSL, 1);
 955 
 956   mov(tmp3, 0x0001000100010001);
 957 
 958   BIND(CH1_LOOP);
 959     ldr(ch1, Address(str1, cnt1_neg));
 960     eor(ch1, ch, ch1);
 961     sub(tmp1, ch1, tmp3);
 962     orr(tmp2, ch1, 0x7fff7fff7fff7fff);
 963     bics(tmp1, tmp1, tmp2);
 964     br(NE, HAS_ZERO);
 965     adds(cnt1_neg, cnt1_neg, 8);
 966     br(LT, CH1_LOOP);
 967 
 968     cmp(cnt1_neg, (u1)8);
 969     mov(cnt1_neg, 0);
 970     br(LT, CH1_LOOP);
 971     b(NOMATCH);
 972 
 973   BIND(HAS_ZERO);
 974     rev(tmp1, tmp1);
 975     clz(tmp1, tmp1);
 976     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
 977     b(MATCH);
 978 
 979   BIND(DO1_SHORT);
 980     mov(result_tmp, cnt1);
 981     lea(str1, Address(str1, cnt1, Address::uxtw(1)));
 982     sub(cnt1_neg, zr, cnt1, LSL, 1);
 983   BIND(DO1_LOOP);
 984     ldrh(ch1, Address(str1, cnt1_neg));
 985     cmpw(ch, ch1);
 986     br(EQ, MATCH);
 987     adds(cnt1_neg, cnt1_neg, 2);
 988     br(LT, DO1_LOOP);
 989   BIND(NOMATCH);
 990     mov(result, -1);
 991     b(DONE);
 992   BIND(MATCH);
 993     add(result, result_tmp, cnt1_neg, ASR, 1);
 994   BIND(DONE);
 995 }
 996 
 997 void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1,
 998                                                 Register ch, Register result,
 999                                                 FloatRegister ztmp1,
1000                                                 FloatRegister ztmp2,
1001                                                 PRegister tmp_pg,
1002                                                 PRegister tmp_pdn, bool isL)
1003 {
1004   // Note that `tmp_pdn` should *NOT* be used as governing predicate register.
1005   assert(tmp_pg->is_governing(),
1006          "this register has to be a governing predicate register");
1007 
1008   Label LOOP, MATCH, DONE, NOMATCH;
1009   Register vec_len = rscratch1;
1010   Register idx = rscratch2;
1011 
1012   SIMD_RegVariant T = (isL == true) ? B : H;
1013 
1014   cbz(cnt1, NOMATCH);
1015 
1016   // Assign the particular char throughout the vector.
1017   sve_dup(ztmp2, T, ch);
1018   if (isL) {
1019     sve_cntb(vec_len);
1020   } else {
1021     sve_cnth(vec_len);
1022   }
1023   mov(idx, 0);
1024 
1025   // Generate a predicate to control the reading of input string.
1026   sve_whilelt(tmp_pg, T, idx, cnt1);
1027 
1028   BIND(LOOP);
1029     // Read a vector of 8- or 16-bit data depending on the string type. Note
1030     // that inactive elements indicated by the predicate register won't cause
1031     // a data read from memory to the destination vector.
1032     if (isL) {
1033       sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx));
1034     } else {
1035       sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1)));
1036     }
1037     add(idx, idx, vec_len);
1038 
1039     // Perform the comparison. An element of the destination predicate is set
1040     // to active if the particular char is matched.
1041     sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2);
1042 
1043     // Branch if the particular char is found.
1044     br(NE, MATCH);
1045 
1046     sve_whilelt(tmp_pg, T, idx, cnt1);
1047 
1048     // Loop back if the particular char not found.
1049     br(MI, LOOP);
1050 
1051   BIND(NOMATCH);
1052     mov(result, -1);
1053     b(DONE);
1054 
1055   BIND(MATCH);
1056     // Undo the index increment.
1057     sub(idx, idx, vec_len);
1058 
1059     // Crop the vector to find its location.
1060     sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */);
1061     add(result, idx, -1);
1062     sve_incp(result, T, tmp_pdn);
1063   BIND(DONE);
1064 }
1065 
1066 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1,
1067                                             Register ch, Register result,
1068                                             Register tmp1, Register tmp2, Register tmp3)
1069 {
1070   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
1071   Register cnt1_neg = cnt1;
1072   Register ch1 = rscratch1;
1073   Register result_tmp = rscratch2;
1074 
1075   cbz(cnt1, NOMATCH);
1076 
1077   cmp(cnt1, (u1)8);
1078   br(LT, DO1_SHORT);
1079 
1080   orr(ch, ch, ch, LSL, 8);
1081   orr(ch, ch, ch, LSL, 16);
1082   orr(ch, ch, ch, LSL, 32);
1083 
1084   sub(cnt1, cnt1, 8);
1085   mov(result_tmp, cnt1);
1086   lea(str1, Address(str1, cnt1));
1087   sub(cnt1_neg, zr, cnt1);
1088 
1089   mov(tmp3, 0x0101010101010101);
1090 
1091   BIND(CH1_LOOP);
1092     ldr(ch1, Address(str1, cnt1_neg));
1093     eor(ch1, ch, ch1);
1094     sub(tmp1, ch1, tmp3);
1095     orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f);
1096     bics(tmp1, tmp1, tmp2);
1097     br(NE, HAS_ZERO);
1098     adds(cnt1_neg, cnt1_neg, 8);
1099     br(LT, CH1_LOOP);
1100 
1101     cmp(cnt1_neg, (u1)8);
1102     mov(cnt1_neg, 0);
1103     br(LT, CH1_LOOP);
1104     b(NOMATCH);
1105 
1106   BIND(HAS_ZERO);
1107     rev(tmp1, tmp1);
1108     clz(tmp1, tmp1);
1109     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
1110     b(MATCH);
1111 
1112   BIND(DO1_SHORT);
1113     mov(result_tmp, cnt1);
1114     lea(str1, Address(str1, cnt1));
1115     sub(cnt1_neg, zr, cnt1);
1116   BIND(DO1_LOOP);
1117     ldrb(ch1, Address(str1, cnt1_neg));
1118     cmp(ch, ch1);
1119     br(EQ, MATCH);
1120     adds(cnt1_neg, cnt1_neg, 1);
1121     br(LT, DO1_LOOP);
1122   BIND(NOMATCH);
1123     mov(result, -1);
1124     b(DONE);
1125   BIND(MATCH);
1126     add(result, result_tmp, cnt1_neg);
1127   BIND(DONE);
1128 }
1129 
1130 // Compare strings.
1131 void C2_MacroAssembler::string_compare(Register str1, Register str2,
1132     Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
1133     FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
1134     PRegister pgtmp1, PRegister pgtmp2, int ae) {
1135   Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
1136       DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
1137       SHORT_LOOP_START, TAIL_CHECK;
1138 
1139   bool isLL = ae == StrIntrinsicNode::LL;
1140   bool isLU = ae == StrIntrinsicNode::LU;
1141   bool isUL = ae == StrIntrinsicNode::UL;
1142 
1143   // The stub threshold for LL strings is: 72 (64 + 8) chars
1144   // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch)
1145   // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least)
1146   const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36);
1147 
1148   bool str1_isL = isLL || isLU;
1149   bool str2_isL = isLL || isUL;
1150 
1151   int str1_chr_shift = str1_isL ? 0 : 1;
1152   int str2_chr_shift = str2_isL ? 0 : 1;
1153   int str1_chr_size = str1_isL ? 1 : 2;
1154   int str2_chr_size = str2_isL ? 1 : 2;
1155   int minCharsInWord = isLL ? wordSize : wordSize/2;
1156 
1157   FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
1158   chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
1159                                       (chr_insn)&MacroAssembler::ldrh;
1160   chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
1161                                       (chr_insn)&MacroAssembler::ldrh;
1162   uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
1163                             (uxt_insn)&MacroAssembler::uxthw;
1164 
1165   BLOCK_COMMENT("string_compare {");
1166 
1167   // Bizarrely, the counts are passed in bytes, regardless of whether they
1168   // are L or U strings, however the result is always in characters.
1169   if (!str1_isL) asrw(cnt1, cnt1, 1);
1170   if (!str2_isL) asrw(cnt2, cnt2, 1);
1171 
1172   // Compute the minimum of the string lengths and save the difference.
1173   subsw(result, cnt1, cnt2);
1174   cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
1175 
1176   // A very short string
1177   cmpw(cnt2, minCharsInWord);
1178   br(Assembler::LE, SHORT_STRING);
1179 
1180   // Compare longwords
1181   // load first parts of strings and finish initialization while loading
1182   {
1183     if (str1_isL == str2_isL) { // LL or UU
1184       ldr(tmp1, Address(str1));
1185       cmp(str1, str2);
1186       br(Assembler::EQ, DONE);
1187       ldr(tmp2, Address(str2));
1188       cmp(cnt2, stub_threshold);
1189       br(GE, STUB);
1190       subsw(cnt2, cnt2, minCharsInWord);
1191       br(EQ, TAIL_CHECK);
1192       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1193       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1194       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1195     } else if (isLU) {
1196       ldrs(vtmp, Address(str1));
1197       ldr(tmp2, Address(str2));
1198       cmp(cnt2, stub_threshold);
1199       br(GE, STUB);
1200       subw(cnt2, cnt2, 4);
1201       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1202       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1203       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1204       zip1(vtmp, T8B, vtmp, vtmpZ);
1205       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1206       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1207       add(cnt1, cnt1, 4);
1208       fmovd(tmp1, vtmp);
1209     } else { // UL case
1210       ldr(tmp1, Address(str1));
1211       ldrs(vtmp, Address(str2));
1212       cmp(cnt2, stub_threshold);
1213       br(GE, STUB);
1214       subw(cnt2, cnt2, 4);
1215       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1216       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1217       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1218       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1219       zip1(vtmp, T8B, vtmp, vtmpZ);
1220       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1221       add(cnt1, cnt1, 8);
1222       fmovd(tmp2, vtmp);
1223     }
1224     adds(cnt2, cnt2, isUL ? 4 : 8);
1225     br(GE, TAIL);
1226     eor(rscratch2, tmp1, tmp2);
1227     cbnz(rscratch2, DIFF);
1228     // main loop
1229     bind(NEXT_WORD);
1230     if (str1_isL == str2_isL) {
1231       ldr(tmp1, Address(str1, cnt2));
1232       ldr(tmp2, Address(str2, cnt2));
1233       adds(cnt2, cnt2, 8);
1234     } else if (isLU) {
1235       ldrs(vtmp, Address(str1, cnt1));
1236       ldr(tmp2, Address(str2, cnt2));
1237       add(cnt1, cnt1, 4);
1238       zip1(vtmp, T8B, vtmp, vtmpZ);
1239       fmovd(tmp1, vtmp);
1240       adds(cnt2, cnt2, 8);
1241     } else { // UL
1242       ldrs(vtmp, Address(str2, cnt2));
1243       ldr(tmp1, Address(str1, cnt1));
1244       zip1(vtmp, T8B, vtmp, vtmpZ);
1245       add(cnt1, cnt1, 8);
1246       fmovd(tmp2, vtmp);
1247       adds(cnt2, cnt2, 4);
1248     }
1249     br(GE, TAIL);
1250 
1251     eor(rscratch2, tmp1, tmp2);
1252     cbz(rscratch2, NEXT_WORD);
1253     b(DIFF);
1254     bind(TAIL);
1255     eor(rscratch2, tmp1, tmp2);
1256     cbnz(rscratch2, DIFF);
1257     // Last longword.  In the case where length == 4 we compare the
1258     // same longword twice, but that's still faster than another
1259     // conditional branch.
1260     if (str1_isL == str2_isL) {
1261       ldr(tmp1, Address(str1));
1262       ldr(tmp2, Address(str2));
1263     } else if (isLU) {
1264       ldrs(vtmp, Address(str1));
1265       ldr(tmp2, Address(str2));
1266       zip1(vtmp, T8B, vtmp, vtmpZ);
1267       fmovd(tmp1, vtmp);
1268     } else { // UL
1269       ldrs(vtmp, Address(str2));
1270       ldr(tmp1, Address(str1));
1271       zip1(vtmp, T8B, vtmp, vtmpZ);
1272       fmovd(tmp2, vtmp);
1273     }
1274     bind(TAIL_CHECK);
1275     eor(rscratch2, tmp1, tmp2);
1276     cbz(rscratch2, DONE);
1277 
1278     // Find the first different characters in the longwords and
1279     // compute their difference.
1280     bind(DIFF);
1281     rev(rscratch2, rscratch2);
1282     clz(rscratch2, rscratch2);
1283     andr(rscratch2, rscratch2, isLL ? -8 : -16);
1284     lsrv(tmp1, tmp1, rscratch2);
1285     (this->*ext_chr)(tmp1, tmp1);
1286     lsrv(tmp2, tmp2, rscratch2);
1287     (this->*ext_chr)(tmp2, tmp2);
1288     subw(result, tmp1, tmp2);
1289     b(DONE);
1290   }
1291 
1292   bind(STUB);
1293     RuntimeAddress stub = nullptr;
1294     switch(ae) {
1295       case StrIntrinsicNode::LL:
1296         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
1297         break;
1298       case StrIntrinsicNode::UU:
1299         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
1300         break;
1301       case StrIntrinsicNode::LU:
1302         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
1303         break;
1304       case StrIntrinsicNode::UL:
1305         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
1306         break;
1307       default:
1308         ShouldNotReachHere();
1309      }
1310     assert(stub.target() != nullptr, "compare_long_string stub has not been generated");
1311     address call = trampoline_call(stub);
1312     if (call == nullptr) {
1313       DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START));
1314       ciEnv::current()->record_failure("CodeCache is full");
1315       return;
1316     }
1317     b(DONE);
1318 
1319   bind(SHORT_STRING);
1320   // Is the minimum length zero?
1321   cbz(cnt2, DONE);
1322   // arrange code to do most branches while loading and loading next characters
1323   // while comparing previous
1324   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1325   subs(cnt2, cnt2, 1);
1326   br(EQ, SHORT_LAST_INIT);
1327   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1328   b(SHORT_LOOP_START);
1329   bind(SHORT_LOOP);
1330   subs(cnt2, cnt2, 1);
1331   br(EQ, SHORT_LAST);
1332   bind(SHORT_LOOP_START);
1333   (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
1334   (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
1335   cmp(tmp1, cnt1);
1336   br(NE, SHORT_LOOP_TAIL);
1337   subs(cnt2, cnt2, 1);
1338   br(EQ, SHORT_LAST2);
1339   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1340   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1341   cmp(tmp2, rscratch1);
1342   br(EQ, SHORT_LOOP);
1343   sub(result, tmp2, rscratch1);
1344   b(DONE);
1345   bind(SHORT_LOOP_TAIL);
1346   sub(result, tmp1, cnt1);
1347   b(DONE);
1348   bind(SHORT_LAST2);
1349   cmp(tmp2, rscratch1);
1350   br(EQ, DONE);
1351   sub(result, tmp2, rscratch1);
1352 
1353   b(DONE);
1354   bind(SHORT_LAST_INIT);
1355   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1356   bind(SHORT_LAST);
1357   cmp(tmp1, cnt1);
1358   br(EQ, DONE);
1359   sub(result, tmp1, cnt1);
1360 
1361   bind(DONE);
1362 
1363   BLOCK_COMMENT("} string_compare");
1364 }
1365 
1366 void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1,
1367                                      FloatRegister src2, Condition cond, bool isQ) {
1368   SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1369   FloatRegister zn = src1, zm = src2;
1370   bool needs_negation = false;
1371   switch (cond) {
1372     case LT: cond = GT; zn = src2; zm = src1; break;
1373     case LE: cond = GE; zn = src2; zm = src1; break;
1374     case LO: cond = HI; zn = src2; zm = src1; break;
1375     case LS: cond = HS; zn = src2; zm = src1; break;
1376     case NE: cond = EQ; needs_negation = true; break;
1377     default:
1378       break;
1379   }
1380 
1381   if (is_floating_point_type(bt)) {
1382     fcm(cond, dst, size, zn, zm);
1383   } else {
1384     cm(cond, dst, size, zn, zm);
1385   }
1386 
1387   if (needs_negation) {
1388     notr(dst, isQ ? T16B : T8B, dst);
1389   }
1390 }
1391 
1392 void C2_MacroAssembler::neon_compare_zero(FloatRegister dst, BasicType bt, FloatRegister src,
1393                                           Condition cond, bool isQ) {
1394   SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1395   if (bt == T_FLOAT || bt == T_DOUBLE) {
1396     if (cond == Assembler::NE) {
1397       fcm(Assembler::EQ, dst, size, src);
1398       notr(dst, isQ ? T16B : T8B, dst);
1399     } else {
1400       fcm(cond, dst, size, src);
1401     }
1402   } else {
1403     if (cond == Assembler::NE) {
1404       cm(Assembler::EQ, dst, size, src);
1405       notr(dst, isQ ? T16B : T8B, dst);
1406     } else {
1407       cm(cond, dst, size, src);
1408     }
1409   }
1410 }
1411 
1412 // Compress the least significant bit of each byte to the rightmost and clear
1413 // the higher garbage bits.
1414 void C2_MacroAssembler::bytemask_compress(Register dst) {
1415   // Example input, dst = 0x01 00 00 00 01 01 00 01
1416   // The "??" bytes are garbage.
1417   orr(dst, dst, dst, Assembler::LSR, 7);  // dst = 0x?? 02 ?? 00 ?? 03 ?? 01
1418   orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D
1419   orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D
1420   andr(dst, dst, 0xff);                   // dst = 0x8D
1421 }
1422 
1423 // Pack the lowest-numbered bit of each mask element in src into a long value
1424 // in dst, at most the first 64 lane elements.
1425 // Clobbers: rscratch1, if UseSVE=1 or the hardware doesn't support FEAT_BITPERM.
1426 void C2_MacroAssembler::sve_vmask_tolong(Register dst, PRegister src, BasicType bt, int lane_cnt,
1427                                          FloatRegister vtmp1, FloatRegister vtmp2) {
1428   assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
1429   assert_different_registers(dst, rscratch1);
1430   assert_different_registers(vtmp1, vtmp2);
1431 
1432   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
1433   // Example:   src = 0b01100101 10001101, bt = T_BYTE, lane_cnt = 16
1434   // Expected:  dst = 0x658D
1435 
1436   // Convert the mask into vector with sequential bytes.
1437   // vtmp1 = 0x00010100 0x00010001 0x01000000 0x01010001
1438   sve_cpy(vtmp1, size, src, 1, false);
1439   if (bt != T_BYTE) {
1440     sve_vector_narrow(vtmp1, B, vtmp1, size, vtmp2);
1441   }
1442 
1443   if (UseSVE > 1 && VM_Version::supports_svebitperm()) {
1444     // Given a vector with the value 0x00 or 0x01 in each byte, the basic idea
1445     // is to compress each significant bit of the byte in a cross-lane way. Due
1446     // to the lack of a cross-lane bit-compress instruction, we use BEXT
1447     // (bit-compress in each lane) with the biggest lane size (T = D) then
1448     // concatenate the results.
1449 
1450     // The second source input of BEXT, initialized with 0x01 in each byte.
1451     // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1452     sve_dup(vtmp2, B, 1);
1453 
1454     // BEXT vtmp1.D, vtmp1.D, vtmp2.D
1455     // vtmp1 = 0x0001010000010001 | 0x0100000001010001
1456     // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1457     //         ---------------------------------------
1458     // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1459     sve_bext(vtmp1, D, vtmp1, vtmp2);
1460 
1461     // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the
1462     // result to dst.
1463     // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1464     // dst   = 0x658D
1465     if (lane_cnt <= 8) {
1466       // No need to concatenate.
1467       umov(dst, vtmp1, B, 0);
1468     } else if (lane_cnt <= 16) {
1469       ins(vtmp1, B, vtmp1, 1, 8);
1470       umov(dst, vtmp1, H, 0);
1471     } else {
1472       // As the lane count is 64 at most, the final expected value must be in
1473       // the lowest 64 bits after narrowing vtmp1 from D to B.
1474       sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2);
1475       umov(dst, vtmp1, D, 0);
1476     }
1477   } else if (UseSVE > 0) {
1478     // Compress the lowest 8 bytes.
1479     fmovd(dst, vtmp1);
1480     bytemask_compress(dst);
1481     if (lane_cnt <= 8) return;
1482 
1483     // Repeat on higher bytes and join the results.
1484     // Compress 8 bytes in each iteration.
1485     for (int idx = 1; idx < (lane_cnt / 8); idx++) {
1486       sve_extract_integral(rscratch1, T_LONG, vtmp1, idx, vtmp2);
1487       bytemask_compress(rscratch1);
1488       orr(dst, dst, rscratch1, Assembler::LSL, idx << 3);
1489     }
1490   } else {
1491     assert(false, "unsupported");
1492     ShouldNotReachHere();
1493   }
1494 }
1495 
1496 // Unpack the mask, a long value in src, into predicate register dst based on the
1497 // corresponding data type. Note that dst can support at most 64 lanes.
1498 // Below example gives the expected dst predicate register in different types, with
1499 // a valid src(0x658D) on a 1024-bit vector size machine.
1500 // BYTE:  dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 00 00 65 8D
1501 // SHORT: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 14 11 40 51
1502 // INT:   dst = 0x00 00 00 00 00 00 00 00 01 10 01 01 10 00 11 01
1503 // LONG:  dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01
1504 //
1505 // The number of significant bits of src must be equal to lane_cnt. E.g., 0xFF658D which
1506 // has 24 significant bits would be an invalid input if dst predicate register refers to
1507 // a LONG type 1024-bit vector, which has at most 16 lanes.
1508 void C2_MacroAssembler::sve_vmask_fromlong(PRegister dst, Register src, BasicType bt, int lane_cnt,
1509                                            FloatRegister vtmp1, FloatRegister vtmp2) {
1510   assert(UseSVE == 2 && VM_Version::supports_svebitperm() &&
1511          lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported");
1512   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
1513   // Example:   src = 0x658D, bt = T_BYTE, size = B, lane_cnt = 16
1514   // Expected:  dst = 0b01101001 10001101
1515 
1516   // Put long value from general purpose register into the first lane of vector.
1517   // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1518   sve_dup(vtmp1, B, 0);
1519   mov(vtmp1, D, 0, src);
1520 
1521   // As sve_cmp generates mask value with the minimum unit in byte, we should
1522   // transform the value in the first lane which is mask in bit now to the
1523   // mask in byte, which can be done by SVE2's BDEP instruction.
1524 
1525   // The first source input of BDEP instruction. Deposite each byte in every 8 bytes.
1526   // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1527   if (lane_cnt <= 8) {
1528     // Nothing. As only one byte exsits.
1529   } else if (lane_cnt <= 16) {
1530     ins(vtmp1, B, vtmp1, 8, 1);
1531     mov(vtmp1, B, 1, zr);
1532   } else {
1533     sve_vector_extend(vtmp1, D, vtmp1, B);
1534   }
1535 
1536   // The second source input of BDEP instruction, initialized with 0x01 for each byte.
1537   // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1538   sve_dup(vtmp2, B, 1);
1539 
1540   // BDEP vtmp1.D, vtmp1.D, vtmp2.D
1541   // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1542   // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1543   //         ---------------------------------------
1544   // vtmp1 = 0x0001010000010001 | 0x0100000001010001
1545   sve_bdep(vtmp1, D, vtmp1, vtmp2);
1546 
1547   if (bt != T_BYTE) {
1548     sve_vector_extend(vtmp1, size, vtmp1, B);
1549   }
1550   // Generate mask according to the given vector, in which the elements have been
1551   // extended to expected type.
1552   // dst = 0b01101001 10001101
1553   sve_cmp(Assembler::NE, dst, size, ptrue, vtmp1, 0);
1554 }
1555 
1556 // Clobbers: rflags
1557 void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg,
1558                                     FloatRegister zn, FloatRegister zm, Condition cond) {
1559   assert(pg->is_governing(), "This register has to be a governing predicate register");
1560   FloatRegister z1 = zn, z2 = zm;
1561   switch (cond) {
1562     case LE: z1 = zm; z2 = zn; cond = GE; break;
1563     case LT: z1 = zm; z2 = zn; cond = GT; break;
1564     case LO: z1 = zm; z2 = zn; cond = HI; break;
1565     case LS: z1 = zm; z2 = zn; cond = HS; break;
1566     default:
1567       break;
1568   }
1569 
1570   SIMD_RegVariant size = elemType_to_regVariant(bt);
1571   if (is_floating_point_type(bt)) {
1572     sve_fcm(cond, pd, size, pg, z1, z2);
1573   } else {
1574     assert(is_integral_type(bt), "unsupported element type");
1575     sve_cmp(cond, pd, size, pg, z1, z2);
1576   }
1577 }
1578 
1579 // Get index of the last mask lane that is set
1580 void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) {
1581   SIMD_RegVariant size = elemType_to_regVariant(bt);
1582   sve_rev(ptmp, size, src);
1583   sve_brkb(ptmp, ptrue, ptmp, false);
1584   sve_cntp(dst, size, ptrue, ptmp);
1585   movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1);
1586   subw(dst, rscratch1, dst);
1587 }
1588 
1589 // Extend integer vector src to dst with the same lane count
1590 // but larger element size, e.g. 4B -> 4I
1591 void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes,
1592                                            FloatRegister src, BasicType src_bt, bool is_unsigned) {
1593   if (src_bt == T_BYTE) {
1594     // 4B to 4S/4I, 8B to 8S
1595     assert(dst_vlen_in_bytes == 8 || dst_vlen_in_bytes == 16, "unsupported");
1596     assert(dst_bt == T_SHORT || dst_bt == T_INT, "unsupported");
1597     _xshll(is_unsigned, dst, T8H, src, T8B, 0);
1598     if (dst_bt == T_INT) {
1599       _xshll(is_unsigned, dst, T4S, dst, T4H, 0);
1600     }
1601   } else if (src_bt == T_SHORT) {
1602     // 2S to 2I/2L, 4S to 4I
1603     assert(dst_vlen_in_bytes == 8 || dst_vlen_in_bytes == 16, "unsupported");
1604     assert(dst_bt == T_INT || dst_bt == T_LONG, "unsupported");
1605     _xshll(is_unsigned, dst, T4S, src, T4H, 0);
1606     if (dst_bt == T_LONG) {
1607       _xshll(is_unsigned, dst, T2D, dst, T2S, 0);
1608     }
1609   } else if (src_bt == T_INT) {
1610     // 2I to 2L
1611     assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported");
1612     _xshll(is_unsigned, dst, T2D, src, T2S, 0);
1613   } else {
1614     ShouldNotReachHere();
1615   }
1616 }
1617 
1618 // Narrow integer vector src down to dst with the same lane count
1619 // but smaller element size, e.g. 4I -> 4B
1620 void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt,
1621                                            FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) {
1622   if (src_bt == T_SHORT) {
1623     // 4S/8S to 4B/8B
1624     assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
1625     assert(dst_bt == T_BYTE, "unsupported");
1626     xtn(dst, T8B, src, T8H);
1627   } else if (src_bt == T_INT) {
1628     // 2I to 2S, 4I to 4B/4S
1629     assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
1630     assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported");
1631     xtn(dst, T4H, src, T4S);
1632     if (dst_bt == T_BYTE) {
1633       xtn(dst, T8B, dst, T8H);
1634     }
1635   } else if (src_bt == T_LONG) {
1636     // 2L to 2S/2I
1637     assert(src_vlen_in_bytes == 16, "unsupported");
1638     assert(dst_bt == T_INT || dst_bt == T_SHORT, "unsupported");
1639     xtn(dst, T2S, src, T2D);
1640     if (dst_bt == T_SHORT) {
1641       xtn(dst, T4H, dst, T4S);
1642     }
1643   } else {
1644     ShouldNotReachHere();
1645   }
1646 }
1647 
1648 void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size,
1649                                           FloatRegister src, SIMD_RegVariant src_size,
1650                                           bool is_unsigned) {
1651   assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size");
1652 
1653   if (src_size == B) {
1654     switch (dst_size) {
1655     case H:
1656       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1657       break;
1658     case S:
1659       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1660       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1661       break;
1662     case D:
1663       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1664       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1665       _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1666       break;
1667     default:
1668       ShouldNotReachHere();
1669     }
1670   } else if (src_size == H) {
1671     if (dst_size == S) {
1672       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1673     } else { // D
1674       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1675       _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1676     }
1677   } else if (src_size == S) {
1678     _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, src);
1679   }
1680 }
1681 
1682 // Vector narrow from src to dst with specified element sizes.
1683 // High part of dst vector will be filled with zero.
1684 void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size,
1685                                           FloatRegister src, SIMD_RegVariant src_size,
1686                                           FloatRegister tmp) {
1687   assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size");
1688   assert_different_registers(src, tmp);
1689   sve_dup(tmp, src_size, 0);
1690   if (src_size == D) {
1691     switch (dst_size) {
1692     case S:
1693       sve_uzp1(dst, S, src, tmp);
1694       break;
1695     case H:
1696       assert_different_registers(dst, tmp);
1697       sve_uzp1(dst, S, src, tmp);
1698       sve_uzp1(dst, H, dst, tmp);
1699       break;
1700     case B:
1701       assert_different_registers(dst, tmp);
1702       sve_uzp1(dst, S, src, tmp);
1703       sve_uzp1(dst, H, dst, tmp);
1704       sve_uzp1(dst, B, dst, tmp);
1705       break;
1706     default:
1707       ShouldNotReachHere();
1708     }
1709   } else if (src_size == S) {
1710     if (dst_size == H) {
1711       sve_uzp1(dst, H, src, tmp);
1712     } else { // B
1713       assert_different_registers(dst, tmp);
1714       sve_uzp1(dst, H, src, tmp);
1715       sve_uzp1(dst, B, dst, tmp);
1716     }
1717   } else if (src_size == H) {
1718     sve_uzp1(dst, B, src, tmp);
1719   }
1720 }
1721 
1722 // Extend src predicate to dst predicate with the same lane count but larger
1723 // element size, e.g. 64Byte -> 512Long
1724 void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src,
1725                                              uint dst_element_length_in_bytes,
1726                                              uint src_element_length_in_bytes) {
1727   if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) {
1728     sve_punpklo(dst, src);
1729   } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) {
1730     sve_punpklo(dst, src);
1731     sve_punpklo(dst, dst);
1732   } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) {
1733     sve_punpklo(dst, src);
1734     sve_punpklo(dst, dst);
1735     sve_punpklo(dst, dst);
1736   } else {
1737     assert(false, "unsupported");
1738     ShouldNotReachHere();
1739   }
1740 }
1741 
1742 // Narrow src predicate to dst predicate with the same lane count but
1743 // smaller element size, e.g. 512Long -> 64Byte
1744 void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src, PRegister ptmp,
1745                                              uint dst_element_length_in_bytes, uint src_element_length_in_bytes) {
1746   // The insignificant bits in src predicate are expected to be zero.
1747   // To ensure the higher order bits of the resultant narrowed vector are 0, an all-zero predicate is
1748   // passed as the second argument. An example narrowing operation with a given mask would be -
1749   // 128Long -> 64Int on a 128-bit machine i.e 2L -> 2I
1750   // Mask (for 2 Longs) : TF
1751   // Predicate register for the above mask (16 bits) : 00000001 00000000
1752   // After narrowing (uzp1 dst.b, src.b, ptmp.b) : 0000 0000 0001 0000
1753   // Which translates to mask for 2 integers as : TF (lower half is considered while upper half is 0)
1754   assert_different_registers(src, ptmp);
1755   assert_different_registers(dst, ptmp);
1756   sve_pfalse(ptmp);
1757   if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) {
1758     sve_uzp1(dst, B, src, ptmp);
1759   } else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) {
1760     sve_uzp1(dst, H, src, ptmp);
1761     sve_uzp1(dst, B, dst, ptmp);
1762   } else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) {
1763     sve_uzp1(dst, S, src, ptmp);
1764     sve_uzp1(dst, H, dst, ptmp);
1765     sve_uzp1(dst, B, dst, ptmp);
1766   } else {
1767     assert(false, "unsupported");
1768     ShouldNotReachHere();
1769   }
1770 }
1771 
1772 // Vector reduction add for integral type with ASIMD instructions.
1773 void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt,
1774                                                  Register isrc, FloatRegister vsrc,
1775                                                  unsigned vector_length_in_bytes,
1776                                                  FloatRegister vtmp) {
1777   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1778   assert_different_registers(dst, isrc);
1779   bool isQ = vector_length_in_bytes == 16;
1780 
1781   BLOCK_COMMENT("neon_reduce_add_integral {");
1782     switch(bt) {
1783       case T_BYTE:
1784         addv(vtmp, isQ ? T16B : T8B, vsrc);
1785         smov(dst, vtmp, B, 0);
1786         addw(dst, dst, isrc, ext::sxtb);
1787         break;
1788       case T_SHORT:
1789         addv(vtmp, isQ ? T8H : T4H, vsrc);
1790         smov(dst, vtmp, H, 0);
1791         addw(dst, dst, isrc, ext::sxth);
1792         break;
1793       case T_INT:
1794         isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc);
1795         umov(dst, vtmp, S, 0);
1796         addw(dst, dst, isrc);
1797         break;
1798       case T_LONG:
1799         assert(isQ, "unsupported");
1800         addpd(vtmp, vsrc);
1801         umov(dst, vtmp, D, 0);
1802         add(dst, dst, isrc);
1803         break;
1804       default:
1805         assert(false, "unsupported");
1806         ShouldNotReachHere();
1807     }
1808   BLOCK_COMMENT("} neon_reduce_add_integral");
1809 }
1810 
1811 // Vector reduction multiply for integral type with ASIMD instructions.
1812 // Note: temporary registers vtmp1 and vtmp2 are not used in some cases.
1813 // Clobbers: rscratch1
1814 void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt,
1815                                                  Register isrc, FloatRegister vsrc,
1816                                                  unsigned vector_length_in_bytes,
1817                                                  FloatRegister vtmp1, FloatRegister vtmp2) {
1818   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1819   bool isQ = vector_length_in_bytes == 16;
1820 
1821   BLOCK_COMMENT("neon_reduce_mul_integral {");
1822     switch(bt) {
1823       case T_BYTE:
1824         if (isQ) {
1825           // Multiply the lower half and higher half of vector iteratively.
1826           // vtmp1 = vsrc[8:15]
1827           ins(vtmp1, D, vsrc, 0, 1);
1828           // vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7]
1829           mulv(vtmp1, T8B, vtmp1, vsrc);
1830           // vtmp2 = vtmp1[4:7]
1831           ins(vtmp2, S, vtmp1, 0, 1);
1832           // vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3]
1833           mulv(vtmp1, T8B, vtmp2, vtmp1);
1834         } else {
1835           ins(vtmp1, S, vsrc, 0, 1);
1836           mulv(vtmp1, T8B, vtmp1, vsrc);
1837         }
1838         // vtmp2 = vtmp1[2:3]
1839         ins(vtmp2, H, vtmp1, 0, 1);
1840         // vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1]
1841         mulv(vtmp2, T8B, vtmp2, vtmp1);
1842         // dst = vtmp2[0] * isrc * vtmp2[1]
1843         umov(rscratch1, vtmp2, B, 0);
1844         mulw(dst, rscratch1, isrc);
1845         sxtb(dst, dst);
1846         umov(rscratch1, vtmp2, B, 1);
1847         mulw(dst, rscratch1, dst);
1848         sxtb(dst, dst);
1849         break;
1850       case T_SHORT:
1851         if (isQ) {
1852           ins(vtmp2, D, vsrc, 0, 1);
1853           mulv(vtmp2, T4H, vtmp2, vsrc);
1854           ins(vtmp1, S, vtmp2, 0, 1);
1855           mulv(vtmp1, T4H, vtmp1, vtmp2);
1856         } else {
1857           ins(vtmp1, S, vsrc, 0, 1);
1858           mulv(vtmp1, T4H, vtmp1, vsrc);
1859         }
1860         umov(rscratch1, vtmp1, H, 0);
1861         mulw(dst, rscratch1, isrc);
1862         sxth(dst, dst);
1863         umov(rscratch1, vtmp1, H, 1);
1864         mulw(dst, rscratch1, dst);
1865         sxth(dst, dst);
1866         break;
1867       case T_INT:
1868         if (isQ) {
1869           ins(vtmp1, D, vsrc, 0, 1);
1870           mulv(vtmp1, T2S, vtmp1, vsrc);
1871         } else {
1872           vtmp1 = vsrc;
1873         }
1874         umov(rscratch1, vtmp1, S, 0);
1875         mul(dst, rscratch1, isrc);
1876         umov(rscratch1, vtmp1, S, 1);
1877         mul(dst, rscratch1, dst);
1878         break;
1879       case T_LONG:
1880         umov(rscratch1, vsrc, D, 0);
1881         mul(dst, isrc, rscratch1);
1882         umov(rscratch1, vsrc, D, 1);
1883         mul(dst, dst, rscratch1);
1884         break;
1885       default:
1886         assert(false, "unsupported");
1887         ShouldNotReachHere();
1888     }
1889   BLOCK_COMMENT("} neon_reduce_mul_integral");
1890 }
1891 
1892 // Vector reduction multiply for floating-point type with ASIMD instructions.
1893 void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt,
1894                                            FloatRegister fsrc, FloatRegister vsrc,
1895                                            unsigned vector_length_in_bytes,
1896                                            FloatRegister vtmp) {
1897   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1898   bool isQ = vector_length_in_bytes == 16;
1899 
1900   BLOCK_COMMENT("neon_reduce_mul_fp {");
1901     switch(bt) {
1902       case T_FLOAT:
1903         fmuls(dst, fsrc, vsrc);
1904         ins(vtmp, S, vsrc, 0, 1);
1905         fmuls(dst, dst, vtmp);
1906         if (isQ) {
1907           ins(vtmp, S, vsrc, 0, 2);
1908           fmuls(dst, dst, vtmp);
1909           ins(vtmp, S, vsrc, 0, 3);
1910           fmuls(dst, dst, vtmp);
1911          }
1912         break;
1913       case T_DOUBLE:
1914         assert(isQ, "unsupported");
1915         fmuld(dst, fsrc, vsrc);
1916         ins(vtmp, D, vsrc, 0, 1);
1917         fmuld(dst, dst, vtmp);
1918         break;
1919       default:
1920         assert(false, "unsupported");
1921         ShouldNotReachHere();
1922     }
1923   BLOCK_COMMENT("} neon_reduce_mul_fp");
1924 }
1925 
1926 // Helper to select logical instruction
1927 void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd,
1928                                                    Register Rn, Register Rm,
1929                                                    enum shift_kind kind, unsigned shift) {
1930   switch(opc) {
1931     case Op_AndReductionV:
1932       is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift);
1933       break;
1934     case Op_OrReductionV:
1935       is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift);
1936       break;
1937     case Op_XorReductionV:
1938       is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift);
1939       break;
1940     default:
1941       assert(false, "unsupported");
1942       ShouldNotReachHere();
1943   }
1944 }
1945 
1946 // Vector reduction logical operations And, Or, Xor
1947 // Clobbers: rscratch1
1948 void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt,
1949                                             Register isrc, FloatRegister vsrc,
1950                                             unsigned vector_length_in_bytes) {
1951   assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV,
1952          "unsupported");
1953   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1954   assert_different_registers(dst, isrc);
1955   bool isQ = vector_length_in_bytes == 16;
1956 
1957   BLOCK_COMMENT("neon_reduce_logical {");
1958     umov(rscratch1, vsrc, isQ ? D : S, 0);
1959     umov(dst, vsrc, isQ ? D : S, 1);
1960     neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1);
1961     switch(bt) {
1962       case T_BYTE:
1963         if (isQ) {
1964           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1965         }
1966         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
1967         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8);
1968         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1969         sxtb(dst, dst);
1970         break;
1971       case T_SHORT:
1972         if (isQ) {
1973           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1974         }
1975         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
1976         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1977         sxth(dst, dst);
1978         break;
1979       case T_INT:
1980         if (isQ) {
1981           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1982         }
1983         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1984         break;
1985       case T_LONG:
1986         assert(isQ, "unsupported");
1987         neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst);
1988         break;
1989       default:
1990         assert(false, "unsupported");
1991         ShouldNotReachHere();
1992     }
1993   BLOCK_COMMENT("} neon_reduce_logical");
1994 }
1995 
1996 // Vector reduction min/max for integral type with ASIMD instructions.
1997 // Note: vtmp is not used and expected to be fnoreg for T_LONG case.
1998 // Clobbers: rscratch1, rflags
1999 void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt,
2000                                                     Register isrc, FloatRegister vsrc,
2001                                                     unsigned vector_length_in_bytes,
2002                                                     FloatRegister vtmp) {
2003   assert(opc == Op_MinReductionV || opc == Op_MaxReductionV, "unsupported");
2004   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
2005   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported");
2006   assert_different_registers(dst, isrc);
2007   bool isQ = vector_length_in_bytes == 16;
2008   bool is_min = opc == Op_MinReductionV;
2009 
2010   BLOCK_COMMENT("neon_reduce_minmax_integral {");
2011     if (bt == T_LONG) {
2012       assert(vtmp == fnoreg, "should be");
2013       assert(isQ, "should be");
2014       umov(rscratch1, vsrc, D, 0);
2015       cmp(isrc, rscratch1);
2016       csel(dst, isrc, rscratch1, is_min ? LT : GT);
2017       umov(rscratch1, vsrc, D, 1);
2018       cmp(dst, rscratch1);
2019       csel(dst, dst, rscratch1, is_min ? LT : GT);
2020     } else {
2021       SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
2022       if (size == T2S) {
2023         is_min ? sminp(vtmp, size, vsrc, vsrc) : smaxp(vtmp, size, vsrc, vsrc);
2024       } else {
2025         is_min ? sminv(vtmp, size, vsrc) : smaxv(vtmp, size, vsrc);
2026       }
2027       if (bt == T_INT) {
2028         umov(dst, vtmp, S, 0);
2029       } else {
2030         smov(dst, vtmp, elemType_to_regVariant(bt), 0);
2031       }
2032       cmpw(dst, isrc);
2033       cselw(dst, dst, isrc, is_min ? LT : GT);
2034     }
2035   BLOCK_COMMENT("} neon_reduce_minmax_integral");
2036 }
2037 
2038 // Vector reduction for integral type with SVE instruction.
2039 // Supported operations are Add, And, Or, Xor, Max, Min.
2040 // rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV.
2041 void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1,
2042                                             FloatRegister src2, PRegister pg, FloatRegister tmp) {
2043   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2044   assert(pg->is_governing(), "This register has to be a governing predicate register");
2045   assert_different_registers(src1, dst);
2046   // Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved.
2047   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2048   switch (opc) {
2049     case Op_AddReductionVI: {
2050       sve_uaddv(tmp, size, pg, src2);
2051       if (bt == T_BYTE) {
2052         smov(dst, tmp, size, 0);
2053         addw(dst, src1, dst, ext::sxtb);
2054       } else if (bt == T_SHORT) {
2055         smov(dst, tmp, size, 0);
2056         addw(dst, src1, dst, ext::sxth);
2057       } else {
2058         umov(dst, tmp, size, 0);
2059         addw(dst, dst, src1);
2060       }
2061       break;
2062     }
2063     case Op_AddReductionVL: {
2064       sve_uaddv(tmp, size, pg, src2);
2065       umov(dst, tmp, size, 0);
2066       add(dst, dst, src1);
2067       break;
2068     }
2069     case Op_AndReductionV: {
2070       sve_andv(tmp, size, pg, src2);
2071       if (bt == T_INT || bt == T_LONG) {
2072         umov(dst, tmp, size, 0);
2073       } else {
2074         smov(dst, tmp, size, 0);
2075       }
2076       if (bt == T_LONG) {
2077         andr(dst, dst, src1);
2078       } else {
2079         andw(dst, dst, src1);
2080       }
2081       break;
2082     }
2083     case Op_OrReductionV: {
2084       sve_orv(tmp, size, pg, src2);
2085       if (bt == T_INT || bt == T_LONG) {
2086         umov(dst, tmp, size, 0);
2087       } else {
2088         smov(dst, tmp, size, 0);
2089       }
2090       if (bt == T_LONG) {
2091         orr(dst, dst, src1);
2092       } else {
2093         orrw(dst, dst, src1);
2094       }
2095       break;
2096     }
2097     case Op_XorReductionV: {
2098       sve_eorv(tmp, size, pg, src2);
2099       if (bt == T_INT || bt == T_LONG) {
2100         umov(dst, tmp, size, 0);
2101       } else {
2102         smov(dst, tmp, size, 0);
2103       }
2104       if (bt == T_LONG) {
2105         eor(dst, dst, src1);
2106       } else {
2107         eorw(dst, dst, src1);
2108       }
2109       break;
2110     }
2111     case Op_MaxReductionV: {
2112       sve_smaxv(tmp, size, pg, src2);
2113       if (bt == T_INT || bt == T_LONG) {
2114         umov(dst, tmp, size, 0);
2115       } else {
2116         smov(dst, tmp, size, 0);
2117       }
2118       if (bt == T_LONG) {
2119         cmp(dst, src1);
2120         csel(dst, dst, src1, Assembler::GT);
2121       } else {
2122         cmpw(dst, src1);
2123         cselw(dst, dst, src1, Assembler::GT);
2124       }
2125       break;
2126     }
2127     case Op_MinReductionV: {
2128       sve_sminv(tmp, size, pg, src2);
2129       if (bt == T_INT || bt == T_LONG) {
2130         umov(dst, tmp, size, 0);
2131       } else {
2132         smov(dst, tmp, size, 0);
2133       }
2134       if (bt == T_LONG) {
2135         cmp(dst, src1);
2136         csel(dst, dst, src1, Assembler::LT);
2137       } else {
2138         cmpw(dst, src1);
2139         cselw(dst, dst, src1, Assembler::LT);
2140       }
2141       break;
2142     }
2143     default:
2144       assert(false, "unsupported");
2145       ShouldNotReachHere();
2146   }
2147 
2148   if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) {
2149     if (bt == T_BYTE) {
2150       sxtb(dst, dst);
2151     } else if (bt == T_SHORT) {
2152       sxth(dst, dst);
2153     }
2154   }
2155 }
2156 
2157 // Set elements of the dst predicate to true for lanes in the range of [0, lane_cnt), or
2158 // to false otherwise. The input "lane_cnt" should be smaller than or equal to the supported
2159 // max vector length of the basic type. Clobbers: rscratch1 and the rFlagsReg.
2160 void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt) {
2161   uint32_t max_vector_length = Matcher::max_vector_size(bt);
2162   assert(lane_cnt <= max_vector_length, "unsupported input lane_cnt");
2163 
2164   // Set all elements to false if the input "lane_cnt" is zero.
2165   if (lane_cnt == 0) {
2166     sve_pfalse(dst);
2167     return;
2168   }
2169 
2170   SIMD_RegVariant size = elemType_to_regVariant(bt);
2171   assert(size != Q, "invalid size");
2172 
2173   // Set all true if "lane_cnt" equals to the max lane count.
2174   if (lane_cnt == max_vector_length) {
2175     sve_ptrue(dst, size, /* ALL */ 0b11111);
2176     return;
2177   }
2178 
2179   // Fixed numbers for "ptrue".
2180   switch(lane_cnt) {
2181   case 1: /* VL1 */
2182   case 2: /* VL2 */
2183   case 3: /* VL3 */
2184   case 4: /* VL4 */
2185   case 5: /* VL5 */
2186   case 6: /* VL6 */
2187   case 7: /* VL7 */
2188   case 8: /* VL8 */
2189     sve_ptrue(dst, size, lane_cnt);
2190     return;
2191   case 16:
2192     sve_ptrue(dst, size, /* VL16 */ 0b01001);
2193     return;
2194   case 32:
2195     sve_ptrue(dst, size, /* VL32 */ 0b01010);
2196     return;
2197   case 64:
2198     sve_ptrue(dst, size, /* VL64 */ 0b01011);
2199     return;
2200   case 128:
2201     sve_ptrue(dst, size, /* VL128 */ 0b01100);
2202     return;
2203   case 256:
2204     sve_ptrue(dst, size, /* VL256 */ 0b01101);
2205     return;
2206   default:
2207     break;
2208   }
2209 
2210   // Special patterns for "ptrue".
2211   if (lane_cnt == round_down_power_of_2(max_vector_length)) {
2212     sve_ptrue(dst, size, /* POW2 */ 0b00000);
2213   } else if (lane_cnt == max_vector_length - (max_vector_length % 4)) {
2214     sve_ptrue(dst, size, /* MUL4 */ 0b11101);
2215   } else if (lane_cnt == max_vector_length - (max_vector_length % 3)) {
2216     sve_ptrue(dst, size, /* MUL3 */ 0b11110);
2217   } else {
2218     // Encode to "whileltw" for the remaining cases.
2219     mov(rscratch1, lane_cnt);
2220     sve_whileltw(dst, size, zr, rscratch1);
2221   }
2222 }
2223 
2224 // Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst.
2225 // Any remaining elements of dst will be filled with zero.
2226 // Clobbers: rscratch1
2227 // Preserves: src, mask
2228 void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask,
2229                                            FloatRegister vtmp1, FloatRegister vtmp2,
2230                                            PRegister pgtmp) {
2231   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2232   assert_different_registers(dst, src, vtmp1, vtmp2);
2233   assert_different_registers(mask, pgtmp);
2234 
2235   // Example input:   src   = 8888 7777 6666 5555 4444 3333 2222 1111
2236   //                  mask  = 0001 0000 0000 0001 0001 0000 0001 0001
2237   // Expected result: dst   = 0000 0000 0000 8888 5555 4444 2222 1111
2238   sve_dup(vtmp2, H, 0);
2239 
2240   // Extend lowest half to type INT.
2241   // dst = 00004444 00003333 00002222 00001111
2242   sve_uunpklo(dst, S, src);
2243   // pgtmp = 00000001 00000000 00000001 00000001
2244   sve_punpklo(pgtmp, mask);
2245   // Pack the active elements in size of type INT to the right,
2246   // and fill the remainings with zero.
2247   // dst = 00000000 00004444 00002222 00001111
2248   sve_compact(dst, S, dst, pgtmp);
2249   // Narrow the result back to type SHORT.
2250   // dst = 0000 0000 0000 0000 0000 4444 2222 1111
2251   sve_uzp1(dst, H, dst, vtmp2);
2252   // Count the active elements of lowest half.
2253   // rscratch1 = 3
2254   sve_cntp(rscratch1, S, ptrue, pgtmp);
2255 
2256   // Repeat to the highest half.
2257   // pgtmp = 00000001 00000000 00000000 00000001
2258   sve_punpkhi(pgtmp, mask);
2259   // vtmp1 = 00008888 00007777 00006666 00005555
2260   sve_uunpkhi(vtmp1, S, src);
2261   // vtmp1 = 00000000 00000000 00008888 00005555
2262   sve_compact(vtmp1, S, vtmp1, pgtmp);
2263   // vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555
2264   sve_uzp1(vtmp1, H, vtmp1, vtmp2);
2265 
2266   // Compressed low:   dst   = 0000 0000 0000 0000 0000 4444 2222 1111
2267   // Compressed high:  vtmp1 = 0000 0000 0000 0000 0000 0000 8888  5555
2268   // Left shift(cross lane) compressed high with TRUE_CNT lanes,
2269   // TRUE_CNT is the number of active elements in the compressed low.
2270   neg(rscratch1, rscratch1);
2271   // vtmp2 = {4 3 2 1 0 -1 -2 -3}
2272   sve_index(vtmp2, H, rscratch1, 1);
2273   // vtmp1 = 0000 0000 0000 8888 5555 0000 0000 0000
2274   sve_tbl(vtmp1, H, vtmp1, vtmp2);
2275 
2276   // Combine the compressed high(after shifted) with the compressed low.
2277   // dst = 0000 0000 0000 8888 5555 4444 2222 1111
2278   sve_orr(dst, dst, vtmp1);
2279 }
2280 
2281 // Clobbers: rscratch1, rscratch2
2282 // Preserves: src, mask
2283 void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask,
2284                                           FloatRegister vtmp1, FloatRegister vtmp2,
2285                                           FloatRegister vtmp3, FloatRegister vtmp4,
2286                                           PRegister ptmp, PRegister pgtmp) {
2287   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2288   assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3, vtmp4);
2289   assert_different_registers(mask, ptmp, pgtmp);
2290   // Example input:   src   = 88 77 66 55 44 33 22 11
2291   //                  mask  = 01 00 00 01 01 00 01 01
2292   // Expected result: dst   = 00 00 00 88 55 44 22 11
2293 
2294   sve_dup(vtmp4, B, 0);
2295   // Extend lowest half to type SHORT.
2296   // vtmp1 = 0044 0033 0022 0011
2297   sve_uunpklo(vtmp1, H, src);
2298   // ptmp = 0001 0000 0001 0001
2299   sve_punpklo(ptmp, mask);
2300   // Count the active elements of lowest half.
2301   // rscratch2 = 3
2302   sve_cntp(rscratch2, H, ptrue, ptmp);
2303   // Pack the active elements in size of type SHORT to the right,
2304   // and fill the remainings with zero.
2305   // dst = 0000 0044 0022 0011
2306   sve_compress_short(dst, vtmp1, ptmp, vtmp2, vtmp3, pgtmp);
2307   // Narrow the result back to type BYTE.
2308   // dst = 00 00 00 00 00 44 22 11
2309   sve_uzp1(dst, B, dst, vtmp4);
2310 
2311   // Repeat to the highest half.
2312   // ptmp = 0001 0000 0000 0001
2313   sve_punpkhi(ptmp, mask);
2314   // vtmp1 = 0088 0077 0066 0055
2315   sve_uunpkhi(vtmp2, H, src);
2316   // vtmp1 = 0000 0000 0088 0055
2317   sve_compress_short(vtmp1, vtmp2, ptmp, vtmp3, vtmp4, pgtmp);
2318 
2319   sve_dup(vtmp4, B, 0);
2320   // vtmp1 = 00 00 00 00 00 00 88 55
2321   sve_uzp1(vtmp1, B, vtmp1, vtmp4);
2322 
2323   // Compressed low:   dst   = 00 00 00 00 00 44 22 11
2324   // Compressed high:  vtmp1 = 00 00 00 00 00 00 88 55
2325   // Left shift(cross lane) compressed high with TRUE_CNT lanes,
2326   // TRUE_CNT is the number of active elements in the compressed low.
2327   neg(rscratch2, rscratch2);
2328   // vtmp2 = {4 3 2 1 0 -1 -2 -3}
2329   sve_index(vtmp2, B, rscratch2, 1);
2330   // vtmp1 = 00 00 00 88 55 00 00 00
2331   sve_tbl(vtmp1, B, vtmp1, vtmp2);
2332   // Combine the compressed high(after shifted) with the compressed low.
2333   // dst = 00 00 00 88 55 44 22 11
2334   sve_orr(dst, dst, vtmp1);
2335 }
2336 
2337 void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2338   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2339   SIMD_Arrangement size = isQ ? T16B : T8B;
2340   if (bt == T_BYTE) {
2341     rbit(dst, size, src);
2342   } else {
2343     neon_reverse_bytes(dst, src, bt, isQ);
2344     rbit(dst, size, dst);
2345   }
2346 }
2347 
2348 void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2349   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2350   SIMD_Arrangement size = isQ ? T16B : T8B;
2351   switch (bt) {
2352     case T_BYTE:
2353       if (dst != src) {
2354         orr(dst, size, src, src);
2355       }
2356       break;
2357     case T_SHORT:
2358       rev16(dst, size, src);
2359       break;
2360     case T_INT:
2361       rev32(dst, size, src);
2362       break;
2363     case T_LONG:
2364       rev64(dst, size, src);
2365       break;
2366     default:
2367       assert(false, "unsupported");
2368       ShouldNotReachHere();
2369   }
2370 }
2371 
2372 // VectorRearrange implementation for short/int/float/long/double types with NEON
2373 // instructions. For VectorRearrange short/int/float, we use NEON tbl instruction.
2374 // But since it supports bytes table only, we need to lookup 2/4 bytes as a group.
2375 // For VectorRearrange long/double, we compare the shuffle input with iota indices,
2376 // and use bsl to implement the operation.
2377 void C2_MacroAssembler::neon_rearrange_hsd(FloatRegister dst, FloatRegister src,
2378                                            FloatRegister shuffle, FloatRegister tmp,
2379                                            BasicType bt, bool isQ) {
2380   assert_different_registers(dst, src, shuffle, tmp);
2381   SIMD_Arrangement size1 = isQ ? T16B : T8B;
2382   SIMD_Arrangement size2 = esize2arrangement((uint)type2aelembytes(bt), isQ);
2383 
2384   // Here is an example that rearranges a NEON vector with 4 ints:
2385   // Rearrange V1 int[a0, a1, a2, a3] to V2 int[a2, a3, a0, a1]
2386   //   1. We assume the shuffle input is Vi int[2, 3, 0, 1].
2387   //   2. Multiply Vi int[2, 3, 0, 1] with constant int vector
2388   //      [0x04040404, 0x04040404, 0x04040404, 0x04040404], and get
2389   //      tbl base Vm int[0x08080808, 0x0c0c0c0c, 0x00000000, 0x04040404].
2390   //   3. Add Vm with constant int[0x03020100, 0x03020100, 0x03020100, 0x03020100],
2391   //      and get tbl index Vm int[0x0b0a0908, 0x0f0e0d0c, 0x03020100, 0x07060504]
2392   //   4. Use Vm as index register, and use V1 as table register.
2393   //      Then get V2 as the result by tbl NEON instructions.
2394   switch (bt) {
2395     case T_SHORT:
2396       mov(tmp, size1, 0x02);
2397       mulv(dst, size2, shuffle, tmp);
2398       mov(tmp, size2, 0x0100);
2399       addv(dst, size1, dst, tmp);
2400       tbl(dst, size1, src, 1, dst);
2401       break;
2402     case T_INT:
2403     case T_FLOAT:
2404       mov(tmp, size1, 0x04);
2405       mulv(dst, size2, shuffle, tmp);
2406       mov(tmp, size2, 0x03020100);
2407       addv(dst, size1, dst, tmp);
2408       tbl(dst, size1, src, 1, dst);
2409       break;
2410     case T_LONG:
2411     case T_DOUBLE:
2412       // Load the iota indices for Long type. The indices are ordered by
2413       // type B/S/I/L/F/D, and the offset between two types is 16; Hence
2414       // the offset for L is 48.
2415       lea(rscratch1,
2416           ExternalAddress(StubRoutines::aarch64::vector_iota_indices() + 48));
2417       ldrq(tmp, rscratch1);
2418       // Check whether the input "shuffle" is the same with iota indices.
2419       // Return "src" if true, otherwise swap the two elements of "src".
2420       cm(EQ, dst, size2, shuffle, tmp);
2421       ext(tmp, size1, src, src, 8);
2422       bsl(dst, size1, src, tmp);
2423       break;
2424     default:
2425       assert(false, "unsupported element type");
2426       ShouldNotReachHere();
2427   }
2428 }
2429 
2430 // Extract a scalar element from an sve vector at position 'idx'.
2431 // The input elements in src are expected to be of integral type.
2432 void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src,
2433                                              int idx, FloatRegister vtmp) {
2434   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2435   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2436   if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction
2437     if (bt == T_INT || bt == T_LONG) {
2438       umov(dst, src, size, idx);
2439     } else {
2440       smov(dst, src, size, idx);
2441     }
2442   } else {
2443     sve_orr(vtmp, src, src);
2444     sve_ext(vtmp, vtmp, idx << size);
2445     if (bt == T_INT || bt == T_LONG) {
2446       umov(dst, vtmp, size, 0);
2447     } else {
2448       smov(dst, vtmp, size, 0);
2449     }
2450   }
2451 }
2452 
2453 // java.lang.Math::round intrinsics
2454 
2455 // Clobbers: rscratch1, rflags
2456 void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2457                                           FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) {
2458   assert_different_registers(tmp1, tmp2, tmp3, src, dst);
2459   switch (T) {
2460     case T2S:
2461     case T4S:
2462       fmovs(tmp1, T, 0.5f);
2463       mov(rscratch1, jint_cast(0x1.0p23f));
2464       break;
2465     case T2D:
2466       fmovd(tmp1, T, 0.5);
2467       mov(rscratch1, julong_cast(0x1.0p52));
2468       break;
2469     default:
2470       assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2471   }
2472   fadd(tmp1, T, tmp1, src);
2473   fcvtms(tmp1, T, tmp1);
2474   // tmp1 = floor(src + 0.5, ties to even)
2475 
2476   fcvtas(dst, T, src);
2477   // dst = round(src), ties to away
2478 
2479   fneg(tmp3, T, src);
2480   dup(tmp2, T, rscratch1);
2481   cm(HS, tmp3, T, tmp3, tmp2);
2482   // tmp3 is now a set of flags
2483 
2484   bif(dst, T16B, tmp1, tmp3);
2485   // result in dst
2486 }
2487 
2488 // Clobbers: rscratch1, rflags
2489 void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2490                                          FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) {
2491   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2492   assert_different_registers(tmp1, tmp2, src, dst);
2493 
2494   switch (T) {
2495     case S:
2496       mov(rscratch1, jint_cast(0x1.0p23f));
2497       break;
2498     case D:
2499       mov(rscratch1, julong_cast(0x1.0p52));
2500       break;
2501     default:
2502       assert(T == S || T == D, "invalid register variant");
2503   }
2504 
2505   sve_frinta(dst, T, ptrue, src);
2506   // dst = round(src), ties to away
2507 
2508   Label none;
2509 
2510   sve_fneg(tmp1, T, ptrue, src);
2511   sve_dup(tmp2, T, rscratch1);
2512   sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1);
2513   br(EQ, none);
2514   {
2515     sve_cpy(tmp1, T, pgtmp, 0.5);
2516     sve_fadd(tmp1, T, pgtmp, src);
2517     sve_frintm(dst, T, pgtmp, tmp1);
2518     // dst = floor(src + 0.5, ties to even)
2519   }
2520   bind(none);
2521 
2522   sve_fcvtzs(dst, T, ptrue, dst, T);
2523   // result in dst
2524 }
2525 
2526 void C2_MacroAssembler::vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero,
2527                                            FloatRegister one, SIMD_Arrangement T) {
2528   assert_different_registers(dst, src, zero, one);
2529   assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2530 
2531   facgt(dst, T, src, zero);
2532   ushr(dst, T, dst, 1); // dst=0 for +-0.0 and NaN. 0x7FF..F otherwise
2533   bsl(dst, T == T2S ? T8B : T16B, one, src); // Result in dst
2534 }
2535 
2536 void C2_MacroAssembler::vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero,
2537                                           FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T) {
2538     assert_different_registers(dst, src, zero, one, vtmp);
2539     assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2540 
2541     sve_orr(vtmp, src, src);
2542     sve_fac(Assembler::GT, pgtmp, T, ptrue, src, zero); // pmtp=0 for +-0.0 and NaN. 0x1 otherwise
2543     switch (T) {
2544     case S:
2545       sve_and(vtmp, T, min_jint); // Extract the sign bit of float value in every lane of src
2546       sve_orr(vtmp, T, jint_cast(1.0)); // OR it with +1 to make the final result +1 or -1 depending
2547                                         // on the sign of the float value
2548       break;
2549     case D:
2550       sve_and(vtmp, T, min_jlong);
2551       sve_orr(vtmp, T, jlong_cast(1.0));
2552       break;
2553     default:
2554       assert(false, "unsupported");
2555       ShouldNotReachHere();
2556     }
2557     sve_sel(dst, T, pgtmp, vtmp, src); // Select either from src or vtmp based on the predicate register pgtmp
2558                                        // Result in dst
2559 }
2560 
2561 bool C2_MacroAssembler::in_scratch_emit_size() {
2562   if (ciEnv::current()->task() != nullptr) {
2563     PhaseOutput* phase_output = Compile::current()->output();
2564     if (phase_output != nullptr && phase_output->in_scratch_emit_size()) {
2565       return true;
2566     }
2567   }
2568   return MacroAssembler::in_scratch_emit_size();
2569 }
2570 
2571 static void abort_verify_int_in_range(uint idx, jint val, jint lo, jint hi) {
2572   fatal("Invalid CastII, idx: %u, val: %d, lo: %d, hi: %d", idx, val, lo, hi);
2573 }
2574 
2575 void C2_MacroAssembler::verify_int_in_range(uint idx, const TypeInt* t, Register rval, Register rtmp) {
2576   assert(!t->empty() && !t->singleton(), "%s", Type::str(t));
2577   if (t == TypeInt::INT) {
2578     return;
2579   }
2580   BLOCK_COMMENT("verify_int_in_range {");
2581   Label L_success, L_failure;
2582 
2583   jint lo = t->_lo;
2584   jint hi = t->_hi;
2585 
2586   if (lo != min_jint && hi != max_jint) {
2587     subsw(rtmp, rval, lo);
2588     br(Assembler::LT, L_failure);
2589     subsw(rtmp, rval, hi);
2590     br(Assembler::LE, L_success);
2591   } else if (lo != min_jint) {
2592     subsw(rtmp, rval, lo);
2593     br(Assembler::GE, L_success);
2594   } else if (hi != max_jint) {
2595     subsw(rtmp, rval, hi);
2596     br(Assembler::LE, L_success);
2597   } else {
2598     ShouldNotReachHere();
2599   }
2600 
2601   bind(L_failure);
2602   movw(c_rarg0, idx);
2603   mov(c_rarg1, rval);
2604   movw(c_rarg2, lo);
2605   movw(c_rarg3, hi);
2606   reconstruct_frame_pointer(rtmp);
2607   rt_call(CAST_FROM_FN_PTR(address, abort_verify_int_in_range), rtmp);
2608   hlt(0);
2609 
2610   bind(L_success);
2611   BLOCK_COMMENT("} verify_int_in_range");
2612 }
2613 
2614 static void abort_verify_long_in_range(uint idx, jlong val, jlong lo, jlong hi) {
2615   fatal("Invalid CastLL, idx: %u, val: " JLONG_FORMAT ", lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, val, lo, hi);
2616 }
2617 
2618 void C2_MacroAssembler::verify_long_in_range(uint idx, const TypeLong* t, Register rval, Register rtmp) {
2619   assert(!t->empty() && !t->singleton(), "%s", Type::str(t));
2620   if (t == TypeLong::LONG) {
2621     return;
2622   }
2623   BLOCK_COMMENT("verify_long_in_range {");
2624   Label L_success, L_failure;
2625 
2626   jlong lo = t->_lo;
2627   jlong hi = t->_hi;
2628 
2629   if (lo != min_jlong && hi != max_jlong) {
2630     subs(rtmp, rval, lo);
2631     br(Assembler::LT, L_failure);
2632     subs(rtmp, rval, hi);
2633     br(Assembler::LE, L_success);
2634   } else if (lo != min_jlong) {
2635     subs(rtmp, rval, lo);
2636     br(Assembler::GE, L_success);
2637   } else if (hi != max_jlong) {
2638     subs(rtmp, rval, hi);
2639     br(Assembler::LE, L_success);
2640   } else {
2641     ShouldNotReachHere();
2642   }
2643 
2644   bind(L_failure);
2645   movw(c_rarg0, idx);
2646   mov(c_rarg1, rval);
2647   mov(c_rarg2, lo);
2648   mov(c_rarg3, hi);
2649   reconstruct_frame_pointer(rtmp);
2650   rt_call(CAST_FROM_FN_PTR(address, abort_verify_long_in_range), rtmp);
2651   hlt(0);
2652 
2653   bind(L_success);
2654   BLOCK_COMMENT("} verify_long_in_range");
2655 }
2656 
2657 void C2_MacroAssembler::reconstruct_frame_pointer(Register rtmp) {
2658   const int framesize = Compile::current()->output()->frame_size_in_bytes();
2659   if (PreserveFramePointer) {
2660     // frame pointer is valid
2661 #ifdef ASSERT
2662     // Verify frame pointer value in rfp.
2663     add(rtmp, sp, framesize - 2 * wordSize);
2664     Label L_success;
2665     cmp(rfp, rtmp);
2666     br(Assembler::EQ, L_success);
2667     stop("frame pointer mismatch");
2668     bind(L_success);
2669 #endif // ASSERT
2670   } else {
2671     add(rfp, sp, framesize - 2 * wordSize);
2672   }
2673 }
2674 
2675 // Selects elements from two source vectors (src1, src2) based on index values in the index register
2676 // using Neon instructions and places it in the destination vector element corresponding to the
2677 // index vector element. Each index in the index register must be in the range - [0, 2 * NUM_ELEM),
2678 // where NUM_ELEM is the number of BasicType elements per vector.
2679 // If idx < NUM_ELEM --> selects src1[idx] (idx is an element of the index register)
2680 // Otherwise, selects src2[idx – NUM_ELEM]
2681 void C2_MacroAssembler::select_from_two_vectors_neon(FloatRegister dst, FloatRegister src1,
2682                                                      FloatRegister src2, FloatRegister index,
2683                                                      FloatRegister tmp, unsigned vector_length_in_bytes) {
2684   assert_different_registers(dst, src1, src2, tmp);
2685   SIMD_Arrangement size = vector_length_in_bytes == 16 ? T16B : T8B;
2686 
2687   if (vector_length_in_bytes == 16) {
2688     assert(UseSVE <= 1, "sve must be <= 1");
2689     assert(src1->successor() == src2, "Source registers must be ordered");
2690     // If the vector length is 16B, then use the Neon "tbl" instruction with two vector table
2691     tbl(dst, size, src1, 2, index);
2692   } else { // vector length == 8
2693     assert(UseSVE == 0, "must be Neon only");
2694     // We need to fit both the source vectors (src1, src2) in a 128-bit register because the
2695     // Neon "tbl" instruction supports only looking up 16B vectors. We then use the Neon "tbl"
2696     // instruction with one vector lookup
2697     ins(tmp, D, src1, 0, 0);
2698     ins(tmp, D, src2, 1, 0);
2699     tbl(dst, size, tmp, 1, index);
2700   }
2701 }
2702 
2703 // Selects elements from two source vectors (src1, src2) based on index values in the index register
2704 // using SVE/SVE2 instructions and places it in the destination vector element corresponding to the
2705 // index vector element. Each index in the index register must be in the range - [0, 2 * NUM_ELEM),
2706 // where NUM_ELEM is the number of BasicType elements per vector.
2707 // If idx < NUM_ELEM --> selects src1[idx] (idx is an element of the index register)
2708 // Otherwise, selects src2[idx – NUM_ELEM]
2709 void C2_MacroAssembler::select_from_two_vectors_sve(FloatRegister dst, FloatRegister src1,
2710                                                     FloatRegister src2, FloatRegister index,
2711                                                     FloatRegister tmp, SIMD_RegVariant T,
2712                                                     unsigned vector_length_in_bytes) {
2713   assert_different_registers(dst, src1, src2, index, tmp);
2714 
2715   if (vector_length_in_bytes == 8) {
2716     // We need to fit both the source vectors (src1, src2) in a single vector register because the
2717     // SVE "tbl" instruction is unpredicated and works on the entire vector which can lead to
2718     // incorrect results if each source vector is only partially filled. We then use the SVE "tbl"
2719     // instruction with one vector lookup
2720     assert(UseSVE >= 1, "sve must be >= 1");
2721     ins(tmp, D, src1, 0, 0);
2722     ins(tmp, D, src2, 1, 0);
2723     sve_tbl(dst, T, tmp, index);
2724   } else {  // UseSVE == 2 and vector_length_in_bytes > 8
2725     // If the vector length is > 8, then use the SVE2 "tbl" instruction with the two vector table.
2726     // The assertion - vector_length_in_bytes == MaxVectorSize ensures that this operation
2727     // is not executed on machines where vector_length_in_bytes < MaxVectorSize
2728     // with the only exception of 8B vector length.
2729     assert(UseSVE == 2 && vector_length_in_bytes == MaxVectorSize, "must be");
2730     assert(src1->successor() == src2, "Source registers must be ordered");
2731     sve_tbl(dst, T, src1, src2, index);
2732   }
2733 }
2734 
2735 void C2_MacroAssembler::select_from_two_vectors(FloatRegister dst, FloatRegister src1,
2736                                                 FloatRegister src2, FloatRegister index,
2737                                                 FloatRegister tmp, BasicType bt,
2738                                                 unsigned vector_length_in_bytes) {
2739 
2740   assert_different_registers(dst, src1, src2, index, tmp);
2741 
2742   // The cases that can reach this method are -
2743   // - UseSVE = 0, vector_length_in_bytes = 8 or 16
2744   // - UseSVE = 1, vector_length_in_bytes = 8 or 16
2745   // - UseSVE = 2, vector_length_in_bytes >= 8
2746   //
2747   // SVE/SVE2 tbl instructions are generated when UseSVE = 1 with vector_length_in_bytes = 8
2748   // and UseSVE = 2 with vector_length_in_bytes >= 8
2749   //
2750   // Neon instructions are generated when UseSVE = 0 with vector_length_in_bytes = 8 or 16 and
2751   // UseSVE = 1 with vector_length_in_bytes = 16
2752 
2753   if ((UseSVE == 1 && vector_length_in_bytes == 8) || UseSVE == 2) {
2754     SIMD_RegVariant T = elemType_to_regVariant(bt);
2755     select_from_two_vectors_sve(dst, src1, src2, index, tmp, T, vector_length_in_bytes);
2756     return;
2757   }
2758 
2759   // The only BasicTypes that can reach here are T_SHORT, T_BYTE, T_INT and T_FLOAT
2760   assert(bt != T_DOUBLE && bt != T_LONG, "unsupported basic type");
2761   assert(vector_length_in_bytes <= 16, "length_in_bytes must be <= 16");
2762 
2763   bool isQ = vector_length_in_bytes == 16;
2764 
2765   SIMD_Arrangement size1 = isQ ? T16B : T8B;
2766   SIMD_Arrangement size2 = esize2arrangement((uint)type2aelembytes(bt), isQ);
2767 
2768   // Neon "tbl" instruction only supports byte tables, so we need to look at chunks of
2769   // 2B for selecting shorts or chunks of 4B for selecting ints/floats from the table.
2770   // The index values in "index" register are in the range of [0, 2 * NUM_ELEM) where NUM_ELEM
2771   // is the number of elements that can fit in a vector. For ex. for T_SHORT with 64-bit vector length,
2772   // the indices can range from [0, 8).
2773   // As an example with 64-bit vector length and T_SHORT type - let index = [2, 5, 1, 0]
2774   // Move a constant 0x02 in every byte of tmp - tmp = [0x0202, 0x0202, 0x0202, 0x0202]
2775   // Multiply index vector with tmp to yield - dst = [0x0404, 0x0a0a, 0x0202, 0x0000]
2776   // Move a constant 0x0100 in every 2B of tmp - tmp = [0x0100, 0x0100, 0x0100, 0x0100]
2777   // Add the multiplied result to the vector in tmp to obtain the byte level
2778   // offsets - dst = [0x0504, 0x0b0a, 0x0302, 0x0100]
2779   // Use these offsets in the "tbl" instruction to select chunks of 2B.
2780 
2781   if (bt == T_BYTE) {
2782     select_from_two_vectors_neon(dst, src1, src2, index, tmp, vector_length_in_bytes);
2783   } else {
2784     int elem_size = (bt == T_SHORT) ? 2 : 4;
2785     uint64_t tbl_offset = (bt == T_SHORT) ? 0x0100u : 0x03020100u;
2786 
2787     mov(tmp, size1, elem_size);
2788     mulv(dst, size2, index, tmp);
2789     mov(tmp, size2, tbl_offset);
2790     addv(dst, size1, dst, tmp); // "dst" now contains the processed index elements
2791                                 // to select a set of 2B/4B
2792     select_from_two_vectors_neon(dst, src1, src2, dst, tmp, vector_length_in_bytes);
2793   }
2794 }