1 /*
   2  * Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "asm/assembler.hpp"
  26 #include "asm/assembler.inline.hpp"
  27 #include "opto/c2_MacroAssembler.hpp"
  28 #include "opto/compile.hpp"
  29 #include "opto/intrinsicnode.hpp"
  30 #include "opto/matcher.hpp"
  31 #include "opto/output.hpp"
  32 #include "opto/subnode.hpp"
  33 #include "runtime/stubRoutines.hpp"
  34 #include "utilities/globalDefinitions.hpp"
  35 #include "utilities/powerOfTwo.hpp"
  36 
  37 #ifdef PRODUCT
  38 #define BLOCK_COMMENT(str) /* nothing */
  39 #define STOP(error) stop(error)
  40 #else
  41 #define BLOCK_COMMENT(str) block_comment(str)
  42 #define STOP(error) block_comment(error); stop(error)
  43 #endif
  44 
  45 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  46 
  47 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
  48 
  49 void C2_MacroAssembler::entry_barrier() {
  50   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
  51   // Dummy labels for just measuring the code size
  52   Label dummy_slow_path;
  53   Label dummy_continuation;
  54   Label dummy_guard;
  55   Label* slow_path = &dummy_slow_path;
  56   Label* continuation = &dummy_continuation;
  57   Label* guard = &dummy_guard;
  58   if (!Compile::current()->output()->in_scratch_emit_size()) {
  59     // Use real labels from actual stub when not emitting code for the purpose of measuring its size
  60     C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
  61     Compile::current()->output()->add_stub(stub);
  62     slow_path = &stub->entry();
  63     continuation = &stub->continuation();
  64     guard = &stub->guard();
  65   }
  66   // In the C2 code, we move the non-hot part of nmethod entry barriers out-of-line to a stub.
  67   bs->nmethod_entry_barrier(this, slow_path, continuation, guard);
  68 }
  69 
  70 // jdk.internal.util.ArraysSupport.vectorizedHashCode
  71 address C2_MacroAssembler::arrays_hashcode(Register ary, Register cnt, Register result,
  72                                            FloatRegister vdata0, FloatRegister vdata1,
  73                                            FloatRegister vdata2, FloatRegister vdata3,
  74                                            FloatRegister vmul0, FloatRegister vmul1,
  75                                            FloatRegister vmul2, FloatRegister vmul3,
  76                                            FloatRegister vpow, FloatRegister vpowm,
  77                                            BasicType eltype) {
  78   ARRAYS_HASHCODE_REGISTERS;
  79 
  80   Register tmp1 = rscratch1, tmp2 = rscratch2;
  81 
  82   Label TAIL, STUB_SWITCH, STUB_SWITCH_OUT, LOOP, BR_BASE, LARGE, DONE;
  83 
  84   // Vectorization factor. Number of array elements loaded to one SIMD&FP registers by the stubs. We
  85   // use 8H load arrangements for chars and shorts and 8B for booleans and bytes. It's possible to
  86   // use 4H for chars and shorts instead, but using 8H gives better performance.
  87   const size_t vf = eltype == T_BOOLEAN || eltype == T_BYTE ? 8
  88                     : eltype == T_CHAR || eltype == T_SHORT ? 8
  89                     : eltype == T_INT                       ? 4
  90                                                             : 0;
  91   guarantee(vf, "unsupported eltype");
  92 
  93   // Unroll factor for the scalar loop below. The value is chosen based on performance analysis.
  94   const size_t unroll_factor = 4;
  95 
  96   switch (eltype) {
  97   case T_BOOLEAN:
  98     BLOCK_COMMENT("arrays_hashcode(unsigned byte) {");
  99     break;
 100   case T_CHAR:
 101     BLOCK_COMMENT("arrays_hashcode(char) {");
 102     break;
 103   case T_BYTE:
 104     BLOCK_COMMENT("arrays_hashcode(byte) {");
 105     break;
 106   case T_SHORT:
 107     BLOCK_COMMENT("arrays_hashcode(short) {");
 108     break;
 109   case T_INT:
 110     BLOCK_COMMENT("arrays_hashcode(int) {");
 111     break;
 112   default:
 113     ShouldNotReachHere();
 114   }
 115 
 116   // large_arrays_hashcode(T_INT) performs worse than the scalar loop below when the Neon loop
 117   // implemented by the stub executes just once. Call the stub only if at least two iterations will
 118   // be executed.
 119   const size_t large_threshold = eltype == T_INT ? vf * 2 : vf;
 120   cmpw(cnt, large_threshold);
 121   br(Assembler::HS, LARGE);
 122 
 123   bind(TAIL);
 124 
 125   // The andr performs cnt % uf where uf = unroll_factor. The subtract shifted by 3 offsets past
 126   // uf - (cnt % uf) pairs of load + madd insns i.e. it only executes cnt % uf load + madd pairs.
 127   // Iteration eats up the remainder, uf elements at a time.
 128   assert(is_power_of_2(unroll_factor), "can't use this value to calculate the jump target PC");
 129   andr(tmp2, cnt, unroll_factor - 1);
 130   adr(tmp1, BR_BASE);
 131   // For Cortex-A53 offset is 4 because 2 nops are generated.
 132   sub(tmp1, tmp1, tmp2, ext::sxtw, VM_Version::supports_a53mac() ? 4 : 3);
 133   movw(tmp2, 0x1f);
 134   br(tmp1);
 135 
 136   bind(LOOP);
 137   for (size_t i = 0; i < unroll_factor; ++i) {
 138     load(tmp1, Address(post(ary, type2aelembytes(eltype))), eltype);
 139     maddw(result, result, tmp2, tmp1);
 140     // maddw generates an extra nop for Cortex-A53 (see maddw definition in macroAssembler).
 141     // Generate 2nd nop to have 4 instructions per iteration.
 142     if (VM_Version::supports_a53mac()) {
 143       nop();
 144     }
 145   }
 146   bind(BR_BASE);
 147   subsw(cnt, cnt, unroll_factor);
 148   br(Assembler::HS, LOOP);
 149 
 150   b(DONE);
 151 
 152   bind(LARGE);
 153 
 154   RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_arrays_hashcode(eltype));
 155   assert(stub.target() != nullptr, "array_hashcode stub has not been generated");
 156   address tpc = trampoline_call(stub);
 157   if (tpc == nullptr) {
 158     DEBUG_ONLY(reset_labels(TAIL, BR_BASE));
 159     postcond(pc() == badAddress);
 160     return nullptr;
 161   }
 162 
 163   bind(DONE);
 164 
 165   BLOCK_COMMENT("} // arrays_hashcode");
 166 
 167   postcond(pc() != badAddress);
 168   return pc();
 169 }
 170 
 171 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register t1,
 172                                               Register t2, Register t3) {
 173   assert_different_registers(obj, box, t1, t2, t3, rscratch2);
 174 
 175   // Handle inflated monitor.
 176   Label inflated;
 177   // Finish fast lock successfully. MUST branch to with flag == EQ
 178   Label locked;
 179   // Finish fast lock unsuccessfully. MUST branch to with flag == NE
 180   Label slow_path;
 181 
 182   if (UseObjectMonitorTable) {
 183     // Clear cache in case fast locking succeeds or we need to take the slow-path.
 184     str(zr, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 185   }
 186 
 187   if (DiagnoseSyncOnValueBasedClasses != 0) {
 188     load_klass(t1, obj);
 189     ldrb(t1, Address(t1, Klass::misc_flags_offset()));
 190     tst(t1, KlassFlags::_misc_is_value_based_class);
 191     br(Assembler::NE, slow_path);
 192   }
 193 
 194   const Register t1_mark = t1;
 195   const Register t3_t = t3;
 196 
 197   { // Lightweight locking
 198 
 199     // Push lock to the lock stack and finish successfully. MUST branch to with flag == EQ
 200     Label push;
 201 
 202     const Register t2_top = t2;
 203 
 204     // Check if lock-stack is full.
 205     ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 206     cmpw(t2_top, (unsigned)LockStack::end_offset() - 1);
 207     br(Assembler::GT, slow_path);
 208 
 209     // Check if recursive.
 210     subw(t3_t, t2_top, oopSize);
 211     ldr(t3_t, Address(rthread, t3_t));
 212     cmp(obj, t3_t);
 213     br(Assembler::EQ, push);
 214 
 215     // Relaxed normal load to check for monitor. Optimization for monitor case.
 216     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 217     tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
 218 
 219     // Not inflated
 220     assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea");
 221 
 222     // Try to lock. Transition lock-bits 0b01 => 0b00
 223     orr(t1_mark, t1_mark, markWord::unlocked_value);
 224     eor(t3_t, t1_mark, markWord::unlocked_value);
 225     cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
 226             /*acquire*/ true, /*release*/ false, /*weak*/ false, noreg);
 227     br(Assembler::NE, slow_path);
 228 
 229     bind(push);
 230     // After successful lock, push object on lock-stack.
 231     str(obj, Address(rthread, t2_top));
 232     addw(t2_top, t2_top, oopSize);
 233     strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 234     b(locked);
 235   }
 236 
 237   { // Handle inflated monitor.
 238     bind(inflated);
 239 
 240     const Register t1_monitor = t1;
 241 
 242     if (!UseObjectMonitorTable) {
 243       assert(t1_monitor == t1_mark, "should be the same here");
 244     } else {
 245       Label monitor_found;
 246 
 247       // Load cache address
 248       lea(t3_t, Address(rthread, JavaThread::om_cache_oops_offset()));
 249 
 250       const int num_unrolled = 2;
 251       for (int i = 0; i < num_unrolled; i++) {
 252         ldr(t1, Address(t3_t));
 253         cmp(obj, t1);
 254         br(Assembler::EQ, monitor_found);
 255         increment(t3_t, in_bytes(OMCache::oop_to_oop_difference()));
 256       }
 257 
 258       Label loop;
 259 
 260       // Search for obj in cache.
 261       bind(loop);
 262 
 263       // Check for match.
 264       ldr(t1, Address(t3_t));
 265       cmp(obj, t1);
 266       br(Assembler::EQ, monitor_found);
 267 
 268       // Search until null encountered, guaranteed _null_sentinel at end.
 269       increment(t3_t, in_bytes(OMCache::oop_to_oop_difference()));
 270       cbnz(t1, loop);
 271       // Cache Miss, NE set from cmp above, cbnz does not set flags
 272       b(slow_path);
 273 
 274       bind(monitor_found);
 275       ldr(t1_monitor, Address(t3_t, OMCache::oop_to_monitor_difference()));
 276     }
 277 
 278     const Register t2_owner_addr = t2;
 279     const Register t3_owner = t3;
 280     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 281     const Address owner_address(t1_monitor, ObjectMonitor::owner_offset() - monitor_tag);
 282     const Address recursions_address(t1_monitor, ObjectMonitor::recursions_offset() - monitor_tag);
 283 
 284     Label monitor_locked;
 285 
 286     // Compute owner address.
 287     lea(t2_owner_addr, owner_address);
 288 
 289     // Try to CAS owner (no owner => current thread's _monitor_owner_id).
 290     ldr(rscratch2, Address(rthread, JavaThread::monitor_owner_id_offset()));
 291     cmpxchg(t2_owner_addr, zr, rscratch2, Assembler::xword, /*acquire*/ true,
 292             /*release*/ false, /*weak*/ false, t3_owner);
 293     br(Assembler::EQ, monitor_locked);
 294 
 295     // Check if recursive.
 296     cmp(t3_owner, rscratch2);
 297     br(Assembler::NE, slow_path);
 298 
 299     // Recursive.
 300     increment(recursions_address, 1);
 301 
 302     bind(monitor_locked);
 303     if (UseObjectMonitorTable) {
 304       str(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 305     }
 306   }
 307 
 308   bind(locked);
 309 
 310 #ifdef ASSERT
 311   // Check that locked label is reached with Flags == EQ.
 312   Label flag_correct;
 313   br(Assembler::EQ, flag_correct);
 314   stop("Fast Lock Flag != EQ");
 315 #endif
 316 
 317   bind(slow_path);
 318 #ifdef ASSERT
 319   // Check that slow_path label is reached with Flags == NE.
 320   br(Assembler::NE, flag_correct);
 321   stop("Fast Lock Flag != NE");
 322   bind(flag_correct);
 323 #endif
 324   // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
 325 }
 326 
 327 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register box, Register t1,
 328                                                 Register t2, Register t3) {
 329   assert_different_registers(obj, box, t1, t2, t3);
 330 
 331   // Handle inflated monitor.
 332   Label inflated, inflated_load_mark;
 333   // Finish fast unlock successfully. MUST branch to with flag == EQ
 334   Label unlocked;
 335   // Finish fast unlock unsuccessfully. MUST branch to with flag == NE
 336   Label slow_path;
 337 
 338   const Register t1_mark = t1;
 339   const Register t2_top = t2;
 340   const Register t3_t = t3;
 341 
 342   { // Lightweight unlock
 343 
 344     Label push_and_slow_path;
 345 
 346     // Check if obj is top of lock-stack.
 347     ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 348     subw(t2_top, t2_top, oopSize);
 349     ldr(t3_t, Address(rthread, t2_top));
 350     cmp(obj, t3_t);
 351     // Top of lock stack was not obj. Must be monitor.
 352     br(Assembler::NE, inflated_load_mark);
 353 
 354     // Pop lock-stack.
 355     DEBUG_ONLY(str(zr, Address(rthread, t2_top));)
 356     strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 357 
 358     // Check if recursive.
 359     subw(t3_t, t2_top, oopSize);
 360     ldr(t3_t, Address(rthread, t3_t));
 361     cmp(obj, t3_t);
 362     br(Assembler::EQ, unlocked);
 363 
 364     // Not recursive.
 365     // Load Mark.
 366     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 367 
 368     // Check header for monitor (0b10).
 369     // Because we got here by popping (meaning we pushed in locked)
 370     // there will be no monitor in the box. So we need to push back the obj
 371     // so that the runtime can fix any potential anonymous owner.
 372     tbnz(t1_mark, exact_log2(markWord::monitor_value), UseObjectMonitorTable ? push_and_slow_path : inflated);
 373 
 374     // Try to unlock. Transition lock bits 0b00 => 0b01
 375     assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea");
 376     orr(t3_t, t1_mark, markWord::unlocked_value);
 377     cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
 378             /*acquire*/ false, /*release*/ true, /*weak*/ false, noreg);
 379     br(Assembler::EQ, unlocked);
 380 
 381     bind(push_and_slow_path);
 382     // Compare and exchange failed.
 383     // Restore lock-stack and handle the unlock in runtime.
 384     DEBUG_ONLY(str(obj, Address(rthread, t2_top));)
 385     addw(t2_top, t2_top, oopSize);
 386     str(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 387     b(slow_path);
 388   }
 389 
 390 
 391   { // Handle inflated monitor.
 392     bind(inflated_load_mark);
 393     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 394 #ifdef ASSERT
 395     tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
 396     stop("Fast Unlock not monitor");
 397 #endif
 398 
 399     bind(inflated);
 400 
 401 #ifdef ASSERT
 402     Label check_done;
 403     subw(t2_top, t2_top, oopSize);
 404     cmpw(t2_top, in_bytes(JavaThread::lock_stack_base_offset()));
 405     br(Assembler::LT, check_done);
 406     ldr(t3_t, Address(rthread, t2_top));
 407     cmp(obj, t3_t);
 408     br(Assembler::NE, inflated);
 409     stop("Fast Unlock lock on stack");
 410     bind(check_done);
 411 #endif
 412 
 413     const Register t1_monitor = t1;
 414 
 415     if (!UseObjectMonitorTable) {
 416       assert(t1_monitor == t1_mark, "should be the same here");
 417 
 418       // Untag the monitor.
 419       add(t1_monitor, t1_mark, -(int)markWord::monitor_value);
 420     } else {
 421       ldr(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 422       // null check with Flags == NE, no valid pointer below alignof(ObjectMonitor*)
 423       cmp(t1_monitor, checked_cast<uint8_t>(alignof(ObjectMonitor*)));
 424       br(Assembler::LO, slow_path);
 425     }
 426 
 427     const Register t2_recursions = t2;
 428     Label not_recursive;
 429 
 430     // Check if recursive.
 431     ldr(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
 432     cbz(t2_recursions, not_recursive);
 433 
 434     // Recursive unlock.
 435     sub(t2_recursions, t2_recursions, 1u);
 436     str(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
 437     // Set flag == EQ
 438     cmp(t2_recursions, t2_recursions);
 439     b(unlocked);
 440 
 441     bind(not_recursive);
 442 
 443     const Register t2_owner_addr = t2;
 444 
 445     // Compute owner address.
 446     lea(t2_owner_addr, Address(t1_monitor, ObjectMonitor::owner_offset()));
 447 
 448     // Set owner to null.
 449     // Release to satisfy the JMM
 450     stlr(zr, t2_owner_addr);
 451     // We need a full fence after clearing owner to avoid stranding.
 452     // StoreLoad achieves this.
 453     membar(StoreLoad);
 454 
 455     // Check if the entry_list is empty.
 456     ldr(rscratch1, Address(t1_monitor, ObjectMonitor::entry_list_offset()));
 457     cmp(rscratch1, zr);
 458     br(Assembler::EQ, unlocked);  // If so we are done.
 459 
 460     // Check if there is a successor.
 461     ldr(rscratch1, Address(t1_monitor, ObjectMonitor::succ_offset()));
 462     cmp(rscratch1, zr);
 463     br(Assembler::NE, unlocked);  // If so we are done.
 464 
 465     // Save the monitor pointer in the current thread, so we can try to
 466     // reacquire the lock in SharedRuntime::monitor_exit_helper().
 467     str(t1_monitor, Address(rthread, JavaThread::unlocked_inflated_monitor_offset()));
 468 
 469     cmp(zr, rthread); // Set Flag to NE => slow path
 470     b(slow_path);
 471   }
 472 
 473   bind(unlocked);
 474   cmp(zr, zr); // Set Flags to EQ => fast path
 475 
 476 #ifdef ASSERT
 477   // Check that unlocked label is reached with Flags == EQ.
 478   Label flag_correct;
 479   br(Assembler::EQ, flag_correct);
 480   stop("Fast Unlock Flag != EQ");
 481 #endif
 482 
 483   bind(slow_path);
 484 #ifdef ASSERT
 485   // Check that slow_path label is reached with Flags == NE.
 486   br(Assembler::NE, flag_correct);
 487   stop("Fast Unlock Flag != NE");
 488   bind(flag_correct);
 489 #endif
 490   // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
 491 }
 492 
 493 // Search for str1 in str2 and return index or -1
 494 // Clobbers: rscratch1, rscratch2, rflags. May also clobber v0-v1, when icnt1==-1.
 495 void C2_MacroAssembler::string_indexof(Register str2, Register str1,
 496                                        Register cnt2, Register cnt1,
 497                                        Register tmp1, Register tmp2,
 498                                        Register tmp3, Register tmp4,
 499                                        Register tmp5, Register tmp6,
 500                                        int icnt1, Register result, int ae) {
 501   // NOTE: tmp5, tmp6 can be zr depending on specific method version
 502   Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
 503 
 504   Register ch1 = rscratch1;
 505   Register ch2 = rscratch2;
 506   Register cnt1tmp = tmp1;
 507   Register cnt2tmp = tmp2;
 508   Register cnt1_neg = cnt1;
 509   Register cnt2_neg = cnt2;
 510   Register result_tmp = tmp4;
 511 
 512   bool isL = ae == StrIntrinsicNode::LL;
 513 
 514   bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
 515   bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
 516   int str1_chr_shift = str1_isL ? 0:1;
 517   int str2_chr_shift = str2_isL ? 0:1;
 518   int str1_chr_size = str1_isL ? 1:2;
 519   int str2_chr_size = str2_isL ? 1:2;
 520   chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
 521                                       (chr_insn)&MacroAssembler::ldrh;
 522   chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
 523                                       (chr_insn)&MacroAssembler::ldrh;
 524   chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
 525   chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
 526 
 527   // Note, inline_string_indexOf() generates checks:
 528   // if (substr.count > string.count) return -1;
 529   // if (substr.count == 0) return 0;
 530 
 531   // We have two strings, a source string in str2, cnt2 and a pattern string
 532   // in str1, cnt1. Find the 1st occurrence of pattern in source or return -1.
 533 
 534   // For larger pattern and source we use a simplified Boyer Moore algorithm.
 535   // With a small pattern and source we use linear scan.
 536 
 537   if (icnt1 == -1) {
 538     sub(result_tmp, cnt2, cnt1);
 539     cmp(cnt1, (u1)8);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
 540     br(LT, LINEARSEARCH);
 541     dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
 542     subs(zr, cnt1, 256);
 543     lsr(tmp1, cnt2, 2);
 544     ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
 545     br(GE, LINEARSTUB);
 546   }
 547 
 548 // The Boyer Moore alogorithm is based on the description here:-
 549 //
 550 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
 551 //
 552 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
 553 // and the 'Good Suffix' rule.
 554 //
 555 // These rules are essentially heuristics for how far we can shift the
 556 // pattern along the search string.
 557 //
 558 // The implementation here uses the 'Bad Character' rule only because of the
 559 // complexity of initialisation for the 'Good Suffix' rule.
 560 //
 561 // This is also known as the Boyer-Moore-Horspool algorithm:-
 562 //
 563 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
 564 //
 565 // This particular implementation has few java-specific optimizations.
 566 //
 567 // #define ASIZE 256
 568 //
 569 //    int bm(unsigned char *x, int m, unsigned char *y, int n) {
 570 //       int i, j;
 571 //       unsigned c;
 572 //       unsigned char bc[ASIZE];
 573 //
 574 //       /* Preprocessing */
 575 //       for (i = 0; i < ASIZE; ++i)
 576 //          bc[i] = m;
 577 //       for (i = 0; i < m - 1; ) {
 578 //          c = x[i];
 579 //          ++i;
 580 //          // c < 256 for Latin1 string, so, no need for branch
 581 //          #ifdef PATTERN_STRING_IS_LATIN1
 582 //          bc[c] = m - i;
 583 //          #else
 584 //          if (c < ASIZE) bc[c] = m - i;
 585 //          #endif
 586 //       }
 587 //
 588 //       /* Searching */
 589 //       j = 0;
 590 //       while (j <= n - m) {
 591 //          c = y[i+j];
 592 //          if (x[m-1] == c)
 593 //            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
 594 //          if (i < 0) return j;
 595 //          // c < 256 for Latin1 string, so, no need for branch
 596 //          #ifdef SOURCE_STRING_IS_LATIN1
 597 //          // LL case: (c< 256) always true. Remove branch
 598 //          j += bc[y[j+m-1]];
 599 //          #endif
 600 //          #ifndef PATTERN_STRING_IS_UTF
 601 //          // UU case: need if (c<ASIZE) check. Skip 1 character if not.
 602 //          if (c < ASIZE)
 603 //            j += bc[y[j+m-1]];
 604 //          else
 605 //            j += 1
 606 //          #endif
 607 //          #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
 608 //          // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
 609 //          if (c < ASIZE)
 610 //            j += bc[y[j+m-1]];
 611 //          else
 612 //            j += m
 613 //          #endif
 614 //       }
 615 //    }
 616 
 617   if (icnt1 == -1) {
 618     Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
 619         BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
 620     Register cnt1end = tmp2;
 621     Register str2end = cnt2;
 622     Register skipch = tmp2;
 623 
 624     // str1 length is >=8, so, we can read at least 1 register for cases when
 625     // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
 626     // UL case. We'll re-read last character in inner pre-loop code to have
 627     // single outer pre-loop load
 628     const int firstStep = isL ? 7 : 3;
 629 
 630     const int ASIZE = 256;
 631     const int STORED_BYTES = 32; // amount of bytes stored per instruction
 632     sub(sp, sp, ASIZE);
 633     mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
 634     mov(ch1, sp);
 635     BIND(BM_INIT_LOOP);
 636       stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
 637       subs(tmp5, tmp5, 1);
 638       br(GT, BM_INIT_LOOP);
 639 
 640       sub(cnt1tmp, cnt1, 1);
 641       mov(tmp5, str2);
 642       add(str2end, str2, result_tmp, LSL, str2_chr_shift);
 643       sub(ch2, cnt1, 1);
 644       mov(tmp3, str1);
 645     BIND(BCLOOP);
 646       (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
 647       if (!str1_isL) {
 648         subs(zr, ch1, ASIZE);
 649         br(HS, BCSKIP);
 650       }
 651       strb(ch2, Address(sp, ch1));
 652     BIND(BCSKIP);
 653       subs(ch2, ch2, 1);
 654       br(GT, BCLOOP);
 655 
 656       add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
 657       if (str1_isL == str2_isL) {
 658         // load last 8 bytes (8LL/4UU symbols)
 659         ldr(tmp6, Address(tmp6, -wordSize));
 660       } else {
 661         ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
 662         // convert Latin1 to UTF. We'll have to wait until load completed, but
 663         // it's still faster than per-character loads+checks
 664         lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
 665         ubfx(ch1, tmp6, 8, 8); // str1[N-2]
 666         ubfx(ch2, tmp6, 16, 8); // str1[N-3]
 667         andr(tmp6, tmp6, 0xFF); // str1[N-4]
 668         orr(ch2, ch1, ch2, LSL, 16);
 669         orr(tmp6, tmp6, tmp3, LSL, 48);
 670         orr(tmp6, tmp6, ch2, LSL, 16);
 671       }
 672     BIND(BMLOOPSTR2);
 673       (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 674       sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
 675       if (str1_isL == str2_isL) {
 676         // re-init tmp3. It's for free because it's executed in parallel with
 677         // load above. Alternative is to initialize it before loop, but it'll
 678         // affect performance on in-order systems with 2 or more ld/st pipelines
 679         lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
 680       }
 681       if (!isL) { // UU/UL case
 682         lsl(ch2, cnt1tmp, 1); // offset in bytes
 683       }
 684       cmp(tmp3, skipch);
 685       br(NE, BMSKIP);
 686       ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
 687       mov(ch1, tmp6);
 688       if (isL) {
 689         b(BMLOOPSTR1_AFTER_LOAD);
 690       } else {
 691         sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
 692         b(BMLOOPSTR1_CMP);
 693       }
 694     BIND(BMLOOPSTR1);
 695       (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
 696       (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 697     BIND(BMLOOPSTR1_AFTER_LOAD);
 698       subs(cnt1tmp, cnt1tmp, 1);
 699       br(LT, BMLOOPSTR1_LASTCMP);
 700     BIND(BMLOOPSTR1_CMP);
 701       cmp(ch1, ch2);
 702       br(EQ, BMLOOPSTR1);
 703     BIND(BMSKIP);
 704       if (!isL) {
 705         // if we've met UTF symbol while searching Latin1 pattern, then we can
 706         // skip cnt1 symbols
 707         if (str1_isL != str2_isL) {
 708           mov(result_tmp, cnt1);
 709         } else {
 710           mov(result_tmp, 1);
 711         }
 712         subs(zr, skipch, ASIZE);
 713         br(HS, BMADV);
 714       }
 715       ldrb(result_tmp, Address(sp, skipch)); // load skip distance
 716     BIND(BMADV);
 717       sub(cnt1tmp, cnt1, 1);
 718       add(str2, str2, result_tmp, LSL, str2_chr_shift);
 719       cmp(str2, str2end);
 720       br(LE, BMLOOPSTR2);
 721       add(sp, sp, ASIZE);
 722       b(NOMATCH);
 723     BIND(BMLOOPSTR1_LASTCMP);
 724       cmp(ch1, ch2);
 725       br(NE, BMSKIP);
 726     BIND(BMMATCH);
 727       sub(result, str2, tmp5);
 728       if (!str2_isL) lsr(result, result, 1);
 729       add(sp, sp, ASIZE);
 730       b(DONE);
 731 
 732     BIND(LINEARSTUB);
 733     cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm
 734     br(LT, LINEAR_MEDIUM);
 735     mov(result, zr);
 736     RuntimeAddress stub = nullptr;
 737     if (isL) {
 738       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
 739       assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated");
 740     } else if (str1_isL) {
 741       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
 742        assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated");
 743     } else {
 744       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
 745       assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated");
 746     }
 747     address call = trampoline_call(stub);
 748     if (call == nullptr) {
 749       DEBUG_ONLY(reset_labels(LINEARSEARCH, LINEAR_MEDIUM, DONE, NOMATCH, MATCH));
 750       ciEnv::current()->record_failure("CodeCache is full");
 751       return;
 752     }
 753     b(DONE);
 754   }
 755 
 756   BIND(LINEARSEARCH);
 757   {
 758     Label DO1, DO2, DO3;
 759 
 760     Register str2tmp = tmp2;
 761     Register first = tmp3;
 762 
 763     if (icnt1 == -1)
 764     {
 765         Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
 766 
 767         cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2));
 768         br(LT, DOSHORT);
 769       BIND(LINEAR_MEDIUM);
 770         (this->*str1_load_1chr)(first, Address(str1));
 771         lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
 772         sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
 773         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 774         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 775 
 776       BIND(FIRST_LOOP);
 777         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
 778         cmp(first, ch2);
 779         br(EQ, STR1_LOOP);
 780       BIND(STR2_NEXT);
 781         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 782         br(LE, FIRST_LOOP);
 783         b(NOMATCH);
 784 
 785       BIND(STR1_LOOP);
 786         adds(cnt1tmp, cnt1_neg, str1_chr_size);
 787         add(cnt2tmp, cnt2_neg, str2_chr_size);
 788         br(GE, MATCH);
 789 
 790       BIND(STR1_NEXT);
 791         (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
 792         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
 793         cmp(ch1, ch2);
 794         br(NE, STR2_NEXT);
 795         adds(cnt1tmp, cnt1tmp, str1_chr_size);
 796         add(cnt2tmp, cnt2tmp, str2_chr_size);
 797         br(LT, STR1_NEXT);
 798         b(MATCH);
 799 
 800       BIND(DOSHORT);
 801       if (str1_isL == str2_isL) {
 802         cmp(cnt1, (u1)2);
 803         br(LT, DO1);
 804         br(GT, DO3);
 805       }
 806     }
 807 
 808     if (icnt1 == 4) {
 809       Label CH1_LOOP;
 810 
 811         (this->*load_4chr)(ch1, str1);
 812         sub(result_tmp, cnt2, 4);
 813         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 814         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 815 
 816       BIND(CH1_LOOP);
 817         (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
 818         cmp(ch1, ch2);
 819         br(EQ, MATCH);
 820         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 821         br(LE, CH1_LOOP);
 822         b(NOMATCH);
 823       }
 824 
 825     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
 826       Label CH1_LOOP;
 827 
 828       BIND(DO2);
 829         (this->*load_2chr)(ch1, str1);
 830         if (icnt1 == 2) {
 831           sub(result_tmp, cnt2, 2);
 832         }
 833         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 834         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 835       BIND(CH1_LOOP);
 836         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
 837         cmp(ch1, ch2);
 838         br(EQ, MATCH);
 839         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 840         br(LE, CH1_LOOP);
 841         b(NOMATCH);
 842     }
 843 
 844     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
 845       Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
 846 
 847       BIND(DO3);
 848         (this->*load_2chr)(first, str1);
 849         (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
 850         if (icnt1 == 3) {
 851           sub(result_tmp, cnt2, 3);
 852         }
 853         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 854         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 855       BIND(FIRST_LOOP);
 856         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
 857         cmpw(first, ch2);
 858         br(EQ, STR1_LOOP);
 859       BIND(STR2_NEXT);
 860         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 861         br(LE, FIRST_LOOP);
 862         b(NOMATCH);
 863 
 864       BIND(STR1_LOOP);
 865         add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
 866         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
 867         cmp(ch1, ch2);
 868         br(NE, STR2_NEXT);
 869         b(MATCH);
 870     }
 871 
 872     if (icnt1 == -1 || icnt1 == 1) {
 873       Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
 874 
 875       BIND(DO1);
 876         (this->*str1_load_1chr)(ch1, str1);
 877         cmp(cnt2, (u1)8);
 878         br(LT, DO1_SHORT);
 879 
 880         sub(result_tmp, cnt2, 8/str2_chr_size);
 881         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 882         mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
 883         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 884 
 885         if (str2_isL) {
 886           orr(ch1, ch1, ch1, LSL, 8);
 887         }
 888         orr(ch1, ch1, ch1, LSL, 16);
 889         orr(ch1, ch1, ch1, LSL, 32);
 890       BIND(CH1_LOOP);
 891         ldr(ch2, Address(str2, cnt2_neg));
 892         eor(ch2, ch1, ch2);
 893         sub(tmp1, ch2, tmp3);
 894         orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 895         bics(tmp1, tmp1, tmp2);
 896         br(NE, HAS_ZERO);
 897         adds(cnt2_neg, cnt2_neg, 8);
 898         br(LT, CH1_LOOP);
 899 
 900         cmp(cnt2_neg, (u1)8);
 901         mov(cnt2_neg, 0);
 902         br(LT, CH1_LOOP);
 903         b(NOMATCH);
 904 
 905       BIND(HAS_ZERO);
 906         rev(tmp1, tmp1);
 907         clz(tmp1, tmp1);
 908         add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
 909         b(MATCH);
 910 
 911       BIND(DO1_SHORT);
 912         mov(result_tmp, cnt2);
 913         lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
 914         sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
 915       BIND(DO1_LOOP);
 916         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
 917         cmpw(ch1, ch2);
 918         br(EQ, MATCH);
 919         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 920         br(LT, DO1_LOOP);
 921     }
 922   }
 923   BIND(NOMATCH);
 924     mov(result, -1);
 925     b(DONE);
 926   BIND(MATCH);
 927     add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
 928   BIND(DONE);
 929 }
 930 
 931 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
 932 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
 933 
 934 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1,
 935                                             Register ch, Register result,
 936                                             Register tmp1, Register tmp2, Register tmp3)
 937 {
 938   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
 939   Register cnt1_neg = cnt1;
 940   Register ch1 = rscratch1;
 941   Register result_tmp = rscratch2;
 942 
 943   cbz(cnt1, NOMATCH);
 944 
 945   cmp(cnt1, (u1)4);
 946   br(LT, DO1_SHORT);
 947 
 948   orr(ch, ch, ch, LSL, 16);
 949   orr(ch, ch, ch, LSL, 32);
 950 
 951   sub(cnt1, cnt1, 4);
 952   mov(result_tmp, cnt1);
 953   lea(str1, Address(str1, cnt1, Address::uxtw(1)));
 954   sub(cnt1_neg, zr, cnt1, LSL, 1);
 955 
 956   mov(tmp3, 0x0001000100010001);
 957 
 958   BIND(CH1_LOOP);
 959     ldr(ch1, Address(str1, cnt1_neg));
 960     eor(ch1, ch, ch1);
 961     sub(tmp1, ch1, tmp3);
 962     orr(tmp2, ch1, 0x7fff7fff7fff7fff);
 963     bics(tmp1, tmp1, tmp2);
 964     br(NE, HAS_ZERO);
 965     adds(cnt1_neg, cnt1_neg, 8);
 966     br(LT, CH1_LOOP);
 967 
 968     cmp(cnt1_neg, (u1)8);
 969     mov(cnt1_neg, 0);
 970     br(LT, CH1_LOOP);
 971     b(NOMATCH);
 972 
 973   BIND(HAS_ZERO);
 974     rev(tmp1, tmp1);
 975     clz(tmp1, tmp1);
 976     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
 977     b(MATCH);
 978 
 979   BIND(DO1_SHORT);
 980     mov(result_tmp, cnt1);
 981     lea(str1, Address(str1, cnt1, Address::uxtw(1)));
 982     sub(cnt1_neg, zr, cnt1, LSL, 1);
 983   BIND(DO1_LOOP);
 984     ldrh(ch1, Address(str1, cnt1_neg));
 985     cmpw(ch, ch1);
 986     br(EQ, MATCH);
 987     adds(cnt1_neg, cnt1_neg, 2);
 988     br(LT, DO1_LOOP);
 989   BIND(NOMATCH);
 990     mov(result, -1);
 991     b(DONE);
 992   BIND(MATCH);
 993     add(result, result_tmp, cnt1_neg, ASR, 1);
 994   BIND(DONE);
 995 }
 996 
 997 void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1,
 998                                                 Register ch, Register result,
 999                                                 FloatRegister ztmp1,
1000                                                 FloatRegister ztmp2,
1001                                                 PRegister tmp_pg,
1002                                                 PRegister tmp_pdn, bool isL)
1003 {
1004   // Note that `tmp_pdn` should *NOT* be used as governing predicate register.
1005   assert(tmp_pg->is_governing(),
1006          "this register has to be a governing predicate register");
1007 
1008   Label LOOP, MATCH, DONE, NOMATCH;
1009   Register vec_len = rscratch1;
1010   Register idx = rscratch2;
1011 
1012   SIMD_RegVariant T = (isL == true) ? B : H;
1013 
1014   cbz(cnt1, NOMATCH);
1015 
1016   // Assign the particular char throughout the vector.
1017   sve_dup(ztmp2, T, ch);
1018   if (isL) {
1019     sve_cntb(vec_len);
1020   } else {
1021     sve_cnth(vec_len);
1022   }
1023   mov(idx, 0);
1024 
1025   // Generate a predicate to control the reading of input string.
1026   sve_whilelt(tmp_pg, T, idx, cnt1);
1027 
1028   BIND(LOOP);
1029     // Read a vector of 8- or 16-bit data depending on the string type. Note
1030     // that inactive elements indicated by the predicate register won't cause
1031     // a data read from memory to the destination vector.
1032     if (isL) {
1033       sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx));
1034     } else {
1035       sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1)));
1036     }
1037     add(idx, idx, vec_len);
1038 
1039     // Perform the comparison. An element of the destination predicate is set
1040     // to active if the particular char is matched.
1041     sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2);
1042 
1043     // Branch if the particular char is found.
1044     br(NE, MATCH);
1045 
1046     sve_whilelt(tmp_pg, T, idx, cnt1);
1047 
1048     // Loop back if the particular char not found.
1049     br(MI, LOOP);
1050 
1051   BIND(NOMATCH);
1052     mov(result, -1);
1053     b(DONE);
1054 
1055   BIND(MATCH);
1056     // Undo the index increment.
1057     sub(idx, idx, vec_len);
1058 
1059     // Crop the vector to find its location.
1060     sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */);
1061     add(result, idx, -1);
1062     sve_incp(result, T, tmp_pdn);
1063   BIND(DONE);
1064 }
1065 
1066 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1,
1067                                             Register ch, Register result,
1068                                             Register tmp1, Register tmp2, Register tmp3)
1069 {
1070   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
1071   Register cnt1_neg = cnt1;
1072   Register ch1 = rscratch1;
1073   Register result_tmp = rscratch2;
1074 
1075   cbz(cnt1, NOMATCH);
1076 
1077   cmp(cnt1, (u1)8);
1078   br(LT, DO1_SHORT);
1079 
1080   orr(ch, ch, ch, LSL, 8);
1081   orr(ch, ch, ch, LSL, 16);
1082   orr(ch, ch, ch, LSL, 32);
1083 
1084   sub(cnt1, cnt1, 8);
1085   mov(result_tmp, cnt1);
1086   lea(str1, Address(str1, cnt1));
1087   sub(cnt1_neg, zr, cnt1);
1088 
1089   mov(tmp3, 0x0101010101010101);
1090 
1091   BIND(CH1_LOOP);
1092     ldr(ch1, Address(str1, cnt1_neg));
1093     eor(ch1, ch, ch1);
1094     sub(tmp1, ch1, tmp3);
1095     orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f);
1096     bics(tmp1, tmp1, tmp2);
1097     br(NE, HAS_ZERO);
1098     adds(cnt1_neg, cnt1_neg, 8);
1099     br(LT, CH1_LOOP);
1100 
1101     cmp(cnt1_neg, (u1)8);
1102     mov(cnt1_neg, 0);
1103     br(LT, CH1_LOOP);
1104     b(NOMATCH);
1105 
1106   BIND(HAS_ZERO);
1107     rev(tmp1, tmp1);
1108     clz(tmp1, tmp1);
1109     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
1110     b(MATCH);
1111 
1112   BIND(DO1_SHORT);
1113     mov(result_tmp, cnt1);
1114     lea(str1, Address(str1, cnt1));
1115     sub(cnt1_neg, zr, cnt1);
1116   BIND(DO1_LOOP);
1117     ldrb(ch1, Address(str1, cnt1_neg));
1118     cmp(ch, ch1);
1119     br(EQ, MATCH);
1120     adds(cnt1_neg, cnt1_neg, 1);
1121     br(LT, DO1_LOOP);
1122   BIND(NOMATCH);
1123     mov(result, -1);
1124     b(DONE);
1125   BIND(MATCH);
1126     add(result, result_tmp, cnt1_neg);
1127   BIND(DONE);
1128 }
1129 
1130 // Compare strings.
1131 void C2_MacroAssembler::string_compare(Register str1, Register str2,
1132     Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
1133     FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
1134     PRegister pgtmp1, PRegister pgtmp2, int ae) {
1135   Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
1136       DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
1137       SHORT_LOOP_START, TAIL_CHECK;
1138 
1139   bool isLL = ae == StrIntrinsicNode::LL;
1140   bool isLU = ae == StrIntrinsicNode::LU;
1141   bool isUL = ae == StrIntrinsicNode::UL;
1142 
1143   // The stub threshold for LL strings is: 72 (64 + 8) chars
1144   // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch)
1145   // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least)
1146   const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36);
1147 
1148   bool str1_isL = isLL || isLU;
1149   bool str2_isL = isLL || isUL;
1150 
1151   int str1_chr_shift = str1_isL ? 0 : 1;
1152   int str2_chr_shift = str2_isL ? 0 : 1;
1153   int str1_chr_size = str1_isL ? 1 : 2;
1154   int str2_chr_size = str2_isL ? 1 : 2;
1155   int minCharsInWord = isLL ? wordSize : wordSize/2;
1156 
1157   FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
1158   chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
1159                                       (chr_insn)&MacroAssembler::ldrh;
1160   chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
1161                                       (chr_insn)&MacroAssembler::ldrh;
1162   uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
1163                             (uxt_insn)&MacroAssembler::uxthw;
1164 
1165   BLOCK_COMMENT("string_compare {");
1166 
1167   // Bizarrely, the counts are passed in bytes, regardless of whether they
1168   // are L or U strings, however the result is always in characters.
1169   if (!str1_isL) asrw(cnt1, cnt1, 1);
1170   if (!str2_isL) asrw(cnt2, cnt2, 1);
1171 
1172   // Compute the minimum of the string lengths and save the difference.
1173   subsw(result, cnt1, cnt2);
1174   cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
1175 
1176   // A very short string
1177   cmpw(cnt2, minCharsInWord);
1178   br(Assembler::LE, SHORT_STRING);
1179 
1180   // Compare longwords
1181   // load first parts of strings and finish initialization while loading
1182   {
1183     if (str1_isL == str2_isL) { // LL or UU
1184       ldr(tmp1, Address(str1));
1185       cmp(str1, str2);
1186       br(Assembler::EQ, DONE);
1187       ldr(tmp2, Address(str2));
1188       cmp(cnt2, stub_threshold);
1189       br(GE, STUB);
1190       subsw(cnt2, cnt2, minCharsInWord);
1191       br(EQ, TAIL_CHECK);
1192       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1193       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1194       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1195     } else if (isLU) {
1196       ldrs(vtmp, Address(str1));
1197       ldr(tmp2, Address(str2));
1198       cmp(cnt2, stub_threshold);
1199       br(GE, STUB);
1200       subw(cnt2, cnt2, 4);
1201       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1202       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1203       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1204       zip1(vtmp, T8B, vtmp, vtmpZ);
1205       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1206       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1207       add(cnt1, cnt1, 4);
1208       fmovd(tmp1, vtmp);
1209     } else { // UL case
1210       ldr(tmp1, Address(str1));
1211       ldrs(vtmp, Address(str2));
1212       cmp(cnt2, stub_threshold);
1213       br(GE, STUB);
1214       subw(cnt2, cnt2, 4);
1215       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1216       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1217       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1218       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1219       zip1(vtmp, T8B, vtmp, vtmpZ);
1220       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1221       add(cnt1, cnt1, 8);
1222       fmovd(tmp2, vtmp);
1223     }
1224     adds(cnt2, cnt2, isUL ? 4 : 8);
1225     br(GE, TAIL);
1226     eor(rscratch2, tmp1, tmp2);
1227     cbnz(rscratch2, DIFF);
1228     // main loop
1229     bind(NEXT_WORD);
1230     if (str1_isL == str2_isL) {
1231       ldr(tmp1, Address(str1, cnt2));
1232       ldr(tmp2, Address(str2, cnt2));
1233       adds(cnt2, cnt2, 8);
1234     } else if (isLU) {
1235       ldrs(vtmp, Address(str1, cnt1));
1236       ldr(tmp2, Address(str2, cnt2));
1237       add(cnt1, cnt1, 4);
1238       zip1(vtmp, T8B, vtmp, vtmpZ);
1239       fmovd(tmp1, vtmp);
1240       adds(cnt2, cnt2, 8);
1241     } else { // UL
1242       ldrs(vtmp, Address(str2, cnt2));
1243       ldr(tmp1, Address(str1, cnt1));
1244       zip1(vtmp, T8B, vtmp, vtmpZ);
1245       add(cnt1, cnt1, 8);
1246       fmovd(tmp2, vtmp);
1247       adds(cnt2, cnt2, 4);
1248     }
1249     br(GE, TAIL);
1250 
1251     eor(rscratch2, tmp1, tmp2);
1252     cbz(rscratch2, NEXT_WORD);
1253     b(DIFF);
1254     bind(TAIL);
1255     eor(rscratch2, tmp1, tmp2);
1256     cbnz(rscratch2, DIFF);
1257     // Last longword.  In the case where length == 4 we compare the
1258     // same longword twice, but that's still faster than another
1259     // conditional branch.
1260     if (str1_isL == str2_isL) {
1261       ldr(tmp1, Address(str1));
1262       ldr(tmp2, Address(str2));
1263     } else if (isLU) {
1264       ldrs(vtmp, Address(str1));
1265       ldr(tmp2, Address(str2));
1266       zip1(vtmp, T8B, vtmp, vtmpZ);
1267       fmovd(tmp1, vtmp);
1268     } else { // UL
1269       ldrs(vtmp, Address(str2));
1270       ldr(tmp1, Address(str1));
1271       zip1(vtmp, T8B, vtmp, vtmpZ);
1272       fmovd(tmp2, vtmp);
1273     }
1274     bind(TAIL_CHECK);
1275     eor(rscratch2, tmp1, tmp2);
1276     cbz(rscratch2, DONE);
1277 
1278     // Find the first different characters in the longwords and
1279     // compute their difference.
1280     bind(DIFF);
1281     rev(rscratch2, rscratch2);
1282     clz(rscratch2, rscratch2);
1283     andr(rscratch2, rscratch2, isLL ? -8 : -16);
1284     lsrv(tmp1, tmp1, rscratch2);
1285     (this->*ext_chr)(tmp1, tmp1);
1286     lsrv(tmp2, tmp2, rscratch2);
1287     (this->*ext_chr)(tmp2, tmp2);
1288     subw(result, tmp1, tmp2);
1289     b(DONE);
1290   }
1291 
1292   bind(STUB);
1293     RuntimeAddress stub = nullptr;
1294     switch(ae) {
1295       case StrIntrinsicNode::LL:
1296         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
1297         break;
1298       case StrIntrinsicNode::UU:
1299         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
1300         break;
1301       case StrIntrinsicNode::LU:
1302         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
1303         break;
1304       case StrIntrinsicNode::UL:
1305         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
1306         break;
1307       default:
1308         ShouldNotReachHere();
1309      }
1310     assert(stub.target() != nullptr, "compare_long_string stub has not been generated");
1311     address call = trampoline_call(stub);
1312     if (call == nullptr) {
1313       DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START));
1314       ciEnv::current()->record_failure("CodeCache is full");
1315       return;
1316     }
1317     b(DONE);
1318 
1319   bind(SHORT_STRING);
1320   // Is the minimum length zero?
1321   cbz(cnt2, DONE);
1322   // arrange code to do most branches while loading and loading next characters
1323   // while comparing previous
1324   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1325   subs(cnt2, cnt2, 1);
1326   br(EQ, SHORT_LAST_INIT);
1327   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1328   b(SHORT_LOOP_START);
1329   bind(SHORT_LOOP);
1330   subs(cnt2, cnt2, 1);
1331   br(EQ, SHORT_LAST);
1332   bind(SHORT_LOOP_START);
1333   (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
1334   (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
1335   cmp(tmp1, cnt1);
1336   br(NE, SHORT_LOOP_TAIL);
1337   subs(cnt2, cnt2, 1);
1338   br(EQ, SHORT_LAST2);
1339   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1340   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1341   cmp(tmp2, rscratch1);
1342   br(EQ, SHORT_LOOP);
1343   sub(result, tmp2, rscratch1);
1344   b(DONE);
1345   bind(SHORT_LOOP_TAIL);
1346   sub(result, tmp1, cnt1);
1347   b(DONE);
1348   bind(SHORT_LAST2);
1349   cmp(tmp2, rscratch1);
1350   br(EQ, DONE);
1351   sub(result, tmp2, rscratch1);
1352 
1353   b(DONE);
1354   bind(SHORT_LAST_INIT);
1355   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1356   bind(SHORT_LAST);
1357   cmp(tmp1, cnt1);
1358   br(EQ, DONE);
1359   sub(result, tmp1, cnt1);
1360 
1361   bind(DONE);
1362 
1363   BLOCK_COMMENT("} string_compare");
1364 }
1365 
1366 void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1,
1367                                      FloatRegister src2, Condition cond, bool isQ) {
1368   SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1369   FloatRegister zn = src1, zm = src2;
1370   bool needs_negation = false;
1371   switch (cond) {
1372     case LT: cond = GT; zn = src2; zm = src1; break;
1373     case LE: cond = GE; zn = src2; zm = src1; break;
1374     case LO: cond = HI; zn = src2; zm = src1; break;
1375     case LS: cond = HS; zn = src2; zm = src1; break;
1376     case NE: cond = EQ; needs_negation = true; break;
1377     default:
1378       break;
1379   }
1380 
1381   if (is_floating_point_type(bt)) {
1382     fcm(cond, dst, size, zn, zm);
1383   } else {
1384     cm(cond, dst, size, zn, zm);
1385   }
1386 
1387   if (needs_negation) {
1388     notr(dst, isQ ? T16B : T8B, dst);
1389   }
1390 }
1391 
1392 void C2_MacroAssembler::neon_compare_zero(FloatRegister dst, BasicType bt, FloatRegister src,
1393                                           Condition cond, bool isQ) {
1394   SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1395   if (bt == T_FLOAT || bt == T_DOUBLE) {
1396     if (cond == Assembler::NE) {
1397       fcm(Assembler::EQ, dst, size, src);
1398       notr(dst, isQ ? T16B : T8B, dst);
1399     } else {
1400       fcm(cond, dst, size, src);
1401     }
1402   } else {
1403     if (cond == Assembler::NE) {
1404       cm(Assembler::EQ, dst, size, src);
1405       notr(dst, isQ ? T16B : T8B, dst);
1406     } else {
1407       cm(cond, dst, size, src);
1408     }
1409   }
1410 }
1411 
1412 // Compress the least significant bit of each byte to the rightmost and clear
1413 // the higher garbage bits.
1414 void C2_MacroAssembler::bytemask_compress(Register dst) {
1415   // Example input, dst = 0x01 00 00 00 01 01 00 01
1416   // The "??" bytes are garbage.
1417   orr(dst, dst, dst, Assembler::LSR, 7);  // dst = 0x?? 02 ?? 00 ?? 03 ?? 01
1418   orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D
1419   orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D
1420   andr(dst, dst, 0xff);                   // dst = 0x8D
1421 }
1422 
1423 // Pack the lowest-numbered bit of each mask element in src into a long value
1424 // in dst, at most the first 64 lane elements.
1425 // Clobbers: rscratch1, if UseSVE=1 or the hardware doesn't support FEAT_BITPERM.
1426 void C2_MacroAssembler::sve_vmask_tolong(Register dst, PRegister src, BasicType bt, int lane_cnt,
1427                                          FloatRegister vtmp1, FloatRegister vtmp2) {
1428   assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
1429   assert_different_registers(dst, rscratch1);
1430   assert_different_registers(vtmp1, vtmp2);
1431 
1432   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
1433   // Example:   src = 0b01100101 10001101, bt = T_BYTE, lane_cnt = 16
1434   // Expected:  dst = 0x658D
1435 
1436   // Convert the mask into vector with sequential bytes.
1437   // vtmp1 = 0x00010100 0x00010001 0x01000000 0x01010001
1438   sve_cpy(vtmp1, size, src, 1, false);
1439   if (bt != T_BYTE) {
1440     sve_vector_narrow(vtmp1, B, vtmp1, size, vtmp2);
1441   }
1442 
1443   if (UseSVE > 1 && VM_Version::supports_svebitperm()) {
1444     // Given a vector with the value 0x00 or 0x01 in each byte, the basic idea
1445     // is to compress each significant bit of the byte in a cross-lane way. Due
1446     // to the lack of a cross-lane bit-compress instruction, we use BEXT
1447     // (bit-compress in each lane) with the biggest lane size (T = D) then
1448     // concatenate the results.
1449 
1450     // The second source input of BEXT, initialized with 0x01 in each byte.
1451     // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1452     sve_dup(vtmp2, B, 1);
1453 
1454     // BEXT vtmp1.D, vtmp1.D, vtmp2.D
1455     // vtmp1 = 0x0001010000010001 | 0x0100000001010001
1456     // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1457     //         ---------------------------------------
1458     // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1459     sve_bext(vtmp1, D, vtmp1, vtmp2);
1460 
1461     // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the
1462     // result to dst.
1463     // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1464     // dst   = 0x658D
1465     if (lane_cnt <= 8) {
1466       // No need to concatenate.
1467       umov(dst, vtmp1, B, 0);
1468     } else if (lane_cnt <= 16) {
1469       ins(vtmp1, B, vtmp1, 1, 8);
1470       umov(dst, vtmp1, H, 0);
1471     } else {
1472       // As the lane count is 64 at most, the final expected value must be in
1473       // the lowest 64 bits after narrowing vtmp1 from D to B.
1474       sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2);
1475       umov(dst, vtmp1, D, 0);
1476     }
1477   } else if (UseSVE > 0) {
1478     // Compress the lowest 8 bytes.
1479     fmovd(dst, vtmp1);
1480     bytemask_compress(dst);
1481     if (lane_cnt <= 8) return;
1482 
1483     // Repeat on higher bytes and join the results.
1484     // Compress 8 bytes in each iteration.
1485     for (int idx = 1; idx < (lane_cnt / 8); idx++) {
1486       sve_extract_integral(rscratch1, T_LONG, vtmp1, idx, vtmp2);
1487       bytemask_compress(rscratch1);
1488       orr(dst, dst, rscratch1, Assembler::LSL, idx << 3);
1489     }
1490   } else {
1491     assert(false, "unsupported");
1492     ShouldNotReachHere();
1493   }
1494 }
1495 
1496 // Unpack the mask, a long value in src, into predicate register dst based on the
1497 // corresponding data type. Note that dst can support at most 64 lanes.
1498 // Below example gives the expected dst predicate register in different types, with
1499 // a valid src(0x658D) on a 1024-bit vector size machine.
1500 // BYTE:  dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 00 00 65 8D
1501 // SHORT: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 14 11 40 51
1502 // INT:   dst = 0x00 00 00 00 00 00 00 00 01 10 01 01 10 00 11 01
1503 // LONG:  dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01
1504 //
1505 // The number of significant bits of src must be equal to lane_cnt. E.g., 0xFF658D which
1506 // has 24 significant bits would be an invalid input if dst predicate register refers to
1507 // a LONG type 1024-bit vector, which has at most 16 lanes.
1508 void C2_MacroAssembler::sve_vmask_fromlong(PRegister dst, Register src, BasicType bt, int lane_cnt,
1509                                            FloatRegister vtmp1, FloatRegister vtmp2) {
1510   assert(UseSVE == 2 && VM_Version::supports_svebitperm() &&
1511          lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported");
1512   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
1513   // Example:   src = 0x658D, bt = T_BYTE, size = B, lane_cnt = 16
1514   // Expected:  dst = 0b01101001 10001101
1515 
1516   // Put long value from general purpose register into the first lane of vector.
1517   // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1518   sve_dup(vtmp1, B, 0);
1519   mov(vtmp1, D, 0, src);
1520 
1521   // As sve_cmp generates mask value with the minimum unit in byte, we should
1522   // transform the value in the first lane which is mask in bit now to the
1523   // mask in byte, which can be done by SVE2's BDEP instruction.
1524 
1525   // The first source input of BDEP instruction. Deposite each byte in every 8 bytes.
1526   // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1527   if (lane_cnt <= 8) {
1528     // Nothing. As only one byte exsits.
1529   } else if (lane_cnt <= 16) {
1530     ins(vtmp1, B, vtmp1, 8, 1);
1531     mov(vtmp1, B, 1, zr);
1532   } else {
1533     sve_vector_extend(vtmp1, D, vtmp1, B);
1534   }
1535 
1536   // The second source input of BDEP instruction, initialized with 0x01 for each byte.
1537   // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1538   sve_dup(vtmp2, B, 1);
1539 
1540   // BDEP vtmp1.D, vtmp1.D, vtmp2.D
1541   // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1542   // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1543   //         ---------------------------------------
1544   // vtmp1 = 0x0001010000010001 | 0x0100000001010001
1545   sve_bdep(vtmp1, D, vtmp1, vtmp2);
1546 
1547   if (bt != T_BYTE) {
1548     sve_vector_extend(vtmp1, size, vtmp1, B);
1549   }
1550   // Generate mask according to the given vector, in which the elements have been
1551   // extended to expected type.
1552   // dst = 0b01101001 10001101
1553   sve_cmp(Assembler::NE, dst, size, ptrue, vtmp1, 0);
1554 }
1555 
1556 // Clobbers: rflags
1557 void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg,
1558                                     FloatRegister zn, FloatRegister zm, Condition cond) {
1559   assert(pg->is_governing(), "This register has to be a governing predicate register");
1560   FloatRegister z1 = zn, z2 = zm;
1561   switch (cond) {
1562     case LE: z1 = zm; z2 = zn; cond = GE; break;
1563     case LT: z1 = zm; z2 = zn; cond = GT; break;
1564     case LO: z1 = zm; z2 = zn; cond = HI; break;
1565     case LS: z1 = zm; z2 = zn; cond = HS; break;
1566     default:
1567       break;
1568   }
1569 
1570   SIMD_RegVariant size = elemType_to_regVariant(bt);
1571   if (is_floating_point_type(bt)) {
1572     sve_fcm(cond, pd, size, pg, z1, z2);
1573   } else {
1574     assert(is_integral_type(bt), "unsupported element type");
1575     sve_cmp(cond, pd, size, pg, z1, z2);
1576   }
1577 }
1578 
1579 // Get index of the last mask lane that is set
1580 void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) {
1581   SIMD_RegVariant size = elemType_to_regVariant(bt);
1582   sve_rev(ptmp, size, src);
1583   sve_brkb(ptmp, ptrue, ptmp, false);
1584   sve_cntp(dst, size, ptrue, ptmp);
1585   movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1);
1586   subw(dst, rscratch1, dst);
1587 }
1588 
1589 // Extend integer vector src to dst with the same lane count
1590 // but larger element size, e.g. 4B -> 4I
1591 void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes,
1592                                            FloatRegister src, BasicType src_bt, bool is_unsigned) {
1593   if (src_bt == T_BYTE) {
1594     // 4B to 4S/4I, 8B to 8S
1595     assert(dst_vlen_in_bytes == 8 || dst_vlen_in_bytes == 16, "unsupported");
1596     assert(dst_bt == T_SHORT || dst_bt == T_INT, "unsupported");
1597     _xshll(is_unsigned, dst, T8H, src, T8B, 0);
1598     if (dst_bt == T_INT) {
1599       _xshll(is_unsigned, dst, T4S, dst, T4H, 0);
1600     }
1601   } else if (src_bt == T_SHORT) {
1602     // 2S to 2I/2L, 4S to 4I
1603     assert(dst_vlen_in_bytes == 8 || dst_vlen_in_bytes == 16, "unsupported");
1604     assert(dst_bt == T_INT || dst_bt == T_LONG, "unsupported");
1605     _xshll(is_unsigned, dst, T4S, src, T4H, 0);
1606     if (dst_bt == T_LONG) {
1607       _xshll(is_unsigned, dst, T2D, dst, T2S, 0);
1608     }
1609   } else if (src_bt == T_INT) {
1610     // 2I to 2L
1611     assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported");
1612     _xshll(is_unsigned, dst, T2D, src, T2S, 0);
1613   } else {
1614     ShouldNotReachHere();
1615   }
1616 }
1617 
1618 // Narrow integer vector src down to dst with the same lane count
1619 // but smaller element size, e.g. 4I -> 4B
1620 void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt,
1621                                            FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) {
1622   if (src_bt == T_SHORT) {
1623     // 4S/8S to 4B/8B
1624     assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
1625     assert(dst_bt == T_BYTE, "unsupported");
1626     xtn(dst, T8B, src, T8H);
1627   } else if (src_bt == T_INT) {
1628     // 2I to 2S, 4I to 4B/4S
1629     assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
1630     assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported");
1631     xtn(dst, T4H, src, T4S);
1632     if (dst_bt == T_BYTE) {
1633       xtn(dst, T8B, dst, T8H);
1634     }
1635   } else if (src_bt == T_LONG) {
1636     // 2L to 2S/2I
1637     assert(src_vlen_in_bytes == 16, "unsupported");
1638     assert(dst_bt == T_INT || dst_bt == T_SHORT, "unsupported");
1639     xtn(dst, T2S, src, T2D);
1640     if (dst_bt == T_SHORT) {
1641       xtn(dst, T4H, dst, T4S);
1642     }
1643   } else {
1644     ShouldNotReachHere();
1645   }
1646 }
1647 
1648 void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size,
1649                                           FloatRegister src, SIMD_RegVariant src_size,
1650                                           bool is_unsigned) {
1651   assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size");
1652 
1653   if (src_size == B) {
1654     switch (dst_size) {
1655     case H:
1656       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1657       break;
1658     case S:
1659       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1660       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1661       break;
1662     case D:
1663       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1664       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1665       _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1666       break;
1667     default:
1668       ShouldNotReachHere();
1669     }
1670   } else if (src_size == H) {
1671     if (dst_size == S) {
1672       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1673     } else { // D
1674       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1675       _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1676     }
1677   } else if (src_size == S) {
1678     _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, src);
1679   }
1680 }
1681 
1682 // Vector narrow from src to dst with specified element sizes.
1683 // High part of dst vector will be filled with zero.
1684 void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size,
1685                                           FloatRegister src, SIMD_RegVariant src_size,
1686                                           FloatRegister tmp) {
1687   assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size");
1688   assert_different_registers(src, tmp);
1689   sve_dup(tmp, src_size, 0);
1690   if (src_size == D) {
1691     switch (dst_size) {
1692     case S:
1693       sve_uzp1(dst, S, src, tmp);
1694       break;
1695     case H:
1696       assert_different_registers(dst, tmp);
1697       sve_uzp1(dst, S, src, tmp);
1698       sve_uzp1(dst, H, dst, tmp);
1699       break;
1700     case B:
1701       assert_different_registers(dst, tmp);
1702       sve_uzp1(dst, S, src, tmp);
1703       sve_uzp1(dst, H, dst, tmp);
1704       sve_uzp1(dst, B, dst, tmp);
1705       break;
1706     default:
1707       ShouldNotReachHere();
1708     }
1709   } else if (src_size == S) {
1710     if (dst_size == H) {
1711       sve_uzp1(dst, H, src, tmp);
1712     } else { // B
1713       assert_different_registers(dst, tmp);
1714       sve_uzp1(dst, H, src, tmp);
1715       sve_uzp1(dst, B, dst, tmp);
1716     }
1717   } else if (src_size == H) {
1718     sve_uzp1(dst, B, src, tmp);
1719   }
1720 }
1721 
1722 // Extend src predicate to dst predicate with the same lane count but larger
1723 // element size, e.g. 64Byte -> 512Long
1724 void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src,
1725                                              uint dst_element_length_in_bytes,
1726                                              uint src_element_length_in_bytes) {
1727   if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) {
1728     sve_punpklo(dst, src);
1729   } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) {
1730     sve_punpklo(dst, src);
1731     sve_punpklo(dst, dst);
1732   } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) {
1733     sve_punpklo(dst, src);
1734     sve_punpklo(dst, dst);
1735     sve_punpklo(dst, dst);
1736   } else {
1737     assert(false, "unsupported");
1738     ShouldNotReachHere();
1739   }
1740 }
1741 
1742 // Narrow src predicate to dst predicate with the same lane count but
1743 // smaller element size, e.g. 512Long -> 64Byte
1744 void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src, PRegister ptmp,
1745                                              uint dst_element_length_in_bytes, uint src_element_length_in_bytes) {
1746   // The insignificant bits in src predicate are expected to be zero.
1747   // To ensure the higher order bits of the resultant narrowed vector are 0, an all-zero predicate is
1748   // passed as the second argument. An example narrowing operation with a given mask would be -
1749   // 128Long -> 64Int on a 128-bit machine i.e 2L -> 2I
1750   // Mask (for 2 Longs) : TF
1751   // Predicate register for the above mask (16 bits) : 00000001 00000000
1752   // After narrowing (uzp1 dst.b, src.b, ptmp.b) : 0000 0000 0001 0000
1753   // Which translates to mask for 2 integers as : TF (lower half is considered while upper half is 0)
1754   assert_different_registers(src, ptmp);
1755   assert_different_registers(dst, ptmp);
1756   sve_pfalse(ptmp);
1757   if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) {
1758     sve_uzp1(dst, B, src, ptmp);
1759   } else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) {
1760     sve_uzp1(dst, H, src, ptmp);
1761     sve_uzp1(dst, B, dst, ptmp);
1762   } else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) {
1763     sve_uzp1(dst, S, src, ptmp);
1764     sve_uzp1(dst, H, dst, ptmp);
1765     sve_uzp1(dst, B, dst, ptmp);
1766   } else {
1767     assert(false, "unsupported");
1768     ShouldNotReachHere();
1769   }
1770 }
1771 
1772 // Vector reduction add for integral type with ASIMD instructions.
1773 void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt,
1774                                                  Register isrc, FloatRegister vsrc,
1775                                                  unsigned vector_length_in_bytes,
1776                                                  FloatRegister vtmp) {
1777   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1778   assert_different_registers(dst, isrc);
1779   bool isQ = vector_length_in_bytes == 16;
1780 
1781   BLOCK_COMMENT("neon_reduce_add_integral {");
1782     switch(bt) {
1783       case T_BYTE:
1784         addv(vtmp, isQ ? T16B : T8B, vsrc);
1785         smov(dst, vtmp, B, 0);
1786         addw(dst, dst, isrc, ext::sxtb);
1787         break;
1788       case T_SHORT:
1789         addv(vtmp, isQ ? T8H : T4H, vsrc);
1790         smov(dst, vtmp, H, 0);
1791         addw(dst, dst, isrc, ext::sxth);
1792         break;
1793       case T_INT:
1794         isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc);
1795         umov(dst, vtmp, S, 0);
1796         addw(dst, dst, isrc);
1797         break;
1798       case T_LONG:
1799         assert(isQ, "unsupported");
1800         addpd(vtmp, vsrc);
1801         umov(dst, vtmp, D, 0);
1802         add(dst, dst, isrc);
1803         break;
1804       default:
1805         assert(false, "unsupported");
1806         ShouldNotReachHere();
1807     }
1808   BLOCK_COMMENT("} neon_reduce_add_integral");
1809 }
1810 
1811 // Vector reduction multiply for integral type with ASIMD instructions.
1812 // Note: temporary registers vtmp1 and vtmp2 are not used in some cases.
1813 // Clobbers: rscratch1
1814 void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt,
1815                                                  Register isrc, FloatRegister vsrc,
1816                                                  unsigned vector_length_in_bytes,
1817                                                  FloatRegister vtmp1, FloatRegister vtmp2) {
1818   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1819   bool isQ = vector_length_in_bytes == 16;
1820 
1821   BLOCK_COMMENT("neon_reduce_mul_integral {");
1822     switch(bt) {
1823       case T_BYTE:
1824         if (isQ) {
1825           // Multiply the lower half and higher half of vector iteratively.
1826           // vtmp1 = vsrc[8:15]
1827           ins(vtmp1, D, vsrc, 0, 1);
1828           // vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7]
1829           mulv(vtmp1, T8B, vtmp1, vsrc);
1830           // vtmp2 = vtmp1[4:7]
1831           ins(vtmp2, S, vtmp1, 0, 1);
1832           // vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3]
1833           mulv(vtmp1, T8B, vtmp2, vtmp1);
1834         } else {
1835           ins(vtmp1, S, vsrc, 0, 1);
1836           mulv(vtmp1, T8B, vtmp1, vsrc);
1837         }
1838         // vtmp2 = vtmp1[2:3]
1839         ins(vtmp2, H, vtmp1, 0, 1);
1840         // vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1]
1841         mulv(vtmp2, T8B, vtmp2, vtmp1);
1842         // dst = vtmp2[0] * isrc * vtmp2[1]
1843         umov(rscratch1, vtmp2, B, 0);
1844         mulw(dst, rscratch1, isrc);
1845         sxtb(dst, dst);
1846         umov(rscratch1, vtmp2, B, 1);
1847         mulw(dst, rscratch1, dst);
1848         sxtb(dst, dst);
1849         break;
1850       case T_SHORT:
1851         if (isQ) {
1852           ins(vtmp2, D, vsrc, 0, 1);
1853           mulv(vtmp2, T4H, vtmp2, vsrc);
1854           ins(vtmp1, S, vtmp2, 0, 1);
1855           mulv(vtmp1, T4H, vtmp1, vtmp2);
1856         } else {
1857           ins(vtmp1, S, vsrc, 0, 1);
1858           mulv(vtmp1, T4H, vtmp1, vsrc);
1859         }
1860         umov(rscratch1, vtmp1, H, 0);
1861         mulw(dst, rscratch1, isrc);
1862         sxth(dst, dst);
1863         umov(rscratch1, vtmp1, H, 1);
1864         mulw(dst, rscratch1, dst);
1865         sxth(dst, dst);
1866         break;
1867       case T_INT:
1868         if (isQ) {
1869           ins(vtmp1, D, vsrc, 0, 1);
1870           mulv(vtmp1, T2S, vtmp1, vsrc);
1871         } else {
1872           vtmp1 = vsrc;
1873         }
1874         umov(rscratch1, vtmp1, S, 0);
1875         mul(dst, rscratch1, isrc);
1876         umov(rscratch1, vtmp1, S, 1);
1877         mul(dst, rscratch1, dst);
1878         break;
1879       case T_LONG:
1880         umov(rscratch1, vsrc, D, 0);
1881         mul(dst, isrc, rscratch1);
1882         umov(rscratch1, vsrc, D, 1);
1883         mul(dst, dst, rscratch1);
1884         break;
1885       default:
1886         assert(false, "unsupported");
1887         ShouldNotReachHere();
1888     }
1889   BLOCK_COMMENT("} neon_reduce_mul_integral");
1890 }
1891 
1892 // Vector reduction multiply for floating-point type with ASIMD instructions.
1893 void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt,
1894                                            FloatRegister fsrc, FloatRegister vsrc,
1895                                            unsigned vector_length_in_bytes,
1896                                            FloatRegister vtmp) {
1897   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1898   bool isQ = vector_length_in_bytes == 16;
1899 
1900   BLOCK_COMMENT("neon_reduce_mul_fp {");
1901     switch(bt) {
1902       case T_FLOAT:
1903         fmuls(dst, fsrc, vsrc);
1904         ins(vtmp, S, vsrc, 0, 1);
1905         fmuls(dst, dst, vtmp);
1906         if (isQ) {
1907           ins(vtmp, S, vsrc, 0, 2);
1908           fmuls(dst, dst, vtmp);
1909           ins(vtmp, S, vsrc, 0, 3);
1910           fmuls(dst, dst, vtmp);
1911          }
1912         break;
1913       case T_DOUBLE:
1914         assert(isQ, "unsupported");
1915         fmuld(dst, fsrc, vsrc);
1916         ins(vtmp, D, vsrc, 0, 1);
1917         fmuld(dst, dst, vtmp);
1918         break;
1919       default:
1920         assert(false, "unsupported");
1921         ShouldNotReachHere();
1922     }
1923   BLOCK_COMMENT("} neon_reduce_mul_fp");
1924 }
1925 
1926 // Helper to select logical instruction
1927 void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd,
1928                                                    Register Rn, Register Rm,
1929                                                    enum shift_kind kind, unsigned shift) {
1930   switch(opc) {
1931     case Op_AndReductionV:
1932       is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift);
1933       break;
1934     case Op_OrReductionV:
1935       is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift);
1936       break;
1937     case Op_XorReductionV:
1938       is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift);
1939       break;
1940     default:
1941       assert(false, "unsupported");
1942       ShouldNotReachHere();
1943   }
1944 }
1945 
1946 // Vector reduction logical operations And, Or, Xor
1947 // Clobbers: rscratch1
1948 void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt,
1949                                             Register isrc, FloatRegister vsrc,
1950                                             unsigned vector_length_in_bytes) {
1951   assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV,
1952          "unsupported");
1953   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1954   assert_different_registers(dst, isrc);
1955   bool isQ = vector_length_in_bytes == 16;
1956 
1957   BLOCK_COMMENT("neon_reduce_logical {");
1958     umov(rscratch1, vsrc, isQ ? D : S, 0);
1959     umov(dst, vsrc, isQ ? D : S, 1);
1960     neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1);
1961     switch(bt) {
1962       case T_BYTE:
1963         if (isQ) {
1964           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1965         }
1966         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
1967         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8);
1968         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1969         sxtb(dst, dst);
1970         break;
1971       case T_SHORT:
1972         if (isQ) {
1973           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1974         }
1975         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
1976         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1977         sxth(dst, dst);
1978         break;
1979       case T_INT:
1980         if (isQ) {
1981           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1982         }
1983         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1984         break;
1985       case T_LONG:
1986         assert(isQ, "unsupported");
1987         neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst);
1988         break;
1989       default:
1990         assert(false, "unsupported");
1991         ShouldNotReachHere();
1992     }
1993   BLOCK_COMMENT("} neon_reduce_logical");
1994 }
1995 
1996 // Vector reduction min/max for integral type with ASIMD instructions.
1997 // Note: vtmp is not used and expected to be fnoreg for T_LONG case.
1998 // Clobbers: rscratch1, rflags
1999 void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt,
2000                                                     Register isrc, FloatRegister vsrc,
2001                                                     unsigned vector_length_in_bytes,
2002                                                     FloatRegister vtmp) {
2003   assert(opc == Op_MinReductionV || opc == Op_MaxReductionV, "unsupported");
2004   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
2005   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported");
2006   assert_different_registers(dst, isrc);
2007   bool isQ = vector_length_in_bytes == 16;
2008   bool is_min = opc == Op_MinReductionV;
2009 
2010   BLOCK_COMMENT("neon_reduce_minmax_integral {");
2011     if (bt == T_LONG) {
2012       assert(vtmp == fnoreg, "should be");
2013       assert(isQ, "should be");
2014       umov(rscratch1, vsrc, D, 0);
2015       cmp(isrc, rscratch1);
2016       csel(dst, isrc, rscratch1, is_min ? LT : GT);
2017       umov(rscratch1, vsrc, D, 1);
2018       cmp(dst, rscratch1);
2019       csel(dst, dst, rscratch1, is_min ? LT : GT);
2020     } else {
2021       SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
2022       if (size == T2S) {
2023         is_min ? sminp(vtmp, size, vsrc, vsrc) : smaxp(vtmp, size, vsrc, vsrc);
2024       } else {
2025         is_min ? sminv(vtmp, size, vsrc) : smaxv(vtmp, size, vsrc);
2026       }
2027       if (bt == T_INT) {
2028         umov(dst, vtmp, S, 0);
2029       } else {
2030         smov(dst, vtmp, elemType_to_regVariant(bt), 0);
2031       }
2032       cmpw(dst, isrc);
2033       cselw(dst, dst, isrc, is_min ? LT : GT);
2034     }
2035   BLOCK_COMMENT("} neon_reduce_minmax_integral");
2036 }
2037 
2038 // Vector reduction for integral type with SVE instruction.
2039 // Supported operations are Add, And, Or, Xor, Max, Min.
2040 // rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV.
2041 void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1,
2042                                             FloatRegister src2, PRegister pg, FloatRegister tmp) {
2043   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2044   assert(pg->is_governing(), "This register has to be a governing predicate register");
2045   assert_different_registers(src1, dst);
2046   // Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved.
2047   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2048   switch (opc) {
2049     case Op_AddReductionVI: {
2050       sve_uaddv(tmp, size, pg, src2);
2051       if (bt == T_BYTE) {
2052         smov(dst, tmp, size, 0);
2053         addw(dst, src1, dst, ext::sxtb);
2054       } else if (bt == T_SHORT) {
2055         smov(dst, tmp, size, 0);
2056         addw(dst, src1, dst, ext::sxth);
2057       } else {
2058         umov(dst, tmp, size, 0);
2059         addw(dst, dst, src1);
2060       }
2061       break;
2062     }
2063     case Op_AddReductionVL: {
2064       sve_uaddv(tmp, size, pg, src2);
2065       umov(dst, tmp, size, 0);
2066       add(dst, dst, src1);
2067       break;
2068     }
2069     case Op_AndReductionV: {
2070       sve_andv(tmp, size, pg, src2);
2071       if (bt == T_INT || bt == T_LONG) {
2072         umov(dst, tmp, size, 0);
2073       } else {
2074         smov(dst, tmp, size, 0);
2075       }
2076       if (bt == T_LONG) {
2077         andr(dst, dst, src1);
2078       } else {
2079         andw(dst, dst, src1);
2080       }
2081       break;
2082     }
2083     case Op_OrReductionV: {
2084       sve_orv(tmp, size, pg, src2);
2085       if (bt == T_INT || bt == T_LONG) {
2086         umov(dst, tmp, size, 0);
2087       } else {
2088         smov(dst, tmp, size, 0);
2089       }
2090       if (bt == T_LONG) {
2091         orr(dst, dst, src1);
2092       } else {
2093         orrw(dst, dst, src1);
2094       }
2095       break;
2096     }
2097     case Op_XorReductionV: {
2098       sve_eorv(tmp, size, pg, src2);
2099       if (bt == T_INT || bt == T_LONG) {
2100         umov(dst, tmp, size, 0);
2101       } else {
2102         smov(dst, tmp, size, 0);
2103       }
2104       if (bt == T_LONG) {
2105         eor(dst, dst, src1);
2106       } else {
2107         eorw(dst, dst, src1);
2108       }
2109       break;
2110     }
2111     case Op_MaxReductionV: {
2112       sve_smaxv(tmp, size, pg, src2);
2113       if (bt == T_INT || bt == T_LONG) {
2114         umov(dst, tmp, size, 0);
2115       } else {
2116         smov(dst, tmp, size, 0);
2117       }
2118       if (bt == T_LONG) {
2119         cmp(dst, src1);
2120         csel(dst, dst, src1, Assembler::GT);
2121       } else {
2122         cmpw(dst, src1);
2123         cselw(dst, dst, src1, Assembler::GT);
2124       }
2125       break;
2126     }
2127     case Op_MinReductionV: {
2128       sve_sminv(tmp, size, pg, src2);
2129       if (bt == T_INT || bt == T_LONG) {
2130         umov(dst, tmp, size, 0);
2131       } else {
2132         smov(dst, tmp, size, 0);
2133       }
2134       if (bt == T_LONG) {
2135         cmp(dst, src1);
2136         csel(dst, dst, src1, Assembler::LT);
2137       } else {
2138         cmpw(dst, src1);
2139         cselw(dst, dst, src1, Assembler::LT);
2140       }
2141       break;
2142     }
2143     default:
2144       assert(false, "unsupported");
2145       ShouldNotReachHere();
2146   }
2147 
2148   if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) {
2149     if (bt == T_BYTE) {
2150       sxtb(dst, dst);
2151     } else if (bt == T_SHORT) {
2152       sxth(dst, dst);
2153     }
2154   }
2155 }
2156 
2157 // Set elements of the dst predicate to true for lanes in the range of [0, lane_cnt), or
2158 // to false otherwise. The input "lane_cnt" should be smaller than or equal to the supported
2159 // max vector length of the basic type. Clobbers: rscratch1 and the rFlagsReg.
2160 void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt) {
2161   uint32_t max_vector_length = Matcher::max_vector_size(bt);
2162   assert(lane_cnt <= max_vector_length, "unsupported input lane_cnt");
2163 
2164   // Set all elements to false if the input "lane_cnt" is zero.
2165   if (lane_cnt == 0) {
2166     sve_pfalse(dst);
2167     return;
2168   }
2169 
2170   SIMD_RegVariant size = elemType_to_regVariant(bt);
2171   assert(size != Q, "invalid size");
2172 
2173   // Set all true if "lane_cnt" equals to the max lane count.
2174   if (lane_cnt == max_vector_length) {
2175     sve_ptrue(dst, size, /* ALL */ 0b11111);
2176     return;
2177   }
2178 
2179   // Fixed numbers for "ptrue".
2180   switch(lane_cnt) {
2181   case 1: /* VL1 */
2182   case 2: /* VL2 */
2183   case 3: /* VL3 */
2184   case 4: /* VL4 */
2185   case 5: /* VL5 */
2186   case 6: /* VL6 */
2187   case 7: /* VL7 */
2188   case 8: /* VL8 */
2189     sve_ptrue(dst, size, lane_cnt);
2190     return;
2191   case 16:
2192     sve_ptrue(dst, size, /* VL16 */ 0b01001);
2193     return;
2194   case 32:
2195     sve_ptrue(dst, size, /* VL32 */ 0b01010);
2196     return;
2197   case 64:
2198     sve_ptrue(dst, size, /* VL64 */ 0b01011);
2199     return;
2200   case 128:
2201     sve_ptrue(dst, size, /* VL128 */ 0b01100);
2202     return;
2203   case 256:
2204     sve_ptrue(dst, size, /* VL256 */ 0b01101);
2205     return;
2206   default:
2207     break;
2208   }
2209 
2210   // Special patterns for "ptrue".
2211   if (lane_cnt == round_down_power_of_2(max_vector_length)) {
2212     sve_ptrue(dst, size, /* POW2 */ 0b00000);
2213   } else if (lane_cnt == max_vector_length - (max_vector_length % 4)) {
2214     sve_ptrue(dst, size, /* MUL4 */ 0b11101);
2215   } else if (lane_cnt == max_vector_length - (max_vector_length % 3)) {
2216     sve_ptrue(dst, size, /* MUL3 */ 0b11110);
2217   } else {
2218     // Encode to "whileltw" for the remaining cases.
2219     mov(rscratch1, lane_cnt);
2220     sve_whileltw(dst, size, zr, rscratch1);
2221   }
2222 }
2223 
2224 // Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst.
2225 // Any remaining elements of dst will be filled with zero.
2226 // Clobbers: rscratch1
2227 // Preserves: mask, vzr
2228 void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask,
2229                                            FloatRegister vzr, FloatRegister vtmp,
2230                                            PRegister pgtmp, unsigned vector_length_in_bytes) {
2231   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2232   // When called by sve_compress_byte, src and vtmp may be the same register.
2233   assert_different_registers(dst, src, vzr);
2234   assert_different_registers(dst, vtmp, vzr);
2235   assert_different_registers(mask, pgtmp);
2236   // high <-- low
2237   // Example input:   src   = hh gg ff ee dd cc bb aa, one character is 8 bits.
2238   //                  mask  = 01 00 00 01 01 00 01 01, one character is 1 bit.
2239   // Expected result: dst   = 00 00 00 hh ee dd bb aa
2240 
2241   // Extend lowest half to type INT.
2242   // dst   =  00dd  00cc  00bb  00aa
2243   sve_uunpklo(dst, S, src);
2244   // pgtmp =  0001  0000  0001  0001
2245   sve_punpklo(pgtmp, mask);
2246   // Pack the active elements in size of type INT to the right,
2247   // and fill the remainings with zero.
2248   // dst   =  0000  00dd  00bb  00aa
2249   sve_compact(dst, S, dst, pgtmp);
2250   // Narrow the result back to type SHORT.
2251   // dst   = 00 00 00 00 00 dd bb aa
2252   sve_uzp1(dst, H, dst, vzr);
2253 
2254   // Return if the vector length is no more than MaxVectorSize/2, since the
2255   // highest half is invalid.
2256   if (vector_length_in_bytes <= (MaxVectorSize >> 1)) {
2257     return;
2258   }
2259 
2260   // Count the active elements of lowest half.
2261   // rscratch1 = 3
2262   sve_cntp(rscratch1, S, ptrue, pgtmp);
2263 
2264   // Repeat to the highest half.
2265   // pgtmp =  0001  0000  0000  0001
2266   sve_punpkhi(pgtmp, mask);
2267   // vtmp  =  00hh  00gg  00ff  00ee
2268   sve_uunpkhi(vtmp, S, src);
2269   // vtmp  =  0000  0000  00hh  00ee
2270   sve_compact(vtmp, S, vtmp, pgtmp);
2271   // vtmp  = 00 00 00 00 00 00 hh ee
2272   sve_uzp1(vtmp, H, vtmp, vzr);
2273 
2274   // pgtmp = 00 00 00 00 00 01 01 01
2275   sve_whilelt(pgtmp, H, zr, rscratch1);
2276   // Compressed low:  dst  = 00 00 00 00 00 dd bb aa
2277   // Compressed high: vtmp = 00 00 00 00 00 00 hh ee
2278   // Combine the compressed low with the compressed high:
2279   //                  dst  = 00 00 00 hh ee dd bb aa
2280   sve_splice(dst, H, pgtmp, vtmp);
2281 }
2282 
2283 // Clobbers: rscratch1, rscratch2
2284 // Preserves: src, mask
2285 void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask,
2286                                           FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
2287                                           PRegister ptmp, PRegister pgtmp, unsigned vector_length_in_bytes) {
2288   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2289   assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3);
2290   assert_different_registers(mask, ptmp, pgtmp);
2291   // high <-- low
2292   // Example input:   src   = q p n m l k j i h g f e d c b a, one character is 8 bits.
2293   //                  mask  = 0 1 0 0 0 0 0 1 0 1 0 0 0 1 0 1, one character is 1 bit.
2294   // Expected result: dst   = 0 0 0 0 0 0 0 0 0 0 0 p i g c a
2295   FloatRegister vzr = vtmp3;
2296   sve_dup(vzr, B, 0);
2297 
2298   // Extend lowest half to type SHORT.
2299   // vtmp1 =  0h  0g  0f  0e  0d  0c  0b  0a
2300   sve_uunpklo(vtmp1, H, src);
2301   // ptmp  =  00  01  00  00  00  01  00  01
2302   sve_punpklo(ptmp, mask);
2303   // Pack the active elements in size of type SHORT to the right,
2304   // and fill the remainings with zero.
2305   // dst   =  00  00  00  00  00  0g  0c  0a
2306   unsigned extended_size = vector_length_in_bytes << 1;
2307   sve_compress_short(dst, vtmp1, ptmp, vzr, vtmp2, pgtmp, extended_size > MaxVectorSize ? MaxVectorSize : extended_size);
2308   // Narrow the result back to type BYTE.
2309   // dst   = 0 0 0 0 0 0 0 0 0 0 0 0 0 g c a
2310   sve_uzp1(dst, B, dst, vzr);
2311 
2312   // Return if the vector length is no more than MaxVectorSize/2, since the
2313   // highest half is invalid.
2314   if (vector_length_in_bytes <= (MaxVectorSize >> 1)) {
2315     return;
2316   }
2317   // Count the active elements of lowest half.
2318   // rscratch2 = 3
2319   sve_cntp(rscratch2, H, ptrue, ptmp);
2320 
2321   // Repeat to the highest half.
2322   // ptmp  =  00  01  00  00  00  00  00  01
2323   sve_punpkhi(ptmp, mask);
2324   // vtmp2 =  0q  0p  0n  0m  0l  0k  0j  0i
2325   sve_uunpkhi(vtmp2, H, src);
2326   // vtmp1 =  00  00  00  00  00  00  0p  0i
2327   sve_compress_short(vtmp1, vtmp2, ptmp, vzr, vtmp2, pgtmp, extended_size - MaxVectorSize);
2328   // vtmp1 = 0 0 0 0 0 0 0 0 0 0 0 0 0 0 p i
2329   sve_uzp1(vtmp1, B, vtmp1, vzr);
2330 
2331   // ptmp  = 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1
2332   sve_whilelt(ptmp, B, zr, rscratch2);
2333   // Compressed low:  dst   = 0 0 0 0 0 0 0 0 0 0 0 0 0 g c a
2334   // Compressed high: vtmp1 = 0 0 0 0 0 0 0 0 0 0 0 0 0 0 p i
2335   // Combine the compressed low with the compressed high:
2336   //                  dst   = 0 0 0 0 0 0 0 0 0 0 0 p i g c a
2337   sve_splice(dst, B, ptmp, vtmp1);
2338 }
2339 
2340 void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2341   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2342   SIMD_Arrangement size = isQ ? T16B : T8B;
2343   if (bt == T_BYTE) {
2344     rbit(dst, size, src);
2345   } else {
2346     neon_reverse_bytes(dst, src, bt, isQ);
2347     rbit(dst, size, dst);
2348   }
2349 }
2350 
2351 void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2352   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2353   SIMD_Arrangement size = isQ ? T16B : T8B;
2354   switch (bt) {
2355     case T_BYTE:
2356       if (dst != src) {
2357         orr(dst, size, src, src);
2358       }
2359       break;
2360     case T_SHORT:
2361       rev16(dst, size, src);
2362       break;
2363     case T_INT:
2364       rev32(dst, size, src);
2365       break;
2366     case T_LONG:
2367       rev64(dst, size, src);
2368       break;
2369     default:
2370       assert(false, "unsupported");
2371       ShouldNotReachHere();
2372   }
2373 }
2374 
2375 // VectorRearrange implementation for short/int/float/long/double types with NEON
2376 // instructions. For VectorRearrange short/int/float, we use NEON tbl instruction.
2377 // But since it supports bytes table only, we need to lookup 2/4 bytes as a group.
2378 // For VectorRearrange long/double, we compare the shuffle input with iota indices,
2379 // and use bsl to implement the operation.
2380 void C2_MacroAssembler::neon_rearrange_hsd(FloatRegister dst, FloatRegister src,
2381                                            FloatRegister shuffle, FloatRegister tmp,
2382                                            BasicType bt, bool isQ) {
2383   assert_different_registers(dst, src, shuffle, tmp);
2384   SIMD_Arrangement size1 = isQ ? T16B : T8B;
2385   SIMD_Arrangement size2 = esize2arrangement((uint)type2aelembytes(bt), isQ);
2386 
2387   // Here is an example that rearranges a NEON vector with 4 ints:
2388   // Rearrange V1 int[a0, a1, a2, a3] to V2 int[a2, a3, a0, a1]
2389   //   1. We assume the shuffle input is Vi int[2, 3, 0, 1].
2390   //   2. Multiply Vi int[2, 3, 0, 1] with constant int vector
2391   //      [0x04040404, 0x04040404, 0x04040404, 0x04040404], and get
2392   //      tbl base Vm int[0x08080808, 0x0c0c0c0c, 0x00000000, 0x04040404].
2393   //   3. Add Vm with constant int[0x03020100, 0x03020100, 0x03020100, 0x03020100],
2394   //      and get tbl index Vm int[0x0b0a0908, 0x0f0e0d0c, 0x03020100, 0x07060504]
2395   //   4. Use Vm as index register, and use V1 as table register.
2396   //      Then get V2 as the result by tbl NEON instructions.
2397   switch (bt) {
2398     case T_SHORT:
2399       mov(tmp, size1, 0x02);
2400       mulv(dst, size2, shuffle, tmp);
2401       mov(tmp, size2, 0x0100);
2402       addv(dst, size1, dst, tmp);
2403       tbl(dst, size1, src, 1, dst);
2404       break;
2405     case T_INT:
2406     case T_FLOAT:
2407       mov(tmp, size1, 0x04);
2408       mulv(dst, size2, shuffle, tmp);
2409       mov(tmp, size2, 0x03020100);
2410       addv(dst, size1, dst, tmp);
2411       tbl(dst, size1, src, 1, dst);
2412       break;
2413     case T_LONG:
2414     case T_DOUBLE:
2415       // Load the iota indices for Long type. The indices are ordered by
2416       // type B/S/I/L/F/D, and the offset between two types is 16; Hence
2417       // the offset for L is 48.
2418       lea(rscratch1,
2419           ExternalAddress(StubRoutines::aarch64::vector_iota_indices() + 48));
2420       ldrq(tmp, rscratch1);
2421       // Check whether the input "shuffle" is the same with iota indices.
2422       // Return "src" if true, otherwise swap the two elements of "src".
2423       cm(EQ, dst, size2, shuffle, tmp);
2424       ext(tmp, size1, src, src, 8);
2425       bsl(dst, size1, src, tmp);
2426       break;
2427     default:
2428       assert(false, "unsupported element type");
2429       ShouldNotReachHere();
2430   }
2431 }
2432 
2433 // Extract a scalar element from an sve vector at position 'idx'.
2434 // The input elements in src are expected to be of integral type.
2435 void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src,
2436                                              int idx, FloatRegister vtmp) {
2437   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2438   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2439   if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction
2440     if (bt == T_INT || bt == T_LONG) {
2441       umov(dst, src, size, idx);
2442     } else {
2443       smov(dst, src, size, idx);
2444     }
2445   } else {
2446     sve_orr(vtmp, src, src);
2447     sve_ext(vtmp, vtmp, idx << size);
2448     if (bt == T_INT || bt == T_LONG) {
2449       umov(dst, vtmp, size, 0);
2450     } else {
2451       smov(dst, vtmp, size, 0);
2452     }
2453   }
2454 }
2455 
2456 // java.lang.Math::round intrinsics
2457 
2458 // Clobbers: rscratch1, rflags
2459 void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2460                                           FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) {
2461   assert_different_registers(tmp1, tmp2, tmp3, src, dst);
2462   switch (T) {
2463     case T2S:
2464     case T4S:
2465       fmovs(tmp1, T, 0.5f);
2466       mov(rscratch1, jint_cast(0x1.0p23f));
2467       break;
2468     case T2D:
2469       fmovd(tmp1, T, 0.5);
2470       mov(rscratch1, julong_cast(0x1.0p52));
2471       break;
2472     default:
2473       assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2474   }
2475   fadd(tmp1, T, tmp1, src);
2476   fcvtms(tmp1, T, tmp1);
2477   // tmp1 = floor(src + 0.5, ties to even)
2478 
2479   fcvtas(dst, T, src);
2480   // dst = round(src), ties to away
2481 
2482   fneg(tmp3, T, src);
2483   dup(tmp2, T, rscratch1);
2484   cm(HS, tmp3, T, tmp3, tmp2);
2485   // tmp3 is now a set of flags
2486 
2487   bif(dst, T16B, tmp1, tmp3);
2488   // result in dst
2489 }
2490 
2491 // Clobbers: rscratch1, rflags
2492 void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2493                                          FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) {
2494   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2495   assert_different_registers(tmp1, tmp2, src, dst);
2496 
2497   switch (T) {
2498     case S:
2499       mov(rscratch1, jint_cast(0x1.0p23f));
2500       break;
2501     case D:
2502       mov(rscratch1, julong_cast(0x1.0p52));
2503       break;
2504     default:
2505       assert(T == S || T == D, "invalid register variant");
2506   }
2507 
2508   sve_frinta(dst, T, ptrue, src);
2509   // dst = round(src), ties to away
2510 
2511   Label none;
2512 
2513   sve_fneg(tmp1, T, ptrue, src);
2514   sve_dup(tmp2, T, rscratch1);
2515   sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1);
2516   br(EQ, none);
2517   {
2518     sve_cpy(tmp1, T, pgtmp, 0.5);
2519     sve_fadd(tmp1, T, pgtmp, src);
2520     sve_frintm(dst, T, pgtmp, tmp1);
2521     // dst = floor(src + 0.5, ties to even)
2522   }
2523   bind(none);
2524 
2525   sve_fcvtzs(dst, T, ptrue, dst, T);
2526   // result in dst
2527 }
2528 
2529 void C2_MacroAssembler::vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero,
2530                                            FloatRegister one, SIMD_Arrangement T) {
2531   assert_different_registers(dst, src, zero, one);
2532   assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2533 
2534   facgt(dst, T, src, zero);
2535   ushr(dst, T, dst, 1); // dst=0 for +-0.0 and NaN. 0x7FF..F otherwise
2536   bsl(dst, T == T2S ? T8B : T16B, one, src); // Result in dst
2537 }
2538 
2539 void C2_MacroAssembler::vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero,
2540                                           FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T) {
2541     assert_different_registers(dst, src, zero, one, vtmp);
2542     assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2543 
2544     sve_orr(vtmp, src, src);
2545     sve_fac(Assembler::GT, pgtmp, T, ptrue, src, zero); // pmtp=0 for +-0.0 and NaN. 0x1 otherwise
2546     switch (T) {
2547     case S:
2548       sve_and(vtmp, T, min_jint); // Extract the sign bit of float value in every lane of src
2549       sve_orr(vtmp, T, jint_cast(1.0)); // OR it with +1 to make the final result +1 or -1 depending
2550                                         // on the sign of the float value
2551       break;
2552     case D:
2553       sve_and(vtmp, T, min_jlong);
2554       sve_orr(vtmp, T, jlong_cast(1.0));
2555       break;
2556     default:
2557       assert(false, "unsupported");
2558       ShouldNotReachHere();
2559     }
2560     sve_sel(dst, T, pgtmp, vtmp, src); // Select either from src or vtmp based on the predicate register pgtmp
2561                                        // Result in dst
2562 }
2563 
2564 bool C2_MacroAssembler::in_scratch_emit_size() {
2565   if (ciEnv::current()->task() != nullptr) {
2566     PhaseOutput* phase_output = Compile::current()->output();
2567     if (phase_output != nullptr && phase_output->in_scratch_emit_size()) {
2568       return true;
2569     }
2570   }
2571   return MacroAssembler::in_scratch_emit_size();
2572 }
2573 
2574 static void abort_verify_int_in_range(uint idx, jint val, jint lo, jint hi) {
2575   fatal("Invalid CastII, idx: %u, val: %d, lo: %d, hi: %d", idx, val, lo, hi);
2576 }
2577 
2578 void C2_MacroAssembler::verify_int_in_range(uint idx, const TypeInt* t, Register rval, Register rtmp) {
2579   assert(!t->empty() && !t->singleton(), "%s", Type::str(t));
2580   if (t == TypeInt::INT) {
2581     return;
2582   }
2583   BLOCK_COMMENT("verify_int_in_range {");
2584   Label L_success, L_failure;
2585 
2586   jint lo = t->_lo;
2587   jint hi = t->_hi;
2588 
2589   if (lo != min_jint && hi != max_jint) {
2590     subsw(rtmp, rval, lo);
2591     br(Assembler::LT, L_failure);
2592     subsw(rtmp, rval, hi);
2593     br(Assembler::LE, L_success);
2594   } else if (lo != min_jint) {
2595     subsw(rtmp, rval, lo);
2596     br(Assembler::GE, L_success);
2597   } else if (hi != max_jint) {
2598     subsw(rtmp, rval, hi);
2599     br(Assembler::LE, L_success);
2600   } else {
2601     ShouldNotReachHere();
2602   }
2603 
2604   bind(L_failure);
2605   movw(c_rarg0, idx);
2606   mov(c_rarg1, rval);
2607   movw(c_rarg2, lo);
2608   movw(c_rarg3, hi);
2609   reconstruct_frame_pointer(rtmp);
2610   rt_call(CAST_FROM_FN_PTR(address, abort_verify_int_in_range), rtmp);
2611   hlt(0);
2612 
2613   bind(L_success);
2614   BLOCK_COMMENT("} verify_int_in_range");
2615 }
2616 
2617 static void abort_verify_long_in_range(uint idx, jlong val, jlong lo, jlong hi) {
2618   fatal("Invalid CastLL, idx: %u, val: " JLONG_FORMAT ", lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, val, lo, hi);
2619 }
2620 
2621 void C2_MacroAssembler::verify_long_in_range(uint idx, const TypeLong* t, Register rval, Register rtmp) {
2622   assert(!t->empty() && !t->singleton(), "%s", Type::str(t));
2623   if (t == TypeLong::LONG) {
2624     return;
2625   }
2626   BLOCK_COMMENT("verify_long_in_range {");
2627   Label L_success, L_failure;
2628 
2629   jlong lo = t->_lo;
2630   jlong hi = t->_hi;
2631 
2632   if (lo != min_jlong && hi != max_jlong) {
2633     subs(rtmp, rval, lo);
2634     br(Assembler::LT, L_failure);
2635     subs(rtmp, rval, hi);
2636     br(Assembler::LE, L_success);
2637   } else if (lo != min_jlong) {
2638     subs(rtmp, rval, lo);
2639     br(Assembler::GE, L_success);
2640   } else if (hi != max_jlong) {
2641     subs(rtmp, rval, hi);
2642     br(Assembler::LE, L_success);
2643   } else {
2644     ShouldNotReachHere();
2645   }
2646 
2647   bind(L_failure);
2648   movw(c_rarg0, idx);
2649   mov(c_rarg1, rval);
2650   mov(c_rarg2, lo);
2651   mov(c_rarg3, hi);
2652   reconstruct_frame_pointer(rtmp);
2653   rt_call(CAST_FROM_FN_PTR(address, abort_verify_long_in_range), rtmp);
2654   hlt(0);
2655 
2656   bind(L_success);
2657   BLOCK_COMMENT("} verify_long_in_range");
2658 }
2659 
2660 void C2_MacroAssembler::reconstruct_frame_pointer(Register rtmp) {
2661   const int framesize = Compile::current()->output()->frame_size_in_bytes();
2662   if (PreserveFramePointer) {
2663     // frame pointer is valid
2664 #ifdef ASSERT
2665     // Verify frame pointer value in rfp.
2666     add(rtmp, sp, framesize - 2 * wordSize);
2667     Label L_success;
2668     cmp(rfp, rtmp);
2669     br(Assembler::EQ, L_success);
2670     stop("frame pointer mismatch");
2671     bind(L_success);
2672 #endif // ASSERT
2673   } else {
2674     add(rfp, sp, framesize - 2 * wordSize);
2675   }
2676 }
2677 
2678 // Selects elements from two source vectors (src1, src2) based on index values in the index register
2679 // using Neon instructions and places it in the destination vector element corresponding to the
2680 // index vector element. Each index in the index register must be in the range - [0, 2 * NUM_ELEM),
2681 // where NUM_ELEM is the number of BasicType elements per vector.
2682 // If idx < NUM_ELEM --> selects src1[idx] (idx is an element of the index register)
2683 // Otherwise, selects src2[idx – NUM_ELEM]
2684 void C2_MacroAssembler::select_from_two_vectors_neon(FloatRegister dst, FloatRegister src1,
2685                                                      FloatRegister src2, FloatRegister index,
2686                                                      FloatRegister tmp, unsigned vector_length_in_bytes) {
2687   assert_different_registers(dst, src1, src2, tmp);
2688   SIMD_Arrangement size = vector_length_in_bytes == 16 ? T16B : T8B;
2689 
2690   if (vector_length_in_bytes == 16) {
2691     assert(UseSVE <= 1, "sve must be <= 1");
2692     assert(src1->successor() == src2, "Source registers must be ordered");
2693     // If the vector length is 16B, then use the Neon "tbl" instruction with two vector table
2694     tbl(dst, size, src1, 2, index);
2695   } else { // vector length == 8
2696     assert(UseSVE == 0, "must be Neon only");
2697     // We need to fit both the source vectors (src1, src2) in a 128-bit register because the
2698     // Neon "tbl" instruction supports only looking up 16B vectors. We then use the Neon "tbl"
2699     // instruction with one vector lookup
2700     ins(tmp, D, src1, 0, 0);
2701     ins(tmp, D, src2, 1, 0);
2702     tbl(dst, size, tmp, 1, index);
2703   }
2704 }
2705 
2706 // Selects elements from two source vectors (src1, src2) based on index values in the index register
2707 // using SVE/SVE2 instructions and places it in the destination vector element corresponding to the
2708 // index vector element. Each index in the index register must be in the range - [0, 2 * NUM_ELEM),
2709 // where NUM_ELEM is the number of BasicType elements per vector.
2710 // If idx < NUM_ELEM --> selects src1[idx] (idx is an element of the index register)
2711 // Otherwise, selects src2[idx – NUM_ELEM]
2712 void C2_MacroAssembler::select_from_two_vectors_sve(FloatRegister dst, FloatRegister src1,
2713                                                     FloatRegister src2, FloatRegister index,
2714                                                     FloatRegister tmp, SIMD_RegVariant T,
2715                                                     unsigned vector_length_in_bytes) {
2716   assert_different_registers(dst, src1, src2, index, tmp);
2717 
2718   if (vector_length_in_bytes == 8) {
2719     // We need to fit both the source vectors (src1, src2) in a single vector register because the
2720     // SVE "tbl" instruction is unpredicated and works on the entire vector which can lead to
2721     // incorrect results if each source vector is only partially filled. We then use the SVE "tbl"
2722     // instruction with one vector lookup
2723     assert(UseSVE >= 1, "sve must be >= 1");
2724     ins(tmp, D, src1, 0, 0);
2725     ins(tmp, D, src2, 1, 0);
2726     sve_tbl(dst, T, tmp, index);
2727   } else {  // UseSVE == 2 and vector_length_in_bytes > 8
2728     // If the vector length is > 8, then use the SVE2 "tbl" instruction with the two vector table.
2729     // The assertion - vector_length_in_bytes == MaxVectorSize ensures that this operation
2730     // is not executed on machines where vector_length_in_bytes < MaxVectorSize
2731     // with the only exception of 8B vector length.
2732     assert(UseSVE == 2 && vector_length_in_bytes == MaxVectorSize, "must be");
2733     assert(src1->successor() == src2, "Source registers must be ordered");
2734     sve_tbl(dst, T, src1, src2, index);
2735   }
2736 }
2737 
2738 void C2_MacroAssembler::select_from_two_vectors(FloatRegister dst, FloatRegister src1,
2739                                                 FloatRegister src2, FloatRegister index,
2740                                                 FloatRegister tmp, BasicType bt,
2741                                                 unsigned vector_length_in_bytes) {
2742 
2743   assert_different_registers(dst, src1, src2, index, tmp);
2744 
2745   // The cases that can reach this method are -
2746   // - UseSVE = 0, vector_length_in_bytes = 8 or 16
2747   // - UseSVE = 1, vector_length_in_bytes = 8 or 16
2748   // - UseSVE = 2, vector_length_in_bytes >= 8
2749   //
2750   // SVE/SVE2 tbl instructions are generated when UseSVE = 1 with vector_length_in_bytes = 8
2751   // and UseSVE = 2 with vector_length_in_bytes >= 8
2752   //
2753   // Neon instructions are generated when UseSVE = 0 with vector_length_in_bytes = 8 or 16 and
2754   // UseSVE = 1 with vector_length_in_bytes = 16
2755 
2756   if ((UseSVE == 1 && vector_length_in_bytes == 8) || UseSVE == 2) {
2757     SIMD_RegVariant T = elemType_to_regVariant(bt);
2758     select_from_two_vectors_sve(dst, src1, src2, index, tmp, T, vector_length_in_bytes);
2759     return;
2760   }
2761 
2762   // The only BasicTypes that can reach here are T_SHORT, T_BYTE, T_INT and T_FLOAT
2763   assert(bt != T_DOUBLE && bt != T_LONG, "unsupported basic type");
2764   assert(vector_length_in_bytes <= 16, "length_in_bytes must be <= 16");
2765 
2766   bool isQ = vector_length_in_bytes == 16;
2767 
2768   SIMD_Arrangement size1 = isQ ? T16B : T8B;
2769   SIMD_Arrangement size2 = esize2arrangement((uint)type2aelembytes(bt), isQ);
2770 
2771   // Neon "tbl" instruction only supports byte tables, so we need to look at chunks of
2772   // 2B for selecting shorts or chunks of 4B for selecting ints/floats from the table.
2773   // The index values in "index" register are in the range of [0, 2 * NUM_ELEM) where NUM_ELEM
2774   // is the number of elements that can fit in a vector. For ex. for T_SHORT with 64-bit vector length,
2775   // the indices can range from [0, 8).
2776   // As an example with 64-bit vector length and T_SHORT type - let index = [2, 5, 1, 0]
2777   // Move a constant 0x02 in every byte of tmp - tmp = [0x0202, 0x0202, 0x0202, 0x0202]
2778   // Multiply index vector with tmp to yield - dst = [0x0404, 0x0a0a, 0x0202, 0x0000]
2779   // Move a constant 0x0100 in every 2B of tmp - tmp = [0x0100, 0x0100, 0x0100, 0x0100]
2780   // Add the multiplied result to the vector in tmp to obtain the byte level
2781   // offsets - dst = [0x0504, 0x0b0a, 0x0302, 0x0100]
2782   // Use these offsets in the "tbl" instruction to select chunks of 2B.
2783 
2784   if (bt == T_BYTE) {
2785     select_from_two_vectors_neon(dst, src1, src2, index, tmp, vector_length_in_bytes);
2786   } else {
2787     int elem_size = (bt == T_SHORT) ? 2 : 4;
2788     uint64_t tbl_offset = (bt == T_SHORT) ? 0x0100u : 0x03020100u;
2789 
2790     mov(tmp, size1, elem_size);
2791     mulv(dst, size2, index, tmp);
2792     mov(tmp, size2, tbl_offset);
2793     addv(dst, size1, dst, tmp); // "dst" now contains the processed index elements
2794                                 // to select a set of 2B/4B
2795     select_from_two_vectors_neon(dst, src1, src2, dst, tmp, vector_length_in_bytes);
2796   }
2797 }
2798 
2799 // Vector expand implementation. Elements from the src vector are expanded into
2800 // the dst vector under the control of the vector mask.
2801 // Since there are no native instructions directly corresponding to expand before
2802 // SVE2p2, the following implementations mainly leverages the TBL instruction to
2803 // implement expand. To compute the index input for TBL, the prefix sum algorithm
2804 // (https://en.wikipedia.org/wiki/Prefix_sum) is used. The same algorithm is used
2805 // for NEON and SVE, but with different instructions where appropriate.
2806 
2807 // Vector expand implementation for NEON.
2808 //
2809 // An example of 128-bit Byte vector:
2810 //   Data direction: high <== low
2811 //   Input:
2812 //         src   = g  f  e  d  c  b  a  9  8  7  6  5  4  3  2  1
2813 //         mask  = 0  0 -1 -1  0  0 -1 -1  0  0 -1 -1  0  0 -1 -1
2814 //   Expected result:
2815 //         dst   = 0  0  8  7  0  0  6  5  0  0  4  3  0  0  2  1
2816 void C2_MacroAssembler::vector_expand_neon(FloatRegister dst, FloatRegister src, FloatRegister mask,
2817                                            FloatRegister tmp1, FloatRegister tmp2, BasicType bt,
2818                                            int vector_length_in_bytes) {
2819   assert(vector_length_in_bytes <= 16, "the vector length in bytes for NEON must be <= 16");
2820   assert_different_registers(dst, src, mask, tmp1, tmp2);
2821   // Since the TBL instruction only supports byte table, we need to
2822   // compute indices in byte type for all types.
2823   SIMD_Arrangement size = vector_length_in_bytes == 16 ? T16B : T8B;
2824   // tmp1 =  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
2825   dup(tmp1, size, zr);
2826   // dst  =  0  0  1  1  0  0  1  1  0  0  1  1  0  0  1  1
2827   negr(dst, size, mask);
2828   // Calculate vector index for TBL with prefix sum algorithm.
2829   // dst  =  8  8  8  7  6  6  6  5  4  4  4  3  2  2  2  1
2830   for (int i = 1; i < vector_length_in_bytes; i <<= 1) {
2831     ext(tmp2, size, tmp1, dst, vector_length_in_bytes - i);
2832     addv(dst, size, tmp2, dst);
2833   }
2834   // tmp2 =  0  0 -1 -1  0  0 -1 -1  0  0 -1 -1  0  0 -1 -1
2835   orr(tmp2, size, mask, mask);
2836   // tmp2 =  0  0  8  7  0  0  6  5  0  0  4  3  0  0  2  1
2837   bsl(tmp2, size, dst, tmp1);
2838   // tmp1 =  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
2839   movi(tmp1, size, 1);
2840   // dst  = -1 -1  7  6 -1 -1  5  4 -1 -1  3  2 -1 -1  1  0
2841   subv(dst, size, tmp2, tmp1);
2842   // dst  =  0  0  8  7  0  0  6  5  0  0  4  3  0  0  2  1
2843   tbl(dst, size, src, 1, dst);
2844 }
2845 
2846 // Vector expand implementation for SVE.
2847 //
2848 // An example of 128-bit Short vector:
2849 //   Data direction: high <== low
2850 //   Input:
2851 //         src   = gf ed cb a9 87 65 43 21
2852 //         pg    = 00 01 00 01 00 01 00 01
2853 //   Expected result:
2854 //         dst   = 00 87 00 65 00 43 00 21
2855 void C2_MacroAssembler::vector_expand_sve(FloatRegister dst, FloatRegister src, PRegister pg,
2856                                           FloatRegister tmp1, FloatRegister tmp2, BasicType bt,
2857                                           int vector_length_in_bytes) {
2858   assert(UseSVE > 0, "expand implementation only for SVE");
2859   assert_different_registers(dst, src, tmp1, tmp2);
2860   SIMD_RegVariant size = elemType_to_regVariant(bt);
2861 
2862   // tmp1 = 00 00 00 00 00 00 00 00
2863   sve_dup(tmp1, size, 0);
2864   sve_movprfx(tmp2, tmp1);
2865   // tmp2 = 00 01 00 01 00 01 00 01
2866   sve_cpy(tmp2, size, pg, 1, true);
2867   // Calculate vector index for TBL with prefix sum algorithm.
2868   // tmp2 = 04 04 03 03 02 02 01 01
2869   for (int i = type2aelembytes(bt); i < vector_length_in_bytes; i <<= 1) {
2870     sve_movprfx(dst, tmp1);
2871     // The EXT instruction operates on the full-width sve register. The correct
2872     // index calculation method is:
2873     // vector_length_in_bytes - i + MaxVectorSize - vector_length_in_bytes =>
2874     // MaxVectorSize - i.
2875     sve_ext(dst, tmp2, MaxVectorSize - i);
2876     sve_add(tmp2, size, dst, tmp2);
2877   }
2878   // dst  = 00 04 00 03 00 02 00 01
2879   sve_sel(dst, size, pg, tmp2, tmp1);
2880   // dst  = -1 03 -1 02 -1 01 -1 00
2881   sve_sub(dst, size, 1);
2882   // dst  = 00 87 00 65 00 43 00 21
2883   sve_tbl(dst, size, src, dst);
2884 }