1 /*
   2  * Copyright (c) 2020, 2026, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright 2026 Arm Limited and/or its affiliates.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "opto/c2_MacroAssembler.hpp"
  29 #include "opto/compile.hpp"
  30 #include "opto/intrinsicnode.hpp"
  31 #include "opto/matcher.hpp"
  32 #include "opto/output.hpp"
  33 #include "opto/subnode.hpp"
  34 #include "runtime/objectMonitorTable.hpp"
  35 #include "runtime/stubRoutines.hpp"
  36 #include "runtime/synchronizer.hpp"
  37 #include "utilities/globalDefinitions.hpp"
  38 #include "utilities/powerOfTwo.hpp"
  39 
  40 #ifdef PRODUCT
  41 #define BLOCK_COMMENT(str) /* nothing */
  42 #define STOP(error) stop(error)
  43 #else
  44 #define BLOCK_COMMENT(str) block_comment(str)
  45 #define STOP(error) block_comment(error); stop(error)
  46 #endif
  47 
  48 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  49 
  50 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
  51 
  52 void C2_MacroAssembler::entry_barrier() {
  53   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
  54   // Dummy labels for just measuring the code size
  55   Label dummy_slow_path;
  56   Label dummy_continuation;
  57   Label dummy_guard;
  58   Label* slow_path = &dummy_slow_path;
  59   Label* continuation = &dummy_continuation;
  60   Label* guard = &dummy_guard;
  61   if (!Compile::current()->output()->in_scratch_emit_size()) {
  62     // Use real labels from actual stub when not emitting code for the purpose of measuring its size
  63     C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
  64     Compile::current()->output()->add_stub(stub);
  65     slow_path = &stub->entry();
  66     continuation = &stub->continuation();
  67     guard = &stub->guard();
  68   }
  69   // In the C2 code, we move the non-hot part of nmethod entry barriers out-of-line to a stub.
  70   bs->nmethod_entry_barrier(this, slow_path, continuation, guard);
  71 }
  72 
  73 // jdk.internal.util.ArraysSupport.vectorizedHashCode
  74 address C2_MacroAssembler::arrays_hashcode(Register ary, Register cnt, Register result,
  75                                            FloatRegister vdata0, FloatRegister vdata1,
  76                                            FloatRegister vdata2, FloatRegister vdata3,
  77                                            FloatRegister vmul0, FloatRegister vmul1,
  78                                            FloatRegister vmul2, FloatRegister vmul3,
  79                                            FloatRegister vpow, FloatRegister vpowm,
  80                                            BasicType eltype) {
  81   ARRAYS_HASHCODE_REGISTERS;
  82 
  83   Register tmp1 = rscratch1, tmp2 = rscratch2;
  84 
  85   Label TAIL, STUB_SWITCH, STUB_SWITCH_OUT, LOOP, BR_BASE, LARGE, DONE;
  86 
  87   // Vectorization factor. Number of array elements loaded to one SIMD&FP registers by the stubs. We
  88   // use 8H load arrangements for chars and shorts and 8B for booleans and bytes. It's possible to
  89   // use 4H for chars and shorts instead, but using 8H gives better performance.
  90   const size_t vf = eltype == T_BOOLEAN || eltype == T_BYTE ? 8
  91                     : eltype == T_CHAR || eltype == T_SHORT ? 8
  92                     : eltype == T_INT                       ? 4
  93                                                             : 0;
  94   guarantee(vf, "unsupported eltype");
  95 
  96   // Unroll factor for the scalar loop below. The value is chosen based on performance analysis.
  97   const size_t unroll_factor = 4;
  98 
  99   switch (eltype) {
 100   case T_BOOLEAN:
 101     BLOCK_COMMENT("arrays_hashcode(unsigned byte) {");
 102     break;
 103   case T_CHAR:
 104     BLOCK_COMMENT("arrays_hashcode(char) {");
 105     break;
 106   case T_BYTE:
 107     BLOCK_COMMENT("arrays_hashcode(byte) {");
 108     break;
 109   case T_SHORT:
 110     BLOCK_COMMENT("arrays_hashcode(short) {");
 111     break;
 112   case T_INT:
 113     BLOCK_COMMENT("arrays_hashcode(int) {");
 114     break;
 115   default:
 116     ShouldNotReachHere();
 117   }
 118 
 119   // large_arrays_hashcode(T_INT) performs worse than the scalar loop below when the Neon loop
 120   // implemented by the stub executes just once. Call the stub only if at least two iterations will
 121   // be executed.
 122   const size_t large_threshold = eltype == T_INT ? vf * 2 : vf;
 123   cmpw(cnt, large_threshold);
 124   br(Assembler::HS, LARGE);
 125 
 126   bind(TAIL);
 127 
 128   // The andr performs cnt % uf where uf = unroll_factor. The subtract shifted by 3 offsets past
 129   // uf - (cnt % uf) pairs of load + madd insns i.e. it only executes cnt % uf load + madd pairs.
 130   // Iteration eats up the remainder, uf elements at a time.
 131   assert(is_power_of_2(unroll_factor), "can't use this value to calculate the jump target PC");
 132   andr(tmp2, cnt, unroll_factor - 1);
 133   adr(tmp1, BR_BASE);
 134   // For Cortex-A53 offset is 4 because 2 nops are generated.
 135   sub(tmp1, tmp1, tmp2, ext::sxtw, VM_Version::supports_a53mac() ? 4 : 3);
 136   movw(tmp2, 0x1f);
 137   br(tmp1);
 138 
 139   bind(LOOP);
 140   for (size_t i = 0; i < unroll_factor; ++i) {
 141     load(tmp1, Address(post(ary, type2aelembytes(eltype))), eltype);
 142     maddw(result, result, tmp2, tmp1);
 143     // maddw generates an extra nop for Cortex-A53 (see maddw definition in macroAssembler).
 144     // Generate 2nd nop to have 4 instructions per iteration.
 145     if (VM_Version::supports_a53mac()) {
 146       nop();
 147     }
 148   }
 149   bind(BR_BASE);
 150   subsw(cnt, cnt, unroll_factor);
 151   br(Assembler::HS, LOOP);
 152 
 153   b(DONE);
 154 
 155   bind(LARGE);
 156 
 157   RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_arrays_hashcode(eltype));
 158   assert(stub.target() != nullptr, "array_hashcode stub has not been generated");
 159   address tpc = trampoline_call(stub);
 160   if (tpc == nullptr) {
 161     DEBUG_ONLY(reset_labels(TAIL, BR_BASE));
 162     postcond(pc() == badAddress);
 163     return nullptr;
 164   }
 165 
 166   bind(DONE);
 167 
 168   BLOCK_COMMENT("} // arrays_hashcode");
 169 
 170   postcond(pc() != badAddress);
 171   return pc();
 172 }
 173 
 174 void C2_MacroAssembler::fast_lock(Register obj, Register box, Register t1,
 175                                   Register t2, Register t3) {
 176   assert_different_registers(obj, box, t1, t2, t3, rscratch2);
 177 
 178   // Handle inflated monitor.
 179   Label inflated;
 180   // Finish fast lock successfully. MUST branch to with flag == EQ
 181   Label locked;
 182   // Finish fast lock unsuccessfully. MUST branch to with flag == NE
 183   Label slow_path;
 184 
 185   if (UseObjectMonitorTable) {
 186     // Clear cache in case fast locking succeeds or we need to take the slow-path.
 187     str(zr, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 188   }
 189 
 190   if (DiagnoseSyncOnValueBasedClasses != 0) {
 191     load_klass(t1, obj);
 192     ldrb(t1, Address(t1, Klass::misc_flags_offset()));
 193     tst(t1, KlassFlags::_misc_is_value_based_class);
 194     br(Assembler::NE, slow_path);
 195   }
 196 
 197   const Register t1_mark = t1;
 198   const Register t3_t = t3;
 199 
 200   { // Fast locking
 201 
 202     // Push lock to the lock stack and finish successfully. MUST branch to with flag == EQ
 203     Label push;
 204 
 205     const Register t2_top = t2;
 206 
 207     // Check if lock-stack is full.
 208     ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 209     cmpw(t2_top, (unsigned)LockStack::end_offset() - 1);
 210     br(Assembler::GT, slow_path);
 211 
 212     // Check if recursive.
 213     subw(t3_t, t2_top, oopSize);
 214     ldr(t3_t, Address(rthread, t3_t));
 215     cmp(obj, t3_t);
 216     br(Assembler::EQ, push);
 217 
 218     // Relaxed normal load to check for monitor. Optimization for monitor case.
 219     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 220     tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
 221 
 222     // Not inflated
 223     assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea");
 224 
 225     // Try to lock. Transition lock-bits 0b01 => 0b00
 226     orr(t1_mark, t1_mark, markWord::unlocked_value);
 227     eor(t3_t, t1_mark, markWord::unlocked_value);
 228     cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword, memory_order_acquire);
 229     br(Assembler::NE, slow_path);
 230 
 231     bind(push);
 232     // After successful lock, push object on lock-stack.
 233     str(obj, Address(rthread, t2_top));
 234     addw(t2_top, t2_top, oopSize);
 235     strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 236     b(locked);
 237   }
 238 
 239   { // Handle inflated monitor.
 240     bind(inflated);
 241 
 242     const Register t1_monitor = t1;
 243 
 244     if (!UseObjectMonitorTable) {
 245       assert(t1_monitor == t1_mark, "should be the same here");
 246     } else {
 247       const Register t1_hash = t1;
 248       Label monitor_found;
 249 
 250       // Save the mark, we might need it to extract the hash.
 251       mov(t3, t1_mark);
 252 
 253       // Look for the monitor in the om_cache.
 254 
 255       ByteSize cache_offset   = JavaThread::om_cache_oops_offset();
 256       ByteSize monitor_offset = OMCache::oop_to_monitor_difference();
 257       const int num_unrolled  = OMCache::CAPACITY;
 258       for (int i = 0; i < num_unrolled; i++) {
 259         ldr(t1_monitor, Address(rthread, cache_offset + monitor_offset));
 260         ldr(t2, Address(rthread, cache_offset));
 261         cmp(obj, t2);
 262         br(Assembler::EQ, monitor_found);
 263         cache_offset = cache_offset + OMCache::oop_to_oop_difference();
 264       }
 265 
 266       // Look for the monitor in the table.
 267 
 268       // Get the hash code.
 269       ubfx(t1_hash, t3, markWord::hash_shift, markWord::hash_bits);
 270 
 271       // Get the table and calculate the bucket's address
 272       lea(t3, ExternalAddress(ObjectMonitorTable::current_table_address()));
 273       ldr(t3, Address(t3));
 274       ldr(t2, Address(t3, ObjectMonitorTable::table_capacity_mask_offset()));
 275       ands(t1_hash, t1_hash, t2);
 276       ldr(t3, Address(t3, ObjectMonitorTable::table_buckets_offset()));
 277 
 278       // Read the monitor from the bucket.
 279       ldr(t1_monitor, Address(t3, t1_hash, Address::lsl(LogBytesPerWord)));
 280 
 281       // Check if the monitor in the bucket is special (empty, tombstone or removed).
 282       cmp(t1_monitor, (unsigned char)ObjectMonitorTable::SpecialPointerValues::below_is_special);
 283       br(Assembler::LO, slow_path);
 284 
 285       // Check if object matches.
 286       ldr(t3, Address(t1_monitor, ObjectMonitor::object_offset()));
 287       BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
 288       bs_asm->try_peek_weak_handle_in_nmethod(this, t3, t3, t2, slow_path);
 289       cmp(t3, obj);
 290       br(Assembler::NE, slow_path);
 291 
 292       bind(monitor_found);
 293     }
 294 
 295     const Register t2_owner_addr = t2;
 296     const Register t3_owner = t3;
 297     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 298     const Address owner_address(t1_monitor, ObjectMonitor::owner_offset() - monitor_tag);
 299     const Address recursions_address(t1_monitor, ObjectMonitor::recursions_offset() - monitor_tag);
 300 
 301     Label monitor_locked;
 302 
 303     // Compute owner address.
 304     lea(t2_owner_addr, owner_address);
 305 
 306     // Try to CAS owner (no owner => current thread's _monitor_owner_id).
 307     ldr(rscratch2, Address(rthread, JavaThread::monitor_owner_id_offset()));
 308     cmpxchg(t2_owner_addr, zr, rscratch2, Assembler::xword, memory_order_acquire, t3_owner);
 309     br(Assembler::EQ, monitor_locked);
 310 
 311     // Check if recursive.
 312     cmp(t3_owner, rscratch2);
 313     br(Assembler::NE, slow_path);
 314 
 315     // Recursive.
 316     increment(recursions_address, 1);
 317 
 318     bind(monitor_locked);
 319     if (UseObjectMonitorTable) {
 320       str(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 321     }
 322   }
 323 
 324   bind(locked);
 325 
 326 #ifdef ASSERT
 327   // Check that locked label is reached with Flags == EQ.
 328   Label flag_correct;
 329   br(Assembler::EQ, flag_correct);
 330   stop("Fast Lock Flag != EQ");
 331 #endif
 332 
 333   bind(slow_path);
 334 #ifdef ASSERT
 335   // Check that slow_path label is reached with Flags == NE.
 336   br(Assembler::NE, flag_correct);
 337   stop("Fast Lock Flag != NE");
 338   bind(flag_correct);
 339 #endif
 340   // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
 341 }
 342 
 343 void C2_MacroAssembler::fast_unlock(Register obj, Register box, Register t1,
 344                                     Register t2, Register t3) {
 345   assert_different_registers(obj, box, t1, t2, t3);
 346 
 347   // Handle inflated monitor.
 348   Label inflated, inflated_load_mark;
 349   // Finish fast unlock successfully. MUST branch to with flag == EQ
 350   Label unlocked;
 351   // Finish fast unlock unsuccessfully. MUST branch to with flag == NE
 352   Label slow_path;
 353 
 354   const Register t1_mark = t1;
 355   const Register t2_top = t2;
 356   const Register t3_t = t3;
 357 
 358   { // Fast unlock
 359 
 360     Label push_and_slow_path;
 361 
 362     // Check if obj is top of lock-stack.
 363     ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 364     subw(t2_top, t2_top, oopSize);
 365     ldr(t3_t, Address(rthread, t2_top));
 366     cmp(obj, t3_t);
 367     // Top of lock stack was not obj. Must be monitor.
 368     br(Assembler::NE, inflated_load_mark);
 369 
 370     // Pop lock-stack.
 371     DEBUG_ONLY(str(zr, Address(rthread, t2_top));)
 372     strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 373 
 374     // Check if recursive.
 375     subw(t3_t, t2_top, oopSize);
 376     ldr(t3_t, Address(rthread, t3_t));
 377     cmp(obj, t3_t);
 378     br(Assembler::EQ, unlocked);
 379 
 380     // Not recursive.
 381     // Load Mark.
 382     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 383 
 384     // Check header for monitor (0b10).
 385     // Because we got here by popping (meaning we pushed in locked)
 386     // there will be no monitor in the box. So we need to push back the obj
 387     // so that the runtime can fix any potential anonymous owner.
 388     tbnz(t1_mark, exact_log2(markWord::monitor_value), UseObjectMonitorTable ? push_and_slow_path : inflated);
 389 
 390     // Try to unlock. Transition lock bits 0b00 => 0b01
 391     assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea");
 392     orr(t3_t, t1_mark, markWord::unlocked_value);
 393     cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword, memory_order_release);
 394     br(Assembler::EQ, unlocked);
 395 
 396     bind(push_and_slow_path);
 397     // Compare and exchange failed.
 398     // Restore lock-stack and handle the unlock in runtime.
 399     DEBUG_ONLY(str(obj, Address(rthread, t2_top));)
 400     addw(t2_top, t2_top, oopSize);
 401     str(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 402     b(slow_path);
 403   }
 404 
 405 
 406   { // Handle inflated monitor.
 407     bind(inflated_load_mark);
 408     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 409 #ifdef ASSERT
 410     tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
 411     stop("Fast Unlock not monitor");
 412 #endif
 413 
 414     bind(inflated);
 415 
 416 #ifdef ASSERT
 417     Label check_done;
 418     subw(t2_top, t2_top, oopSize);
 419     cmpw(t2_top, in_bytes(JavaThread::lock_stack_base_offset()));
 420     br(Assembler::LT, check_done);
 421     ldr(t3_t, Address(rthread, t2_top));
 422     cmp(obj, t3_t);
 423     br(Assembler::NE, inflated);
 424     stop("Fast Unlock lock on stack");
 425     bind(check_done);
 426 #endif
 427 
 428     const Register t1_monitor = t1;
 429 
 430     if (!UseObjectMonitorTable) {
 431       assert(t1_monitor == t1_mark, "should be the same here");
 432 
 433       // Untag the monitor.
 434       add(t1_monitor, t1_mark, -(int)markWord::monitor_value);
 435     } else {
 436       ldr(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 437       // null check with Flags == NE, no valid pointer below alignof(ObjectMonitor*)
 438       cmp(t1_monitor, checked_cast<uint8_t>(alignof(ObjectMonitor*)));
 439       br(Assembler::LO, slow_path);
 440     }
 441 
 442     const Register t2_recursions = t2;
 443     Label not_recursive;
 444 
 445     // Check if recursive.
 446     ldr(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
 447     cbz(t2_recursions, not_recursive);
 448 
 449     // Recursive unlock.
 450     sub(t2_recursions, t2_recursions, 1u);
 451     str(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
 452     // Set flag == EQ
 453     cmp(t2_recursions, t2_recursions);
 454     b(unlocked);
 455 
 456     bind(not_recursive);
 457 
 458     const Register t2_owner_addr = t2;
 459 
 460     // Compute owner address.
 461     lea(t2_owner_addr, Address(t1_monitor, ObjectMonitor::owner_offset()));
 462 
 463     // Set owner to null.
 464     // Release to satisfy the JMM
 465     stlr(zr, t2_owner_addr);
 466     // We need a full fence after clearing owner to avoid stranding.
 467     // StoreLoad achieves this.
 468     membar(StoreLoad);
 469 
 470     // Check if the entry_list is empty.
 471     ldr(rscratch1, Address(t1_monitor, ObjectMonitor::entry_list_offset()));
 472     cmp(rscratch1, zr);
 473     br(Assembler::EQ, unlocked);  // If so we are done.
 474 
 475     // Check if there is a successor.
 476     ldr(rscratch1, Address(t1_monitor, ObjectMonitor::succ_offset()));
 477     cmp(rscratch1, zr);
 478     br(Assembler::NE, unlocked);  // If so we are done.
 479 
 480     // Save the monitor pointer in the current thread, so we can try to
 481     // reacquire the lock in SharedRuntime::monitor_exit_helper().
 482     str(t1_monitor, Address(rthread, JavaThread::unlocked_inflated_monitor_offset()));
 483 
 484     cmp(zr, rthread); // Set Flag to NE => slow path
 485     b(slow_path);
 486   }
 487 
 488   bind(unlocked);
 489   cmp(zr, zr); // Set Flags to EQ => fast path
 490 
 491 #ifdef ASSERT
 492   // Check that unlocked label is reached with Flags == EQ.
 493   Label flag_correct;
 494   br(Assembler::EQ, flag_correct);
 495   stop("Fast Unlock Flag != EQ");
 496 #endif
 497 
 498   bind(slow_path);
 499 #ifdef ASSERT
 500   // Check that slow_path label is reached with Flags == NE.
 501   br(Assembler::NE, flag_correct);
 502   stop("Fast Unlock Flag != NE");
 503   bind(flag_correct);
 504 #endif
 505   // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
 506 }
 507 
 508 // Search for str1 in str2 and return index or -1
 509 // Clobbers: rscratch1, rscratch2, rflags. May also clobber v0-v1, when icnt1==-1.
 510 void C2_MacroAssembler::string_indexof(Register str2, Register str1,
 511                                        Register cnt2, Register cnt1,
 512                                        Register tmp1, Register tmp2,
 513                                        Register tmp3, Register tmp4,
 514                                        Register tmp5, Register tmp6,
 515                                        int icnt1, Register result, int ae) {
 516   // NOTE: tmp5, tmp6 can be zr depending on specific method version
 517   Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
 518 
 519   Register ch1 = rscratch1;
 520   Register ch2 = rscratch2;
 521   Register cnt1tmp = tmp1;
 522   Register cnt2tmp = tmp2;
 523   Register cnt1_neg = cnt1;
 524   Register cnt2_neg = cnt2;
 525   Register result_tmp = tmp4;
 526 
 527   bool isL = ae == StrIntrinsicNode::LL;
 528 
 529   bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
 530   bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
 531   int str1_chr_shift = str1_isL ? 0:1;
 532   int str2_chr_shift = str2_isL ? 0:1;
 533   int str1_chr_size = str1_isL ? 1:2;
 534   int str2_chr_size = str2_isL ? 1:2;
 535   chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
 536                                       (chr_insn)&MacroAssembler::ldrh;
 537   chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
 538                                       (chr_insn)&MacroAssembler::ldrh;
 539   chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
 540   chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
 541 
 542   // Note, inline_string_indexOf() generates checks:
 543   // if (substr.count > string.count) return -1;
 544   // if (substr.count == 0) return 0;
 545 
 546   // We have two strings, a source string in str2, cnt2 and a pattern string
 547   // in str1, cnt1. Find the 1st occurrence of pattern in source or return -1.
 548 
 549   // For larger pattern and source we use a simplified Boyer Moore algorithm.
 550   // With a small pattern and source we use linear scan.
 551 
 552   if (icnt1 == -1) {
 553     sub(result_tmp, cnt2, cnt1);
 554     cmp(cnt1, (u1)8);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
 555     br(LT, LINEARSEARCH);
 556     dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
 557     subs(zr, cnt1, 256);
 558     lsr(tmp1, cnt2, 2);
 559     ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
 560     br(GE, LINEARSTUB);
 561   }
 562 
 563 // The Boyer Moore alogorithm is based on the description here:-
 564 //
 565 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
 566 //
 567 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
 568 // and the 'Good Suffix' rule.
 569 //
 570 // These rules are essentially heuristics for how far we can shift the
 571 // pattern along the search string.
 572 //
 573 // The implementation here uses the 'Bad Character' rule only because of the
 574 // complexity of initialisation for the 'Good Suffix' rule.
 575 //
 576 // This is also known as the Boyer-Moore-Horspool algorithm:-
 577 //
 578 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
 579 //
 580 // This particular implementation has few java-specific optimizations.
 581 //
 582 // #define ASIZE 256
 583 //
 584 //    int bm(unsigned char *x, int m, unsigned char *y, int n) {
 585 //       int i, j;
 586 //       unsigned c;
 587 //       unsigned char bc[ASIZE];
 588 //
 589 //       /* Preprocessing */
 590 //       for (i = 0; i < ASIZE; ++i)
 591 //          bc[i] = m;
 592 //       for (i = 0; i < m - 1; ) {
 593 //          c = x[i];
 594 //          ++i;
 595 //          // c < 256 for Latin1 string, so, no need for branch
 596 //          #ifdef PATTERN_STRING_IS_LATIN1
 597 //          bc[c] = m - i;
 598 //          #else
 599 //          if (c < ASIZE) bc[c] = m - i;
 600 //          #endif
 601 //       }
 602 //
 603 //       /* Searching */
 604 //       j = 0;
 605 //       while (j <= n - m) {
 606 //          c = y[i+j];
 607 //          if (x[m-1] == c)
 608 //            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
 609 //          if (i < 0) return j;
 610 //          // c < 256 for Latin1 string, so, no need for branch
 611 //          #ifdef SOURCE_STRING_IS_LATIN1
 612 //          // LL case: (c< 256) always true. Remove branch
 613 //          j += bc[y[j+m-1]];
 614 //          #endif
 615 //          #ifndef PATTERN_STRING_IS_UTF
 616 //          // UU case: need if (c<ASIZE) check. Skip 1 character if not.
 617 //          if (c < ASIZE)
 618 //            j += bc[y[j+m-1]];
 619 //          else
 620 //            j += 1
 621 //          #endif
 622 //          #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
 623 //          // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
 624 //          if (c < ASIZE)
 625 //            j += bc[y[j+m-1]];
 626 //          else
 627 //            j += m
 628 //          #endif
 629 //       }
 630 //    }
 631 
 632   if (icnt1 == -1) {
 633     Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
 634         BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
 635     Register cnt1end = tmp2;
 636     Register str2end = cnt2;
 637     Register skipch = tmp2;
 638 
 639     // str1 length is >=8, so, we can read at least 1 register for cases when
 640     // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
 641     // UL case. We'll re-read last character in inner pre-loop code to have
 642     // single outer pre-loop load
 643     const int firstStep = isL ? 7 : 3;
 644 
 645     const int ASIZE = 256;
 646     const int STORED_BYTES = 32; // amount of bytes stored per instruction
 647     sub(sp, sp, ASIZE);
 648     mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
 649     mov(ch1, sp);
 650     BIND(BM_INIT_LOOP);
 651       stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
 652       subs(tmp5, tmp5, 1);
 653       br(GT, BM_INIT_LOOP);
 654 
 655       sub(cnt1tmp, cnt1, 1);
 656       mov(tmp5, str2);
 657       add(str2end, str2, result_tmp, LSL, str2_chr_shift);
 658       sub(ch2, cnt1, 1);
 659       mov(tmp3, str1);
 660     BIND(BCLOOP);
 661       (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
 662       if (!str1_isL) {
 663         subs(zr, ch1, ASIZE);
 664         br(HS, BCSKIP);
 665       }
 666       strb(ch2, Address(sp, ch1));
 667     BIND(BCSKIP);
 668       subs(ch2, ch2, 1);
 669       br(GT, BCLOOP);
 670 
 671       add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
 672       if (str1_isL == str2_isL) {
 673         // load last 8 bytes (8LL/4UU symbols)
 674         ldr(tmp6, Address(tmp6, -wordSize));
 675       } else {
 676         ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
 677         // convert Latin1 to UTF. We'll have to wait until load completed, but
 678         // it's still faster than per-character loads+checks
 679         lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
 680         ubfx(ch1, tmp6, 8, 8); // str1[N-2]
 681         ubfx(ch2, tmp6, 16, 8); // str1[N-3]
 682         andr(tmp6, tmp6, 0xFF); // str1[N-4]
 683         orr(ch2, ch1, ch2, LSL, 16);
 684         orr(tmp6, tmp6, tmp3, LSL, 48);
 685         orr(tmp6, tmp6, ch2, LSL, 16);
 686       }
 687     BIND(BMLOOPSTR2);
 688       (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 689       sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
 690       if (str1_isL == str2_isL) {
 691         // re-init tmp3. It's for free because it's executed in parallel with
 692         // load above. Alternative is to initialize it before loop, but it'll
 693         // affect performance on in-order systems with 2 or more ld/st pipelines
 694         lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
 695       }
 696       if (!isL) { // UU/UL case
 697         lsl(ch2, cnt1tmp, 1); // offset in bytes
 698       }
 699       cmp(tmp3, skipch);
 700       br(NE, BMSKIP);
 701       ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
 702       mov(ch1, tmp6);
 703       if (isL) {
 704         b(BMLOOPSTR1_AFTER_LOAD);
 705       } else {
 706         sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
 707         b(BMLOOPSTR1_CMP);
 708       }
 709     BIND(BMLOOPSTR1);
 710       (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
 711       (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 712     BIND(BMLOOPSTR1_AFTER_LOAD);
 713       subs(cnt1tmp, cnt1tmp, 1);
 714       br(LT, BMLOOPSTR1_LASTCMP);
 715     BIND(BMLOOPSTR1_CMP);
 716       cmp(ch1, ch2);
 717       br(EQ, BMLOOPSTR1);
 718     BIND(BMSKIP);
 719       if (!isL) {
 720         // if we've met UTF symbol while searching Latin1 pattern, then we can
 721         // skip cnt1 symbols
 722         if (str1_isL != str2_isL) {
 723           mov(result_tmp, cnt1);
 724         } else {
 725           mov(result_tmp, 1);
 726         }
 727         subs(zr, skipch, ASIZE);
 728         br(HS, BMADV);
 729       }
 730       ldrb(result_tmp, Address(sp, skipch)); // load skip distance
 731     BIND(BMADV);
 732       sub(cnt1tmp, cnt1, 1);
 733       add(str2, str2, result_tmp, LSL, str2_chr_shift);
 734       cmp(str2, str2end);
 735       br(LE, BMLOOPSTR2);
 736       add(sp, sp, ASIZE);
 737       b(NOMATCH);
 738     BIND(BMLOOPSTR1_LASTCMP);
 739       cmp(ch1, ch2);
 740       br(NE, BMSKIP);
 741     BIND(BMMATCH);
 742       sub(result, str2, tmp5);
 743       if (!str2_isL) lsr(result, result, 1);
 744       add(sp, sp, ASIZE);
 745       b(DONE);
 746 
 747     BIND(LINEARSTUB);
 748     cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm
 749     br(LT, LINEAR_MEDIUM);
 750     mov(result, zr);
 751     RuntimeAddress stub = nullptr;
 752     if (isL) {
 753       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
 754       assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated");
 755     } else if (str1_isL) {
 756       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
 757        assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated");
 758     } else {
 759       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
 760       assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated");
 761     }
 762     address call = trampoline_call(stub);
 763     if (call == nullptr) {
 764       DEBUG_ONLY(reset_labels(LINEARSEARCH, LINEAR_MEDIUM, DONE, NOMATCH, MATCH));
 765       ciEnv::current()->record_failure("CodeCache is full");
 766       return;
 767     }
 768     b(DONE);
 769   }
 770 
 771   BIND(LINEARSEARCH);
 772   {
 773     Label DO1, DO2, DO3;
 774 
 775     Register str2tmp = tmp2;
 776     Register first = tmp3;
 777 
 778     if (icnt1 == -1)
 779     {
 780         Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
 781 
 782         cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2));
 783         br(LT, DOSHORT);
 784       BIND(LINEAR_MEDIUM);
 785         (this->*str1_load_1chr)(first, Address(str1));
 786         lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
 787         sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
 788         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 789         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 790 
 791       BIND(FIRST_LOOP);
 792         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
 793         cmp(first, ch2);
 794         br(EQ, STR1_LOOP);
 795       BIND(STR2_NEXT);
 796         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 797         br(LE, FIRST_LOOP);
 798         b(NOMATCH);
 799 
 800       BIND(STR1_LOOP);
 801         adds(cnt1tmp, cnt1_neg, str1_chr_size);
 802         add(cnt2tmp, cnt2_neg, str2_chr_size);
 803         br(GE, MATCH);
 804 
 805       BIND(STR1_NEXT);
 806         (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
 807         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
 808         cmp(ch1, ch2);
 809         br(NE, STR2_NEXT);
 810         adds(cnt1tmp, cnt1tmp, str1_chr_size);
 811         add(cnt2tmp, cnt2tmp, str2_chr_size);
 812         br(LT, STR1_NEXT);
 813         b(MATCH);
 814 
 815       BIND(DOSHORT);
 816       if (str1_isL == str2_isL) {
 817         cmp(cnt1, (u1)2);
 818         br(LT, DO1);
 819         br(GT, DO3);
 820       }
 821     }
 822 
 823     if (icnt1 == 4) {
 824       Label CH1_LOOP;
 825 
 826         (this->*load_4chr)(ch1, str1);
 827         sub(result_tmp, cnt2, 4);
 828         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 829         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 830 
 831       BIND(CH1_LOOP);
 832         (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
 833         cmp(ch1, ch2);
 834         br(EQ, MATCH);
 835         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 836         br(LE, CH1_LOOP);
 837         b(NOMATCH);
 838       }
 839 
 840     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
 841       Label CH1_LOOP;
 842 
 843       BIND(DO2);
 844         (this->*load_2chr)(ch1, str1);
 845         if (icnt1 == 2) {
 846           sub(result_tmp, cnt2, 2);
 847         }
 848         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 849         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 850       BIND(CH1_LOOP);
 851         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
 852         cmp(ch1, ch2);
 853         br(EQ, MATCH);
 854         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 855         br(LE, CH1_LOOP);
 856         b(NOMATCH);
 857     }
 858 
 859     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
 860       Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
 861 
 862       BIND(DO3);
 863         (this->*load_2chr)(first, str1);
 864         (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
 865         if (icnt1 == 3) {
 866           sub(result_tmp, cnt2, 3);
 867         }
 868         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 869         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 870       BIND(FIRST_LOOP);
 871         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
 872         cmpw(first, ch2);
 873         br(EQ, STR1_LOOP);
 874       BIND(STR2_NEXT);
 875         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 876         br(LE, FIRST_LOOP);
 877         b(NOMATCH);
 878 
 879       BIND(STR1_LOOP);
 880         add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
 881         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
 882         cmp(ch1, ch2);
 883         br(NE, STR2_NEXT);
 884         b(MATCH);
 885     }
 886 
 887     if (icnt1 == -1 || icnt1 == 1) {
 888       Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
 889 
 890       BIND(DO1);
 891         (this->*str1_load_1chr)(ch1, str1);
 892         cmp(cnt2, (u1)8);
 893         br(LT, DO1_SHORT);
 894 
 895         sub(result_tmp, cnt2, 8/str2_chr_size);
 896         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 897         mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
 898         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 899 
 900         if (str2_isL) {
 901           orr(ch1, ch1, ch1, LSL, 8);
 902         }
 903         orr(ch1, ch1, ch1, LSL, 16);
 904         orr(ch1, ch1, ch1, LSL, 32);
 905       BIND(CH1_LOOP);
 906         ldr(ch2, Address(str2, cnt2_neg));
 907         eor(ch2, ch1, ch2);
 908         sub(tmp1, ch2, tmp3);
 909         orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 910         bics(tmp1, tmp1, tmp2);
 911         br(NE, HAS_ZERO);
 912         adds(cnt2_neg, cnt2_neg, 8);
 913         br(LT, CH1_LOOP);
 914 
 915         cmp(cnt2_neg, (u1)8);
 916         mov(cnt2_neg, 0);
 917         br(LT, CH1_LOOP);
 918         b(NOMATCH);
 919 
 920       BIND(HAS_ZERO);
 921         rev(tmp1, tmp1);
 922         clz(tmp1, tmp1);
 923         add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
 924         b(MATCH);
 925 
 926       BIND(DO1_SHORT);
 927         mov(result_tmp, cnt2);
 928         lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
 929         sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
 930       BIND(DO1_LOOP);
 931         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
 932         cmpw(ch1, ch2);
 933         br(EQ, MATCH);
 934         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 935         br(LT, DO1_LOOP);
 936     }
 937   }
 938   BIND(NOMATCH);
 939     mov(result, -1);
 940     b(DONE);
 941   BIND(MATCH);
 942     add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
 943   BIND(DONE);
 944 }
 945 
 946 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
 947 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
 948 
 949 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1,
 950                                             Register ch, Register result,
 951                                             Register tmp1, Register tmp2, Register tmp3)
 952 {
 953   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
 954   Register cnt1_neg = cnt1;
 955   Register ch1 = rscratch1;
 956   Register result_tmp = rscratch2;
 957 
 958   cbz(cnt1, NOMATCH);
 959 
 960   cmp(cnt1, (u1)4);
 961   br(LT, DO1_SHORT);
 962 
 963   orr(ch, ch, ch, LSL, 16);
 964   orr(ch, ch, ch, LSL, 32);
 965 
 966   sub(cnt1, cnt1, 4);
 967   mov(result_tmp, cnt1);
 968   lea(str1, Address(str1, cnt1, Address::uxtw(1)));
 969   sub(cnt1_neg, zr, cnt1, LSL, 1);
 970 
 971   mov(tmp3, 0x0001000100010001);
 972 
 973   BIND(CH1_LOOP);
 974     ldr(ch1, Address(str1, cnt1_neg));
 975     eor(ch1, ch, ch1);
 976     sub(tmp1, ch1, tmp3);
 977     orr(tmp2, ch1, 0x7fff7fff7fff7fff);
 978     bics(tmp1, tmp1, tmp2);
 979     br(NE, HAS_ZERO);
 980     adds(cnt1_neg, cnt1_neg, 8);
 981     br(LT, CH1_LOOP);
 982 
 983     cmp(cnt1_neg, (u1)8);
 984     mov(cnt1_neg, 0);
 985     br(LT, CH1_LOOP);
 986     b(NOMATCH);
 987 
 988   BIND(HAS_ZERO);
 989     rev(tmp1, tmp1);
 990     clz(tmp1, tmp1);
 991     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
 992     b(MATCH);
 993 
 994   BIND(DO1_SHORT);
 995     mov(result_tmp, cnt1);
 996     lea(str1, Address(str1, cnt1, Address::uxtw(1)));
 997     sub(cnt1_neg, zr, cnt1, LSL, 1);
 998   BIND(DO1_LOOP);
 999     ldrh(ch1, Address(str1, cnt1_neg));
1000     cmpw(ch, ch1);
1001     br(EQ, MATCH);
1002     adds(cnt1_neg, cnt1_neg, 2);
1003     br(LT, DO1_LOOP);
1004   BIND(NOMATCH);
1005     mov(result, -1);
1006     b(DONE);
1007   BIND(MATCH);
1008     add(result, result_tmp, cnt1_neg, ASR, 1);
1009   BIND(DONE);
1010 }
1011 
1012 void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1,
1013                                                 Register ch, Register result,
1014                                                 FloatRegister ztmp1,
1015                                                 FloatRegister ztmp2,
1016                                                 PRegister tmp_pg,
1017                                                 PRegister tmp_pdn, bool isL)
1018 {
1019   // Note that `tmp_pdn` should *NOT* be used as governing predicate register.
1020   assert(tmp_pg->is_governing(),
1021          "this register has to be a governing predicate register");
1022 
1023   Label LOOP, MATCH, DONE, NOMATCH;
1024   Register vec_len = rscratch1;
1025   Register idx = rscratch2;
1026 
1027   SIMD_RegVariant T = (isL == true) ? B : H;
1028 
1029   cbz(cnt1, NOMATCH);
1030 
1031   // Assign the particular char throughout the vector.
1032   sve_dup(ztmp2, T, ch);
1033   if (isL) {
1034     sve_cntb(vec_len);
1035   } else {
1036     sve_cnth(vec_len);
1037   }
1038   mov(idx, 0);
1039 
1040   // Generate a predicate to control the reading of input string.
1041   sve_whilelt(tmp_pg, T, idx, cnt1);
1042 
1043   BIND(LOOP);
1044     // Read a vector of 8- or 16-bit data depending on the string type. Note
1045     // that inactive elements indicated by the predicate register won't cause
1046     // a data read from memory to the destination vector.
1047     if (isL) {
1048       sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx));
1049     } else {
1050       sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1)));
1051     }
1052     add(idx, idx, vec_len);
1053 
1054     // Perform the comparison. An element of the destination predicate is set
1055     // to active if the particular char is matched.
1056     sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2);
1057 
1058     // Branch if the particular char is found.
1059     br(NE, MATCH);
1060 
1061     sve_whilelt(tmp_pg, T, idx, cnt1);
1062 
1063     // Loop back if the particular char not found.
1064     br(MI, LOOP);
1065 
1066   BIND(NOMATCH);
1067     mov(result, -1);
1068     b(DONE);
1069 
1070   BIND(MATCH);
1071     // Undo the index increment.
1072     sub(idx, idx, vec_len);
1073 
1074     // Crop the vector to find its location.
1075     sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */);
1076     add(result, idx, -1);
1077     sve_incp(result, T, tmp_pdn);
1078   BIND(DONE);
1079 }
1080 
1081 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1,
1082                                             Register ch, Register result,
1083                                             Register tmp1, Register tmp2, Register tmp3)
1084 {
1085   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
1086   Register cnt1_neg = cnt1;
1087   Register ch1 = rscratch1;
1088   Register result_tmp = rscratch2;
1089 
1090   cbz(cnt1, NOMATCH);
1091 
1092   cmp(cnt1, (u1)8);
1093   br(LT, DO1_SHORT);
1094 
1095   orr(ch, ch, ch, LSL, 8);
1096   orr(ch, ch, ch, LSL, 16);
1097   orr(ch, ch, ch, LSL, 32);
1098 
1099   sub(cnt1, cnt1, 8);
1100   mov(result_tmp, cnt1);
1101   lea(str1, Address(str1, cnt1));
1102   sub(cnt1_neg, zr, cnt1);
1103 
1104   mov(tmp3, 0x0101010101010101);
1105 
1106   BIND(CH1_LOOP);
1107     ldr(ch1, Address(str1, cnt1_neg));
1108     eor(ch1, ch, ch1);
1109     sub(tmp1, ch1, tmp3);
1110     orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f);
1111     bics(tmp1, tmp1, tmp2);
1112     br(NE, HAS_ZERO);
1113     adds(cnt1_neg, cnt1_neg, 8);
1114     br(LT, CH1_LOOP);
1115 
1116     cmp(cnt1_neg, (u1)8);
1117     mov(cnt1_neg, 0);
1118     br(LT, CH1_LOOP);
1119     b(NOMATCH);
1120 
1121   BIND(HAS_ZERO);
1122     rev(tmp1, tmp1);
1123     clz(tmp1, tmp1);
1124     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
1125     b(MATCH);
1126 
1127   BIND(DO1_SHORT);
1128     mov(result_tmp, cnt1);
1129     lea(str1, Address(str1, cnt1));
1130     sub(cnt1_neg, zr, cnt1);
1131   BIND(DO1_LOOP);
1132     ldrb(ch1, Address(str1, cnt1_neg));
1133     cmp(ch, ch1);
1134     br(EQ, MATCH);
1135     adds(cnt1_neg, cnt1_neg, 1);
1136     br(LT, DO1_LOOP);
1137   BIND(NOMATCH);
1138     mov(result, -1);
1139     b(DONE);
1140   BIND(MATCH);
1141     add(result, result_tmp, cnt1_neg);
1142   BIND(DONE);
1143 }
1144 
1145 // Compare strings.
1146 void C2_MacroAssembler::string_compare(Register str1, Register str2,
1147     Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
1148     FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
1149     PRegister pgtmp1, PRegister pgtmp2, int ae) {
1150   Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
1151       DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
1152       SHORT_LOOP_START, TAIL_CHECK;
1153 
1154   bool isLL = ae == StrIntrinsicNode::LL;
1155   bool isLU = ae == StrIntrinsicNode::LU;
1156   bool isUL = ae == StrIntrinsicNode::UL;
1157 
1158   // The stub threshold for LL strings is: 72 (64 + 8) chars
1159   // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch)
1160   // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least)
1161   const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36);
1162 
1163   bool str1_isL = isLL || isLU;
1164   bool str2_isL = isLL || isUL;
1165 
1166   int str1_chr_shift = str1_isL ? 0 : 1;
1167   int str2_chr_shift = str2_isL ? 0 : 1;
1168   int str1_chr_size = str1_isL ? 1 : 2;
1169   int str2_chr_size = str2_isL ? 1 : 2;
1170   int minCharsInWord = isLL ? wordSize : wordSize/2;
1171 
1172   FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
1173   chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
1174                                       (chr_insn)&MacroAssembler::ldrh;
1175   chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
1176                                       (chr_insn)&MacroAssembler::ldrh;
1177   uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
1178                             (uxt_insn)&MacroAssembler::uxthw;
1179 
1180   BLOCK_COMMENT("string_compare {");
1181 
1182   // Bizarrely, the counts are passed in bytes, regardless of whether they
1183   // are L or U strings, however the result is always in characters.
1184   if (!str1_isL) asrw(cnt1, cnt1, 1);
1185   if (!str2_isL) asrw(cnt2, cnt2, 1);
1186 
1187   // Compute the minimum of the string lengths and save the difference.
1188   subsw(result, cnt1, cnt2);
1189   cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
1190 
1191   // A very short string
1192   cmpw(cnt2, minCharsInWord);
1193   br(Assembler::LE, SHORT_STRING);
1194 
1195   // Compare longwords
1196   // load first parts of strings and finish initialization while loading
1197   {
1198     if (str1_isL == str2_isL) { // LL or UU
1199       ldr(tmp1, Address(str1));
1200       cmp(str1, str2);
1201       br(Assembler::EQ, DONE);
1202       ldr(tmp2, Address(str2));
1203       cmp(cnt2, stub_threshold);
1204       br(GE, STUB);
1205       subsw(cnt2, cnt2, minCharsInWord);
1206       br(EQ, TAIL_CHECK);
1207       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1208       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1209       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1210     } else if (isLU) {
1211       ldrs(vtmp, Address(str1));
1212       ldr(tmp2, Address(str2));
1213       cmp(cnt2, stub_threshold);
1214       br(GE, STUB);
1215       subw(cnt2, cnt2, 4);
1216       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1217       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1218       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1219       zip1(vtmp, T8B, vtmp, vtmpZ);
1220       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1221       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1222       add(cnt1, cnt1, 4);
1223       fmovd(tmp1, vtmp);
1224     } else { // UL case
1225       ldr(tmp1, Address(str1));
1226       ldrs(vtmp, Address(str2));
1227       cmp(cnt2, stub_threshold);
1228       br(GE, STUB);
1229       subw(cnt2, cnt2, 4);
1230       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1231       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1232       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1233       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1234       zip1(vtmp, T8B, vtmp, vtmpZ);
1235       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1236       add(cnt1, cnt1, 8);
1237       fmovd(tmp2, vtmp);
1238     }
1239     adds(cnt2, cnt2, isUL ? 4 : 8);
1240     br(GE, TAIL);
1241     eor(rscratch2, tmp1, tmp2);
1242     cbnz(rscratch2, DIFF);
1243     // main loop
1244     bind(NEXT_WORD);
1245     if (str1_isL == str2_isL) {
1246       ldr(tmp1, Address(str1, cnt2));
1247       ldr(tmp2, Address(str2, cnt2));
1248       adds(cnt2, cnt2, 8);
1249     } else if (isLU) {
1250       ldrs(vtmp, Address(str1, cnt1));
1251       ldr(tmp2, Address(str2, cnt2));
1252       add(cnt1, cnt1, 4);
1253       zip1(vtmp, T8B, vtmp, vtmpZ);
1254       fmovd(tmp1, vtmp);
1255       adds(cnt2, cnt2, 8);
1256     } else { // UL
1257       ldrs(vtmp, Address(str2, cnt2));
1258       ldr(tmp1, Address(str1, cnt1));
1259       zip1(vtmp, T8B, vtmp, vtmpZ);
1260       add(cnt1, cnt1, 8);
1261       fmovd(tmp2, vtmp);
1262       adds(cnt2, cnt2, 4);
1263     }
1264     br(GE, TAIL);
1265 
1266     eor(rscratch2, tmp1, tmp2);
1267     cbz(rscratch2, NEXT_WORD);
1268     b(DIFF);
1269     bind(TAIL);
1270     eor(rscratch2, tmp1, tmp2);
1271     cbnz(rscratch2, DIFF);
1272     // Last longword.  In the case where length == 4 we compare the
1273     // same longword twice, but that's still faster than another
1274     // conditional branch.
1275     if (str1_isL == str2_isL) {
1276       ldr(tmp1, Address(str1));
1277       ldr(tmp2, Address(str2));
1278     } else if (isLU) {
1279       ldrs(vtmp, Address(str1));
1280       ldr(tmp2, Address(str2));
1281       zip1(vtmp, T8B, vtmp, vtmpZ);
1282       fmovd(tmp1, vtmp);
1283     } else { // UL
1284       ldrs(vtmp, Address(str2));
1285       ldr(tmp1, Address(str1));
1286       zip1(vtmp, T8B, vtmp, vtmpZ);
1287       fmovd(tmp2, vtmp);
1288     }
1289     bind(TAIL_CHECK);
1290     eor(rscratch2, tmp1, tmp2);
1291     cbz(rscratch2, DONE);
1292 
1293     // Find the first different characters in the longwords and
1294     // compute their difference.
1295     bind(DIFF);
1296     rev(rscratch2, rscratch2);
1297     clz(rscratch2, rscratch2);
1298     andr(rscratch2, rscratch2, isLL ? -8 : -16);
1299     lsrv(tmp1, tmp1, rscratch2);
1300     (this->*ext_chr)(tmp1, tmp1);
1301     lsrv(tmp2, tmp2, rscratch2);
1302     (this->*ext_chr)(tmp2, tmp2);
1303     subw(result, tmp1, tmp2);
1304     b(DONE);
1305   }
1306 
1307   bind(STUB);
1308     RuntimeAddress stub = nullptr;
1309     switch(ae) {
1310       case StrIntrinsicNode::LL:
1311         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
1312         break;
1313       case StrIntrinsicNode::UU:
1314         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
1315         break;
1316       case StrIntrinsicNode::LU:
1317         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
1318         break;
1319       case StrIntrinsicNode::UL:
1320         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
1321         break;
1322       default:
1323         ShouldNotReachHere();
1324      }
1325     assert(stub.target() != nullptr, "compare_long_string stub has not been generated");
1326     address call = trampoline_call(stub);
1327     if (call == nullptr) {
1328       DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START));
1329       ciEnv::current()->record_failure("CodeCache is full");
1330       return;
1331     }
1332     b(DONE);
1333 
1334   bind(SHORT_STRING);
1335   // Is the minimum length zero?
1336   cbz(cnt2, DONE);
1337   // arrange code to do most branches while loading and loading next characters
1338   // while comparing previous
1339   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1340   subs(cnt2, cnt2, 1);
1341   br(EQ, SHORT_LAST_INIT);
1342   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1343   b(SHORT_LOOP_START);
1344   bind(SHORT_LOOP);
1345   subs(cnt2, cnt2, 1);
1346   br(EQ, SHORT_LAST);
1347   bind(SHORT_LOOP_START);
1348   (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
1349   (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
1350   cmp(tmp1, cnt1);
1351   br(NE, SHORT_LOOP_TAIL);
1352   subs(cnt2, cnt2, 1);
1353   br(EQ, SHORT_LAST2);
1354   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1355   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1356   cmp(tmp2, rscratch1);
1357   br(EQ, SHORT_LOOP);
1358   sub(result, tmp2, rscratch1);
1359   b(DONE);
1360   bind(SHORT_LOOP_TAIL);
1361   sub(result, tmp1, cnt1);
1362   b(DONE);
1363   bind(SHORT_LAST2);
1364   cmp(tmp2, rscratch1);
1365   br(EQ, DONE);
1366   sub(result, tmp2, rscratch1);
1367 
1368   b(DONE);
1369   bind(SHORT_LAST_INIT);
1370   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1371   bind(SHORT_LAST);
1372   cmp(tmp1, cnt1);
1373   br(EQ, DONE);
1374   sub(result, tmp1, cnt1);
1375 
1376   bind(DONE);
1377 
1378   BLOCK_COMMENT("} string_compare");
1379 }
1380 
1381 void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1,
1382                                      FloatRegister src2, Condition cond, bool isQ) {
1383   SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1384   FloatRegister zn = src1, zm = src2;
1385   bool needs_negation = false;
1386   switch (cond) {
1387     case LT: cond = GT; zn = src2; zm = src1; break;
1388     case LE: cond = GE; zn = src2; zm = src1; break;
1389     case LO: cond = HI; zn = src2; zm = src1; break;
1390     case LS: cond = HS; zn = src2; zm = src1; break;
1391     case NE: cond = EQ; needs_negation = true; break;
1392     default:
1393       break;
1394   }
1395 
1396   if (is_floating_point_type(bt)) {
1397     fcm(cond, dst, size, zn, zm);
1398   } else {
1399     cm(cond, dst, size, zn, zm);
1400   }
1401 
1402   if (needs_negation) {
1403     notr(dst, isQ ? T16B : T8B, dst);
1404   }
1405 }
1406 
1407 void C2_MacroAssembler::neon_compare_zero(FloatRegister dst, BasicType bt, FloatRegister src,
1408                                           Condition cond, bool isQ) {
1409   SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1410   if (bt == T_FLOAT || bt == T_DOUBLE) {
1411     if (cond == Assembler::NE) {
1412       fcm(Assembler::EQ, dst, size, src);
1413       notr(dst, isQ ? T16B : T8B, dst);
1414     } else {
1415       fcm(cond, dst, size, src);
1416     }
1417   } else {
1418     if (cond == Assembler::NE) {
1419       cm(Assembler::EQ, dst, size, src);
1420       notr(dst, isQ ? T16B : T8B, dst);
1421     } else {
1422       cm(cond, dst, size, src);
1423     }
1424   }
1425 }
1426 
1427 // Compress the least significant bit of each byte to the rightmost and clear
1428 // the higher garbage bits.
1429 void C2_MacroAssembler::bytemask_compress(Register dst) {
1430   // Example input, dst = 0x01 00 00 00 01 01 00 01
1431   // The "??" bytes are garbage.
1432   orr(dst, dst, dst, Assembler::LSR, 7);  // dst = 0x?? 02 ?? 00 ?? 03 ?? 01
1433   orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D
1434   orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D
1435   andr(dst, dst, 0xff);                   // dst = 0x8D
1436 }
1437 
1438 // Pack the value of each mask element in "src" into a long value in "dst", at most
1439 // the first 64 lane elements. The input "src" is a vector of boolean represented as
1440 // bytes with 0x00/0x01 as element values. Each lane value from "src" is packed into
1441 // one bit in "dst".
1442 //
1443 // Example:   src = 0x0001010000010001 0100000001010001, lane_cnt = 16
1444 // Expected:  dst = 0x658D
1445 //
1446 // Clobbers: rscratch1
1447 void C2_MacroAssembler::sve_vmask_tolong(Register dst, FloatRegister src,
1448                                          FloatRegister vtmp, int lane_cnt) {
1449   assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
1450   assert_different_registers(dst, rscratch1);
1451   assert_different_registers(src, vtmp);
1452   assert(UseSVE > 0, "must be");
1453 
1454   // Compress the lowest 8 bytes.
1455   fmovd(dst, src);
1456   bytemask_compress(dst);
1457   if (lane_cnt <= 8) return;
1458 
1459   // Repeat on higher bytes and join the results.
1460   // Compress 8 bytes in each iteration.
1461   for (int idx = 1; idx < (lane_cnt / 8); idx++) {
1462     sve_extract_integral(rscratch1, T_LONG, src, idx, vtmp);
1463     bytemask_compress(rscratch1);
1464     orr(dst, dst, rscratch1, Assembler::LSL, idx << 3);
1465   }
1466 }
1467 
1468 // The function is same as above "sve_vmask_tolong", but it uses SVE2's BEXT
1469 // instruction which requires the FEAT_BITPERM feature.
1470 void C2_MacroAssembler::sve2_vmask_tolong(Register dst, FloatRegister src,
1471                                           FloatRegister vtmp1, FloatRegister vtmp2,
1472                                           int lane_cnt) {
1473   assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
1474   assert_different_registers(src, vtmp1, vtmp2);
1475   assert(UseSVE > 1 && VM_Version::supports_svebitperm(), "must be");
1476 
1477   // Given a vector with the value 0x00 or 0x01 in each byte, the basic idea
1478   // is to compress each significant bit of the byte in a cross-lane way. Due
1479   // to the lack of a cross-lane bit-compress instruction, we use BEXT
1480   // (bit-compress in each lane) with the biggest lane size (T = D) then
1481   // concatenate the results.
1482 
1483   // The second source input of BEXT, initialized with 0x01 in each byte.
1484   // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1485   sve_dup(vtmp2, B, 1);
1486 
1487   // BEXT vtmp1.D, src.D, vtmp2.D
1488   // src   = 0x0001010000010001 | 0x0100000001010001
1489   // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1490   //         ---------------------------------------
1491   // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1492   sve_bext(vtmp1, D, src, vtmp2);
1493 
1494   // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the
1495   // result to dst.
1496   // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1497   // dst   = 0x658D
1498   if (lane_cnt <= 8) {
1499     // No need to concatenate.
1500     umov(dst, vtmp1, B, 0);
1501   } else if (lane_cnt <= 16) {
1502     ins(vtmp1, B, vtmp1, 1, 8);
1503     umov(dst, vtmp1, H, 0);
1504   } else {
1505     // As the lane count is 64 at most, the final expected value must be in
1506     // the lowest 64 bits after narrowing vtmp1 from D to B.
1507     sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2);
1508     umov(dst, vtmp1, D, 0);
1509   }
1510 }
1511 
1512 // Unpack the mask, a long value in "src", into a vector register of boolean
1513 // represented as bytes with 0x00/0x01 as element values in "dst".  Each bit in
1514 // "src" is unpacked into one byte lane in "dst". Note that "dst" can support at
1515 // most 64 lanes.
1516 //
1517 // Below example gives the expected dst vector register, with a valid src(0x658D)
1518 // on a 128-bit vector size machine.
1519 // dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01
1520 void C2_MacroAssembler::sve_vmask_fromlong(FloatRegister dst, Register src,
1521                                            FloatRegister vtmp, int lane_cnt) {
1522   assert_different_registers(dst, vtmp);
1523   assert(UseSVE == 2 && VM_Version::supports_svebitperm() &&
1524          lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported");
1525 
1526   // Example:   src = 0x658D, lane_cnt = 16
1527   // Expected:  dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01
1528 
1529   // Put long value from general purpose register into the first lane of vector.
1530   // vtmp = 0x0000000000000000 | 0x000000000000658D
1531   sve_dup(vtmp, B, 0);
1532   mov(vtmp, D, 0, src);
1533 
1534   // Transform the value in the first lane which is mask in bit now to the mask in
1535   // byte, which can be done by SVE2's BDEP instruction.
1536 
1537   // The first source input of BDEP instruction. Deposite each byte in every 8 bytes.
1538   // vtmp = 0x0000000000000065 | 0x000000000000008D
1539   if (lane_cnt <= 8) {
1540     // Nothing. As only one byte exsits.
1541   } else if (lane_cnt <= 16) {
1542     ins(vtmp, B, vtmp, 8, 1);
1543   } else {
1544     sve_vector_extend(vtmp, D, vtmp, B);
1545   }
1546 
1547   // The second source input of BDEP instruction, initialized with 0x01 for each byte.
1548   // dst = 0x01010101 0x01010101 0x01010101 0x01010101
1549   sve_dup(dst, B, 1);
1550 
1551   // BDEP dst.D, vtmp.D, dst.D
1552   // vtmp = 0x0000000000000065 | 0x000000000000008D
1553   // dst  = 0x0101010101010101 | 0x0101010101010101
1554   //        ---------------------------------------
1555   // dst  = 0x0001010000010001 | 0x0100000001010001
1556   sve_bdep(dst, D, vtmp, dst);
1557 }
1558 
1559 // Clobbers: rflags
1560 void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg,
1561                                     FloatRegister zn, FloatRegister zm, Condition cond) {
1562   assert(pg->is_governing(), "This register has to be a governing predicate register");
1563   FloatRegister z1 = zn, z2 = zm;
1564   switch (cond) {
1565     case LE: z1 = zm; z2 = zn; cond = GE; break;
1566     case LT: z1 = zm; z2 = zn; cond = GT; break;
1567     case LO: z1 = zm; z2 = zn; cond = HI; break;
1568     case LS: z1 = zm; z2 = zn; cond = HS; break;
1569     default:
1570       break;
1571   }
1572 
1573   SIMD_RegVariant size = elemType_to_regVariant(bt);
1574   if (is_floating_point_type(bt)) {
1575     sve_fcm(cond, pd, size, pg, z1, z2);
1576   } else {
1577     assert(is_integral_type(bt), "unsupported element type");
1578     sve_cmp(cond, pd, size, pg, z1, z2);
1579   }
1580 }
1581 
1582 // Get index of the last mask lane that is set
1583 void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) {
1584   SIMD_RegVariant size = elemType_to_regVariant(bt);
1585   sve_rev(ptmp, size, src);
1586   sve_brkb(ptmp, ptrue, ptmp, false);
1587   sve_cntp(dst, size, ptrue, ptmp);
1588   movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1);
1589   subw(dst, rscratch1, dst);
1590 }
1591 
1592 // Extend integer vector src to dst with the same lane count
1593 // but larger element size, e.g. 4B -> 4I
1594 void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes,
1595                                            FloatRegister src, BasicType src_bt, bool is_unsigned) {
1596   if (src_bt == T_BYTE) {
1597     // 4B to 4S/4I, 8B to 8S
1598     assert(dst_vlen_in_bytes == 8 || dst_vlen_in_bytes == 16, "unsupported");
1599     assert(dst_bt == T_SHORT || dst_bt == T_INT, "unsupported");
1600     _xshll(is_unsigned, dst, T8H, src, T8B, 0);
1601     if (dst_bt == T_INT) {
1602       _xshll(is_unsigned, dst, T4S, dst, T4H, 0);
1603     }
1604   } else if (src_bt == T_SHORT) {
1605     // 2S to 2I/2L, 4S to 4I
1606     assert(dst_vlen_in_bytes == 8 || dst_vlen_in_bytes == 16, "unsupported");
1607     assert(dst_bt == T_INT || dst_bt == T_LONG, "unsupported");
1608     _xshll(is_unsigned, dst, T4S, src, T4H, 0);
1609     if (dst_bt == T_LONG) {
1610       _xshll(is_unsigned, dst, T2D, dst, T2S, 0);
1611     }
1612   } else if (src_bt == T_INT) {
1613     // 2I to 2L
1614     assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported");
1615     _xshll(is_unsigned, dst, T2D, src, T2S, 0);
1616   } else {
1617     ShouldNotReachHere();
1618   }
1619 }
1620 
1621 // Narrow integer vector src down to dst with the same lane count
1622 // but smaller element size, e.g. 4I -> 4B
1623 void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt,
1624                                            FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) {
1625   if (src_bt == T_SHORT) {
1626     // 4S/8S to 4B/8B
1627     assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
1628     assert(dst_bt == T_BYTE, "unsupported");
1629     xtn(dst, T8B, src, T8H);
1630   } else if (src_bt == T_INT) {
1631     // 2I to 2S, 4I to 4B/4S
1632     assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
1633     assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported");
1634     xtn(dst, T4H, src, T4S);
1635     if (dst_bt == T_BYTE) {
1636       xtn(dst, T8B, dst, T8H);
1637     }
1638   } else if (src_bt == T_LONG) {
1639     // 2L to 2S/2I
1640     assert(src_vlen_in_bytes == 16, "unsupported");
1641     assert(dst_bt == T_INT || dst_bt == T_SHORT, "unsupported");
1642     xtn(dst, T2S, src, T2D);
1643     if (dst_bt == T_SHORT) {
1644       xtn(dst, T4H, dst, T4S);
1645     }
1646   } else {
1647     ShouldNotReachHere();
1648   }
1649 }
1650 
1651 void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size,
1652                                           FloatRegister src, SIMD_RegVariant src_size,
1653                                           bool is_unsigned) {
1654   assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size");
1655 
1656   if (src_size == B) {
1657     switch (dst_size) {
1658     case H:
1659       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1660       break;
1661     case S:
1662       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1663       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1664       break;
1665     case D:
1666       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1667       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1668       _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1669       break;
1670     default:
1671       ShouldNotReachHere();
1672     }
1673   } else if (src_size == H) {
1674     if (dst_size == S) {
1675       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1676     } else { // D
1677       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1678       _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1679     }
1680   } else if (src_size == S) {
1681     _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, src);
1682   }
1683 }
1684 
1685 // Vector narrow from src to dst with specified element sizes.
1686 // High part of dst vector will be filled with zero.
1687 void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size,
1688                                           FloatRegister src, SIMD_RegVariant src_size,
1689                                           FloatRegister tmp) {
1690   assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size");
1691   assert_different_registers(src, tmp);
1692   sve_dup(tmp, src_size, 0);
1693   if (src_size == D) {
1694     switch (dst_size) {
1695     case S:
1696       sve_uzp1(dst, S, src, tmp);
1697       break;
1698     case H:
1699       assert_different_registers(dst, tmp);
1700       sve_uzp1(dst, S, src, tmp);
1701       sve_uzp1(dst, H, dst, tmp);
1702       break;
1703     case B:
1704       assert_different_registers(dst, tmp);
1705       sve_uzp1(dst, S, src, tmp);
1706       sve_uzp1(dst, H, dst, tmp);
1707       sve_uzp1(dst, B, dst, tmp);
1708       break;
1709     default:
1710       ShouldNotReachHere();
1711     }
1712   } else if (src_size == S) {
1713     if (dst_size == H) {
1714       sve_uzp1(dst, H, src, tmp);
1715     } else { // B
1716       assert_different_registers(dst, tmp);
1717       sve_uzp1(dst, H, src, tmp);
1718       sve_uzp1(dst, B, dst, tmp);
1719     }
1720   } else if (src_size == H) {
1721     sve_uzp1(dst, B, src, tmp);
1722   }
1723 }
1724 
1725 // Extend src predicate to dst predicate with the same lane count but larger
1726 // element size, e.g. 64Byte -> 512Long
1727 void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src,
1728                                              uint dst_element_length_in_bytes,
1729                                              uint src_element_length_in_bytes) {
1730   if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) {
1731     sve_punpklo(dst, src);
1732   } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) {
1733     sve_punpklo(dst, src);
1734     sve_punpklo(dst, dst);
1735   } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) {
1736     sve_punpklo(dst, src);
1737     sve_punpklo(dst, dst);
1738     sve_punpklo(dst, dst);
1739   } else {
1740     assert(false, "unsupported");
1741     ShouldNotReachHere();
1742   }
1743 }
1744 
1745 // Narrow src predicate to dst predicate with the same lane count but
1746 // smaller element size, e.g. 512Long -> 64Byte
1747 void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src, PRegister ptmp,
1748                                              uint dst_element_length_in_bytes, uint src_element_length_in_bytes) {
1749   // The insignificant bits in src predicate are expected to be zero.
1750   // To ensure the higher order bits of the resultant narrowed vector are 0, an all-zero predicate is
1751   // passed as the second argument. An example narrowing operation with a given mask would be -
1752   // 128Long -> 64Int on a 128-bit machine i.e 2L -> 2I
1753   // Mask (for 2 Longs) : TF
1754   // Predicate register for the above mask (16 bits) : 00000001 00000000
1755   // After narrowing (uzp1 dst.b, src.b, ptmp.b) : 0000 0000 0001 0000
1756   // Which translates to mask for 2 integers as : TF (lower half is considered while upper half is 0)
1757   assert_different_registers(src, ptmp);
1758   assert_different_registers(dst, ptmp);
1759   sve_pfalse(ptmp);
1760   if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) {
1761     sve_uzp1(dst, B, src, ptmp);
1762   } else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) {
1763     sve_uzp1(dst, H, src, ptmp);
1764     sve_uzp1(dst, B, dst, ptmp);
1765   } else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) {
1766     sve_uzp1(dst, S, src, ptmp);
1767     sve_uzp1(dst, H, dst, ptmp);
1768     sve_uzp1(dst, B, dst, ptmp);
1769   } else {
1770     assert(false, "unsupported");
1771     ShouldNotReachHere();
1772   }
1773 }
1774 
1775 // Vector reduction add for integral type with ASIMD instructions.
1776 void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt,
1777                                                  Register isrc, FloatRegister vsrc,
1778                                                  unsigned vector_length_in_bytes,
1779                                                  FloatRegister vtmp) {
1780   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1781   assert_different_registers(dst, isrc);
1782   bool isQ = vector_length_in_bytes == 16;
1783 
1784   BLOCK_COMMENT("neon_reduce_add_integral {");
1785     switch(bt) {
1786       case T_BYTE:
1787         addv(vtmp, isQ ? T16B : T8B, vsrc);
1788         smov(dst, vtmp, B, 0);
1789         addw(dst, dst, isrc, ext::sxtb);
1790         break;
1791       case T_SHORT:
1792         addv(vtmp, isQ ? T8H : T4H, vsrc);
1793         smov(dst, vtmp, H, 0);
1794         addw(dst, dst, isrc, ext::sxth);
1795         break;
1796       case T_INT:
1797         isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc);
1798         umov(dst, vtmp, S, 0);
1799         addw(dst, dst, isrc);
1800         break;
1801       case T_LONG:
1802         assert(isQ, "unsupported");
1803         addpd(vtmp, vsrc);
1804         umov(dst, vtmp, D, 0);
1805         add(dst, dst, isrc);
1806         break;
1807       default:
1808         assert(false, "unsupported");
1809         ShouldNotReachHere();
1810     }
1811   BLOCK_COMMENT("} neon_reduce_add_integral");
1812 }
1813 
1814 // Vector reduction multiply for integral type with ASIMD instructions.
1815 // Note: temporary registers vtmp1 and vtmp2 are not used in some cases.
1816 // Clobbers: rscratch1
1817 void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt,
1818                                                  Register isrc, FloatRegister vsrc,
1819                                                  unsigned vector_length_in_bytes,
1820                                                  FloatRegister vtmp1, FloatRegister vtmp2) {
1821   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1822   bool isQ = vector_length_in_bytes == 16;
1823 
1824   BLOCK_COMMENT("neon_reduce_mul_integral {");
1825     switch(bt) {
1826       case T_BYTE:
1827         if (isQ) {
1828           // Multiply the lower half and higher half of vector iteratively.
1829           // vtmp1 = vsrc[8:15]
1830           ins(vtmp1, D, vsrc, 0, 1);
1831           // vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7]
1832           mulv(vtmp1, T8B, vtmp1, vsrc);
1833           // vtmp2 = vtmp1[4:7]
1834           ins(vtmp2, S, vtmp1, 0, 1);
1835           // vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3]
1836           mulv(vtmp1, T8B, vtmp2, vtmp1);
1837         } else {
1838           ins(vtmp1, S, vsrc, 0, 1);
1839           mulv(vtmp1, T8B, vtmp1, vsrc);
1840         }
1841         // vtmp2 = vtmp1[2:3]
1842         ins(vtmp2, H, vtmp1, 0, 1);
1843         // vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1]
1844         mulv(vtmp2, T8B, vtmp2, vtmp1);
1845         // dst = vtmp2[0] * isrc * vtmp2[1]
1846         umov(rscratch1, vtmp2, B, 0);
1847         mulw(dst, rscratch1, isrc);
1848         sxtb(dst, dst);
1849         umov(rscratch1, vtmp2, B, 1);
1850         mulw(dst, rscratch1, dst);
1851         sxtb(dst, dst);
1852         break;
1853       case T_SHORT:
1854         if (isQ) {
1855           ins(vtmp2, D, vsrc, 0, 1);
1856           mulv(vtmp2, T4H, vtmp2, vsrc);
1857           ins(vtmp1, S, vtmp2, 0, 1);
1858           mulv(vtmp1, T4H, vtmp1, vtmp2);
1859         } else {
1860           ins(vtmp1, S, vsrc, 0, 1);
1861           mulv(vtmp1, T4H, vtmp1, vsrc);
1862         }
1863         umov(rscratch1, vtmp1, H, 0);
1864         mulw(dst, rscratch1, isrc);
1865         sxth(dst, dst);
1866         umov(rscratch1, vtmp1, H, 1);
1867         mulw(dst, rscratch1, dst);
1868         sxth(dst, dst);
1869         break;
1870       case T_INT:
1871         if (isQ) {
1872           ins(vtmp1, D, vsrc, 0, 1);
1873           mulv(vtmp1, T2S, vtmp1, vsrc);
1874         } else {
1875           vtmp1 = vsrc;
1876         }
1877         umov(rscratch1, vtmp1, S, 0);
1878         mul(dst, rscratch1, isrc);
1879         umov(rscratch1, vtmp1, S, 1);
1880         mul(dst, rscratch1, dst);
1881         break;
1882       case T_LONG:
1883         umov(rscratch1, vsrc, D, 0);
1884         mul(dst, isrc, rscratch1);
1885         umov(rscratch1, vsrc, D, 1);
1886         mul(dst, dst, rscratch1);
1887         break;
1888       default:
1889         assert(false, "unsupported");
1890         ShouldNotReachHere();
1891     }
1892   BLOCK_COMMENT("} neon_reduce_mul_integral");
1893 }
1894 
1895 // Vector reduction multiply for floating-point type with ASIMD instructions.
1896 void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt,
1897                                            FloatRegister fsrc, FloatRegister vsrc,
1898                                            unsigned vector_length_in_bytes,
1899                                            FloatRegister vtmp) {
1900   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1901   bool isQ = vector_length_in_bytes == 16;
1902 
1903   BLOCK_COMMENT("neon_reduce_mul_fp {");
1904     switch(bt) {
1905       // The T_SHORT type below is for Float16 type which also uses floating-point
1906       // instructions.
1907       case T_SHORT:
1908         fmulh(dst, fsrc, vsrc);
1909         ext(vtmp, T8B, vsrc, vsrc, 2);
1910         fmulh(dst, dst, vtmp);
1911         ext(vtmp, T8B, vsrc, vsrc, 4);
1912         fmulh(dst, dst, vtmp);
1913         ext(vtmp, T8B, vsrc, vsrc, 6);
1914         fmulh(dst, dst, vtmp);
1915         if (isQ) {
1916           ext(vtmp, T16B, vsrc, vsrc, 8);
1917           fmulh(dst, dst, vtmp);
1918           ext(vtmp, T16B, vsrc, vsrc, 10);
1919           fmulh(dst, dst, vtmp);
1920           ext(vtmp, T16B, vsrc, vsrc, 12);
1921           fmulh(dst, dst, vtmp);
1922           ext(vtmp, T16B, vsrc, vsrc, 14);
1923           fmulh(dst, dst, vtmp);
1924         }
1925         break;
1926       case T_FLOAT:
1927         fmuls(dst, fsrc, vsrc);
1928         ins(vtmp, S, vsrc, 0, 1);
1929         fmuls(dst, dst, vtmp);
1930         if (isQ) {
1931           ins(vtmp, S, vsrc, 0, 2);
1932           fmuls(dst, dst, vtmp);
1933           ins(vtmp, S, vsrc, 0, 3);
1934           fmuls(dst, dst, vtmp);
1935          }
1936         break;
1937       case T_DOUBLE:
1938         assert(isQ, "unsupported");
1939         fmuld(dst, fsrc, vsrc);
1940         ins(vtmp, D, vsrc, 0, 1);
1941         fmuld(dst, dst, vtmp);
1942         break;
1943       default:
1944         assert(false, "unsupported");
1945         ShouldNotReachHere();
1946     }
1947   BLOCK_COMMENT("} neon_reduce_mul_fp");
1948 }
1949 
1950 // Vector reduction add for half float type with ASIMD instructions.
1951 void C2_MacroAssembler::neon_reduce_add_fp16(FloatRegister dst, FloatRegister fsrc, FloatRegister vsrc,
1952                                              unsigned vector_length_in_bytes, FloatRegister vtmp) {
1953   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1954   bool isQ = vector_length_in_bytes == 16;
1955 
1956   BLOCK_COMMENT("neon_reduce_add_fp16 {");
1957     faddh(dst, fsrc, vsrc);
1958     ext(vtmp, T8B, vsrc, vsrc, 2);
1959     faddh(dst, dst, vtmp);
1960     ext(vtmp, T8B, vsrc, vsrc, 4);
1961     faddh(dst, dst, vtmp);
1962     ext(vtmp, T8B, vsrc, vsrc, 6);
1963     faddh(dst, dst, vtmp);
1964     if (isQ) {
1965       ext(vtmp, T16B, vsrc, vsrc, 8);
1966       faddh(dst, dst, vtmp);
1967       ext(vtmp, T16B, vsrc, vsrc, 10);
1968       faddh(dst, dst, vtmp);
1969       ext(vtmp, T16B, vsrc, vsrc, 12);
1970       faddh(dst, dst, vtmp);
1971       ext(vtmp, T16B, vsrc, vsrc, 14);
1972       faddh(dst, dst, vtmp);
1973     }
1974   BLOCK_COMMENT("} neon_reduce_add_fp16");
1975 }
1976 
1977 // Helper to select logical instruction
1978 void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd,
1979                                                    Register Rn, Register Rm,
1980                                                    enum shift_kind kind, unsigned shift) {
1981   switch(opc) {
1982     case Op_AndReductionV:
1983       is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift);
1984       break;
1985     case Op_OrReductionV:
1986       is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift);
1987       break;
1988     case Op_XorReductionV:
1989       is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift);
1990       break;
1991     default:
1992       assert(false, "unsupported");
1993       ShouldNotReachHere();
1994   }
1995 }
1996 
1997 // Vector reduction logical operations And, Or, Xor
1998 // Clobbers: rscratch1
1999 void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt,
2000                                             Register isrc, FloatRegister vsrc,
2001                                             unsigned vector_length_in_bytes) {
2002   assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV,
2003          "unsupported");
2004   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
2005   assert_different_registers(dst, isrc);
2006   bool isQ = vector_length_in_bytes == 16;
2007 
2008   BLOCK_COMMENT("neon_reduce_logical {");
2009     umov(rscratch1, vsrc, isQ ? D : S, 0);
2010     umov(dst, vsrc, isQ ? D : S, 1);
2011     neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1);
2012     switch(bt) {
2013       case T_BYTE:
2014         if (isQ) {
2015           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
2016         }
2017         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
2018         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8);
2019         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
2020         sxtb(dst, dst);
2021         break;
2022       case T_SHORT:
2023         if (isQ) {
2024           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
2025         }
2026         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
2027         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
2028         sxth(dst, dst);
2029         break;
2030       case T_INT:
2031         if (isQ) {
2032           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
2033         }
2034         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
2035         break;
2036       case T_LONG:
2037         assert(isQ, "unsupported");
2038         neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst);
2039         break;
2040       default:
2041         assert(false, "unsupported");
2042         ShouldNotReachHere();
2043     }
2044   BLOCK_COMMENT("} neon_reduce_logical");
2045 }
2046 
2047 // Helper function to decode min/max reduction operation properties
2048 void C2_MacroAssembler::decode_minmax_reduction_opc(int opc, bool* is_min,
2049                                                     bool* is_unsigned,
2050                                                     Condition* cond) {
2051   switch(opc) {
2052     case Op_MinReductionV:
2053       *is_min = true;  *is_unsigned = false; *cond = LT; break;
2054     case Op_MaxReductionV:
2055       *is_min = false; *is_unsigned = false; *cond = GT; break;
2056     case Op_UMinReductionV:
2057       *is_min = true;  *is_unsigned = true;  *cond = LO; break;
2058     case Op_UMaxReductionV:
2059       *is_min = false; *is_unsigned = true;  *cond = HI; break;
2060     default:
2061       ShouldNotReachHere();
2062   }
2063 }
2064 
2065 // Vector reduction min/max/umin/umax for integral type with ASIMD instructions.
2066 // Note: vtmp is not used and expected to be fnoreg for T_LONG case.
2067 // Clobbers: rscratch1, rflags
2068 void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt,
2069                                                     Register isrc, FloatRegister vsrc,
2070                                                     unsigned vector_length_in_bytes,
2071                                                     FloatRegister vtmp) {
2072   assert(opc == Op_MinReductionV || opc == Op_MaxReductionV ||
2073          opc == Op_UMinReductionV || opc == Op_UMaxReductionV, "unsupported");
2074   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
2075   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported");
2076   assert_different_registers(dst, isrc);
2077   bool isQ = vector_length_in_bytes == 16;
2078   bool is_min;
2079   bool is_unsigned;
2080   Condition cond;
2081   decode_minmax_reduction_opc(opc, &is_min, &is_unsigned, &cond);
2082   BLOCK_COMMENT("neon_reduce_minmax_integral {");
2083     if (bt == T_LONG) {
2084       assert(vtmp == fnoreg, "should be");
2085       assert(isQ, "should be");
2086       umov(rscratch1, vsrc, D, 0);
2087       cmp(isrc, rscratch1);
2088       csel(dst, isrc, rscratch1, cond);
2089       umov(rscratch1, vsrc, D, 1);
2090       cmp(dst, rscratch1);
2091       csel(dst, dst, rscratch1, cond);
2092     } else {
2093       SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
2094       if (size == T2S) {
2095         // For T2S (2x32-bit elements), use pairwise instructions because
2096         // uminv/umaxv/sminv/smaxv don't support arrangement 2S.
2097         neon_minmaxp(is_unsigned, is_min, vtmp, size, vsrc, vsrc);
2098       } else {
2099         // For other sizes, use reduction to scalar instructions.
2100         neon_minmaxv(is_unsigned, is_min, vtmp, size, vsrc);
2101       }
2102       if (bt == T_INT) {
2103         umov(dst, vtmp, S, 0);
2104       } else if (is_unsigned) {
2105         umov(dst, vtmp, elemType_to_regVariant(bt), 0);
2106       } else {
2107         smov(dst, vtmp, elemType_to_regVariant(bt), 0);
2108       }
2109       cmpw(dst, isrc);
2110       cselw(dst, dst, isrc, cond);
2111     }
2112   BLOCK_COMMENT("} neon_reduce_minmax_integral");
2113 }
2114 
2115 // Vector reduction for integral type with SVE instruction.
2116 // Supported operations are Add, And, Or, Xor, Max, Min, UMax, UMin.
2117 // rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV.
2118 void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1,
2119                                             FloatRegister src2, PRegister pg, FloatRegister tmp) {
2120   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2121   assert(pg->is_governing(), "This register has to be a governing predicate register");
2122   assert_different_registers(src1, dst);
2123   // Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved.
2124   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2125   switch (opc) {
2126     case Op_AddReductionVI: {
2127       sve_uaddv(tmp, size, pg, src2);
2128       if (bt == T_BYTE) {
2129         smov(dst, tmp, size, 0);
2130         addw(dst, src1, dst, ext::sxtb);
2131       } else if (bt == T_SHORT) {
2132         smov(dst, tmp, size, 0);
2133         addw(dst, src1, dst, ext::sxth);
2134       } else {
2135         umov(dst, tmp, size, 0);
2136         addw(dst, dst, src1);
2137       }
2138       break;
2139     }
2140     case Op_AddReductionVL: {
2141       sve_uaddv(tmp, size, pg, src2);
2142       umov(dst, tmp, size, 0);
2143       add(dst, dst, src1);
2144       break;
2145     }
2146     case Op_AndReductionV: {
2147       sve_andv(tmp, size, pg, src2);
2148       if (bt == T_INT || bt == T_LONG) {
2149         umov(dst, tmp, size, 0);
2150       } else {
2151         smov(dst, tmp, size, 0);
2152       }
2153       if (bt == T_LONG) {
2154         andr(dst, dst, src1);
2155       } else {
2156         andw(dst, dst, src1);
2157       }
2158       break;
2159     }
2160     case Op_OrReductionV: {
2161       sve_orv(tmp, size, pg, src2);
2162       if (bt == T_INT || bt == T_LONG) {
2163         umov(dst, tmp, size, 0);
2164       } else {
2165         smov(dst, tmp, size, 0);
2166       }
2167       if (bt == T_LONG) {
2168         orr(dst, dst, src1);
2169       } else {
2170         orrw(dst, dst, src1);
2171       }
2172       break;
2173     }
2174     case Op_XorReductionV: {
2175       sve_eorv(tmp, size, pg, src2);
2176       if (bt == T_INT || bt == T_LONG) {
2177         umov(dst, tmp, size, 0);
2178       } else {
2179         smov(dst, tmp, size, 0);
2180       }
2181       if (bt == T_LONG) {
2182         eor(dst, dst, src1);
2183       } else {
2184         eorw(dst, dst, src1);
2185       }
2186       break;
2187     }
2188     case Op_MaxReductionV:
2189     case Op_MinReductionV:
2190     case Op_UMaxReductionV:
2191     case Op_UMinReductionV: {
2192       bool is_min;
2193       bool is_unsigned;
2194       Condition cond;
2195       decode_minmax_reduction_opc(opc, &is_min, &is_unsigned, &cond);
2196       sve_minmaxv(is_unsigned, is_min, tmp, size, pg, src2);
2197       // Move result from vector to general register
2198       if (is_unsigned || bt == T_INT || bt == T_LONG) {
2199         umov(dst, tmp, size, 0);
2200       } else {
2201         smov(dst, tmp, size, 0);
2202       }
2203       if (bt == T_LONG) {
2204         cmp(dst, src1);
2205         csel(dst, dst, src1, cond);
2206       } else {
2207         cmpw(dst, src1);
2208         cselw(dst, dst, src1, cond);
2209       }
2210       break;
2211     }
2212     default:
2213       assert(false, "unsupported");
2214       ShouldNotReachHere();
2215   }
2216 
2217   if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) {
2218     if (bt == T_BYTE) {
2219       sxtb(dst, dst);
2220     } else if (bt == T_SHORT) {
2221       sxth(dst, dst);
2222     }
2223   }
2224 }
2225 
2226 // Set elements of the dst predicate to true for lanes in the range of [0, lane_cnt), or
2227 // to false otherwise. The input "lane_cnt" should be smaller than or equal to the supported
2228 // max vector length of the basic type. Clobbers: rscratch1 and the rFlagsReg.
2229 void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt) {
2230   uint32_t max_vector_length = Matcher::max_vector_size(bt);
2231   assert(lane_cnt <= max_vector_length, "unsupported input lane_cnt");
2232 
2233   // Set all elements to false if the input "lane_cnt" is zero.
2234   if (lane_cnt == 0) {
2235     sve_pfalse(dst);
2236     return;
2237   }
2238 
2239   SIMD_RegVariant size = elemType_to_regVariant(bt);
2240   assert(size != Q, "invalid size");
2241 
2242   // Set all true if "lane_cnt" equals to the max lane count.
2243   if (lane_cnt == max_vector_length) {
2244     sve_ptrue(dst, size, /* ALL */ 0b11111);
2245     return;
2246   }
2247 
2248   // Fixed numbers for "ptrue".
2249   switch(lane_cnt) {
2250   case 1: /* VL1 */
2251   case 2: /* VL2 */
2252   case 3: /* VL3 */
2253   case 4: /* VL4 */
2254   case 5: /* VL5 */
2255   case 6: /* VL6 */
2256   case 7: /* VL7 */
2257   case 8: /* VL8 */
2258     sve_ptrue(dst, size, lane_cnt);
2259     return;
2260   case 16:
2261     sve_ptrue(dst, size, /* VL16 */ 0b01001);
2262     return;
2263   case 32:
2264     sve_ptrue(dst, size, /* VL32 */ 0b01010);
2265     return;
2266   case 64:
2267     sve_ptrue(dst, size, /* VL64 */ 0b01011);
2268     return;
2269   case 128:
2270     sve_ptrue(dst, size, /* VL128 */ 0b01100);
2271     return;
2272   case 256:
2273     sve_ptrue(dst, size, /* VL256 */ 0b01101);
2274     return;
2275   default:
2276     break;
2277   }
2278 
2279   // Special patterns for "ptrue".
2280   if (lane_cnt == round_down_power_of_2(max_vector_length)) {
2281     sve_ptrue(dst, size, /* POW2 */ 0b00000);
2282   } else if (lane_cnt == max_vector_length - (max_vector_length % 4)) {
2283     sve_ptrue(dst, size, /* MUL4 */ 0b11101);
2284   } else if (lane_cnt == max_vector_length - (max_vector_length % 3)) {
2285     sve_ptrue(dst, size, /* MUL3 */ 0b11110);
2286   } else {
2287     // Encode to "whileltw" for the remaining cases.
2288     mov(rscratch1, lane_cnt);
2289     sve_whileltw(dst, size, zr, rscratch1);
2290   }
2291 }
2292 
2293 // Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst.
2294 // Any remaining elements of dst will be filled with zero.
2295 // Clobbers: rscratch1
2296 // Preserves: mask, vzr
2297 void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask,
2298                                            FloatRegister vzr, FloatRegister vtmp,
2299                                            PRegister pgtmp, unsigned vector_length_in_bytes) {
2300   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2301   // When called by sve_compress_byte, src and vtmp may be the same register.
2302   assert_different_registers(dst, src, vzr);
2303   assert_different_registers(dst, vtmp, vzr);
2304   assert_different_registers(mask, pgtmp);
2305   // high <-- low
2306   // Example input:   src   = hh gg ff ee dd cc bb aa, one character is 8 bits.
2307   //                  mask  = 01 00 00 01 01 00 01 01, one character is 1 bit.
2308   // Expected result: dst   = 00 00 00 hh ee dd bb aa
2309 
2310   // Extend lowest half to type INT.
2311   // dst   =  00dd  00cc  00bb  00aa
2312   sve_uunpklo(dst, S, src);
2313   // pgtmp =  0001  0000  0001  0001
2314   sve_punpklo(pgtmp, mask);
2315   // Pack the active elements in size of type INT to the right,
2316   // and fill the remainings with zero.
2317   // dst   =  0000  00dd  00bb  00aa
2318   sve_compact(dst, S, dst, pgtmp);
2319   // Narrow the result back to type SHORT.
2320   // dst   = 00 00 00 00 00 dd bb aa
2321   sve_uzp1(dst, H, dst, vzr);
2322 
2323   // Return if the vector length is no more than MaxVectorSize/2, since the
2324   // highest half is invalid.
2325   if (vector_length_in_bytes <= (MaxVectorSize >> 1)) {
2326     return;
2327   }
2328 
2329   // Count the active elements of lowest half.
2330   // rscratch1 = 3
2331   sve_cntp(rscratch1, S, ptrue, pgtmp);
2332 
2333   // Repeat to the highest half.
2334   // pgtmp =  0001  0000  0000  0001
2335   sve_punpkhi(pgtmp, mask);
2336   // vtmp  =  00hh  00gg  00ff  00ee
2337   sve_uunpkhi(vtmp, S, src);
2338   // vtmp  =  0000  0000  00hh  00ee
2339   sve_compact(vtmp, S, vtmp, pgtmp);
2340   // vtmp  = 00 00 00 00 00 00 hh ee
2341   sve_uzp1(vtmp, H, vtmp, vzr);
2342 
2343   // pgtmp = 00 00 00 00 00 01 01 01
2344   sve_whilelt(pgtmp, H, zr, rscratch1);
2345   // Compressed low:  dst  = 00 00 00 00 00 dd bb aa
2346   // Compressed high: vtmp = 00 00 00 00 00 00 hh ee
2347   // Combine the compressed low with the compressed high:
2348   //                  dst  = 00 00 00 hh ee dd bb aa
2349   sve_splice(dst, H, pgtmp, vtmp);
2350 }
2351 
2352 // Clobbers: rscratch1, rscratch2
2353 // Preserves: src, mask
2354 void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask,
2355                                           FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
2356                                           PRegister ptmp, PRegister pgtmp, unsigned vector_length_in_bytes) {
2357   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2358   assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3);
2359   assert_different_registers(mask, ptmp, pgtmp);
2360   // high <-- low
2361   // Example input:   src   = q p n m l k j i h g f e d c b a, one character is 8 bits.
2362   //                  mask  = 0 1 0 0 0 0 0 1 0 1 0 0 0 1 0 1, one character is 1 bit.
2363   // Expected result: dst   = 0 0 0 0 0 0 0 0 0 0 0 p i g c a
2364   FloatRegister vzr = vtmp3;
2365   sve_dup(vzr, B, 0);
2366 
2367   // Extend lowest half to type SHORT.
2368   // vtmp1 =  0h  0g  0f  0e  0d  0c  0b  0a
2369   sve_uunpklo(vtmp1, H, src);
2370   // ptmp  =  00  01  00  00  00  01  00  01
2371   sve_punpklo(ptmp, mask);
2372   // Pack the active elements in size of type SHORT to the right,
2373   // and fill the remainings with zero.
2374   // dst   =  00  00  00  00  00  0g  0c  0a
2375   unsigned extended_size = vector_length_in_bytes << 1;
2376   sve_compress_short(dst, vtmp1, ptmp, vzr, vtmp2, pgtmp, extended_size > MaxVectorSize ? MaxVectorSize : extended_size);
2377   // Narrow the result back to type BYTE.
2378   // dst   = 0 0 0 0 0 0 0 0 0 0 0 0 0 g c a
2379   sve_uzp1(dst, B, dst, vzr);
2380 
2381   // Return if the vector length is no more than MaxVectorSize/2, since the
2382   // highest half is invalid.
2383   if (vector_length_in_bytes <= (MaxVectorSize >> 1)) {
2384     return;
2385   }
2386   // Count the active elements of lowest half.
2387   // rscratch2 = 3
2388   sve_cntp(rscratch2, H, ptrue, ptmp);
2389 
2390   // Repeat to the highest half.
2391   // ptmp  =  00  01  00  00  00  00  00  01
2392   sve_punpkhi(ptmp, mask);
2393   // vtmp2 =  0q  0p  0n  0m  0l  0k  0j  0i
2394   sve_uunpkhi(vtmp2, H, src);
2395   // vtmp1 =  00  00  00  00  00  00  0p  0i
2396   sve_compress_short(vtmp1, vtmp2, ptmp, vzr, vtmp2, pgtmp, extended_size - MaxVectorSize);
2397   // vtmp1 = 0 0 0 0 0 0 0 0 0 0 0 0 0 0 p i
2398   sve_uzp1(vtmp1, B, vtmp1, vzr);
2399 
2400   // ptmp  = 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1
2401   sve_whilelt(ptmp, B, zr, rscratch2);
2402   // Compressed low:  dst   = 0 0 0 0 0 0 0 0 0 0 0 0 0 g c a
2403   // Compressed high: vtmp1 = 0 0 0 0 0 0 0 0 0 0 0 0 0 0 p i
2404   // Combine the compressed low with the compressed high:
2405   //                  dst   = 0 0 0 0 0 0 0 0 0 0 0 p i g c a
2406   sve_splice(dst, B, ptmp, vtmp1);
2407 }
2408 
2409 void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2410   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2411   SIMD_Arrangement size = isQ ? T16B : T8B;
2412   if (bt == T_BYTE) {
2413     rbit(dst, size, src);
2414   } else {
2415     neon_reverse_bytes(dst, src, bt, isQ);
2416     rbit(dst, size, dst);
2417   }
2418 }
2419 
2420 void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2421   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2422   SIMD_Arrangement size = isQ ? T16B : T8B;
2423   switch (bt) {
2424     case T_BYTE:
2425       if (dst != src) {
2426         orr(dst, size, src, src);
2427       }
2428       break;
2429     case T_SHORT:
2430       rev16(dst, size, src);
2431       break;
2432     case T_INT:
2433       rev32(dst, size, src);
2434       break;
2435     case T_LONG:
2436       rev64(dst, size, src);
2437       break;
2438     default:
2439       assert(false, "unsupported");
2440       ShouldNotReachHere();
2441   }
2442 }
2443 
2444 // VectorRearrange implementation for short/int/float/long/double types with NEON
2445 // instructions. For VectorRearrange short/int/float, we use NEON tbl instruction.
2446 // But since it supports bytes table only, we need to lookup 2/4 bytes as a group.
2447 // For VectorRearrange long/double, we compare the shuffle input with iota indices,
2448 // and use bsl to implement the operation.
2449 void C2_MacroAssembler::neon_rearrange_hsd(FloatRegister dst, FloatRegister src,
2450                                            FloatRegister shuffle, FloatRegister tmp,
2451                                            BasicType bt, bool isQ) {
2452   assert_different_registers(dst, src, shuffle, tmp);
2453   SIMD_Arrangement size1 = isQ ? T16B : T8B;
2454   SIMD_Arrangement size2 = esize2arrangement((uint)type2aelembytes(bt), isQ);
2455 
2456   // Here is an example that rearranges a NEON vector with 4 ints:
2457   // Rearrange V1 int[a0, a1, a2, a3] to V2 int[a2, a3, a0, a1]
2458   //   1. We assume the shuffle input is Vi int[2, 3, 0, 1].
2459   //   2. Multiply Vi int[2, 3, 0, 1] with constant int vector
2460   //      [0x04040404, 0x04040404, 0x04040404, 0x04040404], and get
2461   //      tbl base Vm int[0x08080808, 0x0c0c0c0c, 0x00000000, 0x04040404].
2462   //   3. Add Vm with constant int[0x03020100, 0x03020100, 0x03020100, 0x03020100],
2463   //      and get tbl index Vm int[0x0b0a0908, 0x0f0e0d0c, 0x03020100, 0x07060504]
2464   //   4. Use Vm as index register, and use V1 as table register.
2465   //      Then get V2 as the result by tbl NEON instructions.
2466   switch (bt) {
2467     case T_SHORT:
2468       mov(tmp, size1, 0x02);
2469       mulv(dst, size2, shuffle, tmp);
2470       mov(tmp, size2, 0x0100);
2471       addv(dst, size1, dst, tmp);
2472       tbl(dst, size1, src, 1, dst);
2473       break;
2474     case T_INT:
2475     case T_FLOAT:
2476       mov(tmp, size1, 0x04);
2477       mulv(dst, size2, shuffle, tmp);
2478       mov(tmp, size2, 0x03020100);
2479       addv(dst, size1, dst, tmp);
2480       tbl(dst, size1, src, 1, dst);
2481       break;
2482     case T_LONG:
2483     case T_DOUBLE:
2484       {
2485         int idx = vector_iota_entry_index(T_LONG);
2486         lea(rscratch1,
2487             ExternalAddress(StubRoutines::aarch64::vector_iota_indices(idx)));
2488         ldrq(tmp, rscratch1);
2489         // Check whether the input "shuffle" is the same with iota indices.
2490         // Return "src" if true, otherwise swap the two elements of "src".
2491         cm(EQ, dst, size2, shuffle, tmp);
2492         ext(tmp, size1, src, src, 8);
2493         bsl(dst, size1, src, tmp);
2494       }
2495       break;
2496     default:
2497       assert(false, "unsupported element type");
2498       ShouldNotReachHere();
2499   }
2500 }
2501 
2502 // Extract a scalar element from an sve vector at position 'idx'.
2503 // The input elements in src are expected to be of integral type.
2504 void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src,
2505                                              int idx, FloatRegister vtmp) {
2506   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2507   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2508   if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction
2509     if (bt == T_INT || bt == T_LONG) {
2510       umov(dst, src, size, idx);
2511     } else {
2512       smov(dst, src, size, idx);
2513     }
2514   } else {
2515     sve_movprfx(vtmp, src);
2516     // Although vtmp and src hold the same value after movprfx, we must use src
2517     // (not vtmp) as the second source of ext. The movprfx destination register
2518     // must not appear in any source operand of the following instruction except
2519     // as the destructive operand.
2520     sve_ext(vtmp, src, idx << size);
2521     if (bt == T_INT || bt == T_LONG) {
2522       umov(dst, vtmp, size, 0);
2523     } else {
2524       smov(dst, vtmp, size, 0);
2525     }
2526   }
2527 }
2528 
2529 // java.lang.Math::round intrinsics
2530 
2531 // Clobbers: rscratch1, rflags
2532 void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2533                                           FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) {
2534   assert_different_registers(tmp1, tmp2, tmp3, src, dst);
2535   switch (T) {
2536     case T2S:
2537     case T4S:
2538       fmovs(tmp1, T, 0.5f);
2539       mov(rscratch1, jint_cast(0x1.0p23f));
2540       break;
2541     case T2D:
2542       fmovd(tmp1, T, 0.5);
2543       mov(rscratch1, julong_cast(0x1.0p52));
2544       break;
2545     default:
2546       assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2547   }
2548   fadd(tmp1, T, tmp1, src);
2549   fcvtms(tmp1, T, tmp1);
2550   // tmp1 = floor(src + 0.5, ties to even)
2551 
2552   fcvtas(dst, T, src);
2553   // dst = round(src), ties to away
2554 
2555   fneg(tmp3, T, src);
2556   dup(tmp2, T, rscratch1);
2557   cm(HS, tmp3, T, tmp3, tmp2);
2558   // tmp3 is now a set of flags
2559 
2560   bif(dst, T16B, tmp1, tmp3);
2561   // result in dst
2562 }
2563 
2564 // Clobbers: rscratch1, rflags
2565 void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2566                                          FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) {
2567   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2568   assert_different_registers(tmp1, tmp2, src, dst);
2569 
2570   switch (T) {
2571     case S:
2572       mov(rscratch1, jint_cast(0x1.0p23f));
2573       break;
2574     case D:
2575       mov(rscratch1, julong_cast(0x1.0p52));
2576       break;
2577     default:
2578       assert(T == S || T == D, "invalid register variant");
2579   }
2580 
2581   sve_frinta(dst, T, ptrue, src);
2582   // dst = round(src), ties to away
2583 
2584   Label none;
2585 
2586   sve_fneg(tmp1, T, ptrue, src);
2587   sve_dup(tmp2, T, rscratch1);
2588   sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1);
2589   br(EQ, none);
2590   {
2591     sve_cpy(tmp1, T, pgtmp, 0.5);
2592     sve_fadd(tmp1, T, pgtmp, src);
2593     sve_frintm(dst, T, pgtmp, tmp1);
2594     // dst = floor(src + 0.5, ties to even)
2595   }
2596   bind(none);
2597 
2598   sve_fcvtzs(dst, T, ptrue, dst, T);
2599   // result in dst
2600 }
2601 
2602 void C2_MacroAssembler::vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero,
2603                                            FloatRegister one, SIMD_Arrangement T) {
2604   assert_different_registers(dst, src, zero, one);
2605   assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2606 
2607   facgt(dst, T, src, zero);
2608   ushr(dst, T, dst, 1); // dst=0 for +-0.0 and NaN. 0x7FF..F otherwise
2609   bsl(dst, T == T2S ? T8B : T16B, one, src); // Result in dst
2610 }
2611 
2612 void C2_MacroAssembler::vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero,
2613                                           FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T) {
2614     assert_different_registers(dst, src, zero, one, vtmp);
2615     assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2616 
2617     sve_orr(vtmp, src, src);
2618     sve_fac(Assembler::GT, pgtmp, T, ptrue, src, zero); // pmtp=0 for +-0.0 and NaN. 0x1 otherwise
2619     switch (T) {
2620     case S:
2621       sve_and(vtmp, T, min_jint); // Extract the sign bit of float value in every lane of src
2622       sve_orr(vtmp, T, jint_cast(1.0)); // OR it with +1 to make the final result +1 or -1 depending
2623                                         // on the sign of the float value
2624       break;
2625     case D:
2626       sve_and(vtmp, T, min_jlong);
2627       sve_orr(vtmp, T, jlong_cast(1.0));
2628       break;
2629     default:
2630       assert(false, "unsupported");
2631       ShouldNotReachHere();
2632     }
2633     sve_sel(dst, T, pgtmp, vtmp, src); // Select either from src or vtmp based on the predicate register pgtmp
2634                                        // Result in dst
2635 }
2636 
2637 bool C2_MacroAssembler::in_scratch_emit_size() {
2638   if (ciEnv::current()->task() != nullptr) {
2639     PhaseOutput* phase_output = Compile::current()->output();
2640     if (phase_output != nullptr && phase_output->in_scratch_emit_size()) {
2641       return true;
2642     }
2643   }
2644   return MacroAssembler::in_scratch_emit_size();
2645 }
2646 
2647 static void abort_verify_int_in_range(uint idx, jint val, jint lo, jint hi) {
2648   fatal("Invalid CastII, idx: %u, val: %d, lo: %d, hi: %d", idx, val, lo, hi);
2649 }
2650 
2651 void C2_MacroAssembler::verify_int_in_range(uint idx, const TypeInt* t, Register rval, Register rtmp) {
2652   assert(!t->empty() && !t->singleton(), "%s", Type::str(t));
2653   if (t == TypeInt::INT) {
2654     return;
2655   }
2656 
2657   BLOCK_COMMENT("verify_int_in_range {");
2658   Label L_success, L_failure;
2659 
2660   jint lo = t->_lo;
2661   jint hi = t->_hi;
2662 
2663   if (lo != min_jint) {
2664     subsw(rtmp, rval, lo);
2665     br(Assembler::LT, L_failure);
2666   }
2667   if (hi != max_jint) {
2668     subsw(rtmp, rval, hi);
2669     br(Assembler::GT, L_failure);
2670   }
2671   b(L_success);
2672 
2673   bind(L_failure);
2674   movw(c_rarg0, idx);
2675   mov(c_rarg1, rval);
2676   movw(c_rarg2, lo);
2677   movw(c_rarg3, hi);
2678   reconstruct_frame_pointer(rtmp);
2679   rt_call(CAST_FROM_FN_PTR(address, abort_verify_int_in_range), rtmp);
2680   hlt(0);
2681 
2682   bind(L_success);
2683   BLOCK_COMMENT("} verify_int_in_range");
2684 }
2685 
2686 static void abort_verify_long_in_range(uint idx, jlong val, jlong lo, jlong hi) {
2687   fatal("Invalid CastLL, idx: %u, val: " JLONG_FORMAT ", lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, val, lo, hi);
2688 }
2689 
2690 void C2_MacroAssembler::verify_long_in_range(uint idx, const TypeLong* t, Register rval, Register rtmp) {
2691   assert(!t->empty() && !t->singleton(), "%s", Type::str(t));
2692   if (t == TypeLong::LONG) {
2693     return;
2694   }
2695 
2696   BLOCK_COMMENT("verify_long_in_range {");
2697   Label L_success, L_failure;
2698 
2699   jlong lo = t->_lo;
2700   jlong hi = t->_hi;
2701 
2702   if (lo != min_jlong) {
2703     subs(rtmp, rval, lo);
2704     br(Assembler::LT, L_failure);
2705   }
2706   if (hi != max_jlong) {
2707     subs(rtmp, rval, hi);
2708     br(Assembler::GT, L_failure);
2709   }
2710   b(L_success);
2711 
2712   bind(L_failure);
2713   movw(c_rarg0, idx);
2714   mov(c_rarg1, rval);
2715   mov(c_rarg2, lo);
2716   mov(c_rarg3, hi);
2717   reconstruct_frame_pointer(rtmp);
2718   rt_call(CAST_FROM_FN_PTR(address, abort_verify_long_in_range), rtmp);
2719   hlt(0);
2720 
2721   bind(L_success);
2722   BLOCK_COMMENT("} verify_long_in_range");
2723 }
2724 
2725 void C2_MacroAssembler::reconstruct_frame_pointer(Register rtmp) {
2726   const int framesize = Compile::current()->output()->frame_size_in_bytes();
2727   if (PreserveFramePointer) {
2728     // frame pointer is valid
2729 #ifdef ASSERT
2730     // Verify frame pointer value in rfp.
2731     add(rtmp, sp, framesize - 2 * wordSize);
2732     Label L_success;
2733     cmp(rfp, rtmp);
2734     br(Assembler::EQ, L_success);
2735     stop("frame pointer mismatch");
2736     bind(L_success);
2737 #endif // ASSERT
2738   } else {
2739     add(rfp, sp, framesize - 2 * wordSize);
2740   }
2741 }
2742 
2743 // Selects elements from two source vectors (src1, src2) based on index values in the index register
2744 // using Neon instructions and places it in the destination vector element corresponding to the
2745 // index vector element. Each index in the index register must be in the range - [0, 2 * NUM_ELEM),
2746 // where NUM_ELEM is the number of BasicType elements per vector.
2747 // If idx < NUM_ELEM --> selects src1[idx] (idx is an element of the index register)
2748 // Otherwise, selects src2[idx – NUM_ELEM]
2749 void C2_MacroAssembler::select_from_two_vectors_neon(FloatRegister dst, FloatRegister src1,
2750                                                      FloatRegister src2, FloatRegister index,
2751                                                      FloatRegister tmp, unsigned vector_length_in_bytes) {
2752   assert_different_registers(dst, src1, src2, tmp);
2753   SIMD_Arrangement size = vector_length_in_bytes == 16 ? T16B : T8B;
2754 
2755   if (vector_length_in_bytes == 16) {
2756     assert(UseSVE <= 1, "sve must be <= 1");
2757     assert(src1->successor() == src2, "Source registers must be ordered");
2758     // If the vector length is 16B, then use the Neon "tbl" instruction with two vector table
2759     tbl(dst, size, src1, 2, index);
2760   } else { // vector length == 8
2761     assert(UseSVE == 0, "must be Neon only");
2762     // We need to fit both the source vectors (src1, src2) in a 128-bit register because the
2763     // Neon "tbl" instruction supports only looking up 16B vectors. We then use the Neon "tbl"
2764     // instruction with one vector lookup
2765     ins(tmp, D, src1, 0, 0);
2766     ins(tmp, D, src2, 1, 0);
2767     tbl(dst, size, tmp, 1, index);
2768   }
2769 }
2770 
2771 // Selects elements from two source vectors (src1, src2) based on index values in the index register
2772 // using SVE/SVE2 instructions and places it in the destination vector element corresponding to the
2773 // index vector element. Each index in the index register must be in the range - [0, 2 * NUM_ELEM),
2774 // where NUM_ELEM is the number of BasicType elements per vector.
2775 // If idx < NUM_ELEM --> selects src1[idx] (idx is an element of the index register)
2776 // Otherwise, selects src2[idx – NUM_ELEM]
2777 void C2_MacroAssembler::select_from_two_vectors_sve(FloatRegister dst, FloatRegister src1,
2778                                                     FloatRegister src2, FloatRegister index,
2779                                                     FloatRegister tmp, SIMD_RegVariant T,
2780                                                     unsigned vector_length_in_bytes) {
2781   assert_different_registers(dst, src1, src2, index, tmp);
2782 
2783   if (vector_length_in_bytes == 8) {
2784     // We need to fit both the source vectors (src1, src2) in a single vector register because the
2785     // SVE "tbl" instruction is unpredicated and works on the entire vector which can lead to
2786     // incorrect results if each source vector is only partially filled. We then use the SVE "tbl"
2787     // instruction with one vector lookup
2788     assert(UseSVE >= 1, "sve must be >= 1");
2789     ins(tmp, D, src1, 0, 0);
2790     ins(tmp, D, src2, 1, 0);
2791     sve_tbl(dst, T, tmp, index);
2792   } else {  // UseSVE == 2 and vector_length_in_bytes > 8
2793     // If the vector length is > 8, then use the SVE2 "tbl" instruction with the two vector table.
2794     // The assertion - vector_length_in_bytes == MaxVectorSize ensures that this operation
2795     // is not executed on machines where vector_length_in_bytes < MaxVectorSize
2796     // with the only exception of 8B vector length.
2797     assert(UseSVE == 2 && vector_length_in_bytes == MaxVectorSize, "must be");
2798     assert(src1->successor() == src2, "Source registers must be ordered");
2799     sve_tbl(dst, T, src1, src2, index);
2800   }
2801 }
2802 
2803 void C2_MacroAssembler::select_from_two_vectors(FloatRegister dst, FloatRegister src1,
2804                                                 FloatRegister src2, FloatRegister index,
2805                                                 FloatRegister tmp, BasicType bt,
2806                                                 unsigned vector_length_in_bytes) {
2807 
2808   assert_different_registers(dst, src1, src2, index, tmp);
2809 
2810   // The cases that can reach this method are -
2811   // - UseSVE = 0/1, vector_length_in_bytes = 8 or 16, excluding double and long types
2812   // - UseSVE = 2, vector_length_in_bytes >= 8, for all types
2813   //
2814   // SVE/SVE2 tbl instructions are generated when UseSVE = 1 with vector_length_in_bytes = 8
2815   // and UseSVE = 2 with vector_length_in_bytes >= 8
2816   //
2817   // Neon instructions are generated when UseSVE = 0 with vector_length_in_bytes = 8 or 16 and
2818   // UseSVE = 1 with vector_length_in_bytes = 16
2819 
2820   if ((UseSVE == 1 && vector_length_in_bytes == 8) || UseSVE == 2) {
2821     SIMD_RegVariant T = elemType_to_regVariant(bt);
2822     select_from_two_vectors_sve(dst, src1, src2, index, tmp, T, vector_length_in_bytes);
2823     return;
2824   }
2825 
2826   // The only BasicTypes that can reach here are T_SHORT, T_BYTE, T_INT and T_FLOAT
2827   assert(bt != T_DOUBLE && bt != T_LONG, "unsupported basic type");
2828   assert(vector_length_in_bytes <= 16, "length_in_bytes must be <= 16");
2829 
2830   bool isQ = vector_length_in_bytes == 16;
2831 
2832   SIMD_Arrangement size1 = isQ ? T16B : T8B;
2833   SIMD_Arrangement size2 = esize2arrangement((uint)type2aelembytes(bt), isQ);
2834 
2835   // Neon "tbl" instruction only supports byte tables, so we need to look at chunks of
2836   // 2B for selecting shorts or chunks of 4B for selecting ints/floats from the table.
2837   // The index values in "index" register are in the range of [0, 2 * NUM_ELEM) where NUM_ELEM
2838   // is the number of elements that can fit in a vector. For ex. for T_SHORT with 64-bit vector length,
2839   // the indices can range from [0, 8).
2840   // As an example with 64-bit vector length and T_SHORT type - let index = [2, 5, 1, 0]
2841   // Move a constant 0x02 in every byte of tmp - tmp = [0x0202, 0x0202, 0x0202, 0x0202]
2842   // Multiply index vector with tmp to yield - dst = [0x0404, 0x0a0a, 0x0202, 0x0000]
2843   // Move a constant 0x0100 in every 2B of tmp - tmp = [0x0100, 0x0100, 0x0100, 0x0100]
2844   // Add the multiplied result to the vector in tmp to obtain the byte level
2845   // offsets - dst = [0x0504, 0x0b0a, 0x0302, 0x0100]
2846   // Use these offsets in the "tbl" instruction to select chunks of 2B.
2847 
2848   if (bt == T_BYTE) {
2849     select_from_two_vectors_neon(dst, src1, src2, index, tmp, vector_length_in_bytes);
2850   } else {
2851     int elem_size = (bt == T_SHORT) ? 2 : 4;
2852     uint64_t tbl_offset = (bt == T_SHORT) ? 0x0100u : 0x03020100u;
2853 
2854     mov(tmp, size1, elem_size);
2855     mulv(dst, size2, index, tmp);
2856     mov(tmp, size2, tbl_offset);
2857     addv(dst, size1, dst, tmp); // "dst" now contains the processed index elements
2858                                 // to select a set of 2B/4B
2859     select_from_two_vectors_neon(dst, src1, src2, dst, tmp, vector_length_in_bytes);
2860   }
2861 }
2862 
2863 // Vector expand implementation. Elements from the src vector are expanded into
2864 // the dst vector under the control of the vector mask.
2865 // Since there are no native instructions directly corresponding to expand before
2866 // SVE2p2, the following implementations mainly leverages the TBL instruction to
2867 // implement expand. To compute the index input for TBL, the prefix sum algorithm
2868 // (https://en.wikipedia.org/wiki/Prefix_sum) is used. The same algorithm is used
2869 // for NEON and SVE, but with different instructions where appropriate.
2870 
2871 // Vector expand implementation for NEON.
2872 //
2873 // An example of 128-bit Byte vector:
2874 //   Data direction: high <== low
2875 //   Input:
2876 //         src   = g  f  e  d  c  b  a  9  8  7  6  5  4  3  2  1
2877 //         mask  = 0  0 -1 -1  0  0 -1 -1  0  0 -1 -1  0  0 -1 -1
2878 //   Expected result:
2879 //         dst   = 0  0  8  7  0  0  6  5  0  0  4  3  0  0  2  1
2880 void C2_MacroAssembler::vector_expand_neon(FloatRegister dst, FloatRegister src, FloatRegister mask,
2881                                            FloatRegister tmp1, FloatRegister tmp2, BasicType bt,
2882                                            int vector_length_in_bytes) {
2883   assert(vector_length_in_bytes <= 16, "the vector length in bytes for NEON must be <= 16");
2884   assert_different_registers(dst, src, mask, tmp1, tmp2);
2885   // Since the TBL instruction only supports byte table, we need to
2886   // compute indices in byte type for all types.
2887   SIMD_Arrangement size = vector_length_in_bytes == 16 ? T16B : T8B;
2888   // tmp1 =  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
2889   dup(tmp1, size, zr);
2890   // dst  =  0  0  1  1  0  0  1  1  0  0  1  1  0  0  1  1
2891   negr(dst, size, mask);
2892   // Calculate vector index for TBL with prefix sum algorithm.
2893   // dst  =  8  8  8  7  6  6  6  5  4  4  4  3  2  2  2  1
2894   for (int i = 1; i < vector_length_in_bytes; i <<= 1) {
2895     ext(tmp2, size, tmp1, dst, vector_length_in_bytes - i);
2896     addv(dst, size, tmp2, dst);
2897   }
2898   // tmp2 =  0  0 -1 -1  0  0 -1 -1  0  0 -1 -1  0  0 -1 -1
2899   orr(tmp2, size, mask, mask);
2900   // tmp2 =  0  0  8  7  0  0  6  5  0  0  4  3  0  0  2  1
2901   bsl(tmp2, size, dst, tmp1);
2902   // tmp1 =  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
2903   movi(tmp1, size, 1);
2904   // dst  = -1 -1  7  6 -1 -1  5  4 -1 -1  3  2 -1 -1  1  0
2905   subv(dst, size, tmp2, tmp1);
2906   // dst  =  0  0  8  7  0  0  6  5  0  0  4  3  0  0  2  1
2907   tbl(dst, size, src, 1, dst);
2908 }
2909 
2910 // Vector expand implementation for SVE.
2911 //
2912 // An example of 128-bit Short vector:
2913 //   Data direction: high <== low
2914 //   Input:
2915 //         src   = gf ed cb a9 87 65 43 21
2916 //         pg    = 00 01 00 01 00 01 00 01
2917 //   Expected result:
2918 //         dst   = 00 87 00 65 00 43 00 21
2919 void C2_MacroAssembler::vector_expand_sve(FloatRegister dst, FloatRegister src, PRegister pg,
2920                                           FloatRegister tmp1, FloatRegister tmp2, BasicType bt,
2921                                           int vector_length_in_bytes) {
2922   assert(UseSVE > 0, "expand implementation only for SVE");
2923   assert_different_registers(dst, src, tmp1, tmp2);
2924   SIMD_RegVariant size = elemType_to_regVariant(bt);
2925 
2926   // tmp1 = 00 00 00 00 00 00 00 00
2927   sve_dup(tmp1, size, 0);
2928   sve_movprfx(tmp2, tmp1);
2929   // tmp2 = 00 01 00 01 00 01 00 01
2930   sve_cpy(tmp2, size, pg, 1, true);
2931   // Calculate vector index for TBL with prefix sum algorithm.
2932   // tmp2 = 04 04 03 03 02 02 01 01
2933   for (int i = type2aelembytes(bt); i < vector_length_in_bytes; i <<= 1) {
2934     sve_movprfx(dst, tmp1);
2935     // The EXT instruction operates on the full-width sve register. The correct
2936     // index calculation method is:
2937     // vector_length_in_bytes - i + MaxVectorSize - vector_length_in_bytes =>
2938     // MaxVectorSize - i.
2939     sve_ext(dst, tmp2, MaxVectorSize - i);
2940     sve_add(tmp2, size, dst, tmp2);
2941   }
2942   // dst  = 00 04 00 03 00 02 00 01
2943   sve_sel(dst, size, pg, tmp2, tmp1);
2944   // dst  = -1 03 -1 02 -1 01 -1 00
2945   sve_sub(dst, size, 1);
2946   // dst  = 00 87 00 65 00 43 00 21
2947   sve_tbl(dst, size, src, dst);
2948 }
2949 
2950 // Optimized SVE cpy (imm, zeroing) instruction.
2951 //
2952 // `movi; cpy(imm, merging)` and `cpy(imm, zeroing)` have the same
2953 // functionality, but test results show that `movi; cpy(imm, merging)` has
2954 // higher throughput on some microarchitectures. This would depend on
2955 // microarchitecture and so may vary between implementations.
2956 void C2_MacroAssembler::sve_cpy(FloatRegister dst, SIMD_RegVariant T,
2957                                 PRegister pg, int imm8, bool isMerge) {
2958   if (VM_Version::prefer_sve_merging_mode_cpy() && !isMerge) {
2959     // Generates a NEON instruction `movi V<dst>.2d, #0`.
2960     // On AArch64, Z and V registers alias in the low 128 bits, so V<dst> is
2961     // the low 128 bits of Z<dst>. A write to V<dst> also clears all bits of
2962     // Z<dst> above 128, so this `movi` instruction effectively zeroes the
2963     // entire Z<dst> register. According to the Arm Software Optimization
2964     // Guide, `movi` is zero latency.
2965     movi(dst, T2D, 0);
2966     isMerge = true;
2967   }
2968   Assembler::sve_cpy(dst, T, pg, imm8, isMerge);
2969 }
2970 
2971 int C2_MacroAssembler::vector_iota_entry_index(BasicType bt) {
2972   // The vector iota entries array is ordered by type B/S/I/L/F/D, and
2973   // the offset between two types is 16.
2974   switch(bt) {
2975   case T_BYTE:
2976     return 0;
2977   case T_SHORT:
2978     return 1;
2979   case T_INT:
2980     return 2;
2981   case T_LONG:
2982     return 3;
2983   case T_FLOAT:
2984     return 4;
2985   case T_DOUBLE:
2986     return 5;
2987   default:
2988     ShouldNotReachHere();
2989   }
2990 }