1 /*
   2  * Copyright (c) 2020, 2026, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "asm/assembler.hpp"
  26 #include "asm/assembler.inline.hpp"
  27 #include "opto/c2_MacroAssembler.hpp"
  28 #include "opto/compile.hpp"
  29 #include "opto/intrinsicnode.hpp"
  30 #include "opto/matcher.hpp"
  31 #include "opto/output.hpp"
  32 #include "opto/subnode.hpp"
  33 #include "runtime/objectMonitorTable.hpp"
  34 #include "runtime/stubRoutines.hpp"
  35 #include "runtime/synchronizer.hpp"
  36 #include "utilities/globalDefinitions.hpp"
  37 #include "utilities/powerOfTwo.hpp"
  38 
  39 #ifdef PRODUCT
  40 #define BLOCK_COMMENT(str) /* nothing */
  41 #define STOP(error) stop(error)
  42 #else
  43 #define BLOCK_COMMENT(str) block_comment(str)
  44 #define STOP(error) block_comment(error); stop(error)
  45 #endif
  46 
  47 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  48 
  49 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
  50 
  51 void C2_MacroAssembler::entry_barrier() {
  52   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
  53   // Dummy labels for just measuring the code size
  54   Label dummy_slow_path;
  55   Label dummy_continuation;
  56   Label dummy_guard;
  57   Label* slow_path = &dummy_slow_path;
  58   Label* continuation = &dummy_continuation;
  59   Label* guard = &dummy_guard;
  60   if (!Compile::current()->output()->in_scratch_emit_size()) {
  61     // Use real labels from actual stub when not emitting code for the purpose of measuring its size
  62     C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
  63     Compile::current()->output()->add_stub(stub);
  64     slow_path = &stub->entry();
  65     continuation = &stub->continuation();
  66     guard = &stub->guard();
  67   }
  68   // In the C2 code, we move the non-hot part of nmethod entry barriers out-of-line to a stub.
  69   bs->nmethod_entry_barrier(this, slow_path, continuation, guard);
  70 }
  71 
  72 // jdk.internal.util.ArraysSupport.vectorizedHashCode
  73 address C2_MacroAssembler::arrays_hashcode(Register ary, Register cnt, Register result,
  74                                            FloatRegister vdata0, FloatRegister vdata1,
  75                                            FloatRegister vdata2, FloatRegister vdata3,
  76                                            FloatRegister vmul0, FloatRegister vmul1,
  77                                            FloatRegister vmul2, FloatRegister vmul3,
  78                                            FloatRegister vpow, FloatRegister vpowm,
  79                                            BasicType eltype) {
  80   ARRAYS_HASHCODE_REGISTERS;
  81 
  82   Register tmp1 = rscratch1, tmp2 = rscratch2;
  83 
  84   Label TAIL, STUB_SWITCH, STUB_SWITCH_OUT, LOOP, BR_BASE, LARGE, DONE;
  85 
  86   // Vectorization factor. Number of array elements loaded to one SIMD&FP registers by the stubs. We
  87   // use 8H load arrangements for chars and shorts and 8B for booleans and bytes. It's possible to
  88   // use 4H for chars and shorts instead, but using 8H gives better performance.
  89   const size_t vf = eltype == T_BOOLEAN || eltype == T_BYTE ? 8
  90                     : eltype == T_CHAR || eltype == T_SHORT ? 8
  91                     : eltype == T_INT                       ? 4
  92                                                             : 0;
  93   guarantee(vf, "unsupported eltype");
  94 
  95   // Unroll factor for the scalar loop below. The value is chosen based on performance analysis.
  96   const size_t unroll_factor = 4;
  97 
  98   switch (eltype) {
  99   case T_BOOLEAN:
 100     BLOCK_COMMENT("arrays_hashcode(unsigned byte) {");
 101     break;
 102   case T_CHAR:
 103     BLOCK_COMMENT("arrays_hashcode(char) {");
 104     break;
 105   case T_BYTE:
 106     BLOCK_COMMENT("arrays_hashcode(byte) {");
 107     break;
 108   case T_SHORT:
 109     BLOCK_COMMENT("arrays_hashcode(short) {");
 110     break;
 111   case T_INT:
 112     BLOCK_COMMENT("arrays_hashcode(int) {");
 113     break;
 114   default:
 115     ShouldNotReachHere();
 116   }
 117 
 118   // large_arrays_hashcode(T_INT) performs worse than the scalar loop below when the Neon loop
 119   // implemented by the stub executes just once. Call the stub only if at least two iterations will
 120   // be executed.
 121   const size_t large_threshold = eltype == T_INT ? vf * 2 : vf;
 122   cmpw(cnt, large_threshold);
 123   br(Assembler::HS, LARGE);
 124 
 125   bind(TAIL);
 126 
 127   // The andr performs cnt % uf where uf = unroll_factor. The subtract shifted by 3 offsets past
 128   // uf - (cnt % uf) pairs of load + madd insns i.e. it only executes cnt % uf load + madd pairs.
 129   // Iteration eats up the remainder, uf elements at a time.
 130   assert(is_power_of_2(unroll_factor), "can't use this value to calculate the jump target PC");
 131   andr(tmp2, cnt, unroll_factor - 1);
 132   adr(tmp1, BR_BASE);
 133   // For Cortex-A53 offset is 4 because 2 nops are generated.
 134   sub(tmp1, tmp1, tmp2, ext::sxtw, VM_Version::supports_a53mac() ? 4 : 3);
 135   movw(tmp2, 0x1f);
 136   br(tmp1);
 137 
 138   bind(LOOP);
 139   for (size_t i = 0; i < unroll_factor; ++i) {
 140     load(tmp1, Address(post(ary, type2aelembytes(eltype))), eltype);
 141     maddw(result, result, tmp2, tmp1);
 142     // maddw generates an extra nop for Cortex-A53 (see maddw definition in macroAssembler).
 143     // Generate 2nd nop to have 4 instructions per iteration.
 144     if (VM_Version::supports_a53mac()) {
 145       nop();
 146     }
 147   }
 148   bind(BR_BASE);
 149   subsw(cnt, cnt, unroll_factor);
 150   br(Assembler::HS, LOOP);
 151 
 152   b(DONE);
 153 
 154   bind(LARGE);
 155 
 156   RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_arrays_hashcode(eltype));
 157   assert(stub.target() != nullptr, "array_hashcode stub has not been generated");
 158   address tpc = trampoline_call(stub);
 159   if (tpc == nullptr) {
 160     DEBUG_ONLY(reset_labels(TAIL, BR_BASE));
 161     postcond(pc() == badAddress);
 162     return nullptr;
 163   }
 164 
 165   bind(DONE);
 166 
 167   BLOCK_COMMENT("} // arrays_hashcode");
 168 
 169   postcond(pc() != badAddress);
 170   return pc();
 171 }
 172 
 173 void C2_MacroAssembler::fast_lock(Register obj, Register box, Register t1,
 174                                   Register t2, Register t3) {
 175   assert_different_registers(obj, box, t1, t2, t3, rscratch2);
 176 
 177   // Handle inflated monitor.
 178   Label inflated;
 179   // Finish fast lock successfully. MUST branch to with flag == EQ
 180   Label locked;
 181   // Finish fast lock unsuccessfully. MUST branch to with flag == NE
 182   Label slow_path;
 183 
 184   if (UseObjectMonitorTable) {
 185     // Clear cache in case fast locking succeeds or we need to take the slow-path.
 186     str(zr, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 187   }
 188 
 189   if (DiagnoseSyncOnValueBasedClasses != 0) {
 190     load_klass(t1, obj);
 191     ldrb(t1, Address(t1, Klass::misc_flags_offset()));
 192     tst(t1, KlassFlags::_misc_is_value_based_class);
 193     br(Assembler::NE, slow_path);
 194   }
 195 
 196   const Register t1_mark = t1;
 197   const Register t3_t = t3;
 198 
 199   { // Fast locking
 200 
 201     // Push lock to the lock stack and finish successfully. MUST branch to with flag == EQ
 202     Label push;
 203 
 204     const Register t2_top = t2;
 205 
 206     // Check if lock-stack is full.
 207     ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 208     cmpw(t2_top, (unsigned)LockStack::end_offset() - 1);
 209     br(Assembler::GT, slow_path);
 210 
 211     // Check if recursive.
 212     subw(t3_t, t2_top, oopSize);
 213     ldr(t3_t, Address(rthread, t3_t));
 214     cmp(obj, t3_t);
 215     br(Assembler::EQ, push);
 216 
 217     // Relaxed normal load to check for monitor. Optimization for monitor case.
 218     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 219     tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
 220 
 221     // Not inflated
 222     assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea");
 223 
 224     // Try to lock. Transition lock-bits 0b01 => 0b00
 225     orr(t1_mark, t1_mark, markWord::unlocked_value);
 226     eor(t3_t, t1_mark, markWord::unlocked_value);
 227     cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
 228             /*acquire*/ true, /*release*/ false, /*weak*/ false, noreg);
 229     br(Assembler::NE, slow_path);
 230 
 231     bind(push);
 232     // After successful lock, push object on lock-stack.
 233     str(obj, Address(rthread, t2_top));
 234     addw(t2_top, t2_top, oopSize);
 235     strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 236     b(locked);
 237   }
 238 
 239   { // Handle inflated monitor.
 240     bind(inflated);
 241 
 242     const Register t1_monitor = t1;
 243 
 244     if (!UseObjectMonitorTable) {
 245       assert(t1_monitor == t1_mark, "should be the same here");
 246     } else {
 247       const Register t1_hash = t1;
 248       Label monitor_found;
 249 
 250       // Save the mark, we might need it to extract the hash.
 251       mov(t3, t1_mark);
 252 
 253       // Look for the monitor in the om_cache.
 254 
 255       ByteSize cache_offset   = JavaThread::om_cache_oops_offset();
 256       ByteSize monitor_offset = OMCache::oop_to_monitor_difference();
 257       const int num_unrolled  = OMCache::CAPACITY;
 258       for (int i = 0; i < num_unrolled; i++) {
 259         ldr(t1_monitor, Address(rthread, cache_offset + monitor_offset));
 260         ldr(t2, Address(rthread, cache_offset));
 261         cmp(obj, t2);
 262         br(Assembler::EQ, monitor_found);
 263         cache_offset = cache_offset + OMCache::oop_to_oop_difference();
 264       }
 265 
 266       // Look for the monitor in the table.
 267 
 268       // Get the hash code.
 269       ubfx(t1_hash, t3, markWord::hash_shift, markWord::hash_bits);
 270 
 271       // Get the table and calculate the bucket's address
 272       lea(t3, ExternalAddress(ObjectMonitorTable::current_table_address()));
 273       ldr(t3, Address(t3));
 274       ldr(t2, Address(t3, ObjectMonitorTable::table_capacity_mask_offset()));
 275       ands(t1_hash, t1_hash, t2);
 276       ldr(t3, Address(t3, ObjectMonitorTable::table_buckets_offset()));
 277 
 278       // Read the monitor from the bucket.
 279       ldr(t1_monitor, Address(t3, t1_hash, Address::lsl(LogBytesPerWord)));
 280 
 281       // Check if the monitor in the bucket is special (empty, tombstone or removed).
 282       cmp(t1_monitor, (unsigned char)ObjectMonitorTable::SpecialPointerValues::below_is_special);
 283       br(Assembler::LO, slow_path);
 284 
 285       // Check if object matches.
 286       ldr(t3, Address(t1_monitor, ObjectMonitor::object_offset()));
 287       BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
 288       bs_asm->try_resolve_weak_handle_in_c2(this, t3, t2, slow_path);
 289       cmp(t3, obj);
 290       br(Assembler::NE, slow_path);
 291 
 292       bind(monitor_found);
 293     }
 294 
 295     const Register t2_owner_addr = t2;
 296     const Register t3_owner = t3;
 297     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 298     const Address owner_address(t1_monitor, ObjectMonitor::owner_offset() - monitor_tag);
 299     const Address recursions_address(t1_monitor, ObjectMonitor::recursions_offset() - monitor_tag);
 300 
 301     Label monitor_locked;
 302 
 303     // Compute owner address.
 304     lea(t2_owner_addr, owner_address);
 305 
 306     // Try to CAS owner (no owner => current thread's _monitor_owner_id).
 307     ldr(rscratch2, Address(rthread, JavaThread::monitor_owner_id_offset()));
 308     cmpxchg(t2_owner_addr, zr, rscratch2, Assembler::xword, /*acquire*/ true,
 309             /*release*/ false, /*weak*/ false, t3_owner);
 310     br(Assembler::EQ, monitor_locked);
 311 
 312     // Check if recursive.
 313     cmp(t3_owner, rscratch2);
 314     br(Assembler::NE, slow_path);
 315 
 316     // Recursive.
 317     increment(recursions_address, 1);
 318 
 319     bind(monitor_locked);
 320     if (UseObjectMonitorTable) {
 321       str(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 322     }
 323   }
 324 
 325   bind(locked);
 326 
 327 #ifdef ASSERT
 328   // Check that locked label is reached with Flags == EQ.
 329   Label flag_correct;
 330   br(Assembler::EQ, flag_correct);
 331   stop("Fast Lock Flag != EQ");
 332 #endif
 333 
 334   bind(slow_path);
 335 #ifdef ASSERT
 336   // Check that slow_path label is reached with Flags == NE.
 337   br(Assembler::NE, flag_correct);
 338   stop("Fast Lock Flag != NE");
 339   bind(flag_correct);
 340 #endif
 341   // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
 342 }
 343 
 344 void C2_MacroAssembler::fast_unlock(Register obj, Register box, Register t1,
 345                                     Register t2, Register t3) {
 346   assert_different_registers(obj, box, t1, t2, t3);
 347 
 348   // Handle inflated monitor.
 349   Label inflated, inflated_load_mark;
 350   // Finish fast unlock successfully. MUST branch to with flag == EQ
 351   Label unlocked;
 352   // Finish fast unlock unsuccessfully. MUST branch to with flag == NE
 353   Label slow_path;
 354 
 355   const Register t1_mark = t1;
 356   const Register t2_top = t2;
 357   const Register t3_t = t3;
 358 
 359   { // Fast unlock
 360 
 361     Label push_and_slow_path;
 362 
 363     // Check if obj is top of lock-stack.
 364     ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 365     subw(t2_top, t2_top, oopSize);
 366     ldr(t3_t, Address(rthread, t2_top));
 367     cmp(obj, t3_t);
 368     // Top of lock stack was not obj. Must be monitor.
 369     br(Assembler::NE, inflated_load_mark);
 370 
 371     // Pop lock-stack.
 372     DEBUG_ONLY(str(zr, Address(rthread, t2_top));)
 373     strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 374 
 375     // Check if recursive.
 376     subw(t3_t, t2_top, oopSize);
 377     ldr(t3_t, Address(rthread, t3_t));
 378     cmp(obj, t3_t);
 379     br(Assembler::EQ, unlocked);
 380 
 381     // Not recursive.
 382     // Load Mark.
 383     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 384 
 385     // Check header for monitor (0b10).
 386     // Because we got here by popping (meaning we pushed in locked)
 387     // there will be no monitor in the box. So we need to push back the obj
 388     // so that the runtime can fix any potential anonymous owner.
 389     tbnz(t1_mark, exact_log2(markWord::monitor_value), UseObjectMonitorTable ? push_and_slow_path : inflated);
 390 
 391     // Try to unlock. Transition lock bits 0b00 => 0b01
 392     assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea");
 393     orr(t3_t, t1_mark, markWord::unlocked_value);
 394     cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
 395             /*acquire*/ false, /*release*/ true, /*weak*/ false, noreg);
 396     br(Assembler::EQ, unlocked);
 397 
 398     bind(push_and_slow_path);
 399     // Compare and exchange failed.
 400     // Restore lock-stack and handle the unlock in runtime.
 401     DEBUG_ONLY(str(obj, Address(rthread, t2_top));)
 402     addw(t2_top, t2_top, oopSize);
 403     str(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 404     b(slow_path);
 405   }
 406 
 407 
 408   { // Handle inflated monitor.
 409     bind(inflated_load_mark);
 410     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 411 #ifdef ASSERT
 412     tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
 413     stop("Fast Unlock not monitor");
 414 #endif
 415 
 416     bind(inflated);
 417 
 418 #ifdef ASSERT
 419     Label check_done;
 420     subw(t2_top, t2_top, oopSize);
 421     cmpw(t2_top, in_bytes(JavaThread::lock_stack_base_offset()));
 422     br(Assembler::LT, check_done);
 423     ldr(t3_t, Address(rthread, t2_top));
 424     cmp(obj, t3_t);
 425     br(Assembler::NE, inflated);
 426     stop("Fast Unlock lock on stack");
 427     bind(check_done);
 428 #endif
 429 
 430     const Register t1_monitor = t1;
 431 
 432     if (!UseObjectMonitorTable) {
 433       assert(t1_monitor == t1_mark, "should be the same here");
 434 
 435       // Untag the monitor.
 436       add(t1_monitor, t1_mark, -(int)markWord::monitor_value);
 437     } else {
 438       ldr(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 439       // null check with Flags == NE, no valid pointer below alignof(ObjectMonitor*)
 440       cmp(t1_monitor, checked_cast<uint8_t>(alignof(ObjectMonitor*)));
 441       br(Assembler::LO, slow_path);
 442     }
 443 
 444     const Register t2_recursions = t2;
 445     Label not_recursive;
 446 
 447     // Check if recursive.
 448     ldr(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
 449     cbz(t2_recursions, not_recursive);
 450 
 451     // Recursive unlock.
 452     sub(t2_recursions, t2_recursions, 1u);
 453     str(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
 454     // Set flag == EQ
 455     cmp(t2_recursions, t2_recursions);
 456     b(unlocked);
 457 
 458     bind(not_recursive);
 459 
 460     const Register t2_owner_addr = t2;
 461 
 462     // Compute owner address.
 463     lea(t2_owner_addr, Address(t1_monitor, ObjectMonitor::owner_offset()));
 464 
 465     // Set owner to null.
 466     // Release to satisfy the JMM
 467     stlr(zr, t2_owner_addr);
 468     // We need a full fence after clearing owner to avoid stranding.
 469     // StoreLoad achieves this.
 470     membar(StoreLoad);
 471 
 472     // Check if the entry_list is empty.
 473     ldr(rscratch1, Address(t1_monitor, ObjectMonitor::entry_list_offset()));
 474     cmp(rscratch1, zr);
 475     br(Assembler::EQ, unlocked);  // If so we are done.
 476 
 477     // Check if there is a successor.
 478     ldr(rscratch1, Address(t1_monitor, ObjectMonitor::succ_offset()));
 479     cmp(rscratch1, zr);
 480     br(Assembler::NE, unlocked);  // If so we are done.
 481 
 482     // Save the monitor pointer in the current thread, so we can try to
 483     // reacquire the lock in SharedRuntime::monitor_exit_helper().
 484     str(t1_monitor, Address(rthread, JavaThread::unlocked_inflated_monitor_offset()));
 485 
 486     cmp(zr, rthread); // Set Flag to NE => slow path
 487     b(slow_path);
 488   }
 489 
 490   bind(unlocked);
 491   cmp(zr, zr); // Set Flags to EQ => fast path
 492 
 493 #ifdef ASSERT
 494   // Check that unlocked label is reached with Flags == EQ.
 495   Label flag_correct;
 496   br(Assembler::EQ, flag_correct);
 497   stop("Fast Unlock Flag != EQ");
 498 #endif
 499 
 500   bind(slow_path);
 501 #ifdef ASSERT
 502   // Check that slow_path label is reached with Flags == NE.
 503   br(Assembler::NE, flag_correct);
 504   stop("Fast Unlock Flag != NE");
 505   bind(flag_correct);
 506 #endif
 507   // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
 508 }
 509 
 510 // Search for str1 in str2 and return index or -1
 511 // Clobbers: rscratch1, rscratch2, rflags. May also clobber v0-v1, when icnt1==-1.
 512 void C2_MacroAssembler::string_indexof(Register str2, Register str1,
 513                                        Register cnt2, Register cnt1,
 514                                        Register tmp1, Register tmp2,
 515                                        Register tmp3, Register tmp4,
 516                                        Register tmp5, Register tmp6,
 517                                        int icnt1, Register result, int ae) {
 518   // NOTE: tmp5, tmp6 can be zr depending on specific method version
 519   Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
 520 
 521   Register ch1 = rscratch1;
 522   Register ch2 = rscratch2;
 523   Register cnt1tmp = tmp1;
 524   Register cnt2tmp = tmp2;
 525   Register cnt1_neg = cnt1;
 526   Register cnt2_neg = cnt2;
 527   Register result_tmp = tmp4;
 528 
 529   bool isL = ae == StrIntrinsicNode::LL;
 530 
 531   bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
 532   bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
 533   int str1_chr_shift = str1_isL ? 0:1;
 534   int str2_chr_shift = str2_isL ? 0:1;
 535   int str1_chr_size = str1_isL ? 1:2;
 536   int str2_chr_size = str2_isL ? 1:2;
 537   chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
 538                                       (chr_insn)&MacroAssembler::ldrh;
 539   chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
 540                                       (chr_insn)&MacroAssembler::ldrh;
 541   chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
 542   chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
 543 
 544   // Note, inline_string_indexOf() generates checks:
 545   // if (substr.count > string.count) return -1;
 546   // if (substr.count == 0) return 0;
 547 
 548   // We have two strings, a source string in str2, cnt2 and a pattern string
 549   // in str1, cnt1. Find the 1st occurrence of pattern in source or return -1.
 550 
 551   // For larger pattern and source we use a simplified Boyer Moore algorithm.
 552   // With a small pattern and source we use linear scan.
 553 
 554   if (icnt1 == -1) {
 555     sub(result_tmp, cnt2, cnt1);
 556     cmp(cnt1, (u1)8);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
 557     br(LT, LINEARSEARCH);
 558     dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
 559     subs(zr, cnt1, 256);
 560     lsr(tmp1, cnt2, 2);
 561     ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
 562     br(GE, LINEARSTUB);
 563   }
 564 
 565 // The Boyer Moore alogorithm is based on the description here:-
 566 //
 567 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
 568 //
 569 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
 570 // and the 'Good Suffix' rule.
 571 //
 572 // These rules are essentially heuristics for how far we can shift the
 573 // pattern along the search string.
 574 //
 575 // The implementation here uses the 'Bad Character' rule only because of the
 576 // complexity of initialisation for the 'Good Suffix' rule.
 577 //
 578 // This is also known as the Boyer-Moore-Horspool algorithm:-
 579 //
 580 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
 581 //
 582 // This particular implementation has few java-specific optimizations.
 583 //
 584 // #define ASIZE 256
 585 //
 586 //    int bm(unsigned char *x, int m, unsigned char *y, int n) {
 587 //       int i, j;
 588 //       unsigned c;
 589 //       unsigned char bc[ASIZE];
 590 //
 591 //       /* Preprocessing */
 592 //       for (i = 0; i < ASIZE; ++i)
 593 //          bc[i] = m;
 594 //       for (i = 0; i < m - 1; ) {
 595 //          c = x[i];
 596 //          ++i;
 597 //          // c < 256 for Latin1 string, so, no need for branch
 598 //          #ifdef PATTERN_STRING_IS_LATIN1
 599 //          bc[c] = m - i;
 600 //          #else
 601 //          if (c < ASIZE) bc[c] = m - i;
 602 //          #endif
 603 //       }
 604 //
 605 //       /* Searching */
 606 //       j = 0;
 607 //       while (j <= n - m) {
 608 //          c = y[i+j];
 609 //          if (x[m-1] == c)
 610 //            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
 611 //          if (i < 0) return j;
 612 //          // c < 256 for Latin1 string, so, no need for branch
 613 //          #ifdef SOURCE_STRING_IS_LATIN1
 614 //          // LL case: (c< 256) always true. Remove branch
 615 //          j += bc[y[j+m-1]];
 616 //          #endif
 617 //          #ifndef PATTERN_STRING_IS_UTF
 618 //          // UU case: need if (c<ASIZE) check. Skip 1 character if not.
 619 //          if (c < ASIZE)
 620 //            j += bc[y[j+m-1]];
 621 //          else
 622 //            j += 1
 623 //          #endif
 624 //          #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
 625 //          // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
 626 //          if (c < ASIZE)
 627 //            j += bc[y[j+m-1]];
 628 //          else
 629 //            j += m
 630 //          #endif
 631 //       }
 632 //    }
 633 
 634   if (icnt1 == -1) {
 635     Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
 636         BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
 637     Register cnt1end = tmp2;
 638     Register str2end = cnt2;
 639     Register skipch = tmp2;
 640 
 641     // str1 length is >=8, so, we can read at least 1 register for cases when
 642     // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
 643     // UL case. We'll re-read last character in inner pre-loop code to have
 644     // single outer pre-loop load
 645     const int firstStep = isL ? 7 : 3;
 646 
 647     const int ASIZE = 256;
 648     const int STORED_BYTES = 32; // amount of bytes stored per instruction
 649     sub(sp, sp, ASIZE);
 650     mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
 651     mov(ch1, sp);
 652     BIND(BM_INIT_LOOP);
 653       stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
 654       subs(tmp5, tmp5, 1);
 655       br(GT, BM_INIT_LOOP);
 656 
 657       sub(cnt1tmp, cnt1, 1);
 658       mov(tmp5, str2);
 659       add(str2end, str2, result_tmp, LSL, str2_chr_shift);
 660       sub(ch2, cnt1, 1);
 661       mov(tmp3, str1);
 662     BIND(BCLOOP);
 663       (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
 664       if (!str1_isL) {
 665         subs(zr, ch1, ASIZE);
 666         br(HS, BCSKIP);
 667       }
 668       strb(ch2, Address(sp, ch1));
 669     BIND(BCSKIP);
 670       subs(ch2, ch2, 1);
 671       br(GT, BCLOOP);
 672 
 673       add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
 674       if (str1_isL == str2_isL) {
 675         // load last 8 bytes (8LL/4UU symbols)
 676         ldr(tmp6, Address(tmp6, -wordSize));
 677       } else {
 678         ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
 679         // convert Latin1 to UTF. We'll have to wait until load completed, but
 680         // it's still faster than per-character loads+checks
 681         lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
 682         ubfx(ch1, tmp6, 8, 8); // str1[N-2]
 683         ubfx(ch2, tmp6, 16, 8); // str1[N-3]
 684         andr(tmp6, tmp6, 0xFF); // str1[N-4]
 685         orr(ch2, ch1, ch2, LSL, 16);
 686         orr(tmp6, tmp6, tmp3, LSL, 48);
 687         orr(tmp6, tmp6, ch2, LSL, 16);
 688       }
 689     BIND(BMLOOPSTR2);
 690       (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 691       sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
 692       if (str1_isL == str2_isL) {
 693         // re-init tmp3. It's for free because it's executed in parallel with
 694         // load above. Alternative is to initialize it before loop, but it'll
 695         // affect performance on in-order systems with 2 or more ld/st pipelines
 696         lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
 697       }
 698       if (!isL) { // UU/UL case
 699         lsl(ch2, cnt1tmp, 1); // offset in bytes
 700       }
 701       cmp(tmp3, skipch);
 702       br(NE, BMSKIP);
 703       ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
 704       mov(ch1, tmp6);
 705       if (isL) {
 706         b(BMLOOPSTR1_AFTER_LOAD);
 707       } else {
 708         sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
 709         b(BMLOOPSTR1_CMP);
 710       }
 711     BIND(BMLOOPSTR1);
 712       (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
 713       (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 714     BIND(BMLOOPSTR1_AFTER_LOAD);
 715       subs(cnt1tmp, cnt1tmp, 1);
 716       br(LT, BMLOOPSTR1_LASTCMP);
 717     BIND(BMLOOPSTR1_CMP);
 718       cmp(ch1, ch2);
 719       br(EQ, BMLOOPSTR1);
 720     BIND(BMSKIP);
 721       if (!isL) {
 722         // if we've met UTF symbol while searching Latin1 pattern, then we can
 723         // skip cnt1 symbols
 724         if (str1_isL != str2_isL) {
 725           mov(result_tmp, cnt1);
 726         } else {
 727           mov(result_tmp, 1);
 728         }
 729         subs(zr, skipch, ASIZE);
 730         br(HS, BMADV);
 731       }
 732       ldrb(result_tmp, Address(sp, skipch)); // load skip distance
 733     BIND(BMADV);
 734       sub(cnt1tmp, cnt1, 1);
 735       add(str2, str2, result_tmp, LSL, str2_chr_shift);
 736       cmp(str2, str2end);
 737       br(LE, BMLOOPSTR2);
 738       add(sp, sp, ASIZE);
 739       b(NOMATCH);
 740     BIND(BMLOOPSTR1_LASTCMP);
 741       cmp(ch1, ch2);
 742       br(NE, BMSKIP);
 743     BIND(BMMATCH);
 744       sub(result, str2, tmp5);
 745       if (!str2_isL) lsr(result, result, 1);
 746       add(sp, sp, ASIZE);
 747       b(DONE);
 748 
 749     BIND(LINEARSTUB);
 750     cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm
 751     br(LT, LINEAR_MEDIUM);
 752     mov(result, zr);
 753     RuntimeAddress stub = nullptr;
 754     if (isL) {
 755       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
 756       assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated");
 757     } else if (str1_isL) {
 758       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
 759        assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated");
 760     } else {
 761       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
 762       assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated");
 763     }
 764     address call = trampoline_call(stub);
 765     if (call == nullptr) {
 766       DEBUG_ONLY(reset_labels(LINEARSEARCH, LINEAR_MEDIUM, DONE, NOMATCH, MATCH));
 767       ciEnv::current()->record_failure("CodeCache is full");
 768       return;
 769     }
 770     b(DONE);
 771   }
 772 
 773   BIND(LINEARSEARCH);
 774   {
 775     Label DO1, DO2, DO3;
 776 
 777     Register str2tmp = tmp2;
 778     Register first = tmp3;
 779 
 780     if (icnt1 == -1)
 781     {
 782         Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
 783 
 784         cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2));
 785         br(LT, DOSHORT);
 786       BIND(LINEAR_MEDIUM);
 787         (this->*str1_load_1chr)(first, Address(str1));
 788         lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
 789         sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
 790         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 791         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 792 
 793       BIND(FIRST_LOOP);
 794         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
 795         cmp(first, ch2);
 796         br(EQ, STR1_LOOP);
 797       BIND(STR2_NEXT);
 798         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 799         br(LE, FIRST_LOOP);
 800         b(NOMATCH);
 801 
 802       BIND(STR1_LOOP);
 803         adds(cnt1tmp, cnt1_neg, str1_chr_size);
 804         add(cnt2tmp, cnt2_neg, str2_chr_size);
 805         br(GE, MATCH);
 806 
 807       BIND(STR1_NEXT);
 808         (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
 809         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
 810         cmp(ch1, ch2);
 811         br(NE, STR2_NEXT);
 812         adds(cnt1tmp, cnt1tmp, str1_chr_size);
 813         add(cnt2tmp, cnt2tmp, str2_chr_size);
 814         br(LT, STR1_NEXT);
 815         b(MATCH);
 816 
 817       BIND(DOSHORT);
 818       if (str1_isL == str2_isL) {
 819         cmp(cnt1, (u1)2);
 820         br(LT, DO1);
 821         br(GT, DO3);
 822       }
 823     }
 824 
 825     if (icnt1 == 4) {
 826       Label CH1_LOOP;
 827 
 828         (this->*load_4chr)(ch1, str1);
 829         sub(result_tmp, cnt2, 4);
 830         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 831         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 832 
 833       BIND(CH1_LOOP);
 834         (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
 835         cmp(ch1, ch2);
 836         br(EQ, MATCH);
 837         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 838         br(LE, CH1_LOOP);
 839         b(NOMATCH);
 840       }
 841 
 842     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
 843       Label CH1_LOOP;
 844 
 845       BIND(DO2);
 846         (this->*load_2chr)(ch1, str1);
 847         if (icnt1 == 2) {
 848           sub(result_tmp, cnt2, 2);
 849         }
 850         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 851         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 852       BIND(CH1_LOOP);
 853         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
 854         cmp(ch1, ch2);
 855         br(EQ, MATCH);
 856         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 857         br(LE, CH1_LOOP);
 858         b(NOMATCH);
 859     }
 860 
 861     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
 862       Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
 863 
 864       BIND(DO3);
 865         (this->*load_2chr)(first, str1);
 866         (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
 867         if (icnt1 == 3) {
 868           sub(result_tmp, cnt2, 3);
 869         }
 870         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 871         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 872       BIND(FIRST_LOOP);
 873         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
 874         cmpw(first, ch2);
 875         br(EQ, STR1_LOOP);
 876       BIND(STR2_NEXT);
 877         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 878         br(LE, FIRST_LOOP);
 879         b(NOMATCH);
 880 
 881       BIND(STR1_LOOP);
 882         add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
 883         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
 884         cmp(ch1, ch2);
 885         br(NE, STR2_NEXT);
 886         b(MATCH);
 887     }
 888 
 889     if (icnt1 == -1 || icnt1 == 1) {
 890       Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
 891 
 892       BIND(DO1);
 893         (this->*str1_load_1chr)(ch1, str1);
 894         cmp(cnt2, (u1)8);
 895         br(LT, DO1_SHORT);
 896 
 897         sub(result_tmp, cnt2, 8/str2_chr_size);
 898         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 899         mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
 900         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 901 
 902         if (str2_isL) {
 903           orr(ch1, ch1, ch1, LSL, 8);
 904         }
 905         orr(ch1, ch1, ch1, LSL, 16);
 906         orr(ch1, ch1, ch1, LSL, 32);
 907       BIND(CH1_LOOP);
 908         ldr(ch2, Address(str2, cnt2_neg));
 909         eor(ch2, ch1, ch2);
 910         sub(tmp1, ch2, tmp3);
 911         orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 912         bics(tmp1, tmp1, tmp2);
 913         br(NE, HAS_ZERO);
 914         adds(cnt2_neg, cnt2_neg, 8);
 915         br(LT, CH1_LOOP);
 916 
 917         cmp(cnt2_neg, (u1)8);
 918         mov(cnt2_neg, 0);
 919         br(LT, CH1_LOOP);
 920         b(NOMATCH);
 921 
 922       BIND(HAS_ZERO);
 923         rev(tmp1, tmp1);
 924         clz(tmp1, tmp1);
 925         add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
 926         b(MATCH);
 927 
 928       BIND(DO1_SHORT);
 929         mov(result_tmp, cnt2);
 930         lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
 931         sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
 932       BIND(DO1_LOOP);
 933         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
 934         cmpw(ch1, ch2);
 935         br(EQ, MATCH);
 936         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 937         br(LT, DO1_LOOP);
 938     }
 939   }
 940   BIND(NOMATCH);
 941     mov(result, -1);
 942     b(DONE);
 943   BIND(MATCH);
 944     add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
 945   BIND(DONE);
 946 }
 947 
 948 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
 949 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
 950 
 951 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1,
 952                                             Register ch, Register result,
 953                                             Register tmp1, Register tmp2, Register tmp3)
 954 {
 955   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
 956   Register cnt1_neg = cnt1;
 957   Register ch1 = rscratch1;
 958   Register result_tmp = rscratch2;
 959 
 960   cbz(cnt1, NOMATCH);
 961 
 962   cmp(cnt1, (u1)4);
 963   br(LT, DO1_SHORT);
 964 
 965   orr(ch, ch, ch, LSL, 16);
 966   orr(ch, ch, ch, LSL, 32);
 967 
 968   sub(cnt1, cnt1, 4);
 969   mov(result_tmp, cnt1);
 970   lea(str1, Address(str1, cnt1, Address::uxtw(1)));
 971   sub(cnt1_neg, zr, cnt1, LSL, 1);
 972 
 973   mov(tmp3, 0x0001000100010001);
 974 
 975   BIND(CH1_LOOP);
 976     ldr(ch1, Address(str1, cnt1_neg));
 977     eor(ch1, ch, ch1);
 978     sub(tmp1, ch1, tmp3);
 979     orr(tmp2, ch1, 0x7fff7fff7fff7fff);
 980     bics(tmp1, tmp1, tmp2);
 981     br(NE, HAS_ZERO);
 982     adds(cnt1_neg, cnt1_neg, 8);
 983     br(LT, CH1_LOOP);
 984 
 985     cmp(cnt1_neg, (u1)8);
 986     mov(cnt1_neg, 0);
 987     br(LT, CH1_LOOP);
 988     b(NOMATCH);
 989 
 990   BIND(HAS_ZERO);
 991     rev(tmp1, tmp1);
 992     clz(tmp1, tmp1);
 993     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
 994     b(MATCH);
 995 
 996   BIND(DO1_SHORT);
 997     mov(result_tmp, cnt1);
 998     lea(str1, Address(str1, cnt1, Address::uxtw(1)));
 999     sub(cnt1_neg, zr, cnt1, LSL, 1);
1000   BIND(DO1_LOOP);
1001     ldrh(ch1, Address(str1, cnt1_neg));
1002     cmpw(ch, ch1);
1003     br(EQ, MATCH);
1004     adds(cnt1_neg, cnt1_neg, 2);
1005     br(LT, DO1_LOOP);
1006   BIND(NOMATCH);
1007     mov(result, -1);
1008     b(DONE);
1009   BIND(MATCH);
1010     add(result, result_tmp, cnt1_neg, ASR, 1);
1011   BIND(DONE);
1012 }
1013 
1014 void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1,
1015                                                 Register ch, Register result,
1016                                                 FloatRegister ztmp1,
1017                                                 FloatRegister ztmp2,
1018                                                 PRegister tmp_pg,
1019                                                 PRegister tmp_pdn, bool isL)
1020 {
1021   // Note that `tmp_pdn` should *NOT* be used as governing predicate register.
1022   assert(tmp_pg->is_governing(),
1023          "this register has to be a governing predicate register");
1024 
1025   Label LOOP, MATCH, DONE, NOMATCH;
1026   Register vec_len = rscratch1;
1027   Register idx = rscratch2;
1028 
1029   SIMD_RegVariant T = (isL == true) ? B : H;
1030 
1031   cbz(cnt1, NOMATCH);
1032 
1033   // Assign the particular char throughout the vector.
1034   sve_dup(ztmp2, T, ch);
1035   if (isL) {
1036     sve_cntb(vec_len);
1037   } else {
1038     sve_cnth(vec_len);
1039   }
1040   mov(idx, 0);
1041 
1042   // Generate a predicate to control the reading of input string.
1043   sve_whilelt(tmp_pg, T, idx, cnt1);
1044 
1045   BIND(LOOP);
1046     // Read a vector of 8- or 16-bit data depending on the string type. Note
1047     // that inactive elements indicated by the predicate register won't cause
1048     // a data read from memory to the destination vector.
1049     if (isL) {
1050       sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx));
1051     } else {
1052       sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1)));
1053     }
1054     add(idx, idx, vec_len);
1055 
1056     // Perform the comparison. An element of the destination predicate is set
1057     // to active if the particular char is matched.
1058     sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2);
1059 
1060     // Branch if the particular char is found.
1061     br(NE, MATCH);
1062 
1063     sve_whilelt(tmp_pg, T, idx, cnt1);
1064 
1065     // Loop back if the particular char not found.
1066     br(MI, LOOP);
1067 
1068   BIND(NOMATCH);
1069     mov(result, -1);
1070     b(DONE);
1071 
1072   BIND(MATCH);
1073     // Undo the index increment.
1074     sub(idx, idx, vec_len);
1075 
1076     // Crop the vector to find its location.
1077     sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */);
1078     add(result, idx, -1);
1079     sve_incp(result, T, tmp_pdn);
1080   BIND(DONE);
1081 }
1082 
1083 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1,
1084                                             Register ch, Register result,
1085                                             Register tmp1, Register tmp2, Register tmp3)
1086 {
1087   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
1088   Register cnt1_neg = cnt1;
1089   Register ch1 = rscratch1;
1090   Register result_tmp = rscratch2;
1091 
1092   cbz(cnt1, NOMATCH);
1093 
1094   cmp(cnt1, (u1)8);
1095   br(LT, DO1_SHORT);
1096 
1097   orr(ch, ch, ch, LSL, 8);
1098   orr(ch, ch, ch, LSL, 16);
1099   orr(ch, ch, ch, LSL, 32);
1100 
1101   sub(cnt1, cnt1, 8);
1102   mov(result_tmp, cnt1);
1103   lea(str1, Address(str1, cnt1));
1104   sub(cnt1_neg, zr, cnt1);
1105 
1106   mov(tmp3, 0x0101010101010101);
1107 
1108   BIND(CH1_LOOP);
1109     ldr(ch1, Address(str1, cnt1_neg));
1110     eor(ch1, ch, ch1);
1111     sub(tmp1, ch1, tmp3);
1112     orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f);
1113     bics(tmp1, tmp1, tmp2);
1114     br(NE, HAS_ZERO);
1115     adds(cnt1_neg, cnt1_neg, 8);
1116     br(LT, CH1_LOOP);
1117 
1118     cmp(cnt1_neg, (u1)8);
1119     mov(cnt1_neg, 0);
1120     br(LT, CH1_LOOP);
1121     b(NOMATCH);
1122 
1123   BIND(HAS_ZERO);
1124     rev(tmp1, tmp1);
1125     clz(tmp1, tmp1);
1126     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
1127     b(MATCH);
1128 
1129   BIND(DO1_SHORT);
1130     mov(result_tmp, cnt1);
1131     lea(str1, Address(str1, cnt1));
1132     sub(cnt1_neg, zr, cnt1);
1133   BIND(DO1_LOOP);
1134     ldrb(ch1, Address(str1, cnt1_neg));
1135     cmp(ch, ch1);
1136     br(EQ, MATCH);
1137     adds(cnt1_neg, cnt1_neg, 1);
1138     br(LT, DO1_LOOP);
1139   BIND(NOMATCH);
1140     mov(result, -1);
1141     b(DONE);
1142   BIND(MATCH);
1143     add(result, result_tmp, cnt1_neg);
1144   BIND(DONE);
1145 }
1146 
1147 // Compare strings.
1148 void C2_MacroAssembler::string_compare(Register str1, Register str2,
1149     Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
1150     FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
1151     PRegister pgtmp1, PRegister pgtmp2, int ae) {
1152   Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
1153       DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
1154       SHORT_LOOP_START, TAIL_CHECK;
1155 
1156   bool isLL = ae == StrIntrinsicNode::LL;
1157   bool isLU = ae == StrIntrinsicNode::LU;
1158   bool isUL = ae == StrIntrinsicNode::UL;
1159 
1160   // The stub threshold for LL strings is: 72 (64 + 8) chars
1161   // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch)
1162   // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least)
1163   const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36);
1164 
1165   bool str1_isL = isLL || isLU;
1166   bool str2_isL = isLL || isUL;
1167 
1168   int str1_chr_shift = str1_isL ? 0 : 1;
1169   int str2_chr_shift = str2_isL ? 0 : 1;
1170   int str1_chr_size = str1_isL ? 1 : 2;
1171   int str2_chr_size = str2_isL ? 1 : 2;
1172   int minCharsInWord = isLL ? wordSize : wordSize/2;
1173 
1174   FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
1175   chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
1176                                       (chr_insn)&MacroAssembler::ldrh;
1177   chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
1178                                       (chr_insn)&MacroAssembler::ldrh;
1179   uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
1180                             (uxt_insn)&MacroAssembler::uxthw;
1181 
1182   BLOCK_COMMENT("string_compare {");
1183 
1184   // Bizarrely, the counts are passed in bytes, regardless of whether they
1185   // are L or U strings, however the result is always in characters.
1186   if (!str1_isL) asrw(cnt1, cnt1, 1);
1187   if (!str2_isL) asrw(cnt2, cnt2, 1);
1188 
1189   // Compute the minimum of the string lengths and save the difference.
1190   subsw(result, cnt1, cnt2);
1191   cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
1192 
1193   // A very short string
1194   cmpw(cnt2, minCharsInWord);
1195   br(Assembler::LE, SHORT_STRING);
1196 
1197   // Compare longwords
1198   // load first parts of strings and finish initialization while loading
1199   {
1200     if (str1_isL == str2_isL) { // LL or UU
1201       ldr(tmp1, Address(str1));
1202       cmp(str1, str2);
1203       br(Assembler::EQ, DONE);
1204       ldr(tmp2, Address(str2));
1205       cmp(cnt2, stub_threshold);
1206       br(GE, STUB);
1207       subsw(cnt2, cnt2, minCharsInWord);
1208       br(EQ, TAIL_CHECK);
1209       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1210       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1211       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1212     } else if (isLU) {
1213       ldrs(vtmp, Address(str1));
1214       ldr(tmp2, Address(str2));
1215       cmp(cnt2, stub_threshold);
1216       br(GE, STUB);
1217       subw(cnt2, cnt2, 4);
1218       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1219       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1220       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1221       zip1(vtmp, T8B, vtmp, vtmpZ);
1222       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1223       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1224       add(cnt1, cnt1, 4);
1225       fmovd(tmp1, vtmp);
1226     } else { // UL case
1227       ldr(tmp1, Address(str1));
1228       ldrs(vtmp, Address(str2));
1229       cmp(cnt2, stub_threshold);
1230       br(GE, STUB);
1231       subw(cnt2, cnt2, 4);
1232       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1233       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1234       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1235       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1236       zip1(vtmp, T8B, vtmp, vtmpZ);
1237       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1238       add(cnt1, cnt1, 8);
1239       fmovd(tmp2, vtmp);
1240     }
1241     adds(cnt2, cnt2, isUL ? 4 : 8);
1242     br(GE, TAIL);
1243     eor(rscratch2, tmp1, tmp2);
1244     cbnz(rscratch2, DIFF);
1245     // main loop
1246     bind(NEXT_WORD);
1247     if (str1_isL == str2_isL) {
1248       ldr(tmp1, Address(str1, cnt2));
1249       ldr(tmp2, Address(str2, cnt2));
1250       adds(cnt2, cnt2, 8);
1251     } else if (isLU) {
1252       ldrs(vtmp, Address(str1, cnt1));
1253       ldr(tmp2, Address(str2, cnt2));
1254       add(cnt1, cnt1, 4);
1255       zip1(vtmp, T8B, vtmp, vtmpZ);
1256       fmovd(tmp1, vtmp);
1257       adds(cnt2, cnt2, 8);
1258     } else { // UL
1259       ldrs(vtmp, Address(str2, cnt2));
1260       ldr(tmp1, Address(str1, cnt1));
1261       zip1(vtmp, T8B, vtmp, vtmpZ);
1262       add(cnt1, cnt1, 8);
1263       fmovd(tmp2, vtmp);
1264       adds(cnt2, cnt2, 4);
1265     }
1266     br(GE, TAIL);
1267 
1268     eor(rscratch2, tmp1, tmp2);
1269     cbz(rscratch2, NEXT_WORD);
1270     b(DIFF);
1271     bind(TAIL);
1272     eor(rscratch2, tmp1, tmp2);
1273     cbnz(rscratch2, DIFF);
1274     // Last longword.  In the case where length == 4 we compare the
1275     // same longword twice, but that's still faster than another
1276     // conditional branch.
1277     if (str1_isL == str2_isL) {
1278       ldr(tmp1, Address(str1));
1279       ldr(tmp2, Address(str2));
1280     } else if (isLU) {
1281       ldrs(vtmp, Address(str1));
1282       ldr(tmp2, Address(str2));
1283       zip1(vtmp, T8B, vtmp, vtmpZ);
1284       fmovd(tmp1, vtmp);
1285     } else { // UL
1286       ldrs(vtmp, Address(str2));
1287       ldr(tmp1, Address(str1));
1288       zip1(vtmp, T8B, vtmp, vtmpZ);
1289       fmovd(tmp2, vtmp);
1290     }
1291     bind(TAIL_CHECK);
1292     eor(rscratch2, tmp1, tmp2);
1293     cbz(rscratch2, DONE);
1294 
1295     // Find the first different characters in the longwords and
1296     // compute their difference.
1297     bind(DIFF);
1298     rev(rscratch2, rscratch2);
1299     clz(rscratch2, rscratch2);
1300     andr(rscratch2, rscratch2, isLL ? -8 : -16);
1301     lsrv(tmp1, tmp1, rscratch2);
1302     (this->*ext_chr)(tmp1, tmp1);
1303     lsrv(tmp2, tmp2, rscratch2);
1304     (this->*ext_chr)(tmp2, tmp2);
1305     subw(result, tmp1, tmp2);
1306     b(DONE);
1307   }
1308 
1309   bind(STUB);
1310     RuntimeAddress stub = nullptr;
1311     switch(ae) {
1312       case StrIntrinsicNode::LL:
1313         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
1314         break;
1315       case StrIntrinsicNode::UU:
1316         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
1317         break;
1318       case StrIntrinsicNode::LU:
1319         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
1320         break;
1321       case StrIntrinsicNode::UL:
1322         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
1323         break;
1324       default:
1325         ShouldNotReachHere();
1326      }
1327     assert(stub.target() != nullptr, "compare_long_string stub has not been generated");
1328     address call = trampoline_call(stub);
1329     if (call == nullptr) {
1330       DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START));
1331       ciEnv::current()->record_failure("CodeCache is full");
1332       return;
1333     }
1334     b(DONE);
1335 
1336   bind(SHORT_STRING);
1337   // Is the minimum length zero?
1338   cbz(cnt2, DONE);
1339   // arrange code to do most branches while loading and loading next characters
1340   // while comparing previous
1341   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1342   subs(cnt2, cnt2, 1);
1343   br(EQ, SHORT_LAST_INIT);
1344   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1345   b(SHORT_LOOP_START);
1346   bind(SHORT_LOOP);
1347   subs(cnt2, cnt2, 1);
1348   br(EQ, SHORT_LAST);
1349   bind(SHORT_LOOP_START);
1350   (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
1351   (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
1352   cmp(tmp1, cnt1);
1353   br(NE, SHORT_LOOP_TAIL);
1354   subs(cnt2, cnt2, 1);
1355   br(EQ, SHORT_LAST2);
1356   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1357   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1358   cmp(tmp2, rscratch1);
1359   br(EQ, SHORT_LOOP);
1360   sub(result, tmp2, rscratch1);
1361   b(DONE);
1362   bind(SHORT_LOOP_TAIL);
1363   sub(result, tmp1, cnt1);
1364   b(DONE);
1365   bind(SHORT_LAST2);
1366   cmp(tmp2, rscratch1);
1367   br(EQ, DONE);
1368   sub(result, tmp2, rscratch1);
1369 
1370   b(DONE);
1371   bind(SHORT_LAST_INIT);
1372   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1373   bind(SHORT_LAST);
1374   cmp(tmp1, cnt1);
1375   br(EQ, DONE);
1376   sub(result, tmp1, cnt1);
1377 
1378   bind(DONE);
1379 
1380   BLOCK_COMMENT("} string_compare");
1381 }
1382 
1383 void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1,
1384                                      FloatRegister src2, Condition cond, bool isQ) {
1385   SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1386   FloatRegister zn = src1, zm = src2;
1387   bool needs_negation = false;
1388   switch (cond) {
1389     case LT: cond = GT; zn = src2; zm = src1; break;
1390     case LE: cond = GE; zn = src2; zm = src1; break;
1391     case LO: cond = HI; zn = src2; zm = src1; break;
1392     case LS: cond = HS; zn = src2; zm = src1; break;
1393     case NE: cond = EQ; needs_negation = true; break;
1394     default:
1395       break;
1396   }
1397 
1398   if (is_floating_point_type(bt)) {
1399     fcm(cond, dst, size, zn, zm);
1400   } else {
1401     cm(cond, dst, size, zn, zm);
1402   }
1403 
1404   if (needs_negation) {
1405     notr(dst, isQ ? T16B : T8B, dst);
1406   }
1407 }
1408 
1409 void C2_MacroAssembler::neon_compare_zero(FloatRegister dst, BasicType bt, FloatRegister src,
1410                                           Condition cond, bool isQ) {
1411   SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1412   if (bt == T_FLOAT || bt == T_DOUBLE) {
1413     if (cond == Assembler::NE) {
1414       fcm(Assembler::EQ, dst, size, src);
1415       notr(dst, isQ ? T16B : T8B, dst);
1416     } else {
1417       fcm(cond, dst, size, src);
1418     }
1419   } else {
1420     if (cond == Assembler::NE) {
1421       cm(Assembler::EQ, dst, size, src);
1422       notr(dst, isQ ? T16B : T8B, dst);
1423     } else {
1424       cm(cond, dst, size, src);
1425     }
1426   }
1427 }
1428 
1429 // Compress the least significant bit of each byte to the rightmost and clear
1430 // the higher garbage bits.
1431 void C2_MacroAssembler::bytemask_compress(Register dst) {
1432   // Example input, dst = 0x01 00 00 00 01 01 00 01
1433   // The "??" bytes are garbage.
1434   orr(dst, dst, dst, Assembler::LSR, 7);  // dst = 0x?? 02 ?? 00 ?? 03 ?? 01
1435   orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D
1436   orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D
1437   andr(dst, dst, 0xff);                   // dst = 0x8D
1438 }
1439 
1440 // Pack the value of each mask element in "src" into a long value in "dst", at most
1441 // the first 64 lane elements. The input "src" is a vector of boolean represented as
1442 // bytes with 0x00/0x01 as element values. Each lane value from "src" is packed into
1443 // one bit in "dst".
1444 //
1445 // Example:   src = 0x0001010000010001 0100000001010001, lane_cnt = 16
1446 // Expected:  dst = 0x658D
1447 //
1448 // Clobbers: rscratch1
1449 void C2_MacroAssembler::sve_vmask_tolong(Register dst, FloatRegister src,
1450                                          FloatRegister vtmp, int lane_cnt) {
1451   assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
1452   assert_different_registers(dst, rscratch1);
1453   assert_different_registers(src, vtmp);
1454   assert(UseSVE > 0, "must be");
1455 
1456   // Compress the lowest 8 bytes.
1457   fmovd(dst, src);
1458   bytemask_compress(dst);
1459   if (lane_cnt <= 8) return;
1460 
1461   // Repeat on higher bytes and join the results.
1462   // Compress 8 bytes in each iteration.
1463   for (int idx = 1; idx < (lane_cnt / 8); idx++) {
1464     sve_extract_integral(rscratch1, T_LONG, src, idx, vtmp);
1465     bytemask_compress(rscratch1);
1466     orr(dst, dst, rscratch1, Assembler::LSL, idx << 3);
1467   }
1468 }
1469 
1470 // The function is same as above "sve_vmask_tolong", but it uses SVE2's BEXT
1471 // instruction which requires the FEAT_BITPERM feature.
1472 void C2_MacroAssembler::sve2_vmask_tolong(Register dst, FloatRegister src,
1473                                           FloatRegister vtmp1, FloatRegister vtmp2,
1474                                           int lane_cnt) {
1475   assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
1476   assert_different_registers(src, vtmp1, vtmp2);
1477   assert(UseSVE > 1 && VM_Version::supports_svebitperm(), "must be");
1478 
1479   // Given a vector with the value 0x00 or 0x01 in each byte, the basic idea
1480   // is to compress each significant bit of the byte in a cross-lane way. Due
1481   // to the lack of a cross-lane bit-compress instruction, we use BEXT
1482   // (bit-compress in each lane) with the biggest lane size (T = D) then
1483   // concatenate the results.
1484 
1485   // The second source input of BEXT, initialized with 0x01 in each byte.
1486   // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1487   sve_dup(vtmp2, B, 1);
1488 
1489   // BEXT vtmp1.D, src.D, vtmp2.D
1490   // src   = 0x0001010000010001 | 0x0100000001010001
1491   // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1492   //         ---------------------------------------
1493   // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1494   sve_bext(vtmp1, D, src, vtmp2);
1495 
1496   // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the
1497   // result to dst.
1498   // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1499   // dst   = 0x658D
1500   if (lane_cnt <= 8) {
1501     // No need to concatenate.
1502     umov(dst, vtmp1, B, 0);
1503   } else if (lane_cnt <= 16) {
1504     ins(vtmp1, B, vtmp1, 1, 8);
1505     umov(dst, vtmp1, H, 0);
1506   } else {
1507     // As the lane count is 64 at most, the final expected value must be in
1508     // the lowest 64 bits after narrowing vtmp1 from D to B.
1509     sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2);
1510     umov(dst, vtmp1, D, 0);
1511   }
1512 }
1513 
1514 // Unpack the mask, a long value in "src", into a vector register of boolean
1515 // represented as bytes with 0x00/0x01 as element values in "dst".  Each bit in
1516 // "src" is unpacked into one byte lane in "dst". Note that "dst" can support at
1517 // most 64 lanes.
1518 //
1519 // Below example gives the expected dst vector register, with a valid src(0x658D)
1520 // on a 128-bit vector size machine.
1521 // dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01
1522 void C2_MacroAssembler::sve_vmask_fromlong(FloatRegister dst, Register src,
1523                                            FloatRegister vtmp, int lane_cnt) {
1524   assert_different_registers(dst, vtmp);
1525   assert(UseSVE == 2 && VM_Version::supports_svebitperm() &&
1526          lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported");
1527 
1528   // Example:   src = 0x658D, lane_cnt = 16
1529   // Expected:  dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01
1530 
1531   // Put long value from general purpose register into the first lane of vector.
1532   // vtmp = 0x0000000000000000 | 0x000000000000658D
1533   sve_dup(vtmp, B, 0);
1534   mov(vtmp, D, 0, src);
1535 
1536   // Transform the value in the first lane which is mask in bit now to the mask in
1537   // byte, which can be done by SVE2's BDEP instruction.
1538 
1539   // The first source input of BDEP instruction. Deposite each byte in every 8 bytes.
1540   // vtmp = 0x0000000000000065 | 0x000000000000008D
1541   if (lane_cnt <= 8) {
1542     // Nothing. As only one byte exsits.
1543   } else if (lane_cnt <= 16) {
1544     ins(vtmp, B, vtmp, 8, 1);
1545   } else {
1546     sve_vector_extend(vtmp, D, vtmp, B);
1547   }
1548 
1549   // The second source input of BDEP instruction, initialized with 0x01 for each byte.
1550   // dst = 0x01010101 0x01010101 0x01010101 0x01010101
1551   sve_dup(dst, B, 1);
1552 
1553   // BDEP dst.D, vtmp.D, dst.D
1554   // vtmp = 0x0000000000000065 | 0x000000000000008D
1555   // dst  = 0x0101010101010101 | 0x0101010101010101
1556   //        ---------------------------------------
1557   // dst  = 0x0001010000010001 | 0x0100000001010001
1558   sve_bdep(dst, D, vtmp, dst);
1559 }
1560 
1561 // Clobbers: rflags
1562 void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg,
1563                                     FloatRegister zn, FloatRegister zm, Condition cond) {
1564   assert(pg->is_governing(), "This register has to be a governing predicate register");
1565   FloatRegister z1 = zn, z2 = zm;
1566   switch (cond) {
1567     case LE: z1 = zm; z2 = zn; cond = GE; break;
1568     case LT: z1 = zm; z2 = zn; cond = GT; break;
1569     case LO: z1 = zm; z2 = zn; cond = HI; break;
1570     case LS: z1 = zm; z2 = zn; cond = HS; break;
1571     default:
1572       break;
1573   }
1574 
1575   SIMD_RegVariant size = elemType_to_regVariant(bt);
1576   if (is_floating_point_type(bt)) {
1577     sve_fcm(cond, pd, size, pg, z1, z2);
1578   } else {
1579     assert(is_integral_type(bt), "unsupported element type");
1580     sve_cmp(cond, pd, size, pg, z1, z2);
1581   }
1582 }
1583 
1584 // Get index of the last mask lane that is set
1585 void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) {
1586   SIMD_RegVariant size = elemType_to_regVariant(bt);
1587   sve_rev(ptmp, size, src);
1588   sve_brkb(ptmp, ptrue, ptmp, false);
1589   sve_cntp(dst, size, ptrue, ptmp);
1590   movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1);
1591   subw(dst, rscratch1, dst);
1592 }
1593 
1594 // Extend integer vector src to dst with the same lane count
1595 // but larger element size, e.g. 4B -> 4I
1596 void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes,
1597                                            FloatRegister src, BasicType src_bt, bool is_unsigned) {
1598   if (src_bt == T_BYTE) {
1599     // 4B to 4S/4I, 8B to 8S
1600     assert(dst_vlen_in_bytes == 8 || dst_vlen_in_bytes == 16, "unsupported");
1601     assert(dst_bt == T_SHORT || dst_bt == T_INT, "unsupported");
1602     _xshll(is_unsigned, dst, T8H, src, T8B, 0);
1603     if (dst_bt == T_INT) {
1604       _xshll(is_unsigned, dst, T4S, dst, T4H, 0);
1605     }
1606   } else if (src_bt == T_SHORT) {
1607     // 2S to 2I/2L, 4S to 4I
1608     assert(dst_vlen_in_bytes == 8 || dst_vlen_in_bytes == 16, "unsupported");
1609     assert(dst_bt == T_INT || dst_bt == T_LONG, "unsupported");
1610     _xshll(is_unsigned, dst, T4S, src, T4H, 0);
1611     if (dst_bt == T_LONG) {
1612       _xshll(is_unsigned, dst, T2D, dst, T2S, 0);
1613     }
1614   } else if (src_bt == T_INT) {
1615     // 2I to 2L
1616     assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported");
1617     _xshll(is_unsigned, dst, T2D, src, T2S, 0);
1618   } else {
1619     ShouldNotReachHere();
1620   }
1621 }
1622 
1623 // Narrow integer vector src down to dst with the same lane count
1624 // but smaller element size, e.g. 4I -> 4B
1625 void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt,
1626                                            FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) {
1627   if (src_bt == T_SHORT) {
1628     // 4S/8S to 4B/8B
1629     assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
1630     assert(dst_bt == T_BYTE, "unsupported");
1631     xtn(dst, T8B, src, T8H);
1632   } else if (src_bt == T_INT) {
1633     // 2I to 2S, 4I to 4B/4S
1634     assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
1635     assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported");
1636     xtn(dst, T4H, src, T4S);
1637     if (dst_bt == T_BYTE) {
1638       xtn(dst, T8B, dst, T8H);
1639     }
1640   } else if (src_bt == T_LONG) {
1641     // 2L to 2S/2I
1642     assert(src_vlen_in_bytes == 16, "unsupported");
1643     assert(dst_bt == T_INT || dst_bt == T_SHORT, "unsupported");
1644     xtn(dst, T2S, src, T2D);
1645     if (dst_bt == T_SHORT) {
1646       xtn(dst, T4H, dst, T4S);
1647     }
1648   } else {
1649     ShouldNotReachHere();
1650   }
1651 }
1652 
1653 void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size,
1654                                           FloatRegister src, SIMD_RegVariant src_size,
1655                                           bool is_unsigned) {
1656   assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size");
1657 
1658   if (src_size == B) {
1659     switch (dst_size) {
1660     case H:
1661       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1662       break;
1663     case S:
1664       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1665       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1666       break;
1667     case D:
1668       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1669       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1670       _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1671       break;
1672     default:
1673       ShouldNotReachHere();
1674     }
1675   } else if (src_size == H) {
1676     if (dst_size == S) {
1677       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1678     } else { // D
1679       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1680       _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1681     }
1682   } else if (src_size == S) {
1683     _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, src);
1684   }
1685 }
1686 
1687 // Vector narrow from src to dst with specified element sizes.
1688 // High part of dst vector will be filled with zero.
1689 void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size,
1690                                           FloatRegister src, SIMD_RegVariant src_size,
1691                                           FloatRegister tmp) {
1692   assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size");
1693   assert_different_registers(src, tmp);
1694   sve_dup(tmp, src_size, 0);
1695   if (src_size == D) {
1696     switch (dst_size) {
1697     case S:
1698       sve_uzp1(dst, S, src, tmp);
1699       break;
1700     case H:
1701       assert_different_registers(dst, tmp);
1702       sve_uzp1(dst, S, src, tmp);
1703       sve_uzp1(dst, H, dst, tmp);
1704       break;
1705     case B:
1706       assert_different_registers(dst, tmp);
1707       sve_uzp1(dst, S, src, tmp);
1708       sve_uzp1(dst, H, dst, tmp);
1709       sve_uzp1(dst, B, dst, tmp);
1710       break;
1711     default:
1712       ShouldNotReachHere();
1713     }
1714   } else if (src_size == S) {
1715     if (dst_size == H) {
1716       sve_uzp1(dst, H, src, tmp);
1717     } else { // B
1718       assert_different_registers(dst, tmp);
1719       sve_uzp1(dst, H, src, tmp);
1720       sve_uzp1(dst, B, dst, tmp);
1721     }
1722   } else if (src_size == H) {
1723     sve_uzp1(dst, B, src, tmp);
1724   }
1725 }
1726 
1727 // Extend src predicate to dst predicate with the same lane count but larger
1728 // element size, e.g. 64Byte -> 512Long
1729 void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src,
1730                                              uint dst_element_length_in_bytes,
1731                                              uint src_element_length_in_bytes) {
1732   if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) {
1733     sve_punpklo(dst, src);
1734   } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) {
1735     sve_punpklo(dst, src);
1736     sve_punpklo(dst, dst);
1737   } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) {
1738     sve_punpklo(dst, src);
1739     sve_punpklo(dst, dst);
1740     sve_punpklo(dst, dst);
1741   } else {
1742     assert(false, "unsupported");
1743     ShouldNotReachHere();
1744   }
1745 }
1746 
1747 // Narrow src predicate to dst predicate with the same lane count but
1748 // smaller element size, e.g. 512Long -> 64Byte
1749 void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src, PRegister ptmp,
1750                                              uint dst_element_length_in_bytes, uint src_element_length_in_bytes) {
1751   // The insignificant bits in src predicate are expected to be zero.
1752   // To ensure the higher order bits of the resultant narrowed vector are 0, an all-zero predicate is
1753   // passed as the second argument. An example narrowing operation with a given mask would be -
1754   // 128Long -> 64Int on a 128-bit machine i.e 2L -> 2I
1755   // Mask (for 2 Longs) : TF
1756   // Predicate register for the above mask (16 bits) : 00000001 00000000
1757   // After narrowing (uzp1 dst.b, src.b, ptmp.b) : 0000 0000 0001 0000
1758   // Which translates to mask for 2 integers as : TF (lower half is considered while upper half is 0)
1759   assert_different_registers(src, ptmp);
1760   assert_different_registers(dst, ptmp);
1761   sve_pfalse(ptmp);
1762   if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) {
1763     sve_uzp1(dst, B, src, ptmp);
1764   } else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) {
1765     sve_uzp1(dst, H, src, ptmp);
1766     sve_uzp1(dst, B, dst, ptmp);
1767   } else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) {
1768     sve_uzp1(dst, S, src, ptmp);
1769     sve_uzp1(dst, H, dst, ptmp);
1770     sve_uzp1(dst, B, dst, ptmp);
1771   } else {
1772     assert(false, "unsupported");
1773     ShouldNotReachHere();
1774   }
1775 }
1776 
1777 // Vector reduction add for integral type with ASIMD instructions.
1778 void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt,
1779                                                  Register isrc, FloatRegister vsrc,
1780                                                  unsigned vector_length_in_bytes,
1781                                                  FloatRegister vtmp) {
1782   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1783   assert_different_registers(dst, isrc);
1784   bool isQ = vector_length_in_bytes == 16;
1785 
1786   BLOCK_COMMENT("neon_reduce_add_integral {");
1787     switch(bt) {
1788       case T_BYTE:
1789         addv(vtmp, isQ ? T16B : T8B, vsrc);
1790         smov(dst, vtmp, B, 0);
1791         addw(dst, dst, isrc, ext::sxtb);
1792         break;
1793       case T_SHORT:
1794         addv(vtmp, isQ ? T8H : T4H, vsrc);
1795         smov(dst, vtmp, H, 0);
1796         addw(dst, dst, isrc, ext::sxth);
1797         break;
1798       case T_INT:
1799         isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc);
1800         umov(dst, vtmp, S, 0);
1801         addw(dst, dst, isrc);
1802         break;
1803       case T_LONG:
1804         assert(isQ, "unsupported");
1805         addpd(vtmp, vsrc);
1806         umov(dst, vtmp, D, 0);
1807         add(dst, dst, isrc);
1808         break;
1809       default:
1810         assert(false, "unsupported");
1811         ShouldNotReachHere();
1812     }
1813   BLOCK_COMMENT("} neon_reduce_add_integral");
1814 }
1815 
1816 // Vector reduction multiply for integral type with ASIMD instructions.
1817 // Note: temporary registers vtmp1 and vtmp2 are not used in some cases.
1818 // Clobbers: rscratch1
1819 void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt,
1820                                                  Register isrc, FloatRegister vsrc,
1821                                                  unsigned vector_length_in_bytes,
1822                                                  FloatRegister vtmp1, FloatRegister vtmp2) {
1823   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1824   bool isQ = vector_length_in_bytes == 16;
1825 
1826   BLOCK_COMMENT("neon_reduce_mul_integral {");
1827     switch(bt) {
1828       case T_BYTE:
1829         if (isQ) {
1830           // Multiply the lower half and higher half of vector iteratively.
1831           // vtmp1 = vsrc[8:15]
1832           ins(vtmp1, D, vsrc, 0, 1);
1833           // vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7]
1834           mulv(vtmp1, T8B, vtmp1, vsrc);
1835           // vtmp2 = vtmp1[4:7]
1836           ins(vtmp2, S, vtmp1, 0, 1);
1837           // vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3]
1838           mulv(vtmp1, T8B, vtmp2, vtmp1);
1839         } else {
1840           ins(vtmp1, S, vsrc, 0, 1);
1841           mulv(vtmp1, T8B, vtmp1, vsrc);
1842         }
1843         // vtmp2 = vtmp1[2:3]
1844         ins(vtmp2, H, vtmp1, 0, 1);
1845         // vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1]
1846         mulv(vtmp2, T8B, vtmp2, vtmp1);
1847         // dst = vtmp2[0] * isrc * vtmp2[1]
1848         umov(rscratch1, vtmp2, B, 0);
1849         mulw(dst, rscratch1, isrc);
1850         sxtb(dst, dst);
1851         umov(rscratch1, vtmp2, B, 1);
1852         mulw(dst, rscratch1, dst);
1853         sxtb(dst, dst);
1854         break;
1855       case T_SHORT:
1856         if (isQ) {
1857           ins(vtmp2, D, vsrc, 0, 1);
1858           mulv(vtmp2, T4H, vtmp2, vsrc);
1859           ins(vtmp1, S, vtmp2, 0, 1);
1860           mulv(vtmp1, T4H, vtmp1, vtmp2);
1861         } else {
1862           ins(vtmp1, S, vsrc, 0, 1);
1863           mulv(vtmp1, T4H, vtmp1, vsrc);
1864         }
1865         umov(rscratch1, vtmp1, H, 0);
1866         mulw(dst, rscratch1, isrc);
1867         sxth(dst, dst);
1868         umov(rscratch1, vtmp1, H, 1);
1869         mulw(dst, rscratch1, dst);
1870         sxth(dst, dst);
1871         break;
1872       case T_INT:
1873         if (isQ) {
1874           ins(vtmp1, D, vsrc, 0, 1);
1875           mulv(vtmp1, T2S, vtmp1, vsrc);
1876         } else {
1877           vtmp1 = vsrc;
1878         }
1879         umov(rscratch1, vtmp1, S, 0);
1880         mul(dst, rscratch1, isrc);
1881         umov(rscratch1, vtmp1, S, 1);
1882         mul(dst, rscratch1, dst);
1883         break;
1884       case T_LONG:
1885         umov(rscratch1, vsrc, D, 0);
1886         mul(dst, isrc, rscratch1);
1887         umov(rscratch1, vsrc, D, 1);
1888         mul(dst, dst, rscratch1);
1889         break;
1890       default:
1891         assert(false, "unsupported");
1892         ShouldNotReachHere();
1893     }
1894   BLOCK_COMMENT("} neon_reduce_mul_integral");
1895 }
1896 
1897 // Vector reduction multiply for floating-point type with ASIMD instructions.
1898 void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt,
1899                                            FloatRegister fsrc, FloatRegister vsrc,
1900                                            unsigned vector_length_in_bytes,
1901                                            FloatRegister vtmp) {
1902   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1903   bool isQ = vector_length_in_bytes == 16;
1904 
1905   BLOCK_COMMENT("neon_reduce_mul_fp {");
1906     switch(bt) {
1907       case T_FLOAT:
1908         fmuls(dst, fsrc, vsrc);
1909         ins(vtmp, S, vsrc, 0, 1);
1910         fmuls(dst, dst, vtmp);
1911         if (isQ) {
1912           ins(vtmp, S, vsrc, 0, 2);
1913           fmuls(dst, dst, vtmp);
1914           ins(vtmp, S, vsrc, 0, 3);
1915           fmuls(dst, dst, vtmp);
1916          }
1917         break;
1918       case T_DOUBLE:
1919         assert(isQ, "unsupported");
1920         fmuld(dst, fsrc, vsrc);
1921         ins(vtmp, D, vsrc, 0, 1);
1922         fmuld(dst, dst, vtmp);
1923         break;
1924       default:
1925         assert(false, "unsupported");
1926         ShouldNotReachHere();
1927     }
1928   BLOCK_COMMENT("} neon_reduce_mul_fp");
1929 }
1930 
1931 // Helper to select logical instruction
1932 void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd,
1933                                                    Register Rn, Register Rm,
1934                                                    enum shift_kind kind, unsigned shift) {
1935   switch(opc) {
1936     case Op_AndReductionV:
1937       is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift);
1938       break;
1939     case Op_OrReductionV:
1940       is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift);
1941       break;
1942     case Op_XorReductionV:
1943       is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift);
1944       break;
1945     default:
1946       assert(false, "unsupported");
1947       ShouldNotReachHere();
1948   }
1949 }
1950 
1951 // Vector reduction logical operations And, Or, Xor
1952 // Clobbers: rscratch1
1953 void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt,
1954                                             Register isrc, FloatRegister vsrc,
1955                                             unsigned vector_length_in_bytes) {
1956   assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV,
1957          "unsupported");
1958   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1959   assert_different_registers(dst, isrc);
1960   bool isQ = vector_length_in_bytes == 16;
1961 
1962   BLOCK_COMMENT("neon_reduce_logical {");
1963     umov(rscratch1, vsrc, isQ ? D : S, 0);
1964     umov(dst, vsrc, isQ ? D : S, 1);
1965     neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1);
1966     switch(bt) {
1967       case T_BYTE:
1968         if (isQ) {
1969           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1970         }
1971         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
1972         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8);
1973         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1974         sxtb(dst, dst);
1975         break;
1976       case T_SHORT:
1977         if (isQ) {
1978           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1979         }
1980         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
1981         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1982         sxth(dst, dst);
1983         break;
1984       case T_INT:
1985         if (isQ) {
1986           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1987         }
1988         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1989         break;
1990       case T_LONG:
1991         assert(isQ, "unsupported");
1992         neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst);
1993         break;
1994       default:
1995         assert(false, "unsupported");
1996         ShouldNotReachHere();
1997     }
1998   BLOCK_COMMENT("} neon_reduce_logical");
1999 }
2000 
2001 // Helper function to decode min/max reduction operation properties
2002 void C2_MacroAssembler::decode_minmax_reduction_opc(int opc, bool* is_min,
2003                                                     bool* is_unsigned,
2004                                                     Condition* cond) {
2005   switch(opc) {
2006     case Op_MinReductionV:
2007       *is_min = true;  *is_unsigned = false; *cond = LT; break;
2008     case Op_MaxReductionV:
2009       *is_min = false; *is_unsigned = false; *cond = GT; break;
2010     case Op_UMinReductionV:
2011       *is_min = true;  *is_unsigned = true;  *cond = LO; break;
2012     case Op_UMaxReductionV:
2013       *is_min = false; *is_unsigned = true;  *cond = HI; break;
2014     default:
2015       ShouldNotReachHere();
2016   }
2017 }
2018 
2019 // Vector reduction min/max/umin/umax for integral type with ASIMD instructions.
2020 // Note: vtmp is not used and expected to be fnoreg for T_LONG case.
2021 // Clobbers: rscratch1, rflags
2022 void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt,
2023                                                     Register isrc, FloatRegister vsrc,
2024                                                     unsigned vector_length_in_bytes,
2025                                                     FloatRegister vtmp) {
2026   assert(opc == Op_MinReductionV || opc == Op_MaxReductionV ||
2027          opc == Op_UMinReductionV || opc == Op_UMaxReductionV, "unsupported");
2028   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
2029   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported");
2030   assert_different_registers(dst, isrc);
2031   bool isQ = vector_length_in_bytes == 16;
2032   bool is_min;
2033   bool is_unsigned;
2034   Condition cond;
2035   decode_minmax_reduction_opc(opc, &is_min, &is_unsigned, &cond);
2036   BLOCK_COMMENT("neon_reduce_minmax_integral {");
2037     if (bt == T_LONG) {
2038       assert(vtmp == fnoreg, "should be");
2039       assert(isQ, "should be");
2040       umov(rscratch1, vsrc, D, 0);
2041       cmp(isrc, rscratch1);
2042       csel(dst, isrc, rscratch1, cond);
2043       umov(rscratch1, vsrc, D, 1);
2044       cmp(dst, rscratch1);
2045       csel(dst, dst, rscratch1, cond);
2046     } else {
2047       SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
2048       if (size == T2S) {
2049         // For T2S (2x32-bit elements), use pairwise instructions because
2050         // uminv/umaxv/sminv/smaxv don't support arrangement 2S.
2051         neon_minmaxp(is_unsigned, is_min, vtmp, size, vsrc, vsrc);
2052       } else {
2053         // For other sizes, use reduction to scalar instructions.
2054         neon_minmaxv(is_unsigned, is_min, vtmp, size, vsrc);
2055       }
2056       if (bt == T_INT) {
2057         umov(dst, vtmp, S, 0);
2058       } else if (is_unsigned) {
2059         umov(dst, vtmp, elemType_to_regVariant(bt), 0);
2060       } else {
2061         smov(dst, vtmp, elemType_to_regVariant(bt), 0);
2062       }
2063       cmpw(dst, isrc);
2064       cselw(dst, dst, isrc, cond);
2065     }
2066   BLOCK_COMMENT("} neon_reduce_minmax_integral");
2067 }
2068 
2069 // Vector reduction for integral type with SVE instruction.
2070 // Supported operations are Add, And, Or, Xor, Max, Min, UMax, UMin.
2071 // rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV.
2072 void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1,
2073                                             FloatRegister src2, PRegister pg, FloatRegister tmp) {
2074   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2075   assert(pg->is_governing(), "This register has to be a governing predicate register");
2076   assert_different_registers(src1, dst);
2077   // Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved.
2078   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2079   switch (opc) {
2080     case Op_AddReductionVI: {
2081       sve_uaddv(tmp, size, pg, src2);
2082       if (bt == T_BYTE) {
2083         smov(dst, tmp, size, 0);
2084         addw(dst, src1, dst, ext::sxtb);
2085       } else if (bt == T_SHORT) {
2086         smov(dst, tmp, size, 0);
2087         addw(dst, src1, dst, ext::sxth);
2088       } else {
2089         umov(dst, tmp, size, 0);
2090         addw(dst, dst, src1);
2091       }
2092       break;
2093     }
2094     case Op_AddReductionVL: {
2095       sve_uaddv(tmp, size, pg, src2);
2096       umov(dst, tmp, size, 0);
2097       add(dst, dst, src1);
2098       break;
2099     }
2100     case Op_AndReductionV: {
2101       sve_andv(tmp, size, pg, src2);
2102       if (bt == T_INT || bt == T_LONG) {
2103         umov(dst, tmp, size, 0);
2104       } else {
2105         smov(dst, tmp, size, 0);
2106       }
2107       if (bt == T_LONG) {
2108         andr(dst, dst, src1);
2109       } else {
2110         andw(dst, dst, src1);
2111       }
2112       break;
2113     }
2114     case Op_OrReductionV: {
2115       sve_orv(tmp, size, pg, src2);
2116       if (bt == T_INT || bt == T_LONG) {
2117         umov(dst, tmp, size, 0);
2118       } else {
2119         smov(dst, tmp, size, 0);
2120       }
2121       if (bt == T_LONG) {
2122         orr(dst, dst, src1);
2123       } else {
2124         orrw(dst, dst, src1);
2125       }
2126       break;
2127     }
2128     case Op_XorReductionV: {
2129       sve_eorv(tmp, size, pg, src2);
2130       if (bt == T_INT || bt == T_LONG) {
2131         umov(dst, tmp, size, 0);
2132       } else {
2133         smov(dst, tmp, size, 0);
2134       }
2135       if (bt == T_LONG) {
2136         eor(dst, dst, src1);
2137       } else {
2138         eorw(dst, dst, src1);
2139       }
2140       break;
2141     }
2142     case Op_MaxReductionV:
2143     case Op_MinReductionV:
2144     case Op_UMaxReductionV:
2145     case Op_UMinReductionV: {
2146       bool is_min;
2147       bool is_unsigned;
2148       Condition cond;
2149       decode_minmax_reduction_opc(opc, &is_min, &is_unsigned, &cond);
2150       sve_minmaxv(is_unsigned, is_min, tmp, size, pg, src2);
2151       // Move result from vector to general register
2152       if (is_unsigned || bt == T_INT || bt == T_LONG) {
2153         umov(dst, tmp, size, 0);
2154       } else {
2155         smov(dst, tmp, size, 0);
2156       }
2157       if (bt == T_LONG) {
2158         cmp(dst, src1);
2159         csel(dst, dst, src1, cond);
2160       } else {
2161         cmpw(dst, src1);
2162         cselw(dst, dst, src1, cond);
2163       }
2164       break;
2165     }
2166     default:
2167       assert(false, "unsupported");
2168       ShouldNotReachHere();
2169   }
2170 
2171   if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) {
2172     if (bt == T_BYTE) {
2173       sxtb(dst, dst);
2174     } else if (bt == T_SHORT) {
2175       sxth(dst, dst);
2176     }
2177   }
2178 }
2179 
2180 // Set elements of the dst predicate to true for lanes in the range of [0, lane_cnt), or
2181 // to false otherwise. The input "lane_cnt" should be smaller than or equal to the supported
2182 // max vector length of the basic type. Clobbers: rscratch1 and the rFlagsReg.
2183 void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt) {
2184   uint32_t max_vector_length = Matcher::max_vector_size(bt);
2185   assert(lane_cnt <= max_vector_length, "unsupported input lane_cnt");
2186 
2187   // Set all elements to false if the input "lane_cnt" is zero.
2188   if (lane_cnt == 0) {
2189     sve_pfalse(dst);
2190     return;
2191   }
2192 
2193   SIMD_RegVariant size = elemType_to_regVariant(bt);
2194   assert(size != Q, "invalid size");
2195 
2196   // Set all true if "lane_cnt" equals to the max lane count.
2197   if (lane_cnt == max_vector_length) {
2198     sve_ptrue(dst, size, /* ALL */ 0b11111);
2199     return;
2200   }
2201 
2202   // Fixed numbers for "ptrue".
2203   switch(lane_cnt) {
2204   case 1: /* VL1 */
2205   case 2: /* VL2 */
2206   case 3: /* VL3 */
2207   case 4: /* VL4 */
2208   case 5: /* VL5 */
2209   case 6: /* VL6 */
2210   case 7: /* VL7 */
2211   case 8: /* VL8 */
2212     sve_ptrue(dst, size, lane_cnt);
2213     return;
2214   case 16:
2215     sve_ptrue(dst, size, /* VL16 */ 0b01001);
2216     return;
2217   case 32:
2218     sve_ptrue(dst, size, /* VL32 */ 0b01010);
2219     return;
2220   case 64:
2221     sve_ptrue(dst, size, /* VL64 */ 0b01011);
2222     return;
2223   case 128:
2224     sve_ptrue(dst, size, /* VL128 */ 0b01100);
2225     return;
2226   case 256:
2227     sve_ptrue(dst, size, /* VL256 */ 0b01101);
2228     return;
2229   default:
2230     break;
2231   }
2232 
2233   // Special patterns for "ptrue".
2234   if (lane_cnt == round_down_power_of_2(max_vector_length)) {
2235     sve_ptrue(dst, size, /* POW2 */ 0b00000);
2236   } else if (lane_cnt == max_vector_length - (max_vector_length % 4)) {
2237     sve_ptrue(dst, size, /* MUL4 */ 0b11101);
2238   } else if (lane_cnt == max_vector_length - (max_vector_length % 3)) {
2239     sve_ptrue(dst, size, /* MUL3 */ 0b11110);
2240   } else {
2241     // Encode to "whileltw" for the remaining cases.
2242     mov(rscratch1, lane_cnt);
2243     sve_whileltw(dst, size, zr, rscratch1);
2244   }
2245 }
2246 
2247 // Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst.
2248 // Any remaining elements of dst will be filled with zero.
2249 // Clobbers: rscratch1
2250 // Preserves: mask, vzr
2251 void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask,
2252                                            FloatRegister vzr, FloatRegister vtmp,
2253                                            PRegister pgtmp, unsigned vector_length_in_bytes) {
2254   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2255   // When called by sve_compress_byte, src and vtmp may be the same register.
2256   assert_different_registers(dst, src, vzr);
2257   assert_different_registers(dst, vtmp, vzr);
2258   assert_different_registers(mask, pgtmp);
2259   // high <-- low
2260   // Example input:   src   = hh gg ff ee dd cc bb aa, one character is 8 bits.
2261   //                  mask  = 01 00 00 01 01 00 01 01, one character is 1 bit.
2262   // Expected result: dst   = 00 00 00 hh ee dd bb aa
2263 
2264   // Extend lowest half to type INT.
2265   // dst   =  00dd  00cc  00bb  00aa
2266   sve_uunpklo(dst, S, src);
2267   // pgtmp =  0001  0000  0001  0001
2268   sve_punpklo(pgtmp, mask);
2269   // Pack the active elements in size of type INT to the right,
2270   // and fill the remainings with zero.
2271   // dst   =  0000  00dd  00bb  00aa
2272   sve_compact(dst, S, dst, pgtmp);
2273   // Narrow the result back to type SHORT.
2274   // dst   = 00 00 00 00 00 dd bb aa
2275   sve_uzp1(dst, H, dst, vzr);
2276 
2277   // Return if the vector length is no more than MaxVectorSize/2, since the
2278   // highest half is invalid.
2279   if (vector_length_in_bytes <= (MaxVectorSize >> 1)) {
2280     return;
2281   }
2282 
2283   // Count the active elements of lowest half.
2284   // rscratch1 = 3
2285   sve_cntp(rscratch1, S, ptrue, pgtmp);
2286 
2287   // Repeat to the highest half.
2288   // pgtmp =  0001  0000  0000  0001
2289   sve_punpkhi(pgtmp, mask);
2290   // vtmp  =  00hh  00gg  00ff  00ee
2291   sve_uunpkhi(vtmp, S, src);
2292   // vtmp  =  0000  0000  00hh  00ee
2293   sve_compact(vtmp, S, vtmp, pgtmp);
2294   // vtmp  = 00 00 00 00 00 00 hh ee
2295   sve_uzp1(vtmp, H, vtmp, vzr);
2296 
2297   // pgtmp = 00 00 00 00 00 01 01 01
2298   sve_whilelt(pgtmp, H, zr, rscratch1);
2299   // Compressed low:  dst  = 00 00 00 00 00 dd bb aa
2300   // Compressed high: vtmp = 00 00 00 00 00 00 hh ee
2301   // Combine the compressed low with the compressed high:
2302   //                  dst  = 00 00 00 hh ee dd bb aa
2303   sve_splice(dst, H, pgtmp, vtmp);
2304 }
2305 
2306 // Clobbers: rscratch1, rscratch2
2307 // Preserves: src, mask
2308 void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask,
2309                                           FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
2310                                           PRegister ptmp, PRegister pgtmp, unsigned vector_length_in_bytes) {
2311   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2312   assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3);
2313   assert_different_registers(mask, ptmp, pgtmp);
2314   // high <-- low
2315   // Example input:   src   = q p n m l k j i h g f e d c b a, one character is 8 bits.
2316   //                  mask  = 0 1 0 0 0 0 0 1 0 1 0 0 0 1 0 1, one character is 1 bit.
2317   // Expected result: dst   = 0 0 0 0 0 0 0 0 0 0 0 p i g c a
2318   FloatRegister vzr = vtmp3;
2319   sve_dup(vzr, B, 0);
2320 
2321   // Extend lowest half to type SHORT.
2322   // vtmp1 =  0h  0g  0f  0e  0d  0c  0b  0a
2323   sve_uunpklo(vtmp1, H, src);
2324   // ptmp  =  00  01  00  00  00  01  00  01
2325   sve_punpklo(ptmp, mask);
2326   // Pack the active elements in size of type SHORT to the right,
2327   // and fill the remainings with zero.
2328   // dst   =  00  00  00  00  00  0g  0c  0a
2329   unsigned extended_size = vector_length_in_bytes << 1;
2330   sve_compress_short(dst, vtmp1, ptmp, vzr, vtmp2, pgtmp, extended_size > MaxVectorSize ? MaxVectorSize : extended_size);
2331   // Narrow the result back to type BYTE.
2332   // dst   = 0 0 0 0 0 0 0 0 0 0 0 0 0 g c a
2333   sve_uzp1(dst, B, dst, vzr);
2334 
2335   // Return if the vector length is no more than MaxVectorSize/2, since the
2336   // highest half is invalid.
2337   if (vector_length_in_bytes <= (MaxVectorSize >> 1)) {
2338     return;
2339   }
2340   // Count the active elements of lowest half.
2341   // rscratch2 = 3
2342   sve_cntp(rscratch2, H, ptrue, ptmp);
2343 
2344   // Repeat to the highest half.
2345   // ptmp  =  00  01  00  00  00  00  00  01
2346   sve_punpkhi(ptmp, mask);
2347   // vtmp2 =  0q  0p  0n  0m  0l  0k  0j  0i
2348   sve_uunpkhi(vtmp2, H, src);
2349   // vtmp1 =  00  00  00  00  00  00  0p  0i
2350   sve_compress_short(vtmp1, vtmp2, ptmp, vzr, vtmp2, pgtmp, extended_size - MaxVectorSize);
2351   // vtmp1 = 0 0 0 0 0 0 0 0 0 0 0 0 0 0 p i
2352   sve_uzp1(vtmp1, B, vtmp1, vzr);
2353 
2354   // ptmp  = 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1
2355   sve_whilelt(ptmp, B, zr, rscratch2);
2356   // Compressed low:  dst   = 0 0 0 0 0 0 0 0 0 0 0 0 0 g c a
2357   // Compressed high: vtmp1 = 0 0 0 0 0 0 0 0 0 0 0 0 0 0 p i
2358   // Combine the compressed low with the compressed high:
2359   //                  dst   = 0 0 0 0 0 0 0 0 0 0 0 p i g c a
2360   sve_splice(dst, B, ptmp, vtmp1);
2361 }
2362 
2363 void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2364   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2365   SIMD_Arrangement size = isQ ? T16B : T8B;
2366   if (bt == T_BYTE) {
2367     rbit(dst, size, src);
2368   } else {
2369     neon_reverse_bytes(dst, src, bt, isQ);
2370     rbit(dst, size, dst);
2371   }
2372 }
2373 
2374 void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2375   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2376   SIMD_Arrangement size = isQ ? T16B : T8B;
2377   switch (bt) {
2378     case T_BYTE:
2379       if (dst != src) {
2380         orr(dst, size, src, src);
2381       }
2382       break;
2383     case T_SHORT:
2384       rev16(dst, size, src);
2385       break;
2386     case T_INT:
2387       rev32(dst, size, src);
2388       break;
2389     case T_LONG:
2390       rev64(dst, size, src);
2391       break;
2392     default:
2393       assert(false, "unsupported");
2394       ShouldNotReachHere();
2395   }
2396 }
2397 
2398 // VectorRearrange implementation for short/int/float/long/double types with NEON
2399 // instructions. For VectorRearrange short/int/float, we use NEON tbl instruction.
2400 // But since it supports bytes table only, we need to lookup 2/4 bytes as a group.
2401 // For VectorRearrange long/double, we compare the shuffle input with iota indices,
2402 // and use bsl to implement the operation.
2403 void C2_MacroAssembler::neon_rearrange_hsd(FloatRegister dst, FloatRegister src,
2404                                            FloatRegister shuffle, FloatRegister tmp,
2405                                            BasicType bt, bool isQ) {
2406   assert_different_registers(dst, src, shuffle, tmp);
2407   SIMD_Arrangement size1 = isQ ? T16B : T8B;
2408   SIMD_Arrangement size2 = esize2arrangement((uint)type2aelembytes(bt), isQ);
2409 
2410   // Here is an example that rearranges a NEON vector with 4 ints:
2411   // Rearrange V1 int[a0, a1, a2, a3] to V2 int[a2, a3, a0, a1]
2412   //   1. We assume the shuffle input is Vi int[2, 3, 0, 1].
2413   //   2. Multiply Vi int[2, 3, 0, 1] with constant int vector
2414   //      [0x04040404, 0x04040404, 0x04040404, 0x04040404], and get
2415   //      tbl base Vm int[0x08080808, 0x0c0c0c0c, 0x00000000, 0x04040404].
2416   //   3. Add Vm with constant int[0x03020100, 0x03020100, 0x03020100, 0x03020100],
2417   //      and get tbl index Vm int[0x0b0a0908, 0x0f0e0d0c, 0x03020100, 0x07060504]
2418   //   4. Use Vm as index register, and use V1 as table register.
2419   //      Then get V2 as the result by tbl NEON instructions.
2420   switch (bt) {
2421     case T_SHORT:
2422       mov(tmp, size1, 0x02);
2423       mulv(dst, size2, shuffle, tmp);
2424       mov(tmp, size2, 0x0100);
2425       addv(dst, size1, dst, tmp);
2426       tbl(dst, size1, src, 1, dst);
2427       break;
2428     case T_INT:
2429     case T_FLOAT:
2430       mov(tmp, size1, 0x04);
2431       mulv(dst, size2, shuffle, tmp);
2432       mov(tmp, size2, 0x03020100);
2433       addv(dst, size1, dst, tmp);
2434       tbl(dst, size1, src, 1, dst);
2435       break;
2436     case T_LONG:
2437     case T_DOUBLE:
2438       // Load the iota indices for Long type. The indices are ordered by
2439       // type B/S/I/L/F/D, and the offset between two types is 16; Hence
2440       // the offset for L is 48.
2441       lea(rscratch1,
2442           ExternalAddress(StubRoutines::aarch64::vector_iota_indices() + 48));
2443       ldrq(tmp, rscratch1);
2444       // Check whether the input "shuffle" is the same with iota indices.
2445       // Return "src" if true, otherwise swap the two elements of "src".
2446       cm(EQ, dst, size2, shuffle, tmp);
2447       ext(tmp, size1, src, src, 8);
2448       bsl(dst, size1, src, tmp);
2449       break;
2450     default:
2451       assert(false, "unsupported element type");
2452       ShouldNotReachHere();
2453   }
2454 }
2455 
2456 // Extract a scalar element from an sve vector at position 'idx'.
2457 // The input elements in src are expected to be of integral type.
2458 void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src,
2459                                              int idx, FloatRegister vtmp) {
2460   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2461   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2462   if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction
2463     if (bt == T_INT || bt == T_LONG) {
2464       umov(dst, src, size, idx);
2465     } else {
2466       smov(dst, src, size, idx);
2467     }
2468   } else {
2469     sve_orr(vtmp, src, src);
2470     sve_ext(vtmp, vtmp, idx << size);
2471     if (bt == T_INT || bt == T_LONG) {
2472       umov(dst, vtmp, size, 0);
2473     } else {
2474       smov(dst, vtmp, size, 0);
2475     }
2476   }
2477 }
2478 
2479 // java.lang.Math::round intrinsics
2480 
2481 // Clobbers: rscratch1, rflags
2482 void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2483                                           FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) {
2484   assert_different_registers(tmp1, tmp2, tmp3, src, dst);
2485   switch (T) {
2486     case T2S:
2487     case T4S:
2488       fmovs(tmp1, T, 0.5f);
2489       mov(rscratch1, jint_cast(0x1.0p23f));
2490       break;
2491     case T2D:
2492       fmovd(tmp1, T, 0.5);
2493       mov(rscratch1, julong_cast(0x1.0p52));
2494       break;
2495     default:
2496       assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2497   }
2498   fadd(tmp1, T, tmp1, src);
2499   fcvtms(tmp1, T, tmp1);
2500   // tmp1 = floor(src + 0.5, ties to even)
2501 
2502   fcvtas(dst, T, src);
2503   // dst = round(src), ties to away
2504 
2505   fneg(tmp3, T, src);
2506   dup(tmp2, T, rscratch1);
2507   cm(HS, tmp3, T, tmp3, tmp2);
2508   // tmp3 is now a set of flags
2509 
2510   bif(dst, T16B, tmp1, tmp3);
2511   // result in dst
2512 }
2513 
2514 // Clobbers: rscratch1, rflags
2515 void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2516                                          FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) {
2517   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2518   assert_different_registers(tmp1, tmp2, src, dst);
2519 
2520   switch (T) {
2521     case S:
2522       mov(rscratch1, jint_cast(0x1.0p23f));
2523       break;
2524     case D:
2525       mov(rscratch1, julong_cast(0x1.0p52));
2526       break;
2527     default:
2528       assert(T == S || T == D, "invalid register variant");
2529   }
2530 
2531   sve_frinta(dst, T, ptrue, src);
2532   // dst = round(src), ties to away
2533 
2534   Label none;
2535 
2536   sve_fneg(tmp1, T, ptrue, src);
2537   sve_dup(tmp2, T, rscratch1);
2538   sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1);
2539   br(EQ, none);
2540   {
2541     sve_cpy(tmp1, T, pgtmp, 0.5);
2542     sve_fadd(tmp1, T, pgtmp, src);
2543     sve_frintm(dst, T, pgtmp, tmp1);
2544     // dst = floor(src + 0.5, ties to even)
2545   }
2546   bind(none);
2547 
2548   sve_fcvtzs(dst, T, ptrue, dst, T);
2549   // result in dst
2550 }
2551 
2552 void C2_MacroAssembler::vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero,
2553                                            FloatRegister one, SIMD_Arrangement T) {
2554   assert_different_registers(dst, src, zero, one);
2555   assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2556 
2557   facgt(dst, T, src, zero);
2558   ushr(dst, T, dst, 1); // dst=0 for +-0.0 and NaN. 0x7FF..F otherwise
2559   bsl(dst, T == T2S ? T8B : T16B, one, src); // Result in dst
2560 }
2561 
2562 void C2_MacroAssembler::vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero,
2563                                           FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T) {
2564     assert_different_registers(dst, src, zero, one, vtmp);
2565     assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2566 
2567     sve_orr(vtmp, src, src);
2568     sve_fac(Assembler::GT, pgtmp, T, ptrue, src, zero); // pmtp=0 for +-0.0 and NaN. 0x1 otherwise
2569     switch (T) {
2570     case S:
2571       sve_and(vtmp, T, min_jint); // Extract the sign bit of float value in every lane of src
2572       sve_orr(vtmp, T, jint_cast(1.0)); // OR it with +1 to make the final result +1 or -1 depending
2573                                         // on the sign of the float value
2574       break;
2575     case D:
2576       sve_and(vtmp, T, min_jlong);
2577       sve_orr(vtmp, T, jlong_cast(1.0));
2578       break;
2579     default:
2580       assert(false, "unsupported");
2581       ShouldNotReachHere();
2582     }
2583     sve_sel(dst, T, pgtmp, vtmp, src); // Select either from src or vtmp based on the predicate register pgtmp
2584                                        // Result in dst
2585 }
2586 
2587 bool C2_MacroAssembler::in_scratch_emit_size() {
2588   if (ciEnv::current()->task() != nullptr) {
2589     PhaseOutput* phase_output = Compile::current()->output();
2590     if (phase_output != nullptr && phase_output->in_scratch_emit_size()) {
2591       return true;
2592     }
2593   }
2594   return MacroAssembler::in_scratch_emit_size();
2595 }
2596 
2597 static void abort_verify_int_in_range(uint idx, jint val, jint lo, jint hi) {
2598   fatal("Invalid CastII, idx: %u, val: %d, lo: %d, hi: %d", idx, val, lo, hi);
2599 }
2600 
2601 void C2_MacroAssembler::verify_int_in_range(uint idx, const TypeInt* t, Register rval, Register rtmp) {
2602   assert(!t->empty() && !t->singleton(), "%s", Type::str(t));
2603   if (t == TypeInt::INT) {
2604     return;
2605   }
2606 
2607   BLOCK_COMMENT("verify_int_in_range {");
2608   Label L_success, L_failure;
2609 
2610   jint lo = t->_lo;
2611   jint hi = t->_hi;
2612 
2613   if (lo != min_jint) {
2614     subsw(rtmp, rval, lo);
2615     br(Assembler::LT, L_failure);
2616   }
2617   if (hi != max_jint) {
2618     subsw(rtmp, rval, hi);
2619     br(Assembler::GT, L_failure);
2620   }
2621   b(L_success);
2622 
2623   bind(L_failure);
2624   movw(c_rarg0, idx);
2625   mov(c_rarg1, rval);
2626   movw(c_rarg2, lo);
2627   movw(c_rarg3, hi);
2628   reconstruct_frame_pointer(rtmp);
2629   rt_call(CAST_FROM_FN_PTR(address, abort_verify_int_in_range), rtmp);
2630   hlt(0);
2631 
2632   bind(L_success);
2633   BLOCK_COMMENT("} verify_int_in_range");
2634 }
2635 
2636 static void abort_verify_long_in_range(uint idx, jlong val, jlong lo, jlong hi) {
2637   fatal("Invalid CastLL, idx: %u, val: " JLONG_FORMAT ", lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, val, lo, hi);
2638 }
2639 
2640 void C2_MacroAssembler::verify_long_in_range(uint idx, const TypeLong* t, Register rval, Register rtmp) {
2641   assert(!t->empty() && !t->singleton(), "%s", Type::str(t));
2642   if (t == TypeLong::LONG) {
2643     return;
2644   }
2645 
2646   BLOCK_COMMENT("verify_long_in_range {");
2647   Label L_success, L_failure;
2648 
2649   jlong lo = t->_lo;
2650   jlong hi = t->_hi;
2651 
2652   if (lo != min_jlong) {
2653     subs(rtmp, rval, lo);
2654     br(Assembler::LT, L_failure);
2655   }
2656   if (hi != max_jlong) {
2657     subs(rtmp, rval, hi);
2658     br(Assembler::GT, L_failure);
2659   }
2660   b(L_success);
2661 
2662   bind(L_failure);
2663   movw(c_rarg0, idx);
2664   mov(c_rarg1, rval);
2665   mov(c_rarg2, lo);
2666   mov(c_rarg3, hi);
2667   reconstruct_frame_pointer(rtmp);
2668   rt_call(CAST_FROM_FN_PTR(address, abort_verify_long_in_range), rtmp);
2669   hlt(0);
2670 
2671   bind(L_success);
2672   BLOCK_COMMENT("} verify_long_in_range");
2673 }
2674 
2675 void C2_MacroAssembler::reconstruct_frame_pointer(Register rtmp) {
2676   const int framesize = Compile::current()->output()->frame_size_in_bytes();
2677   if (PreserveFramePointer) {
2678     // frame pointer is valid
2679 #ifdef ASSERT
2680     // Verify frame pointer value in rfp.
2681     add(rtmp, sp, framesize - 2 * wordSize);
2682     Label L_success;
2683     cmp(rfp, rtmp);
2684     br(Assembler::EQ, L_success);
2685     stop("frame pointer mismatch");
2686     bind(L_success);
2687 #endif // ASSERT
2688   } else {
2689     add(rfp, sp, framesize - 2 * wordSize);
2690   }
2691 }
2692 
2693 // Selects elements from two source vectors (src1, src2) based on index values in the index register
2694 // using Neon instructions and places it in the destination vector element corresponding to the
2695 // index vector element. Each index in the index register must be in the range - [0, 2 * NUM_ELEM),
2696 // where NUM_ELEM is the number of BasicType elements per vector.
2697 // If idx < NUM_ELEM --> selects src1[idx] (idx is an element of the index register)
2698 // Otherwise, selects src2[idx – NUM_ELEM]
2699 void C2_MacroAssembler::select_from_two_vectors_neon(FloatRegister dst, FloatRegister src1,
2700                                                      FloatRegister src2, FloatRegister index,
2701                                                      FloatRegister tmp, unsigned vector_length_in_bytes) {
2702   assert_different_registers(dst, src1, src2, tmp);
2703   SIMD_Arrangement size = vector_length_in_bytes == 16 ? T16B : T8B;
2704 
2705   if (vector_length_in_bytes == 16) {
2706     assert(UseSVE <= 1, "sve must be <= 1");
2707     assert(src1->successor() == src2, "Source registers must be ordered");
2708     // If the vector length is 16B, then use the Neon "tbl" instruction with two vector table
2709     tbl(dst, size, src1, 2, index);
2710   } else { // vector length == 8
2711     assert(UseSVE == 0, "must be Neon only");
2712     // We need to fit both the source vectors (src1, src2) in a 128-bit register because the
2713     // Neon "tbl" instruction supports only looking up 16B vectors. We then use the Neon "tbl"
2714     // instruction with one vector lookup
2715     ins(tmp, D, src1, 0, 0);
2716     ins(tmp, D, src2, 1, 0);
2717     tbl(dst, size, tmp, 1, index);
2718   }
2719 }
2720 
2721 // Selects elements from two source vectors (src1, src2) based on index values in the index register
2722 // using SVE/SVE2 instructions and places it in the destination vector element corresponding to the
2723 // index vector element. Each index in the index register must be in the range - [0, 2 * NUM_ELEM),
2724 // where NUM_ELEM is the number of BasicType elements per vector.
2725 // If idx < NUM_ELEM --> selects src1[idx] (idx is an element of the index register)
2726 // Otherwise, selects src2[idx – NUM_ELEM]
2727 void C2_MacroAssembler::select_from_two_vectors_sve(FloatRegister dst, FloatRegister src1,
2728                                                     FloatRegister src2, FloatRegister index,
2729                                                     FloatRegister tmp, SIMD_RegVariant T,
2730                                                     unsigned vector_length_in_bytes) {
2731   assert_different_registers(dst, src1, src2, index, tmp);
2732 
2733   if (vector_length_in_bytes == 8) {
2734     // We need to fit both the source vectors (src1, src2) in a single vector register because the
2735     // SVE "tbl" instruction is unpredicated and works on the entire vector which can lead to
2736     // incorrect results if each source vector is only partially filled. We then use the SVE "tbl"
2737     // instruction with one vector lookup
2738     assert(UseSVE >= 1, "sve must be >= 1");
2739     ins(tmp, D, src1, 0, 0);
2740     ins(tmp, D, src2, 1, 0);
2741     sve_tbl(dst, T, tmp, index);
2742   } else {  // UseSVE == 2 and vector_length_in_bytes > 8
2743     // If the vector length is > 8, then use the SVE2 "tbl" instruction with the two vector table.
2744     // The assertion - vector_length_in_bytes == MaxVectorSize ensures that this operation
2745     // is not executed on machines where vector_length_in_bytes < MaxVectorSize
2746     // with the only exception of 8B vector length.
2747     assert(UseSVE == 2 && vector_length_in_bytes == MaxVectorSize, "must be");
2748     assert(src1->successor() == src2, "Source registers must be ordered");
2749     sve_tbl(dst, T, src1, src2, index);
2750   }
2751 }
2752 
2753 void C2_MacroAssembler::select_from_two_vectors(FloatRegister dst, FloatRegister src1,
2754                                                 FloatRegister src2, FloatRegister index,
2755                                                 FloatRegister tmp, BasicType bt,
2756                                                 unsigned vector_length_in_bytes) {
2757 
2758   assert_different_registers(dst, src1, src2, index, tmp);
2759 
2760   // The cases that can reach this method are -
2761   // - UseSVE = 0/1, vector_length_in_bytes = 8 or 16, excluding double and long types
2762   // - UseSVE = 2, vector_length_in_bytes >= 8, for all types
2763   //
2764   // SVE/SVE2 tbl instructions are generated when UseSVE = 1 with vector_length_in_bytes = 8
2765   // and UseSVE = 2 with vector_length_in_bytes >= 8
2766   //
2767   // Neon instructions are generated when UseSVE = 0 with vector_length_in_bytes = 8 or 16 and
2768   // UseSVE = 1 with vector_length_in_bytes = 16
2769 
2770   if ((UseSVE == 1 && vector_length_in_bytes == 8) || UseSVE == 2) {
2771     SIMD_RegVariant T = elemType_to_regVariant(bt);
2772     select_from_two_vectors_sve(dst, src1, src2, index, tmp, T, vector_length_in_bytes);
2773     return;
2774   }
2775 
2776   // The only BasicTypes that can reach here are T_SHORT, T_BYTE, T_INT and T_FLOAT
2777   assert(bt != T_DOUBLE && bt != T_LONG, "unsupported basic type");
2778   assert(vector_length_in_bytes <= 16, "length_in_bytes must be <= 16");
2779 
2780   bool isQ = vector_length_in_bytes == 16;
2781 
2782   SIMD_Arrangement size1 = isQ ? T16B : T8B;
2783   SIMD_Arrangement size2 = esize2arrangement((uint)type2aelembytes(bt), isQ);
2784 
2785   // Neon "tbl" instruction only supports byte tables, so we need to look at chunks of
2786   // 2B for selecting shorts or chunks of 4B for selecting ints/floats from the table.
2787   // The index values in "index" register are in the range of [0, 2 * NUM_ELEM) where NUM_ELEM
2788   // is the number of elements that can fit in a vector. For ex. for T_SHORT with 64-bit vector length,
2789   // the indices can range from [0, 8).
2790   // As an example with 64-bit vector length and T_SHORT type - let index = [2, 5, 1, 0]
2791   // Move a constant 0x02 in every byte of tmp - tmp = [0x0202, 0x0202, 0x0202, 0x0202]
2792   // Multiply index vector with tmp to yield - dst = [0x0404, 0x0a0a, 0x0202, 0x0000]
2793   // Move a constant 0x0100 in every 2B of tmp - tmp = [0x0100, 0x0100, 0x0100, 0x0100]
2794   // Add the multiplied result to the vector in tmp to obtain the byte level
2795   // offsets - dst = [0x0504, 0x0b0a, 0x0302, 0x0100]
2796   // Use these offsets in the "tbl" instruction to select chunks of 2B.
2797 
2798   if (bt == T_BYTE) {
2799     select_from_two_vectors_neon(dst, src1, src2, index, tmp, vector_length_in_bytes);
2800   } else {
2801     int elem_size = (bt == T_SHORT) ? 2 : 4;
2802     uint64_t tbl_offset = (bt == T_SHORT) ? 0x0100u : 0x03020100u;
2803 
2804     mov(tmp, size1, elem_size);
2805     mulv(dst, size2, index, tmp);
2806     mov(tmp, size2, tbl_offset);
2807     addv(dst, size1, dst, tmp); // "dst" now contains the processed index elements
2808                                 // to select a set of 2B/4B
2809     select_from_two_vectors_neon(dst, src1, src2, dst, tmp, vector_length_in_bytes);
2810   }
2811 }
2812 
2813 // Vector expand implementation. Elements from the src vector are expanded into
2814 // the dst vector under the control of the vector mask.
2815 // Since there are no native instructions directly corresponding to expand before
2816 // SVE2p2, the following implementations mainly leverages the TBL instruction to
2817 // implement expand. To compute the index input for TBL, the prefix sum algorithm
2818 // (https://en.wikipedia.org/wiki/Prefix_sum) is used. The same algorithm is used
2819 // for NEON and SVE, but with different instructions where appropriate.
2820 
2821 // Vector expand implementation for NEON.
2822 //
2823 // An example of 128-bit Byte vector:
2824 //   Data direction: high <== low
2825 //   Input:
2826 //         src   = g  f  e  d  c  b  a  9  8  7  6  5  4  3  2  1
2827 //         mask  = 0  0 -1 -1  0  0 -1 -1  0  0 -1 -1  0  0 -1 -1
2828 //   Expected result:
2829 //         dst   = 0  0  8  7  0  0  6  5  0  0  4  3  0  0  2  1
2830 void C2_MacroAssembler::vector_expand_neon(FloatRegister dst, FloatRegister src, FloatRegister mask,
2831                                            FloatRegister tmp1, FloatRegister tmp2, BasicType bt,
2832                                            int vector_length_in_bytes) {
2833   assert(vector_length_in_bytes <= 16, "the vector length in bytes for NEON must be <= 16");
2834   assert_different_registers(dst, src, mask, tmp1, tmp2);
2835   // Since the TBL instruction only supports byte table, we need to
2836   // compute indices in byte type for all types.
2837   SIMD_Arrangement size = vector_length_in_bytes == 16 ? T16B : T8B;
2838   // tmp1 =  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
2839   dup(tmp1, size, zr);
2840   // dst  =  0  0  1  1  0  0  1  1  0  0  1  1  0  0  1  1
2841   negr(dst, size, mask);
2842   // Calculate vector index for TBL with prefix sum algorithm.
2843   // dst  =  8  8  8  7  6  6  6  5  4  4  4  3  2  2  2  1
2844   for (int i = 1; i < vector_length_in_bytes; i <<= 1) {
2845     ext(tmp2, size, tmp1, dst, vector_length_in_bytes - i);
2846     addv(dst, size, tmp2, dst);
2847   }
2848   // tmp2 =  0  0 -1 -1  0  0 -1 -1  0  0 -1 -1  0  0 -1 -1
2849   orr(tmp2, size, mask, mask);
2850   // tmp2 =  0  0  8  7  0  0  6  5  0  0  4  3  0  0  2  1
2851   bsl(tmp2, size, dst, tmp1);
2852   // tmp1 =  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
2853   movi(tmp1, size, 1);
2854   // dst  = -1 -1  7  6 -1 -1  5  4 -1 -1  3  2 -1 -1  1  0
2855   subv(dst, size, tmp2, tmp1);
2856   // dst  =  0  0  8  7  0  0  6  5  0  0  4  3  0  0  2  1
2857   tbl(dst, size, src, 1, dst);
2858 }
2859 
2860 // Vector expand implementation for SVE.
2861 //
2862 // An example of 128-bit Short vector:
2863 //   Data direction: high <== low
2864 //   Input:
2865 //         src   = gf ed cb a9 87 65 43 21
2866 //         pg    = 00 01 00 01 00 01 00 01
2867 //   Expected result:
2868 //         dst   = 00 87 00 65 00 43 00 21
2869 void C2_MacroAssembler::vector_expand_sve(FloatRegister dst, FloatRegister src, PRegister pg,
2870                                           FloatRegister tmp1, FloatRegister tmp2, BasicType bt,
2871                                           int vector_length_in_bytes) {
2872   assert(UseSVE > 0, "expand implementation only for SVE");
2873   assert_different_registers(dst, src, tmp1, tmp2);
2874   SIMD_RegVariant size = elemType_to_regVariant(bt);
2875 
2876   // tmp1 = 00 00 00 00 00 00 00 00
2877   sve_dup(tmp1, size, 0);
2878   sve_movprfx(tmp2, tmp1);
2879   // tmp2 = 00 01 00 01 00 01 00 01
2880   sve_cpy(tmp2, size, pg, 1, true);
2881   // Calculate vector index for TBL with prefix sum algorithm.
2882   // tmp2 = 04 04 03 03 02 02 01 01
2883   for (int i = type2aelembytes(bt); i < vector_length_in_bytes; i <<= 1) {
2884     sve_movprfx(dst, tmp1);
2885     // The EXT instruction operates on the full-width sve register. The correct
2886     // index calculation method is:
2887     // vector_length_in_bytes - i + MaxVectorSize - vector_length_in_bytes =>
2888     // MaxVectorSize - i.
2889     sve_ext(dst, tmp2, MaxVectorSize - i);
2890     sve_add(tmp2, size, dst, tmp2);
2891   }
2892   // dst  = 00 04 00 03 00 02 00 01
2893   sve_sel(dst, size, pg, tmp2, tmp1);
2894   // dst  = -1 03 -1 02 -1 01 -1 00
2895   sve_sub(dst, size, 1);
2896   // dst  = 00 87 00 65 00 43 00 21
2897   sve_tbl(dst, size, src, dst);
2898 }
2899 
2900 // Optimized SVE cpy (imm, zeroing) instruction.
2901 //
2902 // `movi; cpy(imm, merging)` and `cpy(imm, zeroing)` have the same
2903 // functionality, but test results show that `movi; cpy(imm, merging)` has
2904 // higher throughput on some microarchitectures. This would depend on
2905 // microarchitecture and so may vary between implementations.
2906 void C2_MacroAssembler::sve_cpy(FloatRegister dst, SIMD_RegVariant T,
2907                                 PRegister pg, int imm8, bool isMerge) {
2908   if (VM_Version::prefer_sve_merging_mode_cpy() && !isMerge) {
2909     // Generates a NEON instruction `movi V<dst>.2d, #0`.
2910     // On AArch64, Z and V registers alias in the low 128 bits, so V<dst> is
2911     // the low 128 bits of Z<dst>. A write to V<dst> also clears all bits of
2912     // Z<dst> above 128, so this `movi` instruction effectively zeroes the
2913     // entire Z<dst> register. According to the Arm Software Optimization
2914     // Guide, `movi` is zero latency.
2915     movi(dst, T2D, 0);
2916     isMerge = true;
2917   }
2918   Assembler::sve_cpy(dst, T, pg, imm8, isMerge);
2919 }