1 /*
   2  * Copyright (c) 2020, 2026, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright 2026 Arm Limited and/or its affiliates.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "opto/c2_MacroAssembler.hpp"
  29 #include "opto/compile.hpp"
  30 #include "opto/intrinsicnode.hpp"
  31 #include "opto/matcher.hpp"
  32 #include "opto/output.hpp"
  33 #include "opto/subnode.hpp"
  34 #include "runtime/objectMonitorTable.hpp"
  35 #include "runtime/stubRoutines.hpp"
  36 #include "runtime/synchronizer.hpp"
  37 #include "utilities/globalDefinitions.hpp"
  38 #include "utilities/powerOfTwo.hpp"
  39 
  40 #ifdef PRODUCT
  41 #define BLOCK_COMMENT(str) /* nothing */
  42 #define STOP(error) stop(error)
  43 #else
  44 #define BLOCK_COMMENT(str) block_comment(str)
  45 #define STOP(error) block_comment(error); stop(error)
  46 #endif
  47 
  48 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  49 
  50 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
  51 
  52 void C2_MacroAssembler::entry_barrier() {
  53   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
  54   // Dummy labels for just measuring the code size
  55   Label dummy_slow_path;
  56   Label dummy_continuation;
  57   Label dummy_guard;
  58   Label* slow_path = &dummy_slow_path;
  59   Label* continuation = &dummy_continuation;
  60   Label* guard = &dummy_guard;
  61   if (!Compile::current()->output()->in_scratch_emit_size()) {
  62     // Use real labels from actual stub when not emitting code for the purpose of measuring its size
  63     C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
  64     Compile::current()->output()->add_stub(stub);
  65     slow_path = &stub->entry();
  66     continuation = &stub->continuation();
  67     guard = &stub->guard();
  68   }
  69   // In the C2 code, we move the non-hot part of nmethod entry barriers out-of-line to a stub.
  70   bs->nmethod_entry_barrier(this, slow_path, continuation, guard);
  71 }
  72 
  73 // jdk.internal.util.ArraysSupport.vectorizedHashCode
  74 address C2_MacroAssembler::arrays_hashcode(Register ary, Register cnt, Register result,
  75                                            FloatRegister vdata0, FloatRegister vdata1,
  76                                            FloatRegister vdata2, FloatRegister vdata3,
  77                                            FloatRegister vmul0, FloatRegister vmul1,
  78                                            FloatRegister vmul2, FloatRegister vmul3,
  79                                            FloatRegister vpow, FloatRegister vpowm,
  80                                            BasicType eltype) {
  81   ARRAYS_HASHCODE_REGISTERS;
  82 
  83   Register tmp1 = rscratch1, tmp2 = rscratch2;
  84 
  85   Label TAIL, STUB_SWITCH, STUB_SWITCH_OUT, LOOP, BR_BASE, LARGE, DONE;
  86 
  87   // Vectorization factor. Number of array elements loaded to one SIMD&FP registers by the stubs. We
  88   // use 8H load arrangements for chars and shorts and 8B for booleans and bytes. It's possible to
  89   // use 4H for chars and shorts instead, but using 8H gives better performance.
  90   const size_t vf = eltype == T_BOOLEAN || eltype == T_BYTE ? 8
  91                     : eltype == T_CHAR || eltype == T_SHORT ? 8
  92                     : eltype == T_INT                       ? 4
  93                                                             : 0;
  94   guarantee(vf, "unsupported eltype");
  95 
  96   // Unroll factor for the scalar loop below. The value is chosen based on performance analysis.
  97   const size_t unroll_factor = 4;
  98 
  99   switch (eltype) {
 100   case T_BOOLEAN:
 101     BLOCK_COMMENT("arrays_hashcode(unsigned byte) {");
 102     break;
 103   case T_CHAR:
 104     BLOCK_COMMENT("arrays_hashcode(char) {");
 105     break;
 106   case T_BYTE:
 107     BLOCK_COMMENT("arrays_hashcode(byte) {");
 108     break;
 109   case T_SHORT:
 110     BLOCK_COMMENT("arrays_hashcode(short) {");
 111     break;
 112   case T_INT:
 113     BLOCK_COMMENT("arrays_hashcode(int) {");
 114     break;
 115   default:
 116     ShouldNotReachHere();
 117   }
 118 
 119   // large_arrays_hashcode(T_INT) performs worse than the scalar loop below when the Neon loop
 120   // implemented by the stub executes just once. Call the stub only if at least two iterations will
 121   // be executed.
 122   const size_t large_threshold = eltype == T_INT ? vf * 2 : vf;
 123   cmpw(cnt, large_threshold);
 124   br(Assembler::HS, LARGE);
 125 
 126   bind(TAIL);
 127 
 128   // The andr performs cnt % uf where uf = unroll_factor. The subtract shifted by 3 offsets past
 129   // uf - (cnt % uf) pairs of load + madd insns i.e. it only executes cnt % uf load + madd pairs.
 130   // Iteration eats up the remainder, uf elements at a time.
 131   assert(is_power_of_2(unroll_factor), "can't use this value to calculate the jump target PC");
 132   andr(tmp2, cnt, unroll_factor - 1);
 133   adr(tmp1, BR_BASE);
 134   // For Cortex-A53 offset is 4 because 2 nops are generated.
 135   sub(tmp1, tmp1, tmp2, ext::sxtw, VM_Version::supports_a53mac() ? 4 : 3);
 136   movw(tmp2, 0x1f);
 137   br(tmp1);
 138 
 139   bind(LOOP);
 140   for (size_t i = 0; i < unroll_factor; ++i) {
 141     load(tmp1, Address(post(ary, type2aelembytes(eltype))), eltype);
 142     maddw(result, result, tmp2, tmp1);
 143     // maddw generates an extra nop for Cortex-A53 (see maddw definition in macroAssembler).
 144     // Generate 2nd nop to have 4 instructions per iteration.
 145     if (VM_Version::supports_a53mac()) {
 146       nop();
 147     }
 148   }
 149   bind(BR_BASE);
 150   subsw(cnt, cnt, unroll_factor);
 151   br(Assembler::HS, LOOP);
 152 
 153   b(DONE);
 154 
 155   bind(LARGE);
 156 
 157   RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_arrays_hashcode(eltype));
 158   assert(stub.target() != nullptr, "array_hashcode stub has not been generated");
 159   address tpc = trampoline_call(stub);
 160   if (tpc == nullptr) {
 161     DEBUG_ONLY(reset_labels(TAIL, BR_BASE));
 162     postcond(pc() == badAddress);
 163     return nullptr;
 164   }
 165 
 166   bind(DONE);
 167 
 168   BLOCK_COMMENT("} // arrays_hashcode");
 169 
 170   postcond(pc() != badAddress);
 171   return pc();
 172 }
 173 
 174 void C2_MacroAssembler::fast_lock(Register obj, Register box, Register t1,
 175                                   Register t2, Register t3) {
 176   assert_different_registers(obj, box, t1, t2, t3, rscratch2);
 177 
 178   // Handle inflated monitor.
 179   Label inflated;
 180   // Finish fast lock successfully. MUST branch to with flag == EQ
 181   Label locked;
 182   // Finish fast lock unsuccessfully. MUST branch to with flag == NE
 183   Label slow_path;
 184 
 185   if (UseObjectMonitorTable) {
 186     // Clear cache in case fast locking succeeds or we need to take the slow-path.
 187     str(zr, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 188   }
 189 
 190   if (DiagnoseSyncOnValueBasedClasses != 0) {
 191     load_klass(t1, obj);
 192     ldrb(t1, Address(t1, Klass::misc_flags_offset()));
 193     tst(t1, KlassFlags::_misc_is_value_based_class);
 194     br(Assembler::NE, slow_path);
 195   }
 196 
 197   const Register t1_mark = t1;
 198   const Register t3_t = t3;
 199 
 200   { // Fast locking
 201 
 202     // Push lock to the lock stack and finish successfully. MUST branch to with flag == EQ
 203     Label push;
 204 
 205     const Register t2_top = t2;
 206 
 207     // Check if lock-stack is full.
 208     ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 209     cmpw(t2_top, (unsigned)LockStack::end_offset() - 1);
 210     br(Assembler::GT, slow_path);
 211 
 212     // Check if recursive.
 213     subw(t3_t, t2_top, oopSize);
 214     ldr(t3_t, Address(rthread, t3_t));
 215     cmp(obj, t3_t);
 216     br(Assembler::EQ, push);
 217 
 218     // Relaxed normal load to check for monitor. Optimization for monitor case.
 219     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 220     tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
 221 
 222     // Not inflated
 223     assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea");
 224 
 225     // Try to lock. Transition lock-bits 0b01 => 0b00
 226     orr(t1_mark, t1_mark, markWord::unlocked_value);
 227     eor(t3_t, t1_mark, markWord::unlocked_value);
 228     cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
 229             /*acquire*/ true, /*release*/ false, /*weak*/ false, noreg);
 230     br(Assembler::NE, slow_path);
 231 
 232     bind(push);
 233     // After successful lock, push object on lock-stack.
 234     str(obj, Address(rthread, t2_top));
 235     addw(t2_top, t2_top, oopSize);
 236     strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 237     b(locked);
 238   }
 239 
 240   { // Handle inflated monitor.
 241     bind(inflated);
 242 
 243     const Register t1_monitor = t1;
 244 
 245     if (!UseObjectMonitorTable) {
 246       assert(t1_monitor == t1_mark, "should be the same here");
 247     } else {
 248       const Register t1_hash = t1;
 249       Label monitor_found;
 250 
 251       // Save the mark, we might need it to extract the hash.
 252       mov(t3, t1_mark);
 253 
 254       // Look for the monitor in the om_cache.
 255 
 256       ByteSize cache_offset   = JavaThread::om_cache_oops_offset();
 257       ByteSize monitor_offset = OMCache::oop_to_monitor_difference();
 258       const int num_unrolled  = OMCache::CAPACITY;
 259       for (int i = 0; i < num_unrolled; i++) {
 260         ldr(t1_monitor, Address(rthread, cache_offset + monitor_offset));
 261         ldr(t2, Address(rthread, cache_offset));
 262         cmp(obj, t2);
 263         br(Assembler::EQ, monitor_found);
 264         cache_offset = cache_offset + OMCache::oop_to_oop_difference();
 265       }
 266 
 267       // Look for the monitor in the table.
 268 
 269       // Get the hash code.
 270       ubfx(t1_hash, t3, markWord::hash_shift, markWord::hash_bits);
 271 
 272       // Get the table and calculate the bucket's address
 273       lea(t3, ExternalAddress(ObjectMonitorTable::current_table_address()));
 274       ldr(t3, Address(t3));
 275       ldr(t2, Address(t3, ObjectMonitorTable::table_capacity_mask_offset()));
 276       ands(t1_hash, t1_hash, t2);
 277       ldr(t3, Address(t3, ObjectMonitorTable::table_buckets_offset()));
 278 
 279       // Read the monitor from the bucket.
 280       ldr(t1_monitor, Address(t3, t1_hash, Address::lsl(LogBytesPerWord)));
 281 
 282       // Check if the monitor in the bucket is special (empty, tombstone or removed).
 283       cmp(t1_monitor, (unsigned char)ObjectMonitorTable::SpecialPointerValues::below_is_special);
 284       br(Assembler::LO, slow_path);
 285 
 286       // Check if object matches.
 287       ldr(t3, Address(t1_monitor, ObjectMonitor::object_offset()));
 288       BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
 289       bs_asm->try_resolve_weak_handle_in_c2(this, t3, t2, slow_path);
 290       cmp(t3, obj);
 291       br(Assembler::NE, slow_path);
 292 
 293       bind(monitor_found);
 294     }
 295 
 296     const Register t2_owner_addr = t2;
 297     const Register t3_owner = t3;
 298     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 299     const Address owner_address(t1_monitor, ObjectMonitor::owner_offset() - monitor_tag);
 300     const Address recursions_address(t1_monitor, ObjectMonitor::recursions_offset() - monitor_tag);
 301 
 302     Label monitor_locked;
 303 
 304     // Compute owner address.
 305     lea(t2_owner_addr, owner_address);
 306 
 307     // Try to CAS owner (no owner => current thread's _monitor_owner_id).
 308     ldr(rscratch2, Address(rthread, JavaThread::monitor_owner_id_offset()));
 309     cmpxchg(t2_owner_addr, zr, rscratch2, Assembler::xword, /*acquire*/ true,
 310             /*release*/ false, /*weak*/ false, t3_owner);
 311     br(Assembler::EQ, monitor_locked);
 312 
 313     // Check if recursive.
 314     cmp(t3_owner, rscratch2);
 315     br(Assembler::NE, slow_path);
 316 
 317     // Recursive.
 318     increment(recursions_address, 1);
 319 
 320     bind(monitor_locked);
 321     if (UseObjectMonitorTable) {
 322       str(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 323     }
 324   }
 325 
 326   bind(locked);
 327 
 328 #ifdef ASSERT
 329   // Check that locked label is reached with Flags == EQ.
 330   Label flag_correct;
 331   br(Assembler::EQ, flag_correct);
 332   stop("Fast Lock Flag != EQ");
 333 #endif
 334 
 335   bind(slow_path);
 336 #ifdef ASSERT
 337   // Check that slow_path label is reached with Flags == NE.
 338   br(Assembler::NE, flag_correct);
 339   stop("Fast Lock Flag != NE");
 340   bind(flag_correct);
 341 #endif
 342   // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
 343 }
 344 
 345 void C2_MacroAssembler::fast_unlock(Register obj, Register box, Register t1,
 346                                     Register t2, Register t3) {
 347   assert_different_registers(obj, box, t1, t2, t3);
 348 
 349   // Handle inflated monitor.
 350   Label inflated, inflated_load_mark;
 351   // Finish fast unlock successfully. MUST branch to with flag == EQ
 352   Label unlocked;
 353   // Finish fast unlock unsuccessfully. MUST branch to with flag == NE
 354   Label slow_path;
 355 
 356   const Register t1_mark = t1;
 357   const Register t2_top = t2;
 358   const Register t3_t = t3;
 359 
 360   { // Fast unlock
 361 
 362     Label push_and_slow_path;
 363 
 364     // Check if obj is top of lock-stack.
 365     ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 366     subw(t2_top, t2_top, oopSize);
 367     ldr(t3_t, Address(rthread, t2_top));
 368     cmp(obj, t3_t);
 369     // Top of lock stack was not obj. Must be monitor.
 370     br(Assembler::NE, inflated_load_mark);
 371 
 372     // Pop lock-stack.
 373     DEBUG_ONLY(str(zr, Address(rthread, t2_top));)
 374     strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 375 
 376     // Check if recursive.
 377     subw(t3_t, t2_top, oopSize);
 378     ldr(t3_t, Address(rthread, t3_t));
 379     cmp(obj, t3_t);
 380     br(Assembler::EQ, unlocked);
 381 
 382     // Not recursive.
 383     // Load Mark.
 384     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 385 
 386     // Check header for monitor (0b10).
 387     // Because we got here by popping (meaning we pushed in locked)
 388     // there will be no monitor in the box. So we need to push back the obj
 389     // so that the runtime can fix any potential anonymous owner.
 390     tbnz(t1_mark, exact_log2(markWord::monitor_value), UseObjectMonitorTable ? push_and_slow_path : inflated);
 391 
 392     // Try to unlock. Transition lock bits 0b00 => 0b01
 393     assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea");
 394     orr(t3_t, t1_mark, markWord::unlocked_value);
 395     cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
 396             /*acquire*/ false, /*release*/ true, /*weak*/ false, noreg);
 397     br(Assembler::EQ, unlocked);
 398 
 399     bind(push_and_slow_path);
 400     // Compare and exchange failed.
 401     // Restore lock-stack and handle the unlock in runtime.
 402     DEBUG_ONLY(str(obj, Address(rthread, t2_top));)
 403     addw(t2_top, t2_top, oopSize);
 404     str(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 405     b(slow_path);
 406   }
 407 
 408 
 409   { // Handle inflated monitor.
 410     bind(inflated_load_mark);
 411     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 412 #ifdef ASSERT
 413     tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
 414     stop("Fast Unlock not monitor");
 415 #endif
 416 
 417     bind(inflated);
 418 
 419 #ifdef ASSERT
 420     Label check_done;
 421     subw(t2_top, t2_top, oopSize);
 422     cmpw(t2_top, in_bytes(JavaThread::lock_stack_base_offset()));
 423     br(Assembler::LT, check_done);
 424     ldr(t3_t, Address(rthread, t2_top));
 425     cmp(obj, t3_t);
 426     br(Assembler::NE, inflated);
 427     stop("Fast Unlock lock on stack");
 428     bind(check_done);
 429 #endif
 430 
 431     const Register t1_monitor = t1;
 432 
 433     if (!UseObjectMonitorTable) {
 434       assert(t1_monitor == t1_mark, "should be the same here");
 435 
 436       // Untag the monitor.
 437       add(t1_monitor, t1_mark, -(int)markWord::monitor_value);
 438     } else {
 439       ldr(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 440       // null check with Flags == NE, no valid pointer below alignof(ObjectMonitor*)
 441       cmp(t1_monitor, checked_cast<uint8_t>(alignof(ObjectMonitor*)));
 442       br(Assembler::LO, slow_path);
 443     }
 444 
 445     const Register t2_recursions = t2;
 446     Label not_recursive;
 447 
 448     // Check if recursive.
 449     ldr(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
 450     cbz(t2_recursions, not_recursive);
 451 
 452     // Recursive unlock.
 453     sub(t2_recursions, t2_recursions, 1u);
 454     str(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
 455     // Set flag == EQ
 456     cmp(t2_recursions, t2_recursions);
 457     b(unlocked);
 458 
 459     bind(not_recursive);
 460 
 461     const Register t2_owner_addr = t2;
 462 
 463     // Compute owner address.
 464     lea(t2_owner_addr, Address(t1_monitor, ObjectMonitor::owner_offset()));
 465 
 466     // Set owner to null.
 467     // Release to satisfy the JMM
 468     stlr(zr, t2_owner_addr);
 469     // We need a full fence after clearing owner to avoid stranding.
 470     // StoreLoad achieves this.
 471     membar(StoreLoad);
 472 
 473     // Check if the entry_list is empty.
 474     ldr(rscratch1, Address(t1_monitor, ObjectMonitor::entry_list_offset()));
 475     cmp(rscratch1, zr);
 476     br(Assembler::EQ, unlocked);  // If so we are done.
 477 
 478     // Check if there is a successor.
 479     ldr(rscratch1, Address(t1_monitor, ObjectMonitor::succ_offset()));
 480     cmp(rscratch1, zr);
 481     br(Assembler::NE, unlocked);  // If so we are done.
 482 
 483     // Save the monitor pointer in the current thread, so we can try to
 484     // reacquire the lock in SharedRuntime::monitor_exit_helper().
 485     str(t1_monitor, Address(rthread, JavaThread::unlocked_inflated_monitor_offset()));
 486 
 487     cmp(zr, rthread); // Set Flag to NE => slow path
 488     b(slow_path);
 489   }
 490 
 491   bind(unlocked);
 492   cmp(zr, zr); // Set Flags to EQ => fast path
 493 
 494 #ifdef ASSERT
 495   // Check that unlocked label is reached with Flags == EQ.
 496   Label flag_correct;
 497   br(Assembler::EQ, flag_correct);
 498   stop("Fast Unlock Flag != EQ");
 499 #endif
 500 
 501   bind(slow_path);
 502 #ifdef ASSERT
 503   // Check that slow_path label is reached with Flags == NE.
 504   br(Assembler::NE, flag_correct);
 505   stop("Fast Unlock Flag != NE");
 506   bind(flag_correct);
 507 #endif
 508   // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
 509 }
 510 
 511 // Search for str1 in str2 and return index or -1
 512 // Clobbers: rscratch1, rscratch2, rflags. May also clobber v0-v1, when icnt1==-1.
 513 void C2_MacroAssembler::string_indexof(Register str2, Register str1,
 514                                        Register cnt2, Register cnt1,
 515                                        Register tmp1, Register tmp2,
 516                                        Register tmp3, Register tmp4,
 517                                        Register tmp5, Register tmp6,
 518                                        int icnt1, Register result, int ae) {
 519   // NOTE: tmp5, tmp6 can be zr depending on specific method version
 520   Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
 521 
 522   Register ch1 = rscratch1;
 523   Register ch2 = rscratch2;
 524   Register cnt1tmp = tmp1;
 525   Register cnt2tmp = tmp2;
 526   Register cnt1_neg = cnt1;
 527   Register cnt2_neg = cnt2;
 528   Register result_tmp = tmp4;
 529 
 530   bool isL = ae == StrIntrinsicNode::LL;
 531 
 532   bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
 533   bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
 534   int str1_chr_shift = str1_isL ? 0:1;
 535   int str2_chr_shift = str2_isL ? 0:1;
 536   int str1_chr_size = str1_isL ? 1:2;
 537   int str2_chr_size = str2_isL ? 1:2;
 538   chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
 539                                       (chr_insn)&MacroAssembler::ldrh;
 540   chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
 541                                       (chr_insn)&MacroAssembler::ldrh;
 542   chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
 543   chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
 544 
 545   // Note, inline_string_indexOf() generates checks:
 546   // if (substr.count > string.count) return -1;
 547   // if (substr.count == 0) return 0;
 548 
 549   // We have two strings, a source string in str2, cnt2 and a pattern string
 550   // in str1, cnt1. Find the 1st occurrence of pattern in source or return -1.
 551 
 552   // For larger pattern and source we use a simplified Boyer Moore algorithm.
 553   // With a small pattern and source we use linear scan.
 554 
 555   if (icnt1 == -1) {
 556     sub(result_tmp, cnt2, cnt1);
 557     cmp(cnt1, (u1)8);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
 558     br(LT, LINEARSEARCH);
 559     dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
 560     subs(zr, cnt1, 256);
 561     lsr(tmp1, cnt2, 2);
 562     ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
 563     br(GE, LINEARSTUB);
 564   }
 565 
 566 // The Boyer Moore alogorithm is based on the description here:-
 567 //
 568 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
 569 //
 570 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
 571 // and the 'Good Suffix' rule.
 572 //
 573 // These rules are essentially heuristics for how far we can shift the
 574 // pattern along the search string.
 575 //
 576 // The implementation here uses the 'Bad Character' rule only because of the
 577 // complexity of initialisation for the 'Good Suffix' rule.
 578 //
 579 // This is also known as the Boyer-Moore-Horspool algorithm:-
 580 //
 581 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
 582 //
 583 // This particular implementation has few java-specific optimizations.
 584 //
 585 // #define ASIZE 256
 586 //
 587 //    int bm(unsigned char *x, int m, unsigned char *y, int n) {
 588 //       int i, j;
 589 //       unsigned c;
 590 //       unsigned char bc[ASIZE];
 591 //
 592 //       /* Preprocessing */
 593 //       for (i = 0; i < ASIZE; ++i)
 594 //          bc[i] = m;
 595 //       for (i = 0; i < m - 1; ) {
 596 //          c = x[i];
 597 //          ++i;
 598 //          // c < 256 for Latin1 string, so, no need for branch
 599 //          #ifdef PATTERN_STRING_IS_LATIN1
 600 //          bc[c] = m - i;
 601 //          #else
 602 //          if (c < ASIZE) bc[c] = m - i;
 603 //          #endif
 604 //       }
 605 //
 606 //       /* Searching */
 607 //       j = 0;
 608 //       while (j <= n - m) {
 609 //          c = y[i+j];
 610 //          if (x[m-1] == c)
 611 //            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
 612 //          if (i < 0) return j;
 613 //          // c < 256 for Latin1 string, so, no need for branch
 614 //          #ifdef SOURCE_STRING_IS_LATIN1
 615 //          // LL case: (c< 256) always true. Remove branch
 616 //          j += bc[y[j+m-1]];
 617 //          #endif
 618 //          #ifndef PATTERN_STRING_IS_UTF
 619 //          // UU case: need if (c<ASIZE) check. Skip 1 character if not.
 620 //          if (c < ASIZE)
 621 //            j += bc[y[j+m-1]];
 622 //          else
 623 //            j += 1
 624 //          #endif
 625 //          #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
 626 //          // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
 627 //          if (c < ASIZE)
 628 //            j += bc[y[j+m-1]];
 629 //          else
 630 //            j += m
 631 //          #endif
 632 //       }
 633 //    }
 634 
 635   if (icnt1 == -1) {
 636     Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
 637         BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
 638     Register cnt1end = tmp2;
 639     Register str2end = cnt2;
 640     Register skipch = tmp2;
 641 
 642     // str1 length is >=8, so, we can read at least 1 register for cases when
 643     // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
 644     // UL case. We'll re-read last character in inner pre-loop code to have
 645     // single outer pre-loop load
 646     const int firstStep = isL ? 7 : 3;
 647 
 648     const int ASIZE = 256;
 649     const int STORED_BYTES = 32; // amount of bytes stored per instruction
 650     sub(sp, sp, ASIZE);
 651     mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
 652     mov(ch1, sp);
 653     BIND(BM_INIT_LOOP);
 654       stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
 655       subs(tmp5, tmp5, 1);
 656       br(GT, BM_INIT_LOOP);
 657 
 658       sub(cnt1tmp, cnt1, 1);
 659       mov(tmp5, str2);
 660       add(str2end, str2, result_tmp, LSL, str2_chr_shift);
 661       sub(ch2, cnt1, 1);
 662       mov(tmp3, str1);
 663     BIND(BCLOOP);
 664       (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
 665       if (!str1_isL) {
 666         subs(zr, ch1, ASIZE);
 667         br(HS, BCSKIP);
 668       }
 669       strb(ch2, Address(sp, ch1));
 670     BIND(BCSKIP);
 671       subs(ch2, ch2, 1);
 672       br(GT, BCLOOP);
 673 
 674       add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
 675       if (str1_isL == str2_isL) {
 676         // load last 8 bytes (8LL/4UU symbols)
 677         ldr(tmp6, Address(tmp6, -wordSize));
 678       } else {
 679         ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
 680         // convert Latin1 to UTF. We'll have to wait until load completed, but
 681         // it's still faster than per-character loads+checks
 682         lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
 683         ubfx(ch1, tmp6, 8, 8); // str1[N-2]
 684         ubfx(ch2, tmp6, 16, 8); // str1[N-3]
 685         andr(tmp6, tmp6, 0xFF); // str1[N-4]
 686         orr(ch2, ch1, ch2, LSL, 16);
 687         orr(tmp6, tmp6, tmp3, LSL, 48);
 688         orr(tmp6, tmp6, ch2, LSL, 16);
 689       }
 690     BIND(BMLOOPSTR2);
 691       (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 692       sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
 693       if (str1_isL == str2_isL) {
 694         // re-init tmp3. It's for free because it's executed in parallel with
 695         // load above. Alternative is to initialize it before loop, but it'll
 696         // affect performance on in-order systems with 2 or more ld/st pipelines
 697         lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
 698       }
 699       if (!isL) { // UU/UL case
 700         lsl(ch2, cnt1tmp, 1); // offset in bytes
 701       }
 702       cmp(tmp3, skipch);
 703       br(NE, BMSKIP);
 704       ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
 705       mov(ch1, tmp6);
 706       if (isL) {
 707         b(BMLOOPSTR1_AFTER_LOAD);
 708       } else {
 709         sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
 710         b(BMLOOPSTR1_CMP);
 711       }
 712     BIND(BMLOOPSTR1);
 713       (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
 714       (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 715     BIND(BMLOOPSTR1_AFTER_LOAD);
 716       subs(cnt1tmp, cnt1tmp, 1);
 717       br(LT, BMLOOPSTR1_LASTCMP);
 718     BIND(BMLOOPSTR1_CMP);
 719       cmp(ch1, ch2);
 720       br(EQ, BMLOOPSTR1);
 721     BIND(BMSKIP);
 722       if (!isL) {
 723         // if we've met UTF symbol while searching Latin1 pattern, then we can
 724         // skip cnt1 symbols
 725         if (str1_isL != str2_isL) {
 726           mov(result_tmp, cnt1);
 727         } else {
 728           mov(result_tmp, 1);
 729         }
 730         subs(zr, skipch, ASIZE);
 731         br(HS, BMADV);
 732       }
 733       ldrb(result_tmp, Address(sp, skipch)); // load skip distance
 734     BIND(BMADV);
 735       sub(cnt1tmp, cnt1, 1);
 736       add(str2, str2, result_tmp, LSL, str2_chr_shift);
 737       cmp(str2, str2end);
 738       br(LE, BMLOOPSTR2);
 739       add(sp, sp, ASIZE);
 740       b(NOMATCH);
 741     BIND(BMLOOPSTR1_LASTCMP);
 742       cmp(ch1, ch2);
 743       br(NE, BMSKIP);
 744     BIND(BMMATCH);
 745       sub(result, str2, tmp5);
 746       if (!str2_isL) lsr(result, result, 1);
 747       add(sp, sp, ASIZE);
 748       b(DONE);
 749 
 750     BIND(LINEARSTUB);
 751     cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm
 752     br(LT, LINEAR_MEDIUM);
 753     mov(result, zr);
 754     RuntimeAddress stub = nullptr;
 755     if (isL) {
 756       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
 757       assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated");
 758     } else if (str1_isL) {
 759       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
 760        assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated");
 761     } else {
 762       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
 763       assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated");
 764     }
 765     address call = trampoline_call(stub);
 766     if (call == nullptr) {
 767       DEBUG_ONLY(reset_labels(LINEARSEARCH, LINEAR_MEDIUM, DONE, NOMATCH, MATCH));
 768       ciEnv::current()->record_failure("CodeCache is full");
 769       return;
 770     }
 771     b(DONE);
 772   }
 773 
 774   BIND(LINEARSEARCH);
 775   {
 776     Label DO1, DO2, DO3;
 777 
 778     Register str2tmp = tmp2;
 779     Register first = tmp3;
 780 
 781     if (icnt1 == -1)
 782     {
 783         Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
 784 
 785         cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2));
 786         br(LT, DOSHORT);
 787       BIND(LINEAR_MEDIUM);
 788         (this->*str1_load_1chr)(first, Address(str1));
 789         lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
 790         sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
 791         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 792         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 793 
 794       BIND(FIRST_LOOP);
 795         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
 796         cmp(first, ch2);
 797         br(EQ, STR1_LOOP);
 798       BIND(STR2_NEXT);
 799         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 800         br(LE, FIRST_LOOP);
 801         b(NOMATCH);
 802 
 803       BIND(STR1_LOOP);
 804         adds(cnt1tmp, cnt1_neg, str1_chr_size);
 805         add(cnt2tmp, cnt2_neg, str2_chr_size);
 806         br(GE, MATCH);
 807 
 808       BIND(STR1_NEXT);
 809         (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
 810         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
 811         cmp(ch1, ch2);
 812         br(NE, STR2_NEXT);
 813         adds(cnt1tmp, cnt1tmp, str1_chr_size);
 814         add(cnt2tmp, cnt2tmp, str2_chr_size);
 815         br(LT, STR1_NEXT);
 816         b(MATCH);
 817 
 818       BIND(DOSHORT);
 819       if (str1_isL == str2_isL) {
 820         cmp(cnt1, (u1)2);
 821         br(LT, DO1);
 822         br(GT, DO3);
 823       }
 824     }
 825 
 826     if (icnt1 == 4) {
 827       Label CH1_LOOP;
 828 
 829         (this->*load_4chr)(ch1, str1);
 830         sub(result_tmp, cnt2, 4);
 831         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 832         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 833 
 834       BIND(CH1_LOOP);
 835         (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
 836         cmp(ch1, ch2);
 837         br(EQ, MATCH);
 838         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 839         br(LE, CH1_LOOP);
 840         b(NOMATCH);
 841       }
 842 
 843     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
 844       Label CH1_LOOP;
 845 
 846       BIND(DO2);
 847         (this->*load_2chr)(ch1, str1);
 848         if (icnt1 == 2) {
 849           sub(result_tmp, cnt2, 2);
 850         }
 851         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 852         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 853       BIND(CH1_LOOP);
 854         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
 855         cmp(ch1, ch2);
 856         br(EQ, MATCH);
 857         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 858         br(LE, CH1_LOOP);
 859         b(NOMATCH);
 860     }
 861 
 862     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
 863       Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
 864 
 865       BIND(DO3);
 866         (this->*load_2chr)(first, str1);
 867         (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
 868         if (icnt1 == 3) {
 869           sub(result_tmp, cnt2, 3);
 870         }
 871         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 872         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 873       BIND(FIRST_LOOP);
 874         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
 875         cmpw(first, ch2);
 876         br(EQ, STR1_LOOP);
 877       BIND(STR2_NEXT);
 878         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 879         br(LE, FIRST_LOOP);
 880         b(NOMATCH);
 881 
 882       BIND(STR1_LOOP);
 883         add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
 884         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
 885         cmp(ch1, ch2);
 886         br(NE, STR2_NEXT);
 887         b(MATCH);
 888     }
 889 
 890     if (icnt1 == -1 || icnt1 == 1) {
 891       Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
 892 
 893       BIND(DO1);
 894         (this->*str1_load_1chr)(ch1, str1);
 895         cmp(cnt2, (u1)8);
 896         br(LT, DO1_SHORT);
 897 
 898         sub(result_tmp, cnt2, 8/str2_chr_size);
 899         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 900         mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
 901         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 902 
 903         if (str2_isL) {
 904           orr(ch1, ch1, ch1, LSL, 8);
 905         }
 906         orr(ch1, ch1, ch1, LSL, 16);
 907         orr(ch1, ch1, ch1, LSL, 32);
 908       BIND(CH1_LOOP);
 909         ldr(ch2, Address(str2, cnt2_neg));
 910         eor(ch2, ch1, ch2);
 911         sub(tmp1, ch2, tmp3);
 912         orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 913         bics(tmp1, tmp1, tmp2);
 914         br(NE, HAS_ZERO);
 915         adds(cnt2_neg, cnt2_neg, 8);
 916         br(LT, CH1_LOOP);
 917 
 918         cmp(cnt2_neg, (u1)8);
 919         mov(cnt2_neg, 0);
 920         br(LT, CH1_LOOP);
 921         b(NOMATCH);
 922 
 923       BIND(HAS_ZERO);
 924         rev(tmp1, tmp1);
 925         clz(tmp1, tmp1);
 926         add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
 927         b(MATCH);
 928 
 929       BIND(DO1_SHORT);
 930         mov(result_tmp, cnt2);
 931         lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
 932         sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
 933       BIND(DO1_LOOP);
 934         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
 935         cmpw(ch1, ch2);
 936         br(EQ, MATCH);
 937         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 938         br(LT, DO1_LOOP);
 939     }
 940   }
 941   BIND(NOMATCH);
 942     mov(result, -1);
 943     b(DONE);
 944   BIND(MATCH);
 945     add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
 946   BIND(DONE);
 947 }
 948 
 949 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
 950 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
 951 
 952 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1,
 953                                             Register ch, Register result,
 954                                             Register tmp1, Register tmp2, Register tmp3)
 955 {
 956   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
 957   Register cnt1_neg = cnt1;
 958   Register ch1 = rscratch1;
 959   Register result_tmp = rscratch2;
 960 
 961   cbz(cnt1, NOMATCH);
 962 
 963   cmp(cnt1, (u1)4);
 964   br(LT, DO1_SHORT);
 965 
 966   orr(ch, ch, ch, LSL, 16);
 967   orr(ch, ch, ch, LSL, 32);
 968 
 969   sub(cnt1, cnt1, 4);
 970   mov(result_tmp, cnt1);
 971   lea(str1, Address(str1, cnt1, Address::uxtw(1)));
 972   sub(cnt1_neg, zr, cnt1, LSL, 1);
 973 
 974   mov(tmp3, 0x0001000100010001);
 975 
 976   BIND(CH1_LOOP);
 977     ldr(ch1, Address(str1, cnt1_neg));
 978     eor(ch1, ch, ch1);
 979     sub(tmp1, ch1, tmp3);
 980     orr(tmp2, ch1, 0x7fff7fff7fff7fff);
 981     bics(tmp1, tmp1, tmp2);
 982     br(NE, HAS_ZERO);
 983     adds(cnt1_neg, cnt1_neg, 8);
 984     br(LT, CH1_LOOP);
 985 
 986     cmp(cnt1_neg, (u1)8);
 987     mov(cnt1_neg, 0);
 988     br(LT, CH1_LOOP);
 989     b(NOMATCH);
 990 
 991   BIND(HAS_ZERO);
 992     rev(tmp1, tmp1);
 993     clz(tmp1, tmp1);
 994     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
 995     b(MATCH);
 996 
 997   BIND(DO1_SHORT);
 998     mov(result_tmp, cnt1);
 999     lea(str1, Address(str1, cnt1, Address::uxtw(1)));
1000     sub(cnt1_neg, zr, cnt1, LSL, 1);
1001   BIND(DO1_LOOP);
1002     ldrh(ch1, Address(str1, cnt1_neg));
1003     cmpw(ch, ch1);
1004     br(EQ, MATCH);
1005     adds(cnt1_neg, cnt1_neg, 2);
1006     br(LT, DO1_LOOP);
1007   BIND(NOMATCH);
1008     mov(result, -1);
1009     b(DONE);
1010   BIND(MATCH);
1011     add(result, result_tmp, cnt1_neg, ASR, 1);
1012   BIND(DONE);
1013 }
1014 
1015 void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1,
1016                                                 Register ch, Register result,
1017                                                 FloatRegister ztmp1,
1018                                                 FloatRegister ztmp2,
1019                                                 PRegister tmp_pg,
1020                                                 PRegister tmp_pdn, bool isL)
1021 {
1022   // Note that `tmp_pdn` should *NOT* be used as governing predicate register.
1023   assert(tmp_pg->is_governing(),
1024          "this register has to be a governing predicate register");
1025 
1026   Label LOOP, MATCH, DONE, NOMATCH;
1027   Register vec_len = rscratch1;
1028   Register idx = rscratch2;
1029 
1030   SIMD_RegVariant T = (isL == true) ? B : H;
1031 
1032   cbz(cnt1, NOMATCH);
1033 
1034   // Assign the particular char throughout the vector.
1035   sve_dup(ztmp2, T, ch);
1036   if (isL) {
1037     sve_cntb(vec_len);
1038   } else {
1039     sve_cnth(vec_len);
1040   }
1041   mov(idx, 0);
1042 
1043   // Generate a predicate to control the reading of input string.
1044   sve_whilelt(tmp_pg, T, idx, cnt1);
1045 
1046   BIND(LOOP);
1047     // Read a vector of 8- or 16-bit data depending on the string type. Note
1048     // that inactive elements indicated by the predicate register won't cause
1049     // a data read from memory to the destination vector.
1050     if (isL) {
1051       sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx));
1052     } else {
1053       sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1)));
1054     }
1055     add(idx, idx, vec_len);
1056 
1057     // Perform the comparison. An element of the destination predicate is set
1058     // to active if the particular char is matched.
1059     sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2);
1060 
1061     // Branch if the particular char is found.
1062     br(NE, MATCH);
1063 
1064     sve_whilelt(tmp_pg, T, idx, cnt1);
1065 
1066     // Loop back if the particular char not found.
1067     br(MI, LOOP);
1068 
1069   BIND(NOMATCH);
1070     mov(result, -1);
1071     b(DONE);
1072 
1073   BIND(MATCH);
1074     // Undo the index increment.
1075     sub(idx, idx, vec_len);
1076 
1077     // Crop the vector to find its location.
1078     sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */);
1079     add(result, idx, -1);
1080     sve_incp(result, T, tmp_pdn);
1081   BIND(DONE);
1082 }
1083 
1084 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1,
1085                                             Register ch, Register result,
1086                                             Register tmp1, Register tmp2, Register tmp3)
1087 {
1088   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
1089   Register cnt1_neg = cnt1;
1090   Register ch1 = rscratch1;
1091   Register result_tmp = rscratch2;
1092 
1093   cbz(cnt1, NOMATCH);
1094 
1095   cmp(cnt1, (u1)8);
1096   br(LT, DO1_SHORT);
1097 
1098   orr(ch, ch, ch, LSL, 8);
1099   orr(ch, ch, ch, LSL, 16);
1100   orr(ch, ch, ch, LSL, 32);
1101 
1102   sub(cnt1, cnt1, 8);
1103   mov(result_tmp, cnt1);
1104   lea(str1, Address(str1, cnt1));
1105   sub(cnt1_neg, zr, cnt1);
1106 
1107   mov(tmp3, 0x0101010101010101);
1108 
1109   BIND(CH1_LOOP);
1110     ldr(ch1, Address(str1, cnt1_neg));
1111     eor(ch1, ch, ch1);
1112     sub(tmp1, ch1, tmp3);
1113     orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f);
1114     bics(tmp1, tmp1, tmp2);
1115     br(NE, HAS_ZERO);
1116     adds(cnt1_neg, cnt1_neg, 8);
1117     br(LT, CH1_LOOP);
1118 
1119     cmp(cnt1_neg, (u1)8);
1120     mov(cnt1_neg, 0);
1121     br(LT, CH1_LOOP);
1122     b(NOMATCH);
1123 
1124   BIND(HAS_ZERO);
1125     rev(tmp1, tmp1);
1126     clz(tmp1, tmp1);
1127     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
1128     b(MATCH);
1129 
1130   BIND(DO1_SHORT);
1131     mov(result_tmp, cnt1);
1132     lea(str1, Address(str1, cnt1));
1133     sub(cnt1_neg, zr, cnt1);
1134   BIND(DO1_LOOP);
1135     ldrb(ch1, Address(str1, cnt1_neg));
1136     cmp(ch, ch1);
1137     br(EQ, MATCH);
1138     adds(cnt1_neg, cnt1_neg, 1);
1139     br(LT, DO1_LOOP);
1140   BIND(NOMATCH);
1141     mov(result, -1);
1142     b(DONE);
1143   BIND(MATCH);
1144     add(result, result_tmp, cnt1_neg);
1145   BIND(DONE);
1146 }
1147 
1148 // Compare strings.
1149 void C2_MacroAssembler::string_compare(Register str1, Register str2,
1150     Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
1151     FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
1152     PRegister pgtmp1, PRegister pgtmp2, int ae) {
1153   Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
1154       DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
1155       SHORT_LOOP_START, TAIL_CHECK;
1156 
1157   bool isLL = ae == StrIntrinsicNode::LL;
1158   bool isLU = ae == StrIntrinsicNode::LU;
1159   bool isUL = ae == StrIntrinsicNode::UL;
1160 
1161   // The stub threshold for LL strings is: 72 (64 + 8) chars
1162   // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch)
1163   // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least)
1164   const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36);
1165 
1166   bool str1_isL = isLL || isLU;
1167   bool str2_isL = isLL || isUL;
1168 
1169   int str1_chr_shift = str1_isL ? 0 : 1;
1170   int str2_chr_shift = str2_isL ? 0 : 1;
1171   int str1_chr_size = str1_isL ? 1 : 2;
1172   int str2_chr_size = str2_isL ? 1 : 2;
1173   int minCharsInWord = isLL ? wordSize : wordSize/2;
1174 
1175   FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
1176   chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
1177                                       (chr_insn)&MacroAssembler::ldrh;
1178   chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
1179                                       (chr_insn)&MacroAssembler::ldrh;
1180   uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
1181                             (uxt_insn)&MacroAssembler::uxthw;
1182 
1183   BLOCK_COMMENT("string_compare {");
1184 
1185   // Bizarrely, the counts are passed in bytes, regardless of whether they
1186   // are L or U strings, however the result is always in characters.
1187   if (!str1_isL) asrw(cnt1, cnt1, 1);
1188   if (!str2_isL) asrw(cnt2, cnt2, 1);
1189 
1190   // Compute the minimum of the string lengths and save the difference.
1191   subsw(result, cnt1, cnt2);
1192   cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
1193 
1194   // A very short string
1195   cmpw(cnt2, minCharsInWord);
1196   br(Assembler::LE, SHORT_STRING);
1197 
1198   // Compare longwords
1199   // load first parts of strings and finish initialization while loading
1200   {
1201     if (str1_isL == str2_isL) { // LL or UU
1202       ldr(tmp1, Address(str1));
1203       cmp(str1, str2);
1204       br(Assembler::EQ, DONE);
1205       ldr(tmp2, Address(str2));
1206       cmp(cnt2, stub_threshold);
1207       br(GE, STUB);
1208       subsw(cnt2, cnt2, minCharsInWord);
1209       br(EQ, TAIL_CHECK);
1210       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1211       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1212       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1213     } else if (isLU) {
1214       ldrs(vtmp, Address(str1));
1215       ldr(tmp2, Address(str2));
1216       cmp(cnt2, stub_threshold);
1217       br(GE, STUB);
1218       subw(cnt2, cnt2, 4);
1219       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1220       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1221       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1222       zip1(vtmp, T8B, vtmp, vtmpZ);
1223       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1224       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1225       add(cnt1, cnt1, 4);
1226       fmovd(tmp1, vtmp);
1227     } else { // UL case
1228       ldr(tmp1, Address(str1));
1229       ldrs(vtmp, Address(str2));
1230       cmp(cnt2, stub_threshold);
1231       br(GE, STUB);
1232       subw(cnt2, cnt2, 4);
1233       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1234       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1235       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1236       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1237       zip1(vtmp, T8B, vtmp, vtmpZ);
1238       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1239       add(cnt1, cnt1, 8);
1240       fmovd(tmp2, vtmp);
1241     }
1242     adds(cnt2, cnt2, isUL ? 4 : 8);
1243     br(GE, TAIL);
1244     eor(rscratch2, tmp1, tmp2);
1245     cbnz(rscratch2, DIFF);
1246     // main loop
1247     bind(NEXT_WORD);
1248     if (str1_isL == str2_isL) {
1249       ldr(tmp1, Address(str1, cnt2));
1250       ldr(tmp2, Address(str2, cnt2));
1251       adds(cnt2, cnt2, 8);
1252     } else if (isLU) {
1253       ldrs(vtmp, Address(str1, cnt1));
1254       ldr(tmp2, Address(str2, cnt2));
1255       add(cnt1, cnt1, 4);
1256       zip1(vtmp, T8B, vtmp, vtmpZ);
1257       fmovd(tmp1, vtmp);
1258       adds(cnt2, cnt2, 8);
1259     } else { // UL
1260       ldrs(vtmp, Address(str2, cnt2));
1261       ldr(tmp1, Address(str1, cnt1));
1262       zip1(vtmp, T8B, vtmp, vtmpZ);
1263       add(cnt1, cnt1, 8);
1264       fmovd(tmp2, vtmp);
1265       adds(cnt2, cnt2, 4);
1266     }
1267     br(GE, TAIL);
1268 
1269     eor(rscratch2, tmp1, tmp2);
1270     cbz(rscratch2, NEXT_WORD);
1271     b(DIFF);
1272     bind(TAIL);
1273     eor(rscratch2, tmp1, tmp2);
1274     cbnz(rscratch2, DIFF);
1275     // Last longword.  In the case where length == 4 we compare the
1276     // same longword twice, but that's still faster than another
1277     // conditional branch.
1278     if (str1_isL == str2_isL) {
1279       ldr(tmp1, Address(str1));
1280       ldr(tmp2, Address(str2));
1281     } else if (isLU) {
1282       ldrs(vtmp, Address(str1));
1283       ldr(tmp2, Address(str2));
1284       zip1(vtmp, T8B, vtmp, vtmpZ);
1285       fmovd(tmp1, vtmp);
1286     } else { // UL
1287       ldrs(vtmp, Address(str2));
1288       ldr(tmp1, Address(str1));
1289       zip1(vtmp, T8B, vtmp, vtmpZ);
1290       fmovd(tmp2, vtmp);
1291     }
1292     bind(TAIL_CHECK);
1293     eor(rscratch2, tmp1, tmp2);
1294     cbz(rscratch2, DONE);
1295 
1296     // Find the first different characters in the longwords and
1297     // compute their difference.
1298     bind(DIFF);
1299     rev(rscratch2, rscratch2);
1300     clz(rscratch2, rscratch2);
1301     andr(rscratch2, rscratch2, isLL ? -8 : -16);
1302     lsrv(tmp1, tmp1, rscratch2);
1303     (this->*ext_chr)(tmp1, tmp1);
1304     lsrv(tmp2, tmp2, rscratch2);
1305     (this->*ext_chr)(tmp2, tmp2);
1306     subw(result, tmp1, tmp2);
1307     b(DONE);
1308   }
1309 
1310   bind(STUB);
1311     RuntimeAddress stub = nullptr;
1312     switch(ae) {
1313       case StrIntrinsicNode::LL:
1314         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
1315         break;
1316       case StrIntrinsicNode::UU:
1317         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
1318         break;
1319       case StrIntrinsicNode::LU:
1320         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
1321         break;
1322       case StrIntrinsicNode::UL:
1323         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
1324         break;
1325       default:
1326         ShouldNotReachHere();
1327      }
1328     assert(stub.target() != nullptr, "compare_long_string stub has not been generated");
1329     address call = trampoline_call(stub);
1330     if (call == nullptr) {
1331       DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START));
1332       ciEnv::current()->record_failure("CodeCache is full");
1333       return;
1334     }
1335     b(DONE);
1336 
1337   bind(SHORT_STRING);
1338   // Is the minimum length zero?
1339   cbz(cnt2, DONE);
1340   // arrange code to do most branches while loading and loading next characters
1341   // while comparing previous
1342   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1343   subs(cnt2, cnt2, 1);
1344   br(EQ, SHORT_LAST_INIT);
1345   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1346   b(SHORT_LOOP_START);
1347   bind(SHORT_LOOP);
1348   subs(cnt2, cnt2, 1);
1349   br(EQ, SHORT_LAST);
1350   bind(SHORT_LOOP_START);
1351   (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
1352   (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
1353   cmp(tmp1, cnt1);
1354   br(NE, SHORT_LOOP_TAIL);
1355   subs(cnt2, cnt2, 1);
1356   br(EQ, SHORT_LAST2);
1357   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1358   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1359   cmp(tmp2, rscratch1);
1360   br(EQ, SHORT_LOOP);
1361   sub(result, tmp2, rscratch1);
1362   b(DONE);
1363   bind(SHORT_LOOP_TAIL);
1364   sub(result, tmp1, cnt1);
1365   b(DONE);
1366   bind(SHORT_LAST2);
1367   cmp(tmp2, rscratch1);
1368   br(EQ, DONE);
1369   sub(result, tmp2, rscratch1);
1370 
1371   b(DONE);
1372   bind(SHORT_LAST_INIT);
1373   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1374   bind(SHORT_LAST);
1375   cmp(tmp1, cnt1);
1376   br(EQ, DONE);
1377   sub(result, tmp1, cnt1);
1378 
1379   bind(DONE);
1380 
1381   BLOCK_COMMENT("} string_compare");
1382 }
1383 
1384 void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1,
1385                                      FloatRegister src2, Condition cond, bool isQ) {
1386   SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1387   FloatRegister zn = src1, zm = src2;
1388   bool needs_negation = false;
1389   switch (cond) {
1390     case LT: cond = GT; zn = src2; zm = src1; break;
1391     case LE: cond = GE; zn = src2; zm = src1; break;
1392     case LO: cond = HI; zn = src2; zm = src1; break;
1393     case LS: cond = HS; zn = src2; zm = src1; break;
1394     case NE: cond = EQ; needs_negation = true; break;
1395     default:
1396       break;
1397   }
1398 
1399   if (is_floating_point_type(bt)) {
1400     fcm(cond, dst, size, zn, zm);
1401   } else {
1402     cm(cond, dst, size, zn, zm);
1403   }
1404 
1405   if (needs_negation) {
1406     notr(dst, isQ ? T16B : T8B, dst);
1407   }
1408 }
1409 
1410 void C2_MacroAssembler::neon_compare_zero(FloatRegister dst, BasicType bt, FloatRegister src,
1411                                           Condition cond, bool isQ) {
1412   SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1413   if (bt == T_FLOAT || bt == T_DOUBLE) {
1414     if (cond == Assembler::NE) {
1415       fcm(Assembler::EQ, dst, size, src);
1416       notr(dst, isQ ? T16B : T8B, dst);
1417     } else {
1418       fcm(cond, dst, size, src);
1419     }
1420   } else {
1421     if (cond == Assembler::NE) {
1422       cm(Assembler::EQ, dst, size, src);
1423       notr(dst, isQ ? T16B : T8B, dst);
1424     } else {
1425       cm(cond, dst, size, src);
1426     }
1427   }
1428 }
1429 
1430 // Compress the least significant bit of each byte to the rightmost and clear
1431 // the higher garbage bits.
1432 void C2_MacroAssembler::bytemask_compress(Register dst) {
1433   // Example input, dst = 0x01 00 00 00 01 01 00 01
1434   // The "??" bytes are garbage.
1435   orr(dst, dst, dst, Assembler::LSR, 7);  // dst = 0x?? 02 ?? 00 ?? 03 ?? 01
1436   orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D
1437   orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D
1438   andr(dst, dst, 0xff);                   // dst = 0x8D
1439 }
1440 
1441 // Pack the value of each mask element in "src" into a long value in "dst", at most
1442 // the first 64 lane elements. The input "src" is a vector of boolean represented as
1443 // bytes with 0x00/0x01 as element values. Each lane value from "src" is packed into
1444 // one bit in "dst".
1445 //
1446 // Example:   src = 0x0001010000010001 0100000001010001, lane_cnt = 16
1447 // Expected:  dst = 0x658D
1448 //
1449 // Clobbers: rscratch1
1450 void C2_MacroAssembler::sve_vmask_tolong(Register dst, FloatRegister src,
1451                                          FloatRegister vtmp, int lane_cnt) {
1452   assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
1453   assert_different_registers(dst, rscratch1);
1454   assert_different_registers(src, vtmp);
1455   assert(UseSVE > 0, "must be");
1456 
1457   // Compress the lowest 8 bytes.
1458   fmovd(dst, src);
1459   bytemask_compress(dst);
1460   if (lane_cnt <= 8) return;
1461 
1462   // Repeat on higher bytes and join the results.
1463   // Compress 8 bytes in each iteration.
1464   for (int idx = 1; idx < (lane_cnt / 8); idx++) {
1465     sve_extract_integral(rscratch1, T_LONG, src, idx, vtmp);
1466     bytemask_compress(rscratch1);
1467     orr(dst, dst, rscratch1, Assembler::LSL, idx << 3);
1468   }
1469 }
1470 
1471 // The function is same as above "sve_vmask_tolong", but it uses SVE2's BEXT
1472 // instruction which requires the FEAT_BITPERM feature.
1473 void C2_MacroAssembler::sve2_vmask_tolong(Register dst, FloatRegister src,
1474                                           FloatRegister vtmp1, FloatRegister vtmp2,
1475                                           int lane_cnt) {
1476   assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
1477   assert_different_registers(src, vtmp1, vtmp2);
1478   assert(UseSVE > 1 && VM_Version::supports_svebitperm(), "must be");
1479 
1480   // Given a vector with the value 0x00 or 0x01 in each byte, the basic idea
1481   // is to compress each significant bit of the byte in a cross-lane way. Due
1482   // to the lack of a cross-lane bit-compress instruction, we use BEXT
1483   // (bit-compress in each lane) with the biggest lane size (T = D) then
1484   // concatenate the results.
1485 
1486   // The second source input of BEXT, initialized with 0x01 in each byte.
1487   // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1488   sve_dup(vtmp2, B, 1);
1489 
1490   // BEXT vtmp1.D, src.D, vtmp2.D
1491   // src   = 0x0001010000010001 | 0x0100000001010001
1492   // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1493   //         ---------------------------------------
1494   // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1495   sve_bext(vtmp1, D, src, vtmp2);
1496 
1497   // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the
1498   // result to dst.
1499   // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1500   // dst   = 0x658D
1501   if (lane_cnt <= 8) {
1502     // No need to concatenate.
1503     umov(dst, vtmp1, B, 0);
1504   } else if (lane_cnt <= 16) {
1505     ins(vtmp1, B, vtmp1, 1, 8);
1506     umov(dst, vtmp1, H, 0);
1507   } else {
1508     // As the lane count is 64 at most, the final expected value must be in
1509     // the lowest 64 bits after narrowing vtmp1 from D to B.
1510     sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2);
1511     umov(dst, vtmp1, D, 0);
1512   }
1513 }
1514 
1515 // Unpack the mask, a long value in "src", into a vector register of boolean
1516 // represented as bytes with 0x00/0x01 as element values in "dst".  Each bit in
1517 // "src" is unpacked into one byte lane in "dst". Note that "dst" can support at
1518 // most 64 lanes.
1519 //
1520 // Below example gives the expected dst vector register, with a valid src(0x658D)
1521 // on a 128-bit vector size machine.
1522 // dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01
1523 void C2_MacroAssembler::sve_vmask_fromlong(FloatRegister dst, Register src,
1524                                            FloatRegister vtmp, int lane_cnt) {
1525   assert_different_registers(dst, vtmp);
1526   assert(UseSVE == 2 && VM_Version::supports_svebitperm() &&
1527          lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported");
1528 
1529   // Example:   src = 0x658D, lane_cnt = 16
1530   // Expected:  dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01
1531 
1532   // Put long value from general purpose register into the first lane of vector.
1533   // vtmp = 0x0000000000000000 | 0x000000000000658D
1534   sve_dup(vtmp, B, 0);
1535   mov(vtmp, D, 0, src);
1536 
1537   // Transform the value in the first lane which is mask in bit now to the mask in
1538   // byte, which can be done by SVE2's BDEP instruction.
1539 
1540   // The first source input of BDEP instruction. Deposite each byte in every 8 bytes.
1541   // vtmp = 0x0000000000000065 | 0x000000000000008D
1542   if (lane_cnt <= 8) {
1543     // Nothing. As only one byte exsits.
1544   } else if (lane_cnt <= 16) {
1545     ins(vtmp, B, vtmp, 8, 1);
1546   } else {
1547     sve_vector_extend(vtmp, D, vtmp, B);
1548   }
1549 
1550   // The second source input of BDEP instruction, initialized with 0x01 for each byte.
1551   // dst = 0x01010101 0x01010101 0x01010101 0x01010101
1552   sve_dup(dst, B, 1);
1553 
1554   // BDEP dst.D, vtmp.D, dst.D
1555   // vtmp = 0x0000000000000065 | 0x000000000000008D
1556   // dst  = 0x0101010101010101 | 0x0101010101010101
1557   //        ---------------------------------------
1558   // dst  = 0x0001010000010001 | 0x0100000001010001
1559   sve_bdep(dst, D, vtmp, dst);
1560 }
1561 
1562 // Clobbers: rflags
1563 void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg,
1564                                     FloatRegister zn, FloatRegister zm, Condition cond) {
1565   assert(pg->is_governing(), "This register has to be a governing predicate register");
1566   FloatRegister z1 = zn, z2 = zm;
1567   switch (cond) {
1568     case LE: z1 = zm; z2 = zn; cond = GE; break;
1569     case LT: z1 = zm; z2 = zn; cond = GT; break;
1570     case LO: z1 = zm; z2 = zn; cond = HI; break;
1571     case LS: z1 = zm; z2 = zn; cond = HS; break;
1572     default:
1573       break;
1574   }
1575 
1576   SIMD_RegVariant size = elemType_to_regVariant(bt);
1577   if (is_floating_point_type(bt)) {
1578     sve_fcm(cond, pd, size, pg, z1, z2);
1579   } else {
1580     assert(is_integral_type(bt), "unsupported element type");
1581     sve_cmp(cond, pd, size, pg, z1, z2);
1582   }
1583 }
1584 
1585 // Get index of the last mask lane that is set
1586 void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) {
1587   SIMD_RegVariant size = elemType_to_regVariant(bt);
1588   sve_rev(ptmp, size, src);
1589   sve_brkb(ptmp, ptrue, ptmp, false);
1590   sve_cntp(dst, size, ptrue, ptmp);
1591   movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1);
1592   subw(dst, rscratch1, dst);
1593 }
1594 
1595 // Extend integer vector src to dst with the same lane count
1596 // but larger element size, e.g. 4B -> 4I
1597 void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes,
1598                                            FloatRegister src, BasicType src_bt, bool is_unsigned) {
1599   if (src_bt == T_BYTE) {
1600     // 4B to 4S/4I, 8B to 8S
1601     assert(dst_vlen_in_bytes == 8 || dst_vlen_in_bytes == 16, "unsupported");
1602     assert(dst_bt == T_SHORT || dst_bt == T_INT, "unsupported");
1603     _xshll(is_unsigned, dst, T8H, src, T8B, 0);
1604     if (dst_bt == T_INT) {
1605       _xshll(is_unsigned, dst, T4S, dst, T4H, 0);
1606     }
1607   } else if (src_bt == T_SHORT) {
1608     // 2S to 2I/2L, 4S to 4I
1609     assert(dst_vlen_in_bytes == 8 || dst_vlen_in_bytes == 16, "unsupported");
1610     assert(dst_bt == T_INT || dst_bt == T_LONG, "unsupported");
1611     _xshll(is_unsigned, dst, T4S, src, T4H, 0);
1612     if (dst_bt == T_LONG) {
1613       _xshll(is_unsigned, dst, T2D, dst, T2S, 0);
1614     }
1615   } else if (src_bt == T_INT) {
1616     // 2I to 2L
1617     assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported");
1618     _xshll(is_unsigned, dst, T2D, src, T2S, 0);
1619   } else {
1620     ShouldNotReachHere();
1621   }
1622 }
1623 
1624 // Narrow integer vector src down to dst with the same lane count
1625 // but smaller element size, e.g. 4I -> 4B
1626 void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt,
1627                                            FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) {
1628   if (src_bt == T_SHORT) {
1629     // 4S/8S to 4B/8B
1630     assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
1631     assert(dst_bt == T_BYTE, "unsupported");
1632     xtn(dst, T8B, src, T8H);
1633   } else if (src_bt == T_INT) {
1634     // 2I to 2S, 4I to 4B/4S
1635     assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
1636     assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported");
1637     xtn(dst, T4H, src, T4S);
1638     if (dst_bt == T_BYTE) {
1639       xtn(dst, T8B, dst, T8H);
1640     }
1641   } else if (src_bt == T_LONG) {
1642     // 2L to 2S/2I
1643     assert(src_vlen_in_bytes == 16, "unsupported");
1644     assert(dst_bt == T_INT || dst_bt == T_SHORT, "unsupported");
1645     xtn(dst, T2S, src, T2D);
1646     if (dst_bt == T_SHORT) {
1647       xtn(dst, T4H, dst, T4S);
1648     }
1649   } else {
1650     ShouldNotReachHere();
1651   }
1652 }
1653 
1654 void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size,
1655                                           FloatRegister src, SIMD_RegVariant src_size,
1656                                           bool is_unsigned) {
1657   assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size");
1658 
1659   if (src_size == B) {
1660     switch (dst_size) {
1661     case H:
1662       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1663       break;
1664     case S:
1665       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1666       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1667       break;
1668     case D:
1669       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1670       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1671       _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1672       break;
1673     default:
1674       ShouldNotReachHere();
1675     }
1676   } else if (src_size == H) {
1677     if (dst_size == S) {
1678       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1679     } else { // D
1680       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1681       _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1682     }
1683   } else if (src_size == S) {
1684     _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, src);
1685   }
1686 }
1687 
1688 // Vector narrow from src to dst with specified element sizes.
1689 // High part of dst vector will be filled with zero.
1690 void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size,
1691                                           FloatRegister src, SIMD_RegVariant src_size,
1692                                           FloatRegister tmp) {
1693   assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size");
1694   assert_different_registers(src, tmp);
1695   sve_dup(tmp, src_size, 0);
1696   if (src_size == D) {
1697     switch (dst_size) {
1698     case S:
1699       sve_uzp1(dst, S, src, tmp);
1700       break;
1701     case H:
1702       assert_different_registers(dst, tmp);
1703       sve_uzp1(dst, S, src, tmp);
1704       sve_uzp1(dst, H, dst, tmp);
1705       break;
1706     case B:
1707       assert_different_registers(dst, tmp);
1708       sve_uzp1(dst, S, src, tmp);
1709       sve_uzp1(dst, H, dst, tmp);
1710       sve_uzp1(dst, B, dst, tmp);
1711       break;
1712     default:
1713       ShouldNotReachHere();
1714     }
1715   } else if (src_size == S) {
1716     if (dst_size == H) {
1717       sve_uzp1(dst, H, src, tmp);
1718     } else { // B
1719       assert_different_registers(dst, tmp);
1720       sve_uzp1(dst, H, src, tmp);
1721       sve_uzp1(dst, B, dst, tmp);
1722     }
1723   } else if (src_size == H) {
1724     sve_uzp1(dst, B, src, tmp);
1725   }
1726 }
1727 
1728 // Extend src predicate to dst predicate with the same lane count but larger
1729 // element size, e.g. 64Byte -> 512Long
1730 void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src,
1731                                              uint dst_element_length_in_bytes,
1732                                              uint src_element_length_in_bytes) {
1733   if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) {
1734     sve_punpklo(dst, src);
1735   } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) {
1736     sve_punpklo(dst, src);
1737     sve_punpklo(dst, dst);
1738   } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) {
1739     sve_punpklo(dst, src);
1740     sve_punpklo(dst, dst);
1741     sve_punpklo(dst, dst);
1742   } else {
1743     assert(false, "unsupported");
1744     ShouldNotReachHere();
1745   }
1746 }
1747 
1748 // Narrow src predicate to dst predicate with the same lane count but
1749 // smaller element size, e.g. 512Long -> 64Byte
1750 void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src, PRegister ptmp,
1751                                              uint dst_element_length_in_bytes, uint src_element_length_in_bytes) {
1752   // The insignificant bits in src predicate are expected to be zero.
1753   // To ensure the higher order bits of the resultant narrowed vector are 0, an all-zero predicate is
1754   // passed as the second argument. An example narrowing operation with a given mask would be -
1755   // 128Long -> 64Int on a 128-bit machine i.e 2L -> 2I
1756   // Mask (for 2 Longs) : TF
1757   // Predicate register for the above mask (16 bits) : 00000001 00000000
1758   // After narrowing (uzp1 dst.b, src.b, ptmp.b) : 0000 0000 0001 0000
1759   // Which translates to mask for 2 integers as : TF (lower half is considered while upper half is 0)
1760   assert_different_registers(src, ptmp);
1761   assert_different_registers(dst, ptmp);
1762   sve_pfalse(ptmp);
1763   if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) {
1764     sve_uzp1(dst, B, src, ptmp);
1765   } else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) {
1766     sve_uzp1(dst, H, src, ptmp);
1767     sve_uzp1(dst, B, dst, ptmp);
1768   } else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) {
1769     sve_uzp1(dst, S, src, ptmp);
1770     sve_uzp1(dst, H, dst, ptmp);
1771     sve_uzp1(dst, B, dst, ptmp);
1772   } else {
1773     assert(false, "unsupported");
1774     ShouldNotReachHere();
1775   }
1776 }
1777 
1778 // Vector reduction add for integral type with ASIMD instructions.
1779 void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt,
1780                                                  Register isrc, FloatRegister vsrc,
1781                                                  unsigned vector_length_in_bytes,
1782                                                  FloatRegister vtmp) {
1783   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1784   assert_different_registers(dst, isrc);
1785   bool isQ = vector_length_in_bytes == 16;
1786 
1787   BLOCK_COMMENT("neon_reduce_add_integral {");
1788     switch(bt) {
1789       case T_BYTE:
1790         addv(vtmp, isQ ? T16B : T8B, vsrc);
1791         smov(dst, vtmp, B, 0);
1792         addw(dst, dst, isrc, ext::sxtb);
1793         break;
1794       case T_SHORT:
1795         addv(vtmp, isQ ? T8H : T4H, vsrc);
1796         smov(dst, vtmp, H, 0);
1797         addw(dst, dst, isrc, ext::sxth);
1798         break;
1799       case T_INT:
1800         isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc);
1801         umov(dst, vtmp, S, 0);
1802         addw(dst, dst, isrc);
1803         break;
1804       case T_LONG:
1805         assert(isQ, "unsupported");
1806         addpd(vtmp, vsrc);
1807         umov(dst, vtmp, D, 0);
1808         add(dst, dst, isrc);
1809         break;
1810       default:
1811         assert(false, "unsupported");
1812         ShouldNotReachHere();
1813     }
1814   BLOCK_COMMENT("} neon_reduce_add_integral");
1815 }
1816 
1817 // Vector reduction multiply for integral type with ASIMD instructions.
1818 // Note: temporary registers vtmp1 and vtmp2 are not used in some cases.
1819 // Clobbers: rscratch1
1820 void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt,
1821                                                  Register isrc, FloatRegister vsrc,
1822                                                  unsigned vector_length_in_bytes,
1823                                                  FloatRegister vtmp1, FloatRegister vtmp2) {
1824   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1825   bool isQ = vector_length_in_bytes == 16;
1826 
1827   BLOCK_COMMENT("neon_reduce_mul_integral {");
1828     switch(bt) {
1829       case T_BYTE:
1830         if (isQ) {
1831           // Multiply the lower half and higher half of vector iteratively.
1832           // vtmp1 = vsrc[8:15]
1833           ins(vtmp1, D, vsrc, 0, 1);
1834           // vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7]
1835           mulv(vtmp1, T8B, vtmp1, vsrc);
1836           // vtmp2 = vtmp1[4:7]
1837           ins(vtmp2, S, vtmp1, 0, 1);
1838           // vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3]
1839           mulv(vtmp1, T8B, vtmp2, vtmp1);
1840         } else {
1841           ins(vtmp1, S, vsrc, 0, 1);
1842           mulv(vtmp1, T8B, vtmp1, vsrc);
1843         }
1844         // vtmp2 = vtmp1[2:3]
1845         ins(vtmp2, H, vtmp1, 0, 1);
1846         // vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1]
1847         mulv(vtmp2, T8B, vtmp2, vtmp1);
1848         // dst = vtmp2[0] * isrc * vtmp2[1]
1849         umov(rscratch1, vtmp2, B, 0);
1850         mulw(dst, rscratch1, isrc);
1851         sxtb(dst, dst);
1852         umov(rscratch1, vtmp2, B, 1);
1853         mulw(dst, rscratch1, dst);
1854         sxtb(dst, dst);
1855         break;
1856       case T_SHORT:
1857         if (isQ) {
1858           ins(vtmp2, D, vsrc, 0, 1);
1859           mulv(vtmp2, T4H, vtmp2, vsrc);
1860           ins(vtmp1, S, vtmp2, 0, 1);
1861           mulv(vtmp1, T4H, vtmp1, vtmp2);
1862         } else {
1863           ins(vtmp1, S, vsrc, 0, 1);
1864           mulv(vtmp1, T4H, vtmp1, vsrc);
1865         }
1866         umov(rscratch1, vtmp1, H, 0);
1867         mulw(dst, rscratch1, isrc);
1868         sxth(dst, dst);
1869         umov(rscratch1, vtmp1, H, 1);
1870         mulw(dst, rscratch1, dst);
1871         sxth(dst, dst);
1872         break;
1873       case T_INT:
1874         if (isQ) {
1875           ins(vtmp1, D, vsrc, 0, 1);
1876           mulv(vtmp1, T2S, vtmp1, vsrc);
1877         } else {
1878           vtmp1 = vsrc;
1879         }
1880         umov(rscratch1, vtmp1, S, 0);
1881         mul(dst, rscratch1, isrc);
1882         umov(rscratch1, vtmp1, S, 1);
1883         mul(dst, rscratch1, dst);
1884         break;
1885       case T_LONG:
1886         umov(rscratch1, vsrc, D, 0);
1887         mul(dst, isrc, rscratch1);
1888         umov(rscratch1, vsrc, D, 1);
1889         mul(dst, dst, rscratch1);
1890         break;
1891       default:
1892         assert(false, "unsupported");
1893         ShouldNotReachHere();
1894     }
1895   BLOCK_COMMENT("} neon_reduce_mul_integral");
1896 }
1897 
1898 // Vector reduction multiply for floating-point type with ASIMD instructions.
1899 void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt,
1900                                            FloatRegister fsrc, FloatRegister vsrc,
1901                                            unsigned vector_length_in_bytes,
1902                                            FloatRegister vtmp) {
1903   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1904   bool isQ = vector_length_in_bytes == 16;
1905 
1906   BLOCK_COMMENT("neon_reduce_mul_fp {");
1907     switch(bt) {
1908       // The T_SHORT type below is for Float16 type which also uses floating-point
1909       // instructions.
1910       case T_SHORT:
1911         fmulh(dst, fsrc, vsrc);
1912         ext(vtmp, T8B, vsrc, vsrc, 2);
1913         fmulh(dst, dst, vtmp);
1914         ext(vtmp, T8B, vsrc, vsrc, 4);
1915         fmulh(dst, dst, vtmp);
1916         ext(vtmp, T8B, vsrc, vsrc, 6);
1917         fmulh(dst, dst, vtmp);
1918         if (isQ) {
1919           ext(vtmp, T16B, vsrc, vsrc, 8);
1920           fmulh(dst, dst, vtmp);
1921           ext(vtmp, T16B, vsrc, vsrc, 10);
1922           fmulh(dst, dst, vtmp);
1923           ext(vtmp, T16B, vsrc, vsrc, 12);
1924           fmulh(dst, dst, vtmp);
1925           ext(vtmp, T16B, vsrc, vsrc, 14);
1926           fmulh(dst, dst, vtmp);
1927         }
1928         break;
1929       case T_FLOAT:
1930         fmuls(dst, fsrc, vsrc);
1931         ins(vtmp, S, vsrc, 0, 1);
1932         fmuls(dst, dst, vtmp);
1933         if (isQ) {
1934           ins(vtmp, S, vsrc, 0, 2);
1935           fmuls(dst, dst, vtmp);
1936           ins(vtmp, S, vsrc, 0, 3);
1937           fmuls(dst, dst, vtmp);
1938          }
1939         break;
1940       case T_DOUBLE:
1941         assert(isQ, "unsupported");
1942         fmuld(dst, fsrc, vsrc);
1943         ins(vtmp, D, vsrc, 0, 1);
1944         fmuld(dst, dst, vtmp);
1945         break;
1946       default:
1947         assert(false, "unsupported");
1948         ShouldNotReachHere();
1949     }
1950   BLOCK_COMMENT("} neon_reduce_mul_fp");
1951 }
1952 
1953 // Vector reduction add for half float type with ASIMD instructions.
1954 void C2_MacroAssembler::neon_reduce_add_fp16(FloatRegister dst, FloatRegister fsrc, FloatRegister vsrc,
1955                                              unsigned vector_length_in_bytes, FloatRegister vtmp) {
1956   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1957   bool isQ = vector_length_in_bytes == 16;
1958 
1959   BLOCK_COMMENT("neon_reduce_add_fp16 {");
1960     faddh(dst, fsrc, vsrc);
1961     ext(vtmp, T8B, vsrc, vsrc, 2);
1962     faddh(dst, dst, vtmp);
1963     ext(vtmp, T8B, vsrc, vsrc, 4);
1964     faddh(dst, dst, vtmp);
1965     ext(vtmp, T8B, vsrc, vsrc, 6);
1966     faddh(dst, dst, vtmp);
1967     if (isQ) {
1968       ext(vtmp, T16B, vsrc, vsrc, 8);
1969       faddh(dst, dst, vtmp);
1970       ext(vtmp, T16B, vsrc, vsrc, 10);
1971       faddh(dst, dst, vtmp);
1972       ext(vtmp, T16B, vsrc, vsrc, 12);
1973       faddh(dst, dst, vtmp);
1974       ext(vtmp, T16B, vsrc, vsrc, 14);
1975       faddh(dst, dst, vtmp);
1976     }
1977   BLOCK_COMMENT("} neon_reduce_add_fp16");
1978 }
1979 
1980 // Helper to select logical instruction
1981 void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd,
1982                                                    Register Rn, Register Rm,
1983                                                    enum shift_kind kind, unsigned shift) {
1984   switch(opc) {
1985     case Op_AndReductionV:
1986       is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift);
1987       break;
1988     case Op_OrReductionV:
1989       is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift);
1990       break;
1991     case Op_XorReductionV:
1992       is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift);
1993       break;
1994     default:
1995       assert(false, "unsupported");
1996       ShouldNotReachHere();
1997   }
1998 }
1999 
2000 // Vector reduction logical operations And, Or, Xor
2001 // Clobbers: rscratch1
2002 void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt,
2003                                             Register isrc, FloatRegister vsrc,
2004                                             unsigned vector_length_in_bytes) {
2005   assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV,
2006          "unsupported");
2007   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
2008   assert_different_registers(dst, isrc);
2009   bool isQ = vector_length_in_bytes == 16;
2010 
2011   BLOCK_COMMENT("neon_reduce_logical {");
2012     umov(rscratch1, vsrc, isQ ? D : S, 0);
2013     umov(dst, vsrc, isQ ? D : S, 1);
2014     neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1);
2015     switch(bt) {
2016       case T_BYTE:
2017         if (isQ) {
2018           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
2019         }
2020         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
2021         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8);
2022         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
2023         sxtb(dst, dst);
2024         break;
2025       case T_SHORT:
2026         if (isQ) {
2027           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
2028         }
2029         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
2030         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
2031         sxth(dst, dst);
2032         break;
2033       case T_INT:
2034         if (isQ) {
2035           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
2036         }
2037         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
2038         break;
2039       case T_LONG:
2040         assert(isQ, "unsupported");
2041         neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst);
2042         break;
2043       default:
2044         assert(false, "unsupported");
2045         ShouldNotReachHere();
2046     }
2047   BLOCK_COMMENT("} neon_reduce_logical");
2048 }
2049 
2050 // Helper function to decode min/max reduction operation properties
2051 void C2_MacroAssembler::decode_minmax_reduction_opc(int opc, bool* is_min,
2052                                                     bool* is_unsigned,
2053                                                     Condition* cond) {
2054   switch(opc) {
2055     case Op_MinReductionV:
2056       *is_min = true;  *is_unsigned = false; *cond = LT; break;
2057     case Op_MaxReductionV:
2058       *is_min = false; *is_unsigned = false; *cond = GT; break;
2059     case Op_UMinReductionV:
2060       *is_min = true;  *is_unsigned = true;  *cond = LO; break;
2061     case Op_UMaxReductionV:
2062       *is_min = false; *is_unsigned = true;  *cond = HI; break;
2063     default:
2064       ShouldNotReachHere();
2065   }
2066 }
2067 
2068 // Vector reduction min/max/umin/umax for integral type with ASIMD instructions.
2069 // Note: vtmp is not used and expected to be fnoreg for T_LONG case.
2070 // Clobbers: rscratch1, rflags
2071 void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt,
2072                                                     Register isrc, FloatRegister vsrc,
2073                                                     unsigned vector_length_in_bytes,
2074                                                     FloatRegister vtmp) {
2075   assert(opc == Op_MinReductionV || opc == Op_MaxReductionV ||
2076          opc == Op_UMinReductionV || opc == Op_UMaxReductionV, "unsupported");
2077   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
2078   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported");
2079   assert_different_registers(dst, isrc);
2080   bool isQ = vector_length_in_bytes == 16;
2081   bool is_min;
2082   bool is_unsigned;
2083   Condition cond;
2084   decode_minmax_reduction_opc(opc, &is_min, &is_unsigned, &cond);
2085   BLOCK_COMMENT("neon_reduce_minmax_integral {");
2086     if (bt == T_LONG) {
2087       assert(vtmp == fnoreg, "should be");
2088       assert(isQ, "should be");
2089       umov(rscratch1, vsrc, D, 0);
2090       cmp(isrc, rscratch1);
2091       csel(dst, isrc, rscratch1, cond);
2092       umov(rscratch1, vsrc, D, 1);
2093       cmp(dst, rscratch1);
2094       csel(dst, dst, rscratch1, cond);
2095     } else {
2096       SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
2097       if (size == T2S) {
2098         // For T2S (2x32-bit elements), use pairwise instructions because
2099         // uminv/umaxv/sminv/smaxv don't support arrangement 2S.
2100         neon_minmaxp(is_unsigned, is_min, vtmp, size, vsrc, vsrc);
2101       } else {
2102         // For other sizes, use reduction to scalar instructions.
2103         neon_minmaxv(is_unsigned, is_min, vtmp, size, vsrc);
2104       }
2105       if (bt == T_INT) {
2106         umov(dst, vtmp, S, 0);
2107       } else if (is_unsigned) {
2108         umov(dst, vtmp, elemType_to_regVariant(bt), 0);
2109       } else {
2110         smov(dst, vtmp, elemType_to_regVariant(bt), 0);
2111       }
2112       cmpw(dst, isrc);
2113       cselw(dst, dst, isrc, cond);
2114     }
2115   BLOCK_COMMENT("} neon_reduce_minmax_integral");
2116 }
2117 
2118 // Vector reduction for integral type with SVE instruction.
2119 // Supported operations are Add, And, Or, Xor, Max, Min, UMax, UMin.
2120 // rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV.
2121 void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1,
2122                                             FloatRegister src2, PRegister pg, FloatRegister tmp) {
2123   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2124   assert(pg->is_governing(), "This register has to be a governing predicate register");
2125   assert_different_registers(src1, dst);
2126   // Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved.
2127   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2128   switch (opc) {
2129     case Op_AddReductionVI: {
2130       sve_uaddv(tmp, size, pg, src2);
2131       if (bt == T_BYTE) {
2132         smov(dst, tmp, size, 0);
2133         addw(dst, src1, dst, ext::sxtb);
2134       } else if (bt == T_SHORT) {
2135         smov(dst, tmp, size, 0);
2136         addw(dst, src1, dst, ext::sxth);
2137       } else {
2138         umov(dst, tmp, size, 0);
2139         addw(dst, dst, src1);
2140       }
2141       break;
2142     }
2143     case Op_AddReductionVL: {
2144       sve_uaddv(tmp, size, pg, src2);
2145       umov(dst, tmp, size, 0);
2146       add(dst, dst, src1);
2147       break;
2148     }
2149     case Op_AndReductionV: {
2150       sve_andv(tmp, size, pg, src2);
2151       if (bt == T_INT || bt == T_LONG) {
2152         umov(dst, tmp, size, 0);
2153       } else {
2154         smov(dst, tmp, size, 0);
2155       }
2156       if (bt == T_LONG) {
2157         andr(dst, dst, src1);
2158       } else {
2159         andw(dst, dst, src1);
2160       }
2161       break;
2162     }
2163     case Op_OrReductionV: {
2164       sve_orv(tmp, size, pg, src2);
2165       if (bt == T_INT || bt == T_LONG) {
2166         umov(dst, tmp, size, 0);
2167       } else {
2168         smov(dst, tmp, size, 0);
2169       }
2170       if (bt == T_LONG) {
2171         orr(dst, dst, src1);
2172       } else {
2173         orrw(dst, dst, src1);
2174       }
2175       break;
2176     }
2177     case Op_XorReductionV: {
2178       sve_eorv(tmp, size, pg, src2);
2179       if (bt == T_INT || bt == T_LONG) {
2180         umov(dst, tmp, size, 0);
2181       } else {
2182         smov(dst, tmp, size, 0);
2183       }
2184       if (bt == T_LONG) {
2185         eor(dst, dst, src1);
2186       } else {
2187         eorw(dst, dst, src1);
2188       }
2189       break;
2190     }
2191     case Op_MaxReductionV:
2192     case Op_MinReductionV:
2193     case Op_UMaxReductionV:
2194     case Op_UMinReductionV: {
2195       bool is_min;
2196       bool is_unsigned;
2197       Condition cond;
2198       decode_minmax_reduction_opc(opc, &is_min, &is_unsigned, &cond);
2199       sve_minmaxv(is_unsigned, is_min, tmp, size, pg, src2);
2200       // Move result from vector to general register
2201       if (is_unsigned || bt == T_INT || bt == T_LONG) {
2202         umov(dst, tmp, size, 0);
2203       } else {
2204         smov(dst, tmp, size, 0);
2205       }
2206       if (bt == T_LONG) {
2207         cmp(dst, src1);
2208         csel(dst, dst, src1, cond);
2209       } else {
2210         cmpw(dst, src1);
2211         cselw(dst, dst, src1, cond);
2212       }
2213       break;
2214     }
2215     default:
2216       assert(false, "unsupported");
2217       ShouldNotReachHere();
2218   }
2219 
2220   if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) {
2221     if (bt == T_BYTE) {
2222       sxtb(dst, dst);
2223     } else if (bt == T_SHORT) {
2224       sxth(dst, dst);
2225     }
2226   }
2227 }
2228 
2229 // Set elements of the dst predicate to true for lanes in the range of [0, lane_cnt), or
2230 // to false otherwise. The input "lane_cnt" should be smaller than or equal to the supported
2231 // max vector length of the basic type. Clobbers: rscratch1 and the rFlagsReg.
2232 void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt) {
2233   uint32_t max_vector_length = Matcher::max_vector_size(bt);
2234   assert(lane_cnt <= max_vector_length, "unsupported input lane_cnt");
2235 
2236   // Set all elements to false if the input "lane_cnt" is zero.
2237   if (lane_cnt == 0) {
2238     sve_pfalse(dst);
2239     return;
2240   }
2241 
2242   SIMD_RegVariant size = elemType_to_regVariant(bt);
2243   assert(size != Q, "invalid size");
2244 
2245   // Set all true if "lane_cnt" equals to the max lane count.
2246   if (lane_cnt == max_vector_length) {
2247     sve_ptrue(dst, size, /* ALL */ 0b11111);
2248     return;
2249   }
2250 
2251   // Fixed numbers for "ptrue".
2252   switch(lane_cnt) {
2253   case 1: /* VL1 */
2254   case 2: /* VL2 */
2255   case 3: /* VL3 */
2256   case 4: /* VL4 */
2257   case 5: /* VL5 */
2258   case 6: /* VL6 */
2259   case 7: /* VL7 */
2260   case 8: /* VL8 */
2261     sve_ptrue(dst, size, lane_cnt);
2262     return;
2263   case 16:
2264     sve_ptrue(dst, size, /* VL16 */ 0b01001);
2265     return;
2266   case 32:
2267     sve_ptrue(dst, size, /* VL32 */ 0b01010);
2268     return;
2269   case 64:
2270     sve_ptrue(dst, size, /* VL64 */ 0b01011);
2271     return;
2272   case 128:
2273     sve_ptrue(dst, size, /* VL128 */ 0b01100);
2274     return;
2275   case 256:
2276     sve_ptrue(dst, size, /* VL256 */ 0b01101);
2277     return;
2278   default:
2279     break;
2280   }
2281 
2282   // Special patterns for "ptrue".
2283   if (lane_cnt == round_down_power_of_2(max_vector_length)) {
2284     sve_ptrue(dst, size, /* POW2 */ 0b00000);
2285   } else if (lane_cnt == max_vector_length - (max_vector_length % 4)) {
2286     sve_ptrue(dst, size, /* MUL4 */ 0b11101);
2287   } else if (lane_cnt == max_vector_length - (max_vector_length % 3)) {
2288     sve_ptrue(dst, size, /* MUL3 */ 0b11110);
2289   } else {
2290     // Encode to "whileltw" for the remaining cases.
2291     mov(rscratch1, lane_cnt);
2292     sve_whileltw(dst, size, zr, rscratch1);
2293   }
2294 }
2295 
2296 // Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst.
2297 // Any remaining elements of dst will be filled with zero.
2298 // Clobbers: rscratch1
2299 // Preserves: mask, vzr
2300 void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask,
2301                                            FloatRegister vzr, FloatRegister vtmp,
2302                                            PRegister pgtmp, unsigned vector_length_in_bytes) {
2303   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2304   // When called by sve_compress_byte, src and vtmp may be the same register.
2305   assert_different_registers(dst, src, vzr);
2306   assert_different_registers(dst, vtmp, vzr);
2307   assert_different_registers(mask, pgtmp);
2308   // high <-- low
2309   // Example input:   src   = hh gg ff ee dd cc bb aa, one character is 8 bits.
2310   //                  mask  = 01 00 00 01 01 00 01 01, one character is 1 bit.
2311   // Expected result: dst   = 00 00 00 hh ee dd bb aa
2312 
2313   // Extend lowest half to type INT.
2314   // dst   =  00dd  00cc  00bb  00aa
2315   sve_uunpklo(dst, S, src);
2316   // pgtmp =  0001  0000  0001  0001
2317   sve_punpklo(pgtmp, mask);
2318   // Pack the active elements in size of type INT to the right,
2319   // and fill the remainings with zero.
2320   // dst   =  0000  00dd  00bb  00aa
2321   sve_compact(dst, S, dst, pgtmp);
2322   // Narrow the result back to type SHORT.
2323   // dst   = 00 00 00 00 00 dd bb aa
2324   sve_uzp1(dst, H, dst, vzr);
2325 
2326   // Return if the vector length is no more than MaxVectorSize/2, since the
2327   // highest half is invalid.
2328   if (vector_length_in_bytes <= (MaxVectorSize >> 1)) {
2329     return;
2330   }
2331 
2332   // Count the active elements of lowest half.
2333   // rscratch1 = 3
2334   sve_cntp(rscratch1, S, ptrue, pgtmp);
2335 
2336   // Repeat to the highest half.
2337   // pgtmp =  0001  0000  0000  0001
2338   sve_punpkhi(pgtmp, mask);
2339   // vtmp  =  00hh  00gg  00ff  00ee
2340   sve_uunpkhi(vtmp, S, src);
2341   // vtmp  =  0000  0000  00hh  00ee
2342   sve_compact(vtmp, S, vtmp, pgtmp);
2343   // vtmp  = 00 00 00 00 00 00 hh ee
2344   sve_uzp1(vtmp, H, vtmp, vzr);
2345 
2346   // pgtmp = 00 00 00 00 00 01 01 01
2347   sve_whilelt(pgtmp, H, zr, rscratch1);
2348   // Compressed low:  dst  = 00 00 00 00 00 dd bb aa
2349   // Compressed high: vtmp = 00 00 00 00 00 00 hh ee
2350   // Combine the compressed low with the compressed high:
2351   //                  dst  = 00 00 00 hh ee dd bb aa
2352   sve_splice(dst, H, pgtmp, vtmp);
2353 }
2354 
2355 // Clobbers: rscratch1, rscratch2
2356 // Preserves: src, mask
2357 void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask,
2358                                           FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
2359                                           PRegister ptmp, PRegister pgtmp, unsigned vector_length_in_bytes) {
2360   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2361   assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3);
2362   assert_different_registers(mask, ptmp, pgtmp);
2363   // high <-- low
2364   // Example input:   src   = q p n m l k j i h g f e d c b a, one character is 8 bits.
2365   //                  mask  = 0 1 0 0 0 0 0 1 0 1 0 0 0 1 0 1, one character is 1 bit.
2366   // Expected result: dst   = 0 0 0 0 0 0 0 0 0 0 0 p i g c a
2367   FloatRegister vzr = vtmp3;
2368   sve_dup(vzr, B, 0);
2369 
2370   // Extend lowest half to type SHORT.
2371   // vtmp1 =  0h  0g  0f  0e  0d  0c  0b  0a
2372   sve_uunpklo(vtmp1, H, src);
2373   // ptmp  =  00  01  00  00  00  01  00  01
2374   sve_punpklo(ptmp, mask);
2375   // Pack the active elements in size of type SHORT to the right,
2376   // and fill the remainings with zero.
2377   // dst   =  00  00  00  00  00  0g  0c  0a
2378   unsigned extended_size = vector_length_in_bytes << 1;
2379   sve_compress_short(dst, vtmp1, ptmp, vzr, vtmp2, pgtmp, extended_size > MaxVectorSize ? MaxVectorSize : extended_size);
2380   // Narrow the result back to type BYTE.
2381   // dst   = 0 0 0 0 0 0 0 0 0 0 0 0 0 g c a
2382   sve_uzp1(dst, B, dst, vzr);
2383 
2384   // Return if the vector length is no more than MaxVectorSize/2, since the
2385   // highest half is invalid.
2386   if (vector_length_in_bytes <= (MaxVectorSize >> 1)) {
2387     return;
2388   }
2389   // Count the active elements of lowest half.
2390   // rscratch2 = 3
2391   sve_cntp(rscratch2, H, ptrue, ptmp);
2392 
2393   // Repeat to the highest half.
2394   // ptmp  =  00  01  00  00  00  00  00  01
2395   sve_punpkhi(ptmp, mask);
2396   // vtmp2 =  0q  0p  0n  0m  0l  0k  0j  0i
2397   sve_uunpkhi(vtmp2, H, src);
2398   // vtmp1 =  00  00  00  00  00  00  0p  0i
2399   sve_compress_short(vtmp1, vtmp2, ptmp, vzr, vtmp2, pgtmp, extended_size - MaxVectorSize);
2400   // vtmp1 = 0 0 0 0 0 0 0 0 0 0 0 0 0 0 p i
2401   sve_uzp1(vtmp1, B, vtmp1, vzr);
2402 
2403   // ptmp  = 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1
2404   sve_whilelt(ptmp, B, zr, rscratch2);
2405   // Compressed low:  dst   = 0 0 0 0 0 0 0 0 0 0 0 0 0 g c a
2406   // Compressed high: vtmp1 = 0 0 0 0 0 0 0 0 0 0 0 0 0 0 p i
2407   // Combine the compressed low with the compressed high:
2408   //                  dst   = 0 0 0 0 0 0 0 0 0 0 0 p i g c a
2409   sve_splice(dst, B, ptmp, vtmp1);
2410 }
2411 
2412 void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2413   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2414   SIMD_Arrangement size = isQ ? T16B : T8B;
2415   if (bt == T_BYTE) {
2416     rbit(dst, size, src);
2417   } else {
2418     neon_reverse_bytes(dst, src, bt, isQ);
2419     rbit(dst, size, dst);
2420   }
2421 }
2422 
2423 void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2424   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2425   SIMD_Arrangement size = isQ ? T16B : T8B;
2426   switch (bt) {
2427     case T_BYTE:
2428       if (dst != src) {
2429         orr(dst, size, src, src);
2430       }
2431       break;
2432     case T_SHORT:
2433       rev16(dst, size, src);
2434       break;
2435     case T_INT:
2436       rev32(dst, size, src);
2437       break;
2438     case T_LONG:
2439       rev64(dst, size, src);
2440       break;
2441     default:
2442       assert(false, "unsupported");
2443       ShouldNotReachHere();
2444   }
2445 }
2446 
2447 // VectorRearrange implementation for short/int/float/long/double types with NEON
2448 // instructions. For VectorRearrange short/int/float, we use NEON tbl instruction.
2449 // But since it supports bytes table only, we need to lookup 2/4 bytes as a group.
2450 // For VectorRearrange long/double, we compare the shuffle input with iota indices,
2451 // and use bsl to implement the operation.
2452 void C2_MacroAssembler::neon_rearrange_hsd(FloatRegister dst, FloatRegister src,
2453                                            FloatRegister shuffle, FloatRegister tmp,
2454                                            BasicType bt, bool isQ) {
2455   assert_different_registers(dst, src, shuffle, tmp);
2456   SIMD_Arrangement size1 = isQ ? T16B : T8B;
2457   SIMD_Arrangement size2 = esize2arrangement((uint)type2aelembytes(bt), isQ);
2458 
2459   // Here is an example that rearranges a NEON vector with 4 ints:
2460   // Rearrange V1 int[a0, a1, a2, a3] to V2 int[a2, a3, a0, a1]
2461   //   1. We assume the shuffle input is Vi int[2, 3, 0, 1].
2462   //   2. Multiply Vi int[2, 3, 0, 1] with constant int vector
2463   //      [0x04040404, 0x04040404, 0x04040404, 0x04040404], and get
2464   //      tbl base Vm int[0x08080808, 0x0c0c0c0c, 0x00000000, 0x04040404].
2465   //   3. Add Vm with constant int[0x03020100, 0x03020100, 0x03020100, 0x03020100],
2466   //      and get tbl index Vm int[0x0b0a0908, 0x0f0e0d0c, 0x03020100, 0x07060504]
2467   //   4. Use Vm as index register, and use V1 as table register.
2468   //      Then get V2 as the result by tbl NEON instructions.
2469   switch (bt) {
2470     case T_SHORT:
2471       mov(tmp, size1, 0x02);
2472       mulv(dst, size2, shuffle, tmp);
2473       mov(tmp, size2, 0x0100);
2474       addv(dst, size1, dst, tmp);
2475       tbl(dst, size1, src, 1, dst);
2476       break;
2477     case T_INT:
2478     case T_FLOAT:
2479       mov(tmp, size1, 0x04);
2480       mulv(dst, size2, shuffle, tmp);
2481       mov(tmp, size2, 0x03020100);
2482       addv(dst, size1, dst, tmp);
2483       tbl(dst, size1, src, 1, dst);
2484       break;
2485     case T_LONG:
2486     case T_DOUBLE:
2487       {
2488         int idx = vector_iota_entry_index(T_LONG);
2489         lea(rscratch1,
2490             ExternalAddress(StubRoutines::aarch64::vector_iota_indices(idx)));
2491         ldrq(tmp, rscratch1);
2492         // Check whether the input "shuffle" is the same with iota indices.
2493         // Return "src" if true, otherwise swap the two elements of "src".
2494         cm(EQ, dst, size2, shuffle, tmp);
2495         ext(tmp, size1, src, src, 8);
2496         bsl(dst, size1, src, tmp);
2497       }
2498       break;
2499     default:
2500       assert(false, "unsupported element type");
2501       ShouldNotReachHere();
2502   }
2503 }
2504 
2505 // Extract a scalar element from an sve vector at position 'idx'.
2506 // The input elements in src are expected to be of integral type.
2507 void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src,
2508                                              int idx, FloatRegister vtmp) {
2509   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2510   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2511   if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction
2512     if (bt == T_INT || bt == T_LONG) {
2513       umov(dst, src, size, idx);
2514     } else {
2515       smov(dst, src, size, idx);
2516     }
2517   } else {
2518     sve_orr(vtmp, src, src);
2519     sve_ext(vtmp, vtmp, idx << size);
2520     if (bt == T_INT || bt == T_LONG) {
2521       umov(dst, vtmp, size, 0);
2522     } else {
2523       smov(dst, vtmp, size, 0);
2524     }
2525   }
2526 }
2527 
2528 // java.lang.Math::round intrinsics
2529 
2530 // Clobbers: rscratch1, rflags
2531 void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2532                                           FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) {
2533   assert_different_registers(tmp1, tmp2, tmp3, src, dst);
2534   switch (T) {
2535     case T2S:
2536     case T4S:
2537       fmovs(tmp1, T, 0.5f);
2538       mov(rscratch1, jint_cast(0x1.0p23f));
2539       break;
2540     case T2D:
2541       fmovd(tmp1, T, 0.5);
2542       mov(rscratch1, julong_cast(0x1.0p52));
2543       break;
2544     default:
2545       assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2546   }
2547   fadd(tmp1, T, tmp1, src);
2548   fcvtms(tmp1, T, tmp1);
2549   // tmp1 = floor(src + 0.5, ties to even)
2550 
2551   fcvtas(dst, T, src);
2552   // dst = round(src), ties to away
2553 
2554   fneg(tmp3, T, src);
2555   dup(tmp2, T, rscratch1);
2556   cm(HS, tmp3, T, tmp3, tmp2);
2557   // tmp3 is now a set of flags
2558 
2559   bif(dst, T16B, tmp1, tmp3);
2560   // result in dst
2561 }
2562 
2563 // Clobbers: rscratch1, rflags
2564 void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2565                                          FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) {
2566   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2567   assert_different_registers(tmp1, tmp2, src, dst);
2568 
2569   switch (T) {
2570     case S:
2571       mov(rscratch1, jint_cast(0x1.0p23f));
2572       break;
2573     case D:
2574       mov(rscratch1, julong_cast(0x1.0p52));
2575       break;
2576     default:
2577       assert(T == S || T == D, "invalid register variant");
2578   }
2579 
2580   sve_frinta(dst, T, ptrue, src);
2581   // dst = round(src), ties to away
2582 
2583   Label none;
2584 
2585   sve_fneg(tmp1, T, ptrue, src);
2586   sve_dup(tmp2, T, rscratch1);
2587   sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1);
2588   br(EQ, none);
2589   {
2590     sve_cpy(tmp1, T, pgtmp, 0.5);
2591     sve_fadd(tmp1, T, pgtmp, src);
2592     sve_frintm(dst, T, pgtmp, tmp1);
2593     // dst = floor(src + 0.5, ties to even)
2594   }
2595   bind(none);
2596 
2597   sve_fcvtzs(dst, T, ptrue, dst, T);
2598   // result in dst
2599 }
2600 
2601 void C2_MacroAssembler::vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero,
2602                                            FloatRegister one, SIMD_Arrangement T) {
2603   assert_different_registers(dst, src, zero, one);
2604   assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2605 
2606   facgt(dst, T, src, zero);
2607   ushr(dst, T, dst, 1); // dst=0 for +-0.0 and NaN. 0x7FF..F otherwise
2608   bsl(dst, T == T2S ? T8B : T16B, one, src); // Result in dst
2609 }
2610 
2611 void C2_MacroAssembler::vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero,
2612                                           FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T) {
2613     assert_different_registers(dst, src, zero, one, vtmp);
2614     assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2615 
2616     sve_orr(vtmp, src, src);
2617     sve_fac(Assembler::GT, pgtmp, T, ptrue, src, zero); // pmtp=0 for +-0.0 and NaN. 0x1 otherwise
2618     switch (T) {
2619     case S:
2620       sve_and(vtmp, T, min_jint); // Extract the sign bit of float value in every lane of src
2621       sve_orr(vtmp, T, jint_cast(1.0)); // OR it with +1 to make the final result +1 or -1 depending
2622                                         // on the sign of the float value
2623       break;
2624     case D:
2625       sve_and(vtmp, T, min_jlong);
2626       sve_orr(vtmp, T, jlong_cast(1.0));
2627       break;
2628     default:
2629       assert(false, "unsupported");
2630       ShouldNotReachHere();
2631     }
2632     sve_sel(dst, T, pgtmp, vtmp, src); // Select either from src or vtmp based on the predicate register pgtmp
2633                                        // Result in dst
2634 }
2635 
2636 bool C2_MacroAssembler::in_scratch_emit_size() {
2637   if (ciEnv::current()->task() != nullptr) {
2638     PhaseOutput* phase_output = Compile::current()->output();
2639     if (phase_output != nullptr && phase_output->in_scratch_emit_size()) {
2640       return true;
2641     }
2642   }
2643   return MacroAssembler::in_scratch_emit_size();
2644 }
2645 
2646 static void abort_verify_int_in_range(uint idx, jint val, jint lo, jint hi) {
2647   fatal("Invalid CastII, idx: %u, val: %d, lo: %d, hi: %d", idx, val, lo, hi);
2648 }
2649 
2650 void C2_MacroAssembler::verify_int_in_range(uint idx, const TypeInt* t, Register rval, Register rtmp) {
2651   assert(!t->empty() && !t->singleton(), "%s", Type::str(t));
2652   if (t == TypeInt::INT) {
2653     return;
2654   }
2655 
2656   BLOCK_COMMENT("verify_int_in_range {");
2657   Label L_success, L_failure;
2658 
2659   jint lo = t->_lo;
2660   jint hi = t->_hi;
2661 
2662   if (lo != min_jint) {
2663     subsw(rtmp, rval, lo);
2664     br(Assembler::LT, L_failure);
2665   }
2666   if (hi != max_jint) {
2667     subsw(rtmp, rval, hi);
2668     br(Assembler::GT, L_failure);
2669   }
2670   b(L_success);
2671 
2672   bind(L_failure);
2673   movw(c_rarg0, idx);
2674   mov(c_rarg1, rval);
2675   movw(c_rarg2, lo);
2676   movw(c_rarg3, hi);
2677   reconstruct_frame_pointer(rtmp);
2678   rt_call(CAST_FROM_FN_PTR(address, abort_verify_int_in_range), rtmp);
2679   hlt(0);
2680 
2681   bind(L_success);
2682   BLOCK_COMMENT("} verify_int_in_range");
2683 }
2684 
2685 static void abort_verify_long_in_range(uint idx, jlong val, jlong lo, jlong hi) {
2686   fatal("Invalid CastLL, idx: %u, val: " JLONG_FORMAT ", lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, val, lo, hi);
2687 }
2688 
2689 void C2_MacroAssembler::verify_long_in_range(uint idx, const TypeLong* t, Register rval, Register rtmp) {
2690   assert(!t->empty() && !t->singleton(), "%s", Type::str(t));
2691   if (t == TypeLong::LONG) {
2692     return;
2693   }
2694 
2695   BLOCK_COMMENT("verify_long_in_range {");
2696   Label L_success, L_failure;
2697 
2698   jlong lo = t->_lo;
2699   jlong hi = t->_hi;
2700 
2701   if (lo != min_jlong) {
2702     subs(rtmp, rval, lo);
2703     br(Assembler::LT, L_failure);
2704   }
2705   if (hi != max_jlong) {
2706     subs(rtmp, rval, hi);
2707     br(Assembler::GT, L_failure);
2708   }
2709   b(L_success);
2710 
2711   bind(L_failure);
2712   movw(c_rarg0, idx);
2713   mov(c_rarg1, rval);
2714   mov(c_rarg2, lo);
2715   mov(c_rarg3, hi);
2716   reconstruct_frame_pointer(rtmp);
2717   rt_call(CAST_FROM_FN_PTR(address, abort_verify_long_in_range), rtmp);
2718   hlt(0);
2719 
2720   bind(L_success);
2721   BLOCK_COMMENT("} verify_long_in_range");
2722 }
2723 
2724 void C2_MacroAssembler::reconstruct_frame_pointer(Register rtmp) {
2725   const int framesize = Compile::current()->output()->frame_size_in_bytes();
2726   if (PreserveFramePointer) {
2727     // frame pointer is valid
2728 #ifdef ASSERT
2729     // Verify frame pointer value in rfp.
2730     add(rtmp, sp, framesize - 2 * wordSize);
2731     Label L_success;
2732     cmp(rfp, rtmp);
2733     br(Assembler::EQ, L_success);
2734     stop("frame pointer mismatch");
2735     bind(L_success);
2736 #endif // ASSERT
2737   } else {
2738     add(rfp, sp, framesize - 2 * wordSize);
2739   }
2740 }
2741 
2742 // Selects elements from two source vectors (src1, src2) based on index values in the index register
2743 // using Neon instructions and places it in the destination vector element corresponding to the
2744 // index vector element. Each index in the index register must be in the range - [0, 2 * NUM_ELEM),
2745 // where NUM_ELEM is the number of BasicType elements per vector.
2746 // If idx < NUM_ELEM --> selects src1[idx] (idx is an element of the index register)
2747 // Otherwise, selects src2[idx – NUM_ELEM]
2748 void C2_MacroAssembler::select_from_two_vectors_neon(FloatRegister dst, FloatRegister src1,
2749                                                      FloatRegister src2, FloatRegister index,
2750                                                      FloatRegister tmp, unsigned vector_length_in_bytes) {
2751   assert_different_registers(dst, src1, src2, tmp);
2752   SIMD_Arrangement size = vector_length_in_bytes == 16 ? T16B : T8B;
2753 
2754   if (vector_length_in_bytes == 16) {
2755     assert(UseSVE <= 1, "sve must be <= 1");
2756     assert(src1->successor() == src2, "Source registers must be ordered");
2757     // If the vector length is 16B, then use the Neon "tbl" instruction with two vector table
2758     tbl(dst, size, src1, 2, index);
2759   } else { // vector length == 8
2760     assert(UseSVE == 0, "must be Neon only");
2761     // We need to fit both the source vectors (src1, src2) in a 128-bit register because the
2762     // Neon "tbl" instruction supports only looking up 16B vectors. We then use the Neon "tbl"
2763     // instruction with one vector lookup
2764     ins(tmp, D, src1, 0, 0);
2765     ins(tmp, D, src2, 1, 0);
2766     tbl(dst, size, tmp, 1, index);
2767   }
2768 }
2769 
2770 // Selects elements from two source vectors (src1, src2) based on index values in the index register
2771 // using SVE/SVE2 instructions and places it in the destination vector element corresponding to the
2772 // index vector element. Each index in the index register must be in the range - [0, 2 * NUM_ELEM),
2773 // where NUM_ELEM is the number of BasicType elements per vector.
2774 // If idx < NUM_ELEM --> selects src1[idx] (idx is an element of the index register)
2775 // Otherwise, selects src2[idx – NUM_ELEM]
2776 void C2_MacroAssembler::select_from_two_vectors_sve(FloatRegister dst, FloatRegister src1,
2777                                                     FloatRegister src2, FloatRegister index,
2778                                                     FloatRegister tmp, SIMD_RegVariant T,
2779                                                     unsigned vector_length_in_bytes) {
2780   assert_different_registers(dst, src1, src2, index, tmp);
2781 
2782   if (vector_length_in_bytes == 8) {
2783     // We need to fit both the source vectors (src1, src2) in a single vector register because the
2784     // SVE "tbl" instruction is unpredicated and works on the entire vector which can lead to
2785     // incorrect results if each source vector is only partially filled. We then use the SVE "tbl"
2786     // instruction with one vector lookup
2787     assert(UseSVE >= 1, "sve must be >= 1");
2788     ins(tmp, D, src1, 0, 0);
2789     ins(tmp, D, src2, 1, 0);
2790     sve_tbl(dst, T, tmp, index);
2791   } else {  // UseSVE == 2 and vector_length_in_bytes > 8
2792     // If the vector length is > 8, then use the SVE2 "tbl" instruction with the two vector table.
2793     // The assertion - vector_length_in_bytes == MaxVectorSize ensures that this operation
2794     // is not executed on machines where vector_length_in_bytes < MaxVectorSize
2795     // with the only exception of 8B vector length.
2796     assert(UseSVE == 2 && vector_length_in_bytes == MaxVectorSize, "must be");
2797     assert(src1->successor() == src2, "Source registers must be ordered");
2798     sve_tbl(dst, T, src1, src2, index);
2799   }
2800 }
2801 
2802 void C2_MacroAssembler::select_from_two_vectors(FloatRegister dst, FloatRegister src1,
2803                                                 FloatRegister src2, FloatRegister index,
2804                                                 FloatRegister tmp, BasicType bt,
2805                                                 unsigned vector_length_in_bytes) {
2806 
2807   assert_different_registers(dst, src1, src2, index, tmp);
2808 
2809   // The cases that can reach this method are -
2810   // - UseSVE = 0/1, vector_length_in_bytes = 8 or 16, excluding double and long types
2811   // - UseSVE = 2, vector_length_in_bytes >= 8, for all types
2812   //
2813   // SVE/SVE2 tbl instructions are generated when UseSVE = 1 with vector_length_in_bytes = 8
2814   // and UseSVE = 2 with vector_length_in_bytes >= 8
2815   //
2816   // Neon instructions are generated when UseSVE = 0 with vector_length_in_bytes = 8 or 16 and
2817   // UseSVE = 1 with vector_length_in_bytes = 16
2818 
2819   if ((UseSVE == 1 && vector_length_in_bytes == 8) || UseSVE == 2) {
2820     SIMD_RegVariant T = elemType_to_regVariant(bt);
2821     select_from_two_vectors_sve(dst, src1, src2, index, tmp, T, vector_length_in_bytes);
2822     return;
2823   }
2824 
2825   // The only BasicTypes that can reach here are T_SHORT, T_BYTE, T_INT and T_FLOAT
2826   assert(bt != T_DOUBLE && bt != T_LONG, "unsupported basic type");
2827   assert(vector_length_in_bytes <= 16, "length_in_bytes must be <= 16");
2828 
2829   bool isQ = vector_length_in_bytes == 16;
2830 
2831   SIMD_Arrangement size1 = isQ ? T16B : T8B;
2832   SIMD_Arrangement size2 = esize2arrangement((uint)type2aelembytes(bt), isQ);
2833 
2834   // Neon "tbl" instruction only supports byte tables, so we need to look at chunks of
2835   // 2B for selecting shorts or chunks of 4B for selecting ints/floats from the table.
2836   // The index values in "index" register are in the range of [0, 2 * NUM_ELEM) where NUM_ELEM
2837   // is the number of elements that can fit in a vector. For ex. for T_SHORT with 64-bit vector length,
2838   // the indices can range from [0, 8).
2839   // As an example with 64-bit vector length and T_SHORT type - let index = [2, 5, 1, 0]
2840   // Move a constant 0x02 in every byte of tmp - tmp = [0x0202, 0x0202, 0x0202, 0x0202]
2841   // Multiply index vector with tmp to yield - dst = [0x0404, 0x0a0a, 0x0202, 0x0000]
2842   // Move a constant 0x0100 in every 2B of tmp - tmp = [0x0100, 0x0100, 0x0100, 0x0100]
2843   // Add the multiplied result to the vector in tmp to obtain the byte level
2844   // offsets - dst = [0x0504, 0x0b0a, 0x0302, 0x0100]
2845   // Use these offsets in the "tbl" instruction to select chunks of 2B.
2846 
2847   if (bt == T_BYTE) {
2848     select_from_two_vectors_neon(dst, src1, src2, index, tmp, vector_length_in_bytes);
2849   } else {
2850     int elem_size = (bt == T_SHORT) ? 2 : 4;
2851     uint64_t tbl_offset = (bt == T_SHORT) ? 0x0100u : 0x03020100u;
2852 
2853     mov(tmp, size1, elem_size);
2854     mulv(dst, size2, index, tmp);
2855     mov(tmp, size2, tbl_offset);
2856     addv(dst, size1, dst, tmp); // "dst" now contains the processed index elements
2857                                 // to select a set of 2B/4B
2858     select_from_two_vectors_neon(dst, src1, src2, dst, tmp, vector_length_in_bytes);
2859   }
2860 }
2861 
2862 // Vector expand implementation. Elements from the src vector are expanded into
2863 // the dst vector under the control of the vector mask.
2864 // Since there are no native instructions directly corresponding to expand before
2865 // SVE2p2, the following implementations mainly leverages the TBL instruction to
2866 // implement expand. To compute the index input for TBL, the prefix sum algorithm
2867 // (https://en.wikipedia.org/wiki/Prefix_sum) is used. The same algorithm is used
2868 // for NEON and SVE, but with different instructions where appropriate.
2869 
2870 // Vector expand implementation for NEON.
2871 //
2872 // An example of 128-bit Byte vector:
2873 //   Data direction: high <== low
2874 //   Input:
2875 //         src   = g  f  e  d  c  b  a  9  8  7  6  5  4  3  2  1
2876 //         mask  = 0  0 -1 -1  0  0 -1 -1  0  0 -1 -1  0  0 -1 -1
2877 //   Expected result:
2878 //         dst   = 0  0  8  7  0  0  6  5  0  0  4  3  0  0  2  1
2879 void C2_MacroAssembler::vector_expand_neon(FloatRegister dst, FloatRegister src, FloatRegister mask,
2880                                            FloatRegister tmp1, FloatRegister tmp2, BasicType bt,
2881                                            int vector_length_in_bytes) {
2882   assert(vector_length_in_bytes <= 16, "the vector length in bytes for NEON must be <= 16");
2883   assert_different_registers(dst, src, mask, tmp1, tmp2);
2884   // Since the TBL instruction only supports byte table, we need to
2885   // compute indices in byte type for all types.
2886   SIMD_Arrangement size = vector_length_in_bytes == 16 ? T16B : T8B;
2887   // tmp1 =  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
2888   dup(tmp1, size, zr);
2889   // dst  =  0  0  1  1  0  0  1  1  0  0  1  1  0  0  1  1
2890   negr(dst, size, mask);
2891   // Calculate vector index for TBL with prefix sum algorithm.
2892   // dst  =  8  8  8  7  6  6  6  5  4  4  4  3  2  2  2  1
2893   for (int i = 1; i < vector_length_in_bytes; i <<= 1) {
2894     ext(tmp2, size, tmp1, dst, vector_length_in_bytes - i);
2895     addv(dst, size, tmp2, dst);
2896   }
2897   // tmp2 =  0  0 -1 -1  0  0 -1 -1  0  0 -1 -1  0  0 -1 -1
2898   orr(tmp2, size, mask, mask);
2899   // tmp2 =  0  0  8  7  0  0  6  5  0  0  4  3  0  0  2  1
2900   bsl(tmp2, size, dst, tmp1);
2901   // tmp1 =  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
2902   movi(tmp1, size, 1);
2903   // dst  = -1 -1  7  6 -1 -1  5  4 -1 -1  3  2 -1 -1  1  0
2904   subv(dst, size, tmp2, tmp1);
2905   // dst  =  0  0  8  7  0  0  6  5  0  0  4  3  0  0  2  1
2906   tbl(dst, size, src, 1, dst);
2907 }
2908 
2909 // Vector expand implementation for SVE.
2910 //
2911 // An example of 128-bit Short vector:
2912 //   Data direction: high <== low
2913 //   Input:
2914 //         src   = gf ed cb a9 87 65 43 21
2915 //         pg    = 00 01 00 01 00 01 00 01
2916 //   Expected result:
2917 //         dst   = 00 87 00 65 00 43 00 21
2918 void C2_MacroAssembler::vector_expand_sve(FloatRegister dst, FloatRegister src, PRegister pg,
2919                                           FloatRegister tmp1, FloatRegister tmp2, BasicType bt,
2920                                           int vector_length_in_bytes) {
2921   assert(UseSVE > 0, "expand implementation only for SVE");
2922   assert_different_registers(dst, src, tmp1, tmp2);
2923   SIMD_RegVariant size = elemType_to_regVariant(bt);
2924 
2925   // tmp1 = 00 00 00 00 00 00 00 00
2926   sve_dup(tmp1, size, 0);
2927   sve_movprfx(tmp2, tmp1);
2928   // tmp2 = 00 01 00 01 00 01 00 01
2929   sve_cpy(tmp2, size, pg, 1, true);
2930   // Calculate vector index for TBL with prefix sum algorithm.
2931   // tmp2 = 04 04 03 03 02 02 01 01
2932   for (int i = type2aelembytes(bt); i < vector_length_in_bytes; i <<= 1) {
2933     sve_movprfx(dst, tmp1);
2934     // The EXT instruction operates on the full-width sve register. The correct
2935     // index calculation method is:
2936     // vector_length_in_bytes - i + MaxVectorSize - vector_length_in_bytes =>
2937     // MaxVectorSize - i.
2938     sve_ext(dst, tmp2, MaxVectorSize - i);
2939     sve_add(tmp2, size, dst, tmp2);
2940   }
2941   // dst  = 00 04 00 03 00 02 00 01
2942   sve_sel(dst, size, pg, tmp2, tmp1);
2943   // dst  = -1 03 -1 02 -1 01 -1 00
2944   sve_sub(dst, size, 1);
2945   // dst  = 00 87 00 65 00 43 00 21
2946   sve_tbl(dst, size, src, dst);
2947 }
2948 
2949 // Optimized SVE cpy (imm, zeroing) instruction.
2950 //
2951 // `movi; cpy(imm, merging)` and `cpy(imm, zeroing)` have the same
2952 // functionality, but test results show that `movi; cpy(imm, merging)` has
2953 // higher throughput on some microarchitectures. This would depend on
2954 // microarchitecture and so may vary between implementations.
2955 void C2_MacroAssembler::sve_cpy(FloatRegister dst, SIMD_RegVariant T,
2956                                 PRegister pg, int imm8, bool isMerge) {
2957   if (VM_Version::prefer_sve_merging_mode_cpy() && !isMerge) {
2958     // Generates a NEON instruction `movi V<dst>.2d, #0`.
2959     // On AArch64, Z and V registers alias in the low 128 bits, so V<dst> is
2960     // the low 128 bits of Z<dst>. A write to V<dst> also clears all bits of
2961     // Z<dst> above 128, so this `movi` instruction effectively zeroes the
2962     // entire Z<dst> register. According to the Arm Software Optimization
2963     // Guide, `movi` is zero latency.
2964     movi(dst, T2D, 0);
2965     isMerge = true;
2966   }
2967   Assembler::sve_cpy(dst, T, pg, imm8, isMerge);
2968 }
2969 
2970 int C2_MacroAssembler::vector_iota_entry_index(BasicType bt) {
2971   // The vector iota entries array is ordered by type B/S/I/L/F/D, and
2972   // the offset between two types is 16.
2973   switch(bt) {
2974   case T_BYTE:
2975     return 0;
2976   case T_SHORT:
2977     return 1;
2978   case T_INT:
2979     return 2;
2980   case T_LONG:
2981     return 3;
2982   case T_FLOAT:
2983     return 4;
2984   case T_DOUBLE:
2985     return 5;
2986   default:
2987     ShouldNotReachHere();
2988   }
2989 }