1 /*
   2  * Copyright (c) 2020, 2026, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright 2026 Arm Limited and/or its affiliates.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "opto/c2_MacroAssembler.hpp"
  29 #include "opto/compile.hpp"
  30 #include "opto/intrinsicnode.hpp"
  31 #include "opto/matcher.hpp"
  32 #include "opto/output.hpp"
  33 #include "opto/subnode.hpp"
  34 #include "runtime/objectMonitorTable.hpp"
  35 #include "runtime/stubRoutines.hpp"
  36 #include "runtime/synchronizer.hpp"
  37 #include "utilities/globalDefinitions.hpp"
  38 #include "utilities/powerOfTwo.hpp"
  39 
  40 #ifdef PRODUCT
  41 #define BLOCK_COMMENT(str) /* nothing */
  42 #define STOP(error) stop(error)
  43 #else
  44 #define BLOCK_COMMENT(str) block_comment(str)
  45 #define STOP(error) block_comment(error); stop(error)
  46 #endif
  47 
  48 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  49 
  50 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
  51 
  52 void C2_MacroAssembler::entry_barrier() {
  53   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
  54   // Dummy labels for just measuring the code size
  55   Label dummy_slow_path;
  56   Label dummy_continuation;
  57   Label dummy_guard;
  58   Label* slow_path = &dummy_slow_path;
  59   Label* continuation = &dummy_continuation;
  60   Label* guard = &dummy_guard;
  61   if (!Compile::current()->output()->in_scratch_emit_size()) {
  62     // Use real labels from actual stub when not emitting code for the purpose of measuring its size
  63     C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
  64     Compile::current()->output()->add_stub(stub);
  65     slow_path = &stub->entry();
  66     continuation = &stub->continuation();
  67     guard = &stub->guard();
  68   }
  69   // In the C2 code, we move the non-hot part of nmethod entry barriers out-of-line to a stub.
  70   bs->nmethod_entry_barrier(this, slow_path, continuation, guard);
  71 }
  72 
  73 // jdk.internal.util.ArraysSupport.vectorizedHashCode
  74 address C2_MacroAssembler::arrays_hashcode(Register ary, Register cnt, Register result,
  75                                            FloatRegister vdata0, FloatRegister vdata1,
  76                                            FloatRegister vdata2, FloatRegister vdata3,
  77                                            FloatRegister vmul0, FloatRegister vmul1,
  78                                            FloatRegister vmul2, FloatRegister vmul3,
  79                                            FloatRegister vpow, FloatRegister vpowm,
  80                                            BasicType eltype) {
  81   ARRAYS_HASHCODE_REGISTERS;
  82 
  83   Register tmp1 = rscratch1, tmp2 = rscratch2;
  84 
  85   Label TAIL, STUB_SWITCH, STUB_SWITCH_OUT, LOOP, BR_BASE, LARGE, DONE;
  86 
  87   // Vectorization factor. Number of array elements loaded to one SIMD&FP registers by the stubs. We
  88   // use 8H load arrangements for chars and shorts and 8B for booleans and bytes. It's possible to
  89   // use 4H for chars and shorts instead, but using 8H gives better performance.
  90   const size_t vf = eltype == T_BOOLEAN || eltype == T_BYTE ? 8
  91                     : eltype == T_CHAR || eltype == T_SHORT ? 8
  92                     : eltype == T_INT                       ? 4
  93                                                             : 0;
  94   guarantee(vf, "unsupported eltype");
  95 
  96   // Unroll factor for the scalar loop below. The value is chosen based on performance analysis.
  97   const size_t unroll_factor = 4;
  98 
  99   switch (eltype) {
 100   case T_BOOLEAN:
 101     BLOCK_COMMENT("arrays_hashcode(unsigned byte) {");
 102     break;
 103   case T_CHAR:
 104     BLOCK_COMMENT("arrays_hashcode(char) {");
 105     break;
 106   case T_BYTE:
 107     BLOCK_COMMENT("arrays_hashcode(byte) {");
 108     break;
 109   case T_SHORT:
 110     BLOCK_COMMENT("arrays_hashcode(short) {");
 111     break;
 112   case T_INT:
 113     BLOCK_COMMENT("arrays_hashcode(int) {");
 114     break;
 115   default:
 116     ShouldNotReachHere();
 117   }
 118 
 119   // large_arrays_hashcode(T_INT) performs worse than the scalar loop below when the Neon loop
 120   // implemented by the stub executes just once. Call the stub only if at least two iterations will
 121   // be executed.
 122   const size_t large_threshold = eltype == T_INT ? vf * 2 : vf;
 123   cmpw(cnt, large_threshold);
 124   br(Assembler::HS, LARGE);
 125 
 126   bind(TAIL);
 127 
 128   // The andr performs cnt % uf where uf = unroll_factor. The subtract shifted by 3 offsets past
 129   // uf - (cnt % uf) pairs of load + madd insns i.e. it only executes cnt % uf load + madd pairs.
 130   // Iteration eats up the remainder, uf elements at a time.
 131   assert(is_power_of_2(unroll_factor), "can't use this value to calculate the jump target PC");
 132   andr(tmp2, cnt, unroll_factor - 1);
 133   adr(tmp1, BR_BASE);
 134   // For Cortex-A53 offset is 4 because 2 nops are generated.
 135   sub(tmp1, tmp1, tmp2, ext::sxtw, VM_Version::supports_a53mac() ? 4 : 3);
 136   movw(tmp2, 0x1f);
 137   br(tmp1);
 138 
 139   bind(LOOP);
 140   for (size_t i = 0; i < unroll_factor; ++i) {
 141     load(tmp1, Address(post(ary, type2aelembytes(eltype))), eltype);
 142     maddw(result, result, tmp2, tmp1);
 143     // maddw generates an extra nop for Cortex-A53 (see maddw definition in macroAssembler).
 144     // Generate 2nd nop to have 4 instructions per iteration.
 145     if (VM_Version::supports_a53mac()) {
 146       nop();
 147     }
 148   }
 149   bind(BR_BASE);
 150   subsw(cnt, cnt, unroll_factor);
 151   br(Assembler::HS, LOOP);
 152 
 153   b(DONE);
 154 
 155   bind(LARGE);
 156 
 157   RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_arrays_hashcode(eltype));
 158   assert(stub.target() != nullptr, "array_hashcode stub has not been generated");
 159   address tpc = trampoline_call(stub);
 160   if (tpc == nullptr) {
 161     DEBUG_ONLY(reset_labels(TAIL, BR_BASE));
 162     postcond(pc() == badAddress);
 163     return nullptr;
 164   }
 165 
 166   bind(DONE);
 167 
 168   BLOCK_COMMENT("} // arrays_hashcode");
 169 
 170   postcond(pc() != badAddress);
 171   return pc();
 172 }
 173 
 174 void C2_MacroAssembler::fast_lock(Register obj, Register box, Register t1,
 175                                   Register t2, Register t3) {
 176   assert_different_registers(obj, box, t1, t2, t3, rscratch2);
 177 
 178   // Handle inflated monitor.
 179   Label inflated;
 180   // Finish fast lock successfully. MUST branch to with flag == EQ
 181   Label locked;
 182   // Finish fast lock unsuccessfully. MUST branch to with flag == NE
 183   Label slow_path;
 184 
 185   if (UseObjectMonitorTable) {
 186     // Clear cache in case fast locking succeeds or we need to take the slow-path.
 187     str(zr, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 188   }
 189 
 190   if (DiagnoseSyncOnValueBasedClasses != 0) {
 191     load_klass(t1, obj);
 192     ldrb(t1, Address(t1, Klass::misc_flags_offset()));
 193     tst(t1, KlassFlags::_misc_is_value_based_class);
 194     br(Assembler::NE, slow_path);
 195   }
 196 
 197   const Register t1_mark = t1;
 198   const Register t3_t = t3;
 199 
 200   { // Fast locking
 201 
 202     // Push lock to the lock stack and finish successfully. MUST branch to with flag == EQ
 203     Label push;
 204 
 205     const Register t2_top = t2;
 206 
 207     // Check if lock-stack is full.
 208     ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 209     cmpw(t2_top, (unsigned)LockStack::end_offset() - 1);
 210     br(Assembler::GT, slow_path);
 211 
 212     // Check if recursive.
 213     subw(t3_t, t2_top, oopSize);
 214     ldr(t3_t, Address(rthread, t3_t));
 215     cmp(obj, t3_t);
 216     br(Assembler::EQ, push);
 217 
 218     // Relaxed normal load to check for monitor. Optimization for monitor case.
 219     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 220     tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
 221 
 222     // Not inflated
 223     assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea");
 224 
 225     // Try to lock. Transition lock-bits 0b01 => 0b00
 226     orr(t1_mark, t1_mark, markWord::unlocked_value);
 227     eor(t3_t, t1_mark, markWord::unlocked_value);
 228     cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
 229             /*acquire*/ true, /*release*/ false, /*weak*/ false, noreg);
 230     br(Assembler::NE, slow_path);
 231 
 232     bind(push);
 233     // After successful lock, push object on lock-stack.
 234     str(obj, Address(rthread, t2_top));
 235     addw(t2_top, t2_top, oopSize);
 236     strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 237     b(locked);
 238   }
 239 
 240   { // Handle inflated monitor.
 241     bind(inflated);
 242 
 243     const Register t1_monitor = t1;
 244 
 245     if (!UseObjectMonitorTable) {
 246       assert(t1_monitor == t1_mark, "should be the same here");
 247     } else {
 248       const Register t1_hash = t1;
 249       Label monitor_found;
 250 
 251       // Save the mark, we might need it to extract the hash.
 252       mov(t3, t1_mark);
 253 
 254       // Look for the monitor in the om_cache.
 255 
 256       ByteSize cache_offset   = JavaThread::om_cache_oops_offset();
 257       ByteSize monitor_offset = OMCache::oop_to_monitor_difference();
 258       const int num_unrolled  = OMCache::CAPACITY;
 259       for (int i = 0; i < num_unrolled; i++) {
 260         ldr(t1_monitor, Address(rthread, cache_offset + monitor_offset));
 261         ldr(t2, Address(rthread, cache_offset));
 262         cmp(obj, t2);
 263         br(Assembler::EQ, monitor_found);
 264         cache_offset = cache_offset + OMCache::oop_to_oop_difference();
 265       }
 266 
 267       // Look for the monitor in the table.
 268 
 269       // Get the hash code.
 270       ubfx(t1_hash, t3, markWord::hash_shift, markWord::hash_bits);
 271 
 272       // Get the table and calculate the bucket's address
 273       lea(t3, ExternalAddress(ObjectMonitorTable::current_table_address()));
 274       ldr(t3, Address(t3));
 275       ldr(t2, Address(t3, ObjectMonitorTable::table_capacity_mask_offset()));
 276       ands(t1_hash, t1_hash, t2);
 277       ldr(t3, Address(t3, ObjectMonitorTable::table_buckets_offset()));
 278 
 279       // Read the monitor from the bucket.
 280       ldr(t1_monitor, Address(t3, t1_hash, Address::lsl(LogBytesPerWord)));
 281 
 282       // Check if the monitor in the bucket is special (empty, tombstone or removed).
 283       cmp(t1_monitor, (unsigned char)ObjectMonitorTable::SpecialPointerValues::below_is_special);
 284       br(Assembler::LO, slow_path);
 285 
 286       // Check if object matches.
 287       ldr(t3, Address(t1_monitor, ObjectMonitor::object_offset()));
 288       BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
 289       bs_asm->try_peek_weak_handle_in_nmethod(this, t3, t3, t2, slow_path);
 290       cmp(t3, obj);
 291       br(Assembler::NE, slow_path);
 292 
 293       bind(monitor_found);
 294     }
 295 
 296     const Register t2_owner_addr = t2;
 297     const Register t3_owner = t3;
 298     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 299     const Address owner_address(t1_monitor, ObjectMonitor::owner_offset() - monitor_tag);
 300     const Address recursions_address(t1_monitor, ObjectMonitor::recursions_offset() - monitor_tag);
 301 
 302     Label monitor_locked;
 303 
 304     // Compute owner address.
 305     lea(t2_owner_addr, owner_address);
 306 
 307     // Try to CAS owner (no owner => current thread's _monitor_owner_id).
 308     ldr(rscratch2, Address(rthread, JavaThread::monitor_owner_id_offset()));
 309     cmpxchg(t2_owner_addr, zr, rscratch2, Assembler::xword, /*acquire*/ true,
 310             /*release*/ false, /*weak*/ false, t3_owner);
 311     br(Assembler::EQ, monitor_locked);
 312 
 313     // Check if recursive.
 314     cmp(t3_owner, rscratch2);
 315     br(Assembler::NE, slow_path);
 316 
 317     // Recursive.
 318     increment(recursions_address, 1);
 319 
 320     bind(monitor_locked);
 321     if (UseObjectMonitorTable) {
 322       str(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 323     }
 324   }
 325 
 326   bind(locked);
 327 
 328 #ifdef ASSERT
 329   // Check that locked label is reached with Flags == EQ.
 330   Label flag_correct;
 331   br(Assembler::EQ, flag_correct);
 332   stop("Fast Lock Flag != EQ");
 333 #endif
 334 
 335   bind(slow_path);
 336 #ifdef ASSERT
 337   // Check that slow_path label is reached with Flags == NE.
 338   br(Assembler::NE, flag_correct);
 339   stop("Fast Lock Flag != NE");
 340   bind(flag_correct);
 341 #endif
 342   // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
 343 }
 344 
 345 void C2_MacroAssembler::fast_unlock(Register obj, Register box, Register t1,
 346                                     Register t2, Register t3) {
 347   assert_different_registers(obj, box, t1, t2, t3);
 348 
 349   // Handle inflated monitor.
 350   Label inflated, inflated_load_mark;
 351   // Finish fast unlock successfully. MUST branch to with flag == EQ
 352   Label unlocked;
 353   // Finish fast unlock unsuccessfully. MUST branch to with flag == NE
 354   Label slow_path;
 355 
 356   const Register t1_mark = t1;
 357   const Register t2_top = t2;
 358   const Register t3_t = t3;
 359 
 360   { // Fast unlock
 361 
 362     Label push_and_slow_path;
 363 
 364     // Check if obj is top of lock-stack.
 365     ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 366     subw(t2_top, t2_top, oopSize);
 367     ldr(t3_t, Address(rthread, t2_top));
 368     cmp(obj, t3_t);
 369     // Top of lock stack was not obj. Must be monitor.
 370     br(Assembler::NE, inflated_load_mark);
 371 
 372     // Pop lock-stack.
 373     DEBUG_ONLY(str(zr, Address(rthread, t2_top));)
 374     strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 375 
 376     // Check if recursive.
 377     subw(t3_t, t2_top, oopSize);
 378     ldr(t3_t, Address(rthread, t3_t));
 379     cmp(obj, t3_t);
 380     br(Assembler::EQ, unlocked);
 381 
 382     // Not recursive.
 383     // Load Mark.
 384     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 385 
 386     // Check header for monitor (0b10).
 387     // Because we got here by popping (meaning we pushed in locked)
 388     // there will be no monitor in the box. So we need to push back the obj
 389     // so that the runtime can fix any potential anonymous owner.
 390     tbnz(t1_mark, exact_log2(markWord::monitor_value), UseObjectMonitorTable ? push_and_slow_path : inflated);
 391 
 392     // Try to unlock. Transition lock bits 0b00 => 0b01
 393     assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea");
 394     orr(t3_t, t1_mark, markWord::unlocked_value);
 395     cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
 396             /*acquire*/ false, /*release*/ true, /*weak*/ false, noreg);
 397     br(Assembler::EQ, unlocked);
 398 
 399     bind(push_and_slow_path);
 400     // Compare and exchange failed.
 401     // Restore lock-stack and handle the unlock in runtime.
 402     DEBUG_ONLY(str(obj, Address(rthread, t2_top));)
 403     addw(t2_top, t2_top, oopSize);
 404     str(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 405     b(slow_path);
 406   }
 407 
 408 
 409   { // Handle inflated monitor.
 410     bind(inflated_load_mark);
 411     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 412 #ifdef ASSERT
 413     tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
 414     stop("Fast Unlock not monitor");
 415 #endif
 416 
 417     bind(inflated);
 418 
 419 #ifdef ASSERT
 420     Label check_done;
 421     subw(t2_top, t2_top, oopSize);
 422     cmpw(t2_top, in_bytes(JavaThread::lock_stack_base_offset()));
 423     br(Assembler::LT, check_done);
 424     ldr(t3_t, Address(rthread, t2_top));
 425     cmp(obj, t3_t);
 426     br(Assembler::NE, inflated);
 427     stop("Fast Unlock lock on stack");
 428     bind(check_done);
 429 #endif
 430 
 431     const Register t1_monitor = t1;
 432 
 433     if (!UseObjectMonitorTable) {
 434       assert(t1_monitor == t1_mark, "should be the same here");
 435 
 436       // Untag the monitor.
 437       add(t1_monitor, t1_mark, -(int)markWord::monitor_value);
 438     } else {
 439       ldr(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 440       // null check with Flags == NE, no valid pointer below alignof(ObjectMonitor*)
 441       cmp(t1_monitor, checked_cast<uint8_t>(alignof(ObjectMonitor*)));
 442       br(Assembler::LO, slow_path);
 443     }
 444 
 445     const Register t2_recursions = t2;
 446     Label not_recursive;
 447 
 448     // Check if recursive.
 449     ldr(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
 450     cbz(t2_recursions, not_recursive);
 451 
 452     // Recursive unlock.
 453     sub(t2_recursions, t2_recursions, 1u);
 454     str(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
 455     // Set flag == EQ
 456     cmp(t2_recursions, t2_recursions);
 457     b(unlocked);
 458 
 459     bind(not_recursive);
 460 
 461     const Register t2_owner_addr = t2;
 462 
 463     // Compute owner address.
 464     lea(t2_owner_addr, Address(t1_monitor, ObjectMonitor::owner_offset()));
 465 
 466     // Set owner to null.
 467     // Release to satisfy the JMM
 468     stlr(zr, t2_owner_addr);
 469     // We need a full fence after clearing owner to avoid stranding.
 470     // StoreLoad achieves this.
 471     membar(StoreLoad);
 472 
 473     // Check if the entry_list is empty.
 474     ldr(rscratch1, Address(t1_monitor, ObjectMonitor::entry_list_offset()));
 475     cmp(rscratch1, zr);
 476     br(Assembler::EQ, unlocked);  // If so we are done.
 477 
 478     // Check if there is a successor.
 479     ldr(rscratch1, Address(t1_monitor, ObjectMonitor::succ_offset()));
 480     cmp(rscratch1, zr);
 481     br(Assembler::NE, unlocked);  // If so we are done.
 482 
 483     // Save the monitor pointer in the current thread, so we can try to
 484     // reacquire the lock in SharedRuntime::monitor_exit_helper().
 485     str(t1_monitor, Address(rthread, JavaThread::unlocked_inflated_monitor_offset()));
 486 
 487     cmp(zr, rthread); // Set Flag to NE => slow path
 488     b(slow_path);
 489   }
 490 
 491   bind(unlocked);
 492   cmp(zr, zr); // Set Flags to EQ => fast path
 493 
 494 #ifdef ASSERT
 495   // Check that unlocked label is reached with Flags == EQ.
 496   Label flag_correct;
 497   br(Assembler::EQ, flag_correct);
 498   stop("Fast Unlock Flag != EQ");
 499 #endif
 500 
 501   bind(slow_path);
 502 #ifdef ASSERT
 503   // Check that slow_path label is reached with Flags == NE.
 504   br(Assembler::NE, flag_correct);
 505   stop("Fast Unlock Flag != NE");
 506   bind(flag_correct);
 507 #endif
 508   // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
 509 }
 510 
 511 // Search for str1 in str2 and return index or -1
 512 // Clobbers: rscratch1, rscratch2, rflags. May also clobber v0-v1, when icnt1==-1.
 513 void C2_MacroAssembler::string_indexof(Register str2, Register str1,
 514                                        Register cnt2, Register cnt1,
 515                                        Register tmp1, Register tmp2,
 516                                        Register tmp3, Register tmp4,
 517                                        Register tmp5, Register tmp6,
 518                                        int icnt1, Register result, int ae) {
 519   // NOTE: tmp5, tmp6 can be zr depending on specific method version
 520   Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
 521 
 522   Register ch1 = rscratch1;
 523   Register ch2 = rscratch2;
 524   Register cnt1tmp = tmp1;
 525   Register cnt2tmp = tmp2;
 526   Register cnt1_neg = cnt1;
 527   Register cnt2_neg = cnt2;
 528   Register result_tmp = tmp4;
 529 
 530   bool isL = ae == StrIntrinsicNode::LL;
 531 
 532   bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
 533   bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
 534   int str1_chr_shift = str1_isL ? 0:1;
 535   int str2_chr_shift = str2_isL ? 0:1;
 536   int str1_chr_size = str1_isL ? 1:2;
 537   int str2_chr_size = str2_isL ? 1:2;
 538   chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
 539                                       (chr_insn)&MacroAssembler::ldrh;
 540   chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
 541                                       (chr_insn)&MacroAssembler::ldrh;
 542   chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
 543   chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
 544 
 545   // Note, inline_string_indexOf() generates checks:
 546   // if (substr.count > string.count) return -1;
 547   // if (substr.count == 0) return 0;
 548 
 549   // We have two strings, a source string in str2, cnt2 and a pattern string
 550   // in str1, cnt1. Find the 1st occurrence of pattern in source or return -1.
 551 
 552   // For larger pattern and source we use a simplified Boyer Moore algorithm.
 553   // With a small pattern and source we use linear scan.
 554 
 555   if (icnt1 == -1) {
 556     sub(result_tmp, cnt2, cnt1);
 557     cmp(cnt1, (u1)8);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
 558     br(LT, LINEARSEARCH);
 559     dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
 560     subs(zr, cnt1, 256);
 561     lsr(tmp1, cnt2, 2);
 562     ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
 563     br(GE, LINEARSTUB);
 564   }
 565 
 566 // The Boyer Moore alogorithm is based on the description here:-
 567 //
 568 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
 569 //
 570 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
 571 // and the 'Good Suffix' rule.
 572 //
 573 // These rules are essentially heuristics for how far we can shift the
 574 // pattern along the search string.
 575 //
 576 // The implementation here uses the 'Bad Character' rule only because of the
 577 // complexity of initialisation for the 'Good Suffix' rule.
 578 //
 579 // This is also known as the Boyer-Moore-Horspool algorithm:-
 580 //
 581 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
 582 //
 583 // This particular implementation has few java-specific optimizations.
 584 //
 585 // #define ASIZE 256
 586 //
 587 //    int bm(unsigned char *x, int m, unsigned char *y, int n) {
 588 //       int i, j;
 589 //       unsigned c;
 590 //       unsigned char bc[ASIZE];
 591 //
 592 //       /* Preprocessing */
 593 //       for (i = 0; i < ASIZE; ++i)
 594 //          bc[i] = m;
 595 //       for (i = 0; i < m - 1; ) {
 596 //          c = x[i];
 597 //          ++i;
 598 //          // c < 256 for Latin1 string, so, no need for branch
 599 //          #ifdef PATTERN_STRING_IS_LATIN1
 600 //          bc[c] = m - i;
 601 //          #else
 602 //          if (c < ASIZE) bc[c] = m - i;
 603 //          #endif
 604 //       }
 605 //
 606 //       /* Searching */
 607 //       j = 0;
 608 //       while (j <= n - m) {
 609 //          c = y[i+j];
 610 //          if (x[m-1] == c)
 611 //            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
 612 //          if (i < 0) return j;
 613 //          // c < 256 for Latin1 string, so, no need for branch
 614 //          #ifdef SOURCE_STRING_IS_LATIN1
 615 //          // LL case: (c< 256) always true. Remove branch
 616 //          j += bc[y[j+m-1]];
 617 //          #endif
 618 //          #ifndef PATTERN_STRING_IS_UTF
 619 //          // UU case: need if (c<ASIZE) check. Skip 1 character if not.
 620 //          if (c < ASIZE)
 621 //            j += bc[y[j+m-1]];
 622 //          else
 623 //            j += 1
 624 //          #endif
 625 //          #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
 626 //          // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
 627 //          if (c < ASIZE)
 628 //            j += bc[y[j+m-1]];
 629 //          else
 630 //            j += m
 631 //          #endif
 632 //       }
 633 //    }
 634 
 635   if (icnt1 == -1) {
 636     Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
 637         BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
 638     Register cnt1end = tmp2;
 639     Register str2end = cnt2;
 640     Register skipch = tmp2;
 641 
 642     // str1 length is >=8, so, we can read at least 1 register for cases when
 643     // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
 644     // UL case. We'll re-read last character in inner pre-loop code to have
 645     // single outer pre-loop load
 646     const int firstStep = isL ? 7 : 3;
 647 
 648     const int ASIZE = 256;
 649     const int STORED_BYTES = 32; // amount of bytes stored per instruction
 650     sub(sp, sp, ASIZE);
 651     mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
 652     mov(ch1, sp);
 653     BIND(BM_INIT_LOOP);
 654       stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
 655       subs(tmp5, tmp5, 1);
 656       br(GT, BM_INIT_LOOP);
 657 
 658       sub(cnt1tmp, cnt1, 1);
 659       mov(tmp5, str2);
 660       add(str2end, str2, result_tmp, LSL, str2_chr_shift);
 661       sub(ch2, cnt1, 1);
 662       mov(tmp3, str1);
 663     BIND(BCLOOP);
 664       (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
 665       if (!str1_isL) {
 666         subs(zr, ch1, ASIZE);
 667         br(HS, BCSKIP);
 668       }
 669       strb(ch2, Address(sp, ch1));
 670     BIND(BCSKIP);
 671       subs(ch2, ch2, 1);
 672       br(GT, BCLOOP);
 673 
 674       add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
 675       if (str1_isL == str2_isL) {
 676         // load last 8 bytes (8LL/4UU symbols)
 677         ldr(tmp6, Address(tmp6, -wordSize));
 678       } else {
 679         ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
 680         // convert Latin1 to UTF. We'll have to wait until load completed, but
 681         // it's still faster than per-character loads+checks
 682         lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
 683         ubfx(ch1, tmp6, 8, 8); // str1[N-2]
 684         ubfx(ch2, tmp6, 16, 8); // str1[N-3]
 685         andr(tmp6, tmp6, 0xFF); // str1[N-4]
 686         orr(ch2, ch1, ch2, LSL, 16);
 687         orr(tmp6, tmp6, tmp3, LSL, 48);
 688         orr(tmp6, tmp6, ch2, LSL, 16);
 689       }
 690     BIND(BMLOOPSTR2);
 691       (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 692       sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
 693       if (str1_isL == str2_isL) {
 694         // re-init tmp3. It's for free because it's executed in parallel with
 695         // load above. Alternative is to initialize it before loop, but it'll
 696         // affect performance on in-order systems with 2 or more ld/st pipelines
 697         lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
 698       }
 699       if (!isL) { // UU/UL case
 700         lsl(ch2, cnt1tmp, 1); // offset in bytes
 701       }
 702       cmp(tmp3, skipch);
 703       br(NE, BMSKIP);
 704       ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
 705       mov(ch1, tmp6);
 706       if (isL) {
 707         b(BMLOOPSTR1_AFTER_LOAD);
 708       } else {
 709         sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
 710         b(BMLOOPSTR1_CMP);
 711       }
 712     BIND(BMLOOPSTR1);
 713       (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
 714       (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 715     BIND(BMLOOPSTR1_AFTER_LOAD);
 716       subs(cnt1tmp, cnt1tmp, 1);
 717       br(LT, BMLOOPSTR1_LASTCMP);
 718     BIND(BMLOOPSTR1_CMP);
 719       cmp(ch1, ch2);
 720       br(EQ, BMLOOPSTR1);
 721     BIND(BMSKIP);
 722       if (!isL) {
 723         // if we've met UTF symbol while searching Latin1 pattern, then we can
 724         // skip cnt1 symbols
 725         if (str1_isL != str2_isL) {
 726           mov(result_tmp, cnt1);
 727         } else {
 728           mov(result_tmp, 1);
 729         }
 730         subs(zr, skipch, ASIZE);
 731         br(HS, BMADV);
 732       }
 733       ldrb(result_tmp, Address(sp, skipch)); // load skip distance
 734     BIND(BMADV);
 735       sub(cnt1tmp, cnt1, 1);
 736       add(str2, str2, result_tmp, LSL, str2_chr_shift);
 737       cmp(str2, str2end);
 738       br(LE, BMLOOPSTR2);
 739       add(sp, sp, ASIZE);
 740       b(NOMATCH);
 741     BIND(BMLOOPSTR1_LASTCMP);
 742       cmp(ch1, ch2);
 743       br(NE, BMSKIP);
 744     BIND(BMMATCH);
 745       sub(result, str2, tmp5);
 746       if (!str2_isL) lsr(result, result, 1);
 747       add(sp, sp, ASIZE);
 748       b(DONE);
 749 
 750     BIND(LINEARSTUB);
 751     cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm
 752     br(LT, LINEAR_MEDIUM);
 753     mov(result, zr);
 754     RuntimeAddress stub = nullptr;
 755     if (isL) {
 756       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
 757       assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated");
 758     } else if (str1_isL) {
 759       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
 760        assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated");
 761     } else {
 762       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
 763       assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated");
 764     }
 765     address call = trampoline_call(stub);
 766     if (call == nullptr) {
 767       DEBUG_ONLY(reset_labels(LINEARSEARCH, LINEAR_MEDIUM, DONE, NOMATCH, MATCH));
 768       ciEnv::current()->record_failure("CodeCache is full");
 769       return;
 770     }
 771     b(DONE);
 772   }
 773 
 774   BIND(LINEARSEARCH);
 775   {
 776     Label DO1, DO2, DO3;
 777 
 778     Register str2tmp = tmp2;
 779     Register first = tmp3;
 780 
 781     if (icnt1 == -1)
 782     {
 783         Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
 784 
 785         cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2));
 786         br(LT, DOSHORT);
 787       BIND(LINEAR_MEDIUM);
 788         (this->*str1_load_1chr)(first, Address(str1));
 789         lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
 790         sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
 791         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 792         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 793 
 794       BIND(FIRST_LOOP);
 795         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
 796         cmp(first, ch2);
 797         br(EQ, STR1_LOOP);
 798       BIND(STR2_NEXT);
 799         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 800         br(LE, FIRST_LOOP);
 801         b(NOMATCH);
 802 
 803       BIND(STR1_LOOP);
 804         adds(cnt1tmp, cnt1_neg, str1_chr_size);
 805         add(cnt2tmp, cnt2_neg, str2_chr_size);
 806         br(GE, MATCH);
 807 
 808       BIND(STR1_NEXT);
 809         (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
 810         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
 811         cmp(ch1, ch2);
 812         br(NE, STR2_NEXT);
 813         adds(cnt1tmp, cnt1tmp, str1_chr_size);
 814         add(cnt2tmp, cnt2tmp, str2_chr_size);
 815         br(LT, STR1_NEXT);
 816         b(MATCH);
 817 
 818       BIND(DOSHORT);
 819       if (str1_isL == str2_isL) {
 820         cmp(cnt1, (u1)2);
 821         br(LT, DO1);
 822         br(GT, DO3);
 823       }
 824     }
 825 
 826     if (icnt1 == 4) {
 827       Label CH1_LOOP;
 828 
 829         (this->*load_4chr)(ch1, str1);
 830         sub(result_tmp, cnt2, 4);
 831         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 832         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 833 
 834       BIND(CH1_LOOP);
 835         (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
 836         cmp(ch1, ch2);
 837         br(EQ, MATCH);
 838         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 839         br(LE, CH1_LOOP);
 840         b(NOMATCH);
 841       }
 842 
 843     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
 844       Label CH1_LOOP;
 845 
 846       BIND(DO2);
 847         (this->*load_2chr)(ch1, str1);
 848         if (icnt1 == 2) {
 849           sub(result_tmp, cnt2, 2);
 850         }
 851         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 852         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 853       BIND(CH1_LOOP);
 854         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
 855         cmp(ch1, ch2);
 856         br(EQ, MATCH);
 857         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 858         br(LE, CH1_LOOP);
 859         b(NOMATCH);
 860     }
 861 
 862     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
 863       Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
 864 
 865       BIND(DO3);
 866         (this->*load_2chr)(first, str1);
 867         (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
 868         if (icnt1 == 3) {
 869           sub(result_tmp, cnt2, 3);
 870         }
 871         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 872         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 873       BIND(FIRST_LOOP);
 874         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
 875         cmpw(first, ch2);
 876         br(EQ, STR1_LOOP);
 877       BIND(STR2_NEXT);
 878         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 879         br(LE, FIRST_LOOP);
 880         b(NOMATCH);
 881 
 882       BIND(STR1_LOOP);
 883         add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
 884         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
 885         cmp(ch1, ch2);
 886         br(NE, STR2_NEXT);
 887         b(MATCH);
 888     }
 889 
 890     if (icnt1 == -1 || icnt1 == 1) {
 891       Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
 892 
 893       BIND(DO1);
 894         (this->*str1_load_1chr)(ch1, str1);
 895         cmp(cnt2, (u1)8);
 896         br(LT, DO1_SHORT);
 897 
 898         sub(result_tmp, cnt2, 8/str2_chr_size);
 899         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 900         mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
 901         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 902 
 903         if (str2_isL) {
 904           orr(ch1, ch1, ch1, LSL, 8);
 905         }
 906         orr(ch1, ch1, ch1, LSL, 16);
 907         orr(ch1, ch1, ch1, LSL, 32);
 908       BIND(CH1_LOOP);
 909         ldr(ch2, Address(str2, cnt2_neg));
 910         eor(ch2, ch1, ch2);
 911         sub(tmp1, ch2, tmp3);
 912         orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 913         bics(tmp1, tmp1, tmp2);
 914         br(NE, HAS_ZERO);
 915         adds(cnt2_neg, cnt2_neg, 8);
 916         br(LT, CH1_LOOP);
 917 
 918         cmp(cnt2_neg, (u1)8);
 919         mov(cnt2_neg, 0);
 920         br(LT, CH1_LOOP);
 921         b(NOMATCH);
 922 
 923       BIND(HAS_ZERO);
 924         rev(tmp1, tmp1);
 925         clz(tmp1, tmp1);
 926         add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
 927         b(MATCH);
 928 
 929       BIND(DO1_SHORT);
 930         mov(result_tmp, cnt2);
 931         lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
 932         sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
 933       BIND(DO1_LOOP);
 934         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
 935         cmpw(ch1, ch2);
 936         br(EQ, MATCH);
 937         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 938         br(LT, DO1_LOOP);
 939     }
 940   }
 941   BIND(NOMATCH);
 942     mov(result, -1);
 943     b(DONE);
 944   BIND(MATCH);
 945     add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
 946   BIND(DONE);
 947 }
 948 
 949 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
 950 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
 951 
 952 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1,
 953                                             Register ch, Register result,
 954                                             Register tmp1, Register tmp2, Register tmp3)
 955 {
 956   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
 957   Register cnt1_neg = cnt1;
 958   Register ch1 = rscratch1;
 959   Register result_tmp = rscratch2;
 960 
 961   cbz(cnt1, NOMATCH);
 962 
 963   cmp(cnt1, (u1)4);
 964   br(LT, DO1_SHORT);
 965 
 966   orr(ch, ch, ch, LSL, 16);
 967   orr(ch, ch, ch, LSL, 32);
 968 
 969   sub(cnt1, cnt1, 4);
 970   mov(result_tmp, cnt1);
 971   lea(str1, Address(str1, cnt1, Address::uxtw(1)));
 972   sub(cnt1_neg, zr, cnt1, LSL, 1);
 973 
 974   mov(tmp3, 0x0001000100010001);
 975 
 976   BIND(CH1_LOOP);
 977     ldr(ch1, Address(str1, cnt1_neg));
 978     eor(ch1, ch, ch1);
 979     sub(tmp1, ch1, tmp3);
 980     orr(tmp2, ch1, 0x7fff7fff7fff7fff);
 981     bics(tmp1, tmp1, tmp2);
 982     br(NE, HAS_ZERO);
 983     adds(cnt1_neg, cnt1_neg, 8);
 984     br(LT, CH1_LOOP);
 985 
 986     cmp(cnt1_neg, (u1)8);
 987     mov(cnt1_neg, 0);
 988     br(LT, CH1_LOOP);
 989     b(NOMATCH);
 990 
 991   BIND(HAS_ZERO);
 992     rev(tmp1, tmp1);
 993     clz(tmp1, tmp1);
 994     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
 995     b(MATCH);
 996 
 997   BIND(DO1_SHORT);
 998     mov(result_tmp, cnt1);
 999     lea(str1, Address(str1, cnt1, Address::uxtw(1)));
1000     sub(cnt1_neg, zr, cnt1, LSL, 1);
1001   BIND(DO1_LOOP);
1002     ldrh(ch1, Address(str1, cnt1_neg));
1003     cmpw(ch, ch1);
1004     br(EQ, MATCH);
1005     adds(cnt1_neg, cnt1_neg, 2);
1006     br(LT, DO1_LOOP);
1007   BIND(NOMATCH);
1008     mov(result, -1);
1009     b(DONE);
1010   BIND(MATCH);
1011     add(result, result_tmp, cnt1_neg, ASR, 1);
1012   BIND(DONE);
1013 }
1014 
1015 void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1,
1016                                                 Register ch, Register result,
1017                                                 FloatRegister ztmp1,
1018                                                 FloatRegister ztmp2,
1019                                                 PRegister tmp_pg,
1020                                                 PRegister tmp_pdn, bool isL)
1021 {
1022   // Note that `tmp_pdn` should *NOT* be used as governing predicate register.
1023   assert(tmp_pg->is_governing(),
1024          "this register has to be a governing predicate register");
1025 
1026   Label LOOP, MATCH, DONE, NOMATCH;
1027   Register vec_len = rscratch1;
1028   Register idx = rscratch2;
1029 
1030   SIMD_RegVariant T = (isL == true) ? B : H;
1031 
1032   cbz(cnt1, NOMATCH);
1033 
1034   // Assign the particular char throughout the vector.
1035   sve_dup(ztmp2, T, ch);
1036   if (isL) {
1037     sve_cntb(vec_len);
1038   } else {
1039     sve_cnth(vec_len);
1040   }
1041   mov(idx, 0);
1042 
1043   // Generate a predicate to control the reading of input string.
1044   sve_whilelt(tmp_pg, T, idx, cnt1);
1045 
1046   BIND(LOOP);
1047     // Read a vector of 8- or 16-bit data depending on the string type. Note
1048     // that inactive elements indicated by the predicate register won't cause
1049     // a data read from memory to the destination vector.
1050     if (isL) {
1051       sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx));
1052     } else {
1053       sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1)));
1054     }
1055     add(idx, idx, vec_len);
1056 
1057     // Perform the comparison. An element of the destination predicate is set
1058     // to active if the particular char is matched.
1059     sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2);
1060 
1061     // Branch if the particular char is found.
1062     br(NE, MATCH);
1063 
1064     sve_whilelt(tmp_pg, T, idx, cnt1);
1065 
1066     // Loop back if the particular char not found.
1067     br(MI, LOOP);
1068 
1069   BIND(NOMATCH);
1070     mov(result, -1);
1071     b(DONE);
1072 
1073   BIND(MATCH);
1074     // Undo the index increment.
1075     sub(idx, idx, vec_len);
1076 
1077     // Crop the vector to find its location.
1078     sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */);
1079     add(result, idx, -1);
1080     sve_incp(result, T, tmp_pdn);
1081   BIND(DONE);
1082 }
1083 
1084 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1,
1085                                             Register ch, Register result,
1086                                             Register tmp1, Register tmp2, Register tmp3)
1087 {
1088   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
1089   Register cnt1_neg = cnt1;
1090   Register ch1 = rscratch1;
1091   Register result_tmp = rscratch2;
1092 
1093   cbz(cnt1, NOMATCH);
1094 
1095   cmp(cnt1, (u1)8);
1096   br(LT, DO1_SHORT);
1097 
1098   orr(ch, ch, ch, LSL, 8);
1099   orr(ch, ch, ch, LSL, 16);
1100   orr(ch, ch, ch, LSL, 32);
1101 
1102   sub(cnt1, cnt1, 8);
1103   mov(result_tmp, cnt1);
1104   lea(str1, Address(str1, cnt1));
1105   sub(cnt1_neg, zr, cnt1);
1106 
1107   mov(tmp3, 0x0101010101010101);
1108 
1109   BIND(CH1_LOOP);
1110     ldr(ch1, Address(str1, cnt1_neg));
1111     eor(ch1, ch, ch1);
1112     sub(tmp1, ch1, tmp3);
1113     orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f);
1114     bics(tmp1, tmp1, tmp2);
1115     br(NE, HAS_ZERO);
1116     adds(cnt1_neg, cnt1_neg, 8);
1117     br(LT, CH1_LOOP);
1118 
1119     cmp(cnt1_neg, (u1)8);
1120     mov(cnt1_neg, 0);
1121     br(LT, CH1_LOOP);
1122     b(NOMATCH);
1123 
1124   BIND(HAS_ZERO);
1125     rev(tmp1, tmp1);
1126     clz(tmp1, tmp1);
1127     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
1128     b(MATCH);
1129 
1130   BIND(DO1_SHORT);
1131     mov(result_tmp, cnt1);
1132     lea(str1, Address(str1, cnt1));
1133     sub(cnt1_neg, zr, cnt1);
1134   BIND(DO1_LOOP);
1135     ldrb(ch1, Address(str1, cnt1_neg));
1136     cmp(ch, ch1);
1137     br(EQ, MATCH);
1138     adds(cnt1_neg, cnt1_neg, 1);
1139     br(LT, DO1_LOOP);
1140   BIND(NOMATCH);
1141     mov(result, -1);
1142     b(DONE);
1143   BIND(MATCH);
1144     add(result, result_tmp, cnt1_neg);
1145   BIND(DONE);
1146 }
1147 
1148 // Compare strings.
1149 void C2_MacroAssembler::string_compare(Register str1, Register str2,
1150     Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
1151     FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
1152     PRegister pgtmp1, PRegister pgtmp2, int ae) {
1153   Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
1154       DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
1155       SHORT_LOOP_START, TAIL_CHECK;
1156 
1157   bool isLL = ae == StrIntrinsicNode::LL;
1158   bool isLU = ae == StrIntrinsicNode::LU;
1159   bool isUL = ae == StrIntrinsicNode::UL;
1160 
1161   // The stub threshold for LL strings is: 72 (64 + 8) chars
1162   // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch)
1163   // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least)
1164   const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36);
1165 
1166   bool str1_isL = isLL || isLU;
1167   bool str2_isL = isLL || isUL;
1168 
1169   int str1_chr_shift = str1_isL ? 0 : 1;
1170   int str2_chr_shift = str2_isL ? 0 : 1;
1171   int str1_chr_size = str1_isL ? 1 : 2;
1172   int str2_chr_size = str2_isL ? 1 : 2;
1173   int minCharsInWord = isLL ? wordSize : wordSize/2;
1174 
1175   FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
1176   chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
1177                                       (chr_insn)&MacroAssembler::ldrh;
1178   chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
1179                                       (chr_insn)&MacroAssembler::ldrh;
1180   uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
1181                             (uxt_insn)&MacroAssembler::uxthw;
1182 
1183   BLOCK_COMMENT("string_compare {");
1184 
1185   // Bizarrely, the counts are passed in bytes, regardless of whether they
1186   // are L or U strings, however the result is always in characters.
1187   if (!str1_isL) asrw(cnt1, cnt1, 1);
1188   if (!str2_isL) asrw(cnt2, cnt2, 1);
1189 
1190   // Compute the minimum of the string lengths and save the difference.
1191   subsw(result, cnt1, cnt2);
1192   cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
1193 
1194   // A very short string
1195   cmpw(cnt2, minCharsInWord);
1196   br(Assembler::LE, SHORT_STRING);
1197 
1198   // Compare longwords
1199   // load first parts of strings and finish initialization while loading
1200   {
1201     if (str1_isL == str2_isL) { // LL or UU
1202       ldr(tmp1, Address(str1));
1203       cmp(str1, str2);
1204       br(Assembler::EQ, DONE);
1205       ldr(tmp2, Address(str2));
1206       cmp(cnt2, stub_threshold);
1207       br(GE, STUB);
1208       subsw(cnt2, cnt2, minCharsInWord);
1209       br(EQ, TAIL_CHECK);
1210       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1211       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1212       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1213     } else if (isLU) {
1214       ldrs(vtmp, Address(str1));
1215       ldr(tmp2, Address(str2));
1216       cmp(cnt2, stub_threshold);
1217       br(GE, STUB);
1218       subw(cnt2, cnt2, 4);
1219       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1220       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1221       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1222       zip1(vtmp, T8B, vtmp, vtmpZ);
1223       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1224       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1225       add(cnt1, cnt1, 4);
1226       fmovd(tmp1, vtmp);
1227     } else { // UL case
1228       ldr(tmp1, Address(str1));
1229       ldrs(vtmp, Address(str2));
1230       cmp(cnt2, stub_threshold);
1231       br(GE, STUB);
1232       subw(cnt2, cnt2, 4);
1233       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1234       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1235       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1236       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1237       zip1(vtmp, T8B, vtmp, vtmpZ);
1238       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1239       add(cnt1, cnt1, 8);
1240       fmovd(tmp2, vtmp);
1241     }
1242     adds(cnt2, cnt2, isUL ? 4 : 8);
1243     br(GE, TAIL);
1244     eor(rscratch2, tmp1, tmp2);
1245     cbnz(rscratch2, DIFF);
1246     // main loop
1247     bind(NEXT_WORD);
1248     if (str1_isL == str2_isL) {
1249       ldr(tmp1, Address(str1, cnt2));
1250       ldr(tmp2, Address(str2, cnt2));
1251       adds(cnt2, cnt2, 8);
1252     } else if (isLU) {
1253       ldrs(vtmp, Address(str1, cnt1));
1254       ldr(tmp2, Address(str2, cnt2));
1255       add(cnt1, cnt1, 4);
1256       zip1(vtmp, T8B, vtmp, vtmpZ);
1257       fmovd(tmp1, vtmp);
1258       adds(cnt2, cnt2, 8);
1259     } else { // UL
1260       ldrs(vtmp, Address(str2, cnt2));
1261       ldr(tmp1, Address(str1, cnt1));
1262       zip1(vtmp, T8B, vtmp, vtmpZ);
1263       add(cnt1, cnt1, 8);
1264       fmovd(tmp2, vtmp);
1265       adds(cnt2, cnt2, 4);
1266     }
1267     br(GE, TAIL);
1268 
1269     eor(rscratch2, tmp1, tmp2);
1270     cbz(rscratch2, NEXT_WORD);
1271     b(DIFF);
1272     bind(TAIL);
1273     eor(rscratch2, tmp1, tmp2);
1274     cbnz(rscratch2, DIFF);
1275     // Last longword.  In the case where length == 4 we compare the
1276     // same longword twice, but that's still faster than another
1277     // conditional branch.
1278     if (str1_isL == str2_isL) {
1279       ldr(tmp1, Address(str1));
1280       ldr(tmp2, Address(str2));
1281     } else if (isLU) {
1282       ldrs(vtmp, Address(str1));
1283       ldr(tmp2, Address(str2));
1284       zip1(vtmp, T8B, vtmp, vtmpZ);
1285       fmovd(tmp1, vtmp);
1286     } else { // UL
1287       ldrs(vtmp, Address(str2));
1288       ldr(tmp1, Address(str1));
1289       zip1(vtmp, T8B, vtmp, vtmpZ);
1290       fmovd(tmp2, vtmp);
1291     }
1292     bind(TAIL_CHECK);
1293     eor(rscratch2, tmp1, tmp2);
1294     cbz(rscratch2, DONE);
1295 
1296     // Find the first different characters in the longwords and
1297     // compute their difference.
1298     bind(DIFF);
1299     rev(rscratch2, rscratch2);
1300     clz(rscratch2, rscratch2);
1301     andr(rscratch2, rscratch2, isLL ? -8 : -16);
1302     lsrv(tmp1, tmp1, rscratch2);
1303     (this->*ext_chr)(tmp1, tmp1);
1304     lsrv(tmp2, tmp2, rscratch2);
1305     (this->*ext_chr)(tmp2, tmp2);
1306     subw(result, tmp1, tmp2);
1307     b(DONE);
1308   }
1309 
1310   bind(STUB);
1311     RuntimeAddress stub = nullptr;
1312     switch(ae) {
1313       case StrIntrinsicNode::LL:
1314         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
1315         break;
1316       case StrIntrinsicNode::UU:
1317         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
1318         break;
1319       case StrIntrinsicNode::LU:
1320         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
1321         break;
1322       case StrIntrinsicNode::UL:
1323         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
1324         break;
1325       default:
1326         ShouldNotReachHere();
1327      }
1328     assert(stub.target() != nullptr, "compare_long_string stub has not been generated");
1329     address call = trampoline_call(stub);
1330     if (call == nullptr) {
1331       DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START));
1332       ciEnv::current()->record_failure("CodeCache is full");
1333       return;
1334     }
1335     b(DONE);
1336 
1337   bind(SHORT_STRING);
1338   // Is the minimum length zero?
1339   cbz(cnt2, DONE);
1340   // arrange code to do most branches while loading and loading next characters
1341   // while comparing previous
1342   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1343   subs(cnt2, cnt2, 1);
1344   br(EQ, SHORT_LAST_INIT);
1345   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1346   b(SHORT_LOOP_START);
1347   bind(SHORT_LOOP);
1348   subs(cnt2, cnt2, 1);
1349   br(EQ, SHORT_LAST);
1350   bind(SHORT_LOOP_START);
1351   (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
1352   (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
1353   cmp(tmp1, cnt1);
1354   br(NE, SHORT_LOOP_TAIL);
1355   subs(cnt2, cnt2, 1);
1356   br(EQ, SHORT_LAST2);
1357   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1358   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1359   cmp(tmp2, rscratch1);
1360   br(EQ, SHORT_LOOP);
1361   sub(result, tmp2, rscratch1);
1362   b(DONE);
1363   bind(SHORT_LOOP_TAIL);
1364   sub(result, tmp1, cnt1);
1365   b(DONE);
1366   bind(SHORT_LAST2);
1367   cmp(tmp2, rscratch1);
1368   br(EQ, DONE);
1369   sub(result, tmp2, rscratch1);
1370 
1371   b(DONE);
1372   bind(SHORT_LAST_INIT);
1373   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1374   bind(SHORT_LAST);
1375   cmp(tmp1, cnt1);
1376   br(EQ, DONE);
1377   sub(result, tmp1, cnt1);
1378 
1379   bind(DONE);
1380 
1381   BLOCK_COMMENT("} string_compare");
1382 }
1383 
1384 void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1,
1385                                      FloatRegister src2, Condition cond, bool isQ) {
1386   SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1387   FloatRegister zn = src1, zm = src2;
1388   bool needs_negation = false;
1389   switch (cond) {
1390     case LT: cond = GT; zn = src2; zm = src1; break;
1391     case LE: cond = GE; zn = src2; zm = src1; break;
1392     case LO: cond = HI; zn = src2; zm = src1; break;
1393     case LS: cond = HS; zn = src2; zm = src1; break;
1394     case NE: cond = EQ; needs_negation = true; break;
1395     default:
1396       break;
1397   }
1398 
1399   if (is_floating_point_type(bt)) {
1400     fcm(cond, dst, size, zn, zm);
1401   } else {
1402     cm(cond, dst, size, zn, zm);
1403   }
1404 
1405   if (needs_negation) {
1406     notr(dst, isQ ? T16B : T8B, dst);
1407   }
1408 }
1409 
1410 void C2_MacroAssembler::neon_compare_zero(FloatRegister dst, BasicType bt, FloatRegister src,
1411                                           Condition cond, bool isQ) {
1412   SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1413   if (bt == T_FLOAT || bt == T_DOUBLE) {
1414     if (cond == Assembler::NE) {
1415       fcm(Assembler::EQ, dst, size, src);
1416       notr(dst, isQ ? T16B : T8B, dst);
1417     } else {
1418       fcm(cond, dst, size, src);
1419     }
1420   } else {
1421     if (cond == Assembler::NE) {
1422       cm(Assembler::EQ, dst, size, src);
1423       notr(dst, isQ ? T16B : T8B, dst);
1424     } else {
1425       cm(cond, dst, size, src);
1426     }
1427   }
1428 }
1429 
1430 // Compress the least significant bit of each byte to the rightmost and clear
1431 // the higher garbage bits.
1432 void C2_MacroAssembler::bytemask_compress(Register dst) {
1433   // Example input, dst = 0x01 00 00 00 01 01 00 01
1434   // The "??" bytes are garbage.
1435   orr(dst, dst, dst, Assembler::LSR, 7);  // dst = 0x?? 02 ?? 00 ?? 03 ?? 01
1436   orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D
1437   orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D
1438   andr(dst, dst, 0xff);                   // dst = 0x8D
1439 }
1440 
1441 // Pack the value of each mask element in "src" into a long value in "dst", at most
1442 // the first 64 lane elements. The input "src" is a vector of boolean represented as
1443 // bytes with 0x00/0x01 as element values. Each lane value from "src" is packed into
1444 // one bit in "dst".
1445 //
1446 // Example:   src = 0x0001010000010001 0100000001010001, lane_cnt = 16
1447 // Expected:  dst = 0x658D
1448 //
1449 // Clobbers: rscratch1
1450 void C2_MacroAssembler::sve_vmask_tolong(Register dst, FloatRegister src,
1451                                          FloatRegister vtmp, int lane_cnt) {
1452   assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
1453   assert_different_registers(dst, rscratch1);
1454   assert_different_registers(src, vtmp);
1455   assert(UseSVE > 0, "must be");
1456 
1457   // Compress the lowest 8 bytes.
1458   fmovd(dst, src);
1459   bytemask_compress(dst);
1460   if (lane_cnt <= 8) return;
1461 
1462   // Repeat on higher bytes and join the results.
1463   // Compress 8 bytes in each iteration.
1464   for (int idx = 1; idx < (lane_cnt / 8); idx++) {
1465     sve_extract_integral(rscratch1, T_LONG, src, idx, vtmp);
1466     bytemask_compress(rscratch1);
1467     orr(dst, dst, rscratch1, Assembler::LSL, idx << 3);
1468   }
1469 }
1470 
1471 // The function is same as above "sve_vmask_tolong", but it uses SVE2's BEXT
1472 // instruction which requires the FEAT_BITPERM feature.
1473 void C2_MacroAssembler::sve2_vmask_tolong(Register dst, FloatRegister src,
1474                                           FloatRegister vtmp1, FloatRegister vtmp2,
1475                                           int lane_cnt) {
1476   assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
1477   assert_different_registers(src, vtmp1, vtmp2);
1478   assert(UseSVE > 1 && VM_Version::supports_svebitperm(), "must be");
1479 
1480   // Given a vector with the value 0x00 or 0x01 in each byte, the basic idea
1481   // is to compress each significant bit of the byte in a cross-lane way. Due
1482   // to the lack of a cross-lane bit-compress instruction, we use BEXT
1483   // (bit-compress in each lane) with the biggest lane size (T = D) then
1484   // concatenate the results.
1485 
1486   // The second source input of BEXT, initialized with 0x01 in each byte.
1487   // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1488   sve_dup(vtmp2, B, 1);
1489 
1490   // BEXT vtmp1.D, src.D, vtmp2.D
1491   // src   = 0x0001010000010001 | 0x0100000001010001
1492   // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1493   //         ---------------------------------------
1494   // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1495   sve_bext(vtmp1, D, src, vtmp2);
1496 
1497   // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the
1498   // result to dst.
1499   // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1500   // dst   = 0x658D
1501   if (lane_cnt <= 8) {
1502     // No need to concatenate.
1503     umov(dst, vtmp1, B, 0);
1504   } else if (lane_cnt <= 16) {
1505     ins(vtmp1, B, vtmp1, 1, 8);
1506     umov(dst, vtmp1, H, 0);
1507   } else {
1508     // As the lane count is 64 at most, the final expected value must be in
1509     // the lowest 64 bits after narrowing vtmp1 from D to B.
1510     sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2);
1511     umov(dst, vtmp1, D, 0);
1512   }
1513 }
1514 
1515 // Unpack the mask, a long value in "src", into a vector register of boolean
1516 // represented as bytes with 0x00/0x01 as element values in "dst".  Each bit in
1517 // "src" is unpacked into one byte lane in "dst". Note that "dst" can support at
1518 // most 64 lanes.
1519 //
1520 // Below example gives the expected dst vector register, with a valid src(0x658D)
1521 // on a 128-bit vector size machine.
1522 // dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01
1523 void C2_MacroAssembler::sve_vmask_fromlong(FloatRegister dst, Register src,
1524                                            FloatRegister vtmp, int lane_cnt) {
1525   assert_different_registers(dst, vtmp);
1526   assert(UseSVE == 2 && VM_Version::supports_svebitperm() &&
1527          lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported");
1528 
1529   // Example:   src = 0x658D, lane_cnt = 16
1530   // Expected:  dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01
1531 
1532   // Put long value from general purpose register into the first lane of vector.
1533   // vtmp = 0x0000000000000000 | 0x000000000000658D
1534   sve_dup(vtmp, B, 0);
1535   mov(vtmp, D, 0, src);
1536 
1537   // Transform the value in the first lane which is mask in bit now to the mask in
1538   // byte, which can be done by SVE2's BDEP instruction.
1539 
1540   // The first source input of BDEP instruction. Deposite each byte in every 8 bytes.
1541   // vtmp = 0x0000000000000065 | 0x000000000000008D
1542   if (lane_cnt <= 8) {
1543     // Nothing. As only one byte exsits.
1544   } else if (lane_cnt <= 16) {
1545     ins(vtmp, B, vtmp, 8, 1);
1546   } else {
1547     sve_vector_extend(vtmp, D, vtmp, B);
1548   }
1549 
1550   // The second source input of BDEP instruction, initialized with 0x01 for each byte.
1551   // dst = 0x01010101 0x01010101 0x01010101 0x01010101
1552   sve_dup(dst, B, 1);
1553 
1554   // BDEP dst.D, vtmp.D, dst.D
1555   // vtmp = 0x0000000000000065 | 0x000000000000008D
1556   // dst  = 0x0101010101010101 | 0x0101010101010101
1557   //        ---------------------------------------
1558   // dst  = 0x0001010000010001 | 0x0100000001010001
1559   sve_bdep(dst, D, vtmp, dst);
1560 }
1561 
1562 // Clobbers: rflags
1563 void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg,
1564                                     FloatRegister zn, FloatRegister zm, Condition cond) {
1565   assert(pg->is_governing(), "This register has to be a governing predicate register");
1566   FloatRegister z1 = zn, z2 = zm;
1567   switch (cond) {
1568     case LE: z1 = zm; z2 = zn; cond = GE; break;
1569     case LT: z1 = zm; z2 = zn; cond = GT; break;
1570     case LO: z1 = zm; z2 = zn; cond = HI; break;
1571     case LS: z1 = zm; z2 = zn; cond = HS; break;
1572     default:
1573       break;
1574   }
1575 
1576   SIMD_RegVariant size = elemType_to_regVariant(bt);
1577   if (is_floating_point_type(bt)) {
1578     sve_fcm(cond, pd, size, pg, z1, z2);
1579   } else {
1580     assert(is_integral_type(bt), "unsupported element type");
1581     sve_cmp(cond, pd, size, pg, z1, z2);
1582   }
1583 }
1584 
1585 // Get index of the last mask lane that is set
1586 void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) {
1587   SIMD_RegVariant size = elemType_to_regVariant(bt);
1588   sve_rev(ptmp, size, src);
1589   sve_brkb(ptmp, ptrue, ptmp, false);
1590   sve_cntp(dst, size, ptrue, ptmp);
1591   movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1);
1592   subw(dst, rscratch1, dst);
1593 }
1594 
1595 // Extend integer vector src to dst with the same lane count
1596 // but larger element size, e.g. 4B -> 4I
1597 void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes,
1598                                            FloatRegister src, BasicType src_bt, bool is_unsigned) {
1599   if (src_bt == T_BYTE) {
1600     // 4B to 4S/4I, 8B to 8S
1601     assert(dst_vlen_in_bytes == 8 || dst_vlen_in_bytes == 16, "unsupported");
1602     assert(dst_bt == T_SHORT || dst_bt == T_INT, "unsupported");
1603     _xshll(is_unsigned, dst, T8H, src, T8B, 0);
1604     if (dst_bt == T_INT) {
1605       _xshll(is_unsigned, dst, T4S, dst, T4H, 0);
1606     }
1607   } else if (src_bt == T_SHORT) {
1608     // 2S to 2I/2L, 4S to 4I
1609     assert(dst_vlen_in_bytes == 8 || dst_vlen_in_bytes == 16, "unsupported");
1610     assert(dst_bt == T_INT || dst_bt == T_LONG, "unsupported");
1611     _xshll(is_unsigned, dst, T4S, src, T4H, 0);
1612     if (dst_bt == T_LONG) {
1613       _xshll(is_unsigned, dst, T2D, dst, T2S, 0);
1614     }
1615   } else if (src_bt == T_INT) {
1616     // 2I to 2L
1617     assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported");
1618     _xshll(is_unsigned, dst, T2D, src, T2S, 0);
1619   } else {
1620     ShouldNotReachHere();
1621   }
1622 }
1623 
1624 // Narrow integer vector src down to dst with the same lane count
1625 // but smaller element size, e.g. 4I -> 4B
1626 void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt,
1627                                            FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) {
1628   if (src_bt == T_SHORT) {
1629     // 4S/8S to 4B/8B
1630     assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
1631     assert(dst_bt == T_BYTE, "unsupported");
1632     xtn(dst, T8B, src, T8H);
1633   } else if (src_bt == T_INT) {
1634     // 2I to 2S, 4I to 4B/4S
1635     assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
1636     assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported");
1637     xtn(dst, T4H, src, T4S);
1638     if (dst_bt == T_BYTE) {
1639       xtn(dst, T8B, dst, T8H);
1640     }
1641   } else if (src_bt == T_LONG) {
1642     // 2L to 2S/2I
1643     assert(src_vlen_in_bytes == 16, "unsupported");
1644     assert(dst_bt == T_INT || dst_bt == T_SHORT, "unsupported");
1645     xtn(dst, T2S, src, T2D);
1646     if (dst_bt == T_SHORT) {
1647       xtn(dst, T4H, dst, T4S);
1648     }
1649   } else {
1650     ShouldNotReachHere();
1651   }
1652 }
1653 
1654 void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size,
1655                                           FloatRegister src, SIMD_RegVariant src_size,
1656                                           bool is_unsigned) {
1657   assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size");
1658 
1659   if (src_size == B) {
1660     switch (dst_size) {
1661     case H:
1662       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1663       break;
1664     case S:
1665       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1666       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1667       break;
1668     case D:
1669       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1670       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1671       _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1672       break;
1673     default:
1674       ShouldNotReachHere();
1675     }
1676   } else if (src_size == H) {
1677     if (dst_size == S) {
1678       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1679     } else { // D
1680       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1681       _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1682     }
1683   } else if (src_size == S) {
1684     _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, src);
1685   }
1686 }
1687 
1688 // Vector narrow from src to dst with specified element sizes.
1689 // High part of dst vector will be filled with zero.
1690 void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size,
1691                                           FloatRegister src, SIMD_RegVariant src_size,
1692                                           FloatRegister tmp) {
1693   assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size");
1694   assert_different_registers(src, tmp);
1695   sve_dup(tmp, src_size, 0);
1696   if (src_size == D) {
1697     switch (dst_size) {
1698     case S:
1699       sve_uzp1(dst, S, src, tmp);
1700       break;
1701     case H:
1702       assert_different_registers(dst, tmp);
1703       sve_uzp1(dst, S, src, tmp);
1704       sve_uzp1(dst, H, dst, tmp);
1705       break;
1706     case B:
1707       assert_different_registers(dst, tmp);
1708       sve_uzp1(dst, S, src, tmp);
1709       sve_uzp1(dst, H, dst, tmp);
1710       sve_uzp1(dst, B, dst, tmp);
1711       break;
1712     default:
1713       ShouldNotReachHere();
1714     }
1715   } else if (src_size == S) {
1716     if (dst_size == H) {
1717       sve_uzp1(dst, H, src, tmp);
1718     } else { // B
1719       assert_different_registers(dst, tmp);
1720       sve_uzp1(dst, H, src, tmp);
1721       sve_uzp1(dst, B, dst, tmp);
1722     }
1723   } else if (src_size == H) {
1724     sve_uzp1(dst, B, src, tmp);
1725   }
1726 }
1727 
1728 // Extend src predicate to dst predicate with the same lane count but larger
1729 // element size, e.g. 64Byte -> 512Long
1730 void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src,
1731                                              uint dst_element_length_in_bytes,
1732                                              uint src_element_length_in_bytes) {
1733   if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) {
1734     sve_punpklo(dst, src);
1735   } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) {
1736     sve_punpklo(dst, src);
1737     sve_punpklo(dst, dst);
1738   } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) {
1739     sve_punpklo(dst, src);
1740     sve_punpklo(dst, dst);
1741     sve_punpklo(dst, dst);
1742   } else {
1743     assert(false, "unsupported");
1744     ShouldNotReachHere();
1745   }
1746 }
1747 
1748 // Narrow src predicate to dst predicate with the same lane count but
1749 // smaller element size, e.g. 512Long -> 64Byte
1750 void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src, PRegister ptmp,
1751                                              uint dst_element_length_in_bytes, uint src_element_length_in_bytes) {
1752   // The insignificant bits in src predicate are expected to be zero.
1753   // To ensure the higher order bits of the resultant narrowed vector are 0, an all-zero predicate is
1754   // passed as the second argument. An example narrowing operation with a given mask would be -
1755   // 128Long -> 64Int on a 128-bit machine i.e 2L -> 2I
1756   // Mask (for 2 Longs) : TF
1757   // Predicate register for the above mask (16 bits) : 00000001 00000000
1758   // After narrowing (uzp1 dst.b, src.b, ptmp.b) : 0000 0000 0001 0000
1759   // Which translates to mask for 2 integers as : TF (lower half is considered while upper half is 0)
1760   assert_different_registers(src, ptmp);
1761   assert_different_registers(dst, ptmp);
1762   sve_pfalse(ptmp);
1763   if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) {
1764     sve_uzp1(dst, B, src, ptmp);
1765   } else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) {
1766     sve_uzp1(dst, H, src, ptmp);
1767     sve_uzp1(dst, B, dst, ptmp);
1768   } else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) {
1769     sve_uzp1(dst, S, src, ptmp);
1770     sve_uzp1(dst, H, dst, ptmp);
1771     sve_uzp1(dst, B, dst, ptmp);
1772   } else {
1773     assert(false, "unsupported");
1774     ShouldNotReachHere();
1775   }
1776 }
1777 
1778 // Vector reduction add for integral type with ASIMD instructions.
1779 void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt,
1780                                                  Register isrc, FloatRegister vsrc,
1781                                                  unsigned vector_length_in_bytes,
1782                                                  FloatRegister vtmp) {
1783   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1784   assert_different_registers(dst, isrc);
1785   bool isQ = vector_length_in_bytes == 16;
1786 
1787   BLOCK_COMMENT("neon_reduce_add_integral {");
1788     switch(bt) {
1789       case T_BYTE:
1790         addv(vtmp, isQ ? T16B : T8B, vsrc);
1791         smov(dst, vtmp, B, 0);
1792         addw(dst, dst, isrc, ext::sxtb);
1793         break;
1794       case T_SHORT:
1795         addv(vtmp, isQ ? T8H : T4H, vsrc);
1796         smov(dst, vtmp, H, 0);
1797         addw(dst, dst, isrc, ext::sxth);
1798         break;
1799       case T_INT:
1800         isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc);
1801         umov(dst, vtmp, S, 0);
1802         addw(dst, dst, isrc);
1803         break;
1804       case T_LONG:
1805         assert(isQ, "unsupported");
1806         addpd(vtmp, vsrc);
1807         umov(dst, vtmp, D, 0);
1808         add(dst, dst, isrc);
1809         break;
1810       default:
1811         assert(false, "unsupported");
1812         ShouldNotReachHere();
1813     }
1814   BLOCK_COMMENT("} neon_reduce_add_integral");
1815 }
1816 
1817 // Vector reduction multiply for integral type with ASIMD instructions.
1818 // Note: temporary registers vtmp1 and vtmp2 are not used in some cases.
1819 // Clobbers: rscratch1
1820 void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt,
1821                                                  Register isrc, FloatRegister vsrc,
1822                                                  unsigned vector_length_in_bytes,
1823                                                  FloatRegister vtmp1, FloatRegister vtmp2) {
1824   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1825   bool isQ = vector_length_in_bytes == 16;
1826 
1827   BLOCK_COMMENT("neon_reduce_mul_integral {");
1828     switch(bt) {
1829       case T_BYTE:
1830         if (isQ) {
1831           // Multiply the lower half and higher half of vector iteratively.
1832           // vtmp1 = vsrc[8:15]
1833           ins(vtmp1, D, vsrc, 0, 1);
1834           // vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7]
1835           mulv(vtmp1, T8B, vtmp1, vsrc);
1836           // vtmp2 = vtmp1[4:7]
1837           ins(vtmp2, S, vtmp1, 0, 1);
1838           // vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3]
1839           mulv(vtmp1, T8B, vtmp2, vtmp1);
1840         } else {
1841           ins(vtmp1, S, vsrc, 0, 1);
1842           mulv(vtmp1, T8B, vtmp1, vsrc);
1843         }
1844         // vtmp2 = vtmp1[2:3]
1845         ins(vtmp2, H, vtmp1, 0, 1);
1846         // vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1]
1847         mulv(vtmp2, T8B, vtmp2, vtmp1);
1848         // dst = vtmp2[0] * isrc * vtmp2[1]
1849         umov(rscratch1, vtmp2, B, 0);
1850         mulw(dst, rscratch1, isrc);
1851         sxtb(dst, dst);
1852         umov(rscratch1, vtmp2, B, 1);
1853         mulw(dst, rscratch1, dst);
1854         sxtb(dst, dst);
1855         break;
1856       case T_SHORT:
1857         if (isQ) {
1858           ins(vtmp2, D, vsrc, 0, 1);
1859           mulv(vtmp2, T4H, vtmp2, vsrc);
1860           ins(vtmp1, S, vtmp2, 0, 1);
1861           mulv(vtmp1, T4H, vtmp1, vtmp2);
1862         } else {
1863           ins(vtmp1, S, vsrc, 0, 1);
1864           mulv(vtmp1, T4H, vtmp1, vsrc);
1865         }
1866         umov(rscratch1, vtmp1, H, 0);
1867         mulw(dst, rscratch1, isrc);
1868         sxth(dst, dst);
1869         umov(rscratch1, vtmp1, H, 1);
1870         mulw(dst, rscratch1, dst);
1871         sxth(dst, dst);
1872         break;
1873       case T_INT:
1874         if (isQ) {
1875           ins(vtmp1, D, vsrc, 0, 1);
1876           mulv(vtmp1, T2S, vtmp1, vsrc);
1877         } else {
1878           vtmp1 = vsrc;
1879         }
1880         umov(rscratch1, vtmp1, S, 0);
1881         mul(dst, rscratch1, isrc);
1882         umov(rscratch1, vtmp1, S, 1);
1883         mul(dst, rscratch1, dst);
1884         break;
1885       case T_LONG:
1886         umov(rscratch1, vsrc, D, 0);
1887         mul(dst, isrc, rscratch1);
1888         umov(rscratch1, vsrc, D, 1);
1889         mul(dst, dst, rscratch1);
1890         break;
1891       default:
1892         assert(false, "unsupported");
1893         ShouldNotReachHere();
1894     }
1895   BLOCK_COMMENT("} neon_reduce_mul_integral");
1896 }
1897 
1898 // Vector reduction multiply for floating-point type with ASIMD instructions.
1899 void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt,
1900                                            FloatRegister fsrc, FloatRegister vsrc,
1901                                            unsigned vector_length_in_bytes,
1902                                            FloatRegister vtmp) {
1903   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1904   bool isQ = vector_length_in_bytes == 16;
1905 
1906   BLOCK_COMMENT("neon_reduce_mul_fp {");
1907     switch(bt) {
1908       // The T_SHORT type below is for Float16 type which also uses floating-point
1909       // instructions.
1910       case T_SHORT:
1911         fmulh(dst, fsrc, vsrc);
1912         ext(vtmp, T8B, vsrc, vsrc, 2);
1913         fmulh(dst, dst, vtmp);
1914         ext(vtmp, T8B, vsrc, vsrc, 4);
1915         fmulh(dst, dst, vtmp);
1916         ext(vtmp, T8B, vsrc, vsrc, 6);
1917         fmulh(dst, dst, vtmp);
1918         if (isQ) {
1919           ext(vtmp, T16B, vsrc, vsrc, 8);
1920           fmulh(dst, dst, vtmp);
1921           ext(vtmp, T16B, vsrc, vsrc, 10);
1922           fmulh(dst, dst, vtmp);
1923           ext(vtmp, T16B, vsrc, vsrc, 12);
1924           fmulh(dst, dst, vtmp);
1925           ext(vtmp, T16B, vsrc, vsrc, 14);
1926           fmulh(dst, dst, vtmp);
1927         }
1928         break;
1929       case T_FLOAT:
1930         fmuls(dst, fsrc, vsrc);
1931         ins(vtmp, S, vsrc, 0, 1);
1932         fmuls(dst, dst, vtmp);
1933         if (isQ) {
1934           ins(vtmp, S, vsrc, 0, 2);
1935           fmuls(dst, dst, vtmp);
1936           ins(vtmp, S, vsrc, 0, 3);
1937           fmuls(dst, dst, vtmp);
1938          }
1939         break;
1940       case T_DOUBLE:
1941         assert(isQ, "unsupported");
1942         fmuld(dst, fsrc, vsrc);
1943         ins(vtmp, D, vsrc, 0, 1);
1944         fmuld(dst, dst, vtmp);
1945         break;
1946       default:
1947         assert(false, "unsupported");
1948         ShouldNotReachHere();
1949     }
1950   BLOCK_COMMENT("} neon_reduce_mul_fp");
1951 }
1952 
1953 // Vector reduction add for half float type with ASIMD instructions.
1954 void C2_MacroAssembler::neon_reduce_add_fp16(FloatRegister dst, FloatRegister fsrc, FloatRegister vsrc,
1955                                              unsigned vector_length_in_bytes, FloatRegister vtmp) {
1956   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1957   bool isQ = vector_length_in_bytes == 16;
1958 
1959   BLOCK_COMMENT("neon_reduce_add_fp16 {");
1960     faddh(dst, fsrc, vsrc);
1961     ext(vtmp, T8B, vsrc, vsrc, 2);
1962     faddh(dst, dst, vtmp);
1963     ext(vtmp, T8B, vsrc, vsrc, 4);
1964     faddh(dst, dst, vtmp);
1965     ext(vtmp, T8B, vsrc, vsrc, 6);
1966     faddh(dst, dst, vtmp);
1967     if (isQ) {
1968       ext(vtmp, T16B, vsrc, vsrc, 8);
1969       faddh(dst, dst, vtmp);
1970       ext(vtmp, T16B, vsrc, vsrc, 10);
1971       faddh(dst, dst, vtmp);
1972       ext(vtmp, T16B, vsrc, vsrc, 12);
1973       faddh(dst, dst, vtmp);
1974       ext(vtmp, T16B, vsrc, vsrc, 14);
1975       faddh(dst, dst, vtmp);
1976     }
1977   BLOCK_COMMENT("} neon_reduce_add_fp16");
1978 }
1979 
1980 // Helper to select logical instruction
1981 void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd,
1982                                                    Register Rn, Register Rm,
1983                                                    enum shift_kind kind, unsigned shift) {
1984   switch(opc) {
1985     case Op_AndReductionV:
1986       is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift);
1987       break;
1988     case Op_OrReductionV:
1989       is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift);
1990       break;
1991     case Op_XorReductionV:
1992       is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift);
1993       break;
1994     default:
1995       assert(false, "unsupported");
1996       ShouldNotReachHere();
1997   }
1998 }
1999 
2000 // Vector reduction logical operations And, Or, Xor
2001 // Clobbers: rscratch1
2002 void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt,
2003                                             Register isrc, FloatRegister vsrc,
2004                                             unsigned vector_length_in_bytes) {
2005   assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV,
2006          "unsupported");
2007   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
2008   assert_different_registers(dst, isrc);
2009   bool isQ = vector_length_in_bytes == 16;
2010 
2011   BLOCK_COMMENT("neon_reduce_logical {");
2012     umov(rscratch1, vsrc, isQ ? D : S, 0);
2013     umov(dst, vsrc, isQ ? D : S, 1);
2014     neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1);
2015     switch(bt) {
2016       case T_BYTE:
2017         if (isQ) {
2018           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
2019         }
2020         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
2021         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8);
2022         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
2023         sxtb(dst, dst);
2024         break;
2025       case T_SHORT:
2026         if (isQ) {
2027           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
2028         }
2029         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
2030         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
2031         sxth(dst, dst);
2032         break;
2033       case T_INT:
2034         if (isQ) {
2035           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
2036         }
2037         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
2038         break;
2039       case T_LONG:
2040         assert(isQ, "unsupported");
2041         neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst);
2042         break;
2043       default:
2044         assert(false, "unsupported");
2045         ShouldNotReachHere();
2046     }
2047   BLOCK_COMMENT("} neon_reduce_logical");
2048 }
2049 
2050 // Helper function to decode min/max reduction operation properties
2051 void C2_MacroAssembler::decode_minmax_reduction_opc(int opc, bool* is_min,
2052                                                     bool* is_unsigned,
2053                                                     Condition* cond) {
2054   switch(opc) {
2055     case Op_MinReductionV:
2056       *is_min = true;  *is_unsigned = false; *cond = LT; break;
2057     case Op_MaxReductionV:
2058       *is_min = false; *is_unsigned = false; *cond = GT; break;
2059     case Op_UMinReductionV:
2060       *is_min = true;  *is_unsigned = true;  *cond = LO; break;
2061     case Op_UMaxReductionV:
2062       *is_min = false; *is_unsigned = true;  *cond = HI; break;
2063     default:
2064       ShouldNotReachHere();
2065   }
2066 }
2067 
2068 // Vector reduction min/max/umin/umax for integral type with ASIMD instructions.
2069 // Note: vtmp is not used and expected to be fnoreg for T_LONG case.
2070 // Clobbers: rscratch1, rflags
2071 void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt,
2072                                                     Register isrc, FloatRegister vsrc,
2073                                                     unsigned vector_length_in_bytes,
2074                                                     FloatRegister vtmp) {
2075   assert(opc == Op_MinReductionV || opc == Op_MaxReductionV ||
2076          opc == Op_UMinReductionV || opc == Op_UMaxReductionV, "unsupported");
2077   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
2078   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported");
2079   assert_different_registers(dst, isrc);
2080   bool isQ = vector_length_in_bytes == 16;
2081   bool is_min;
2082   bool is_unsigned;
2083   Condition cond;
2084   decode_minmax_reduction_opc(opc, &is_min, &is_unsigned, &cond);
2085   BLOCK_COMMENT("neon_reduce_minmax_integral {");
2086     if (bt == T_LONG) {
2087       assert(vtmp == fnoreg, "should be");
2088       assert(isQ, "should be");
2089       umov(rscratch1, vsrc, D, 0);
2090       cmp(isrc, rscratch1);
2091       csel(dst, isrc, rscratch1, cond);
2092       umov(rscratch1, vsrc, D, 1);
2093       cmp(dst, rscratch1);
2094       csel(dst, dst, rscratch1, cond);
2095     } else {
2096       SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
2097       if (size == T2S) {
2098         // For T2S (2x32-bit elements), use pairwise instructions because
2099         // uminv/umaxv/sminv/smaxv don't support arrangement 2S.
2100         neon_minmaxp(is_unsigned, is_min, vtmp, size, vsrc, vsrc);
2101       } else {
2102         // For other sizes, use reduction to scalar instructions.
2103         neon_minmaxv(is_unsigned, is_min, vtmp, size, vsrc);
2104       }
2105       if (bt == T_INT) {
2106         umov(dst, vtmp, S, 0);
2107       } else if (is_unsigned) {
2108         umov(dst, vtmp, elemType_to_regVariant(bt), 0);
2109       } else {
2110         smov(dst, vtmp, elemType_to_regVariant(bt), 0);
2111       }
2112       cmpw(dst, isrc);
2113       cselw(dst, dst, isrc, cond);
2114     }
2115   BLOCK_COMMENT("} neon_reduce_minmax_integral");
2116 }
2117 
2118 // Vector reduction for integral type with SVE instruction.
2119 // Supported operations are Add, And, Or, Xor, Max, Min, UMax, UMin.
2120 // rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV.
2121 void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1,
2122                                             FloatRegister src2, PRegister pg, FloatRegister tmp) {
2123   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2124   assert(pg->is_governing(), "This register has to be a governing predicate register");
2125   assert_different_registers(src1, dst);
2126   // Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved.
2127   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2128   switch (opc) {
2129     case Op_AddReductionVI: {
2130       sve_uaddv(tmp, size, pg, src2);
2131       if (bt == T_BYTE) {
2132         smov(dst, tmp, size, 0);
2133         addw(dst, src1, dst, ext::sxtb);
2134       } else if (bt == T_SHORT) {
2135         smov(dst, tmp, size, 0);
2136         addw(dst, src1, dst, ext::sxth);
2137       } else {
2138         umov(dst, tmp, size, 0);
2139         addw(dst, dst, src1);
2140       }
2141       break;
2142     }
2143     case Op_AddReductionVL: {
2144       sve_uaddv(tmp, size, pg, src2);
2145       umov(dst, tmp, size, 0);
2146       add(dst, dst, src1);
2147       break;
2148     }
2149     case Op_AndReductionV: {
2150       sve_andv(tmp, size, pg, src2);
2151       if (bt == T_INT || bt == T_LONG) {
2152         umov(dst, tmp, size, 0);
2153       } else {
2154         smov(dst, tmp, size, 0);
2155       }
2156       if (bt == T_LONG) {
2157         andr(dst, dst, src1);
2158       } else {
2159         andw(dst, dst, src1);
2160       }
2161       break;
2162     }
2163     case Op_OrReductionV: {
2164       sve_orv(tmp, size, pg, src2);
2165       if (bt == T_INT || bt == T_LONG) {
2166         umov(dst, tmp, size, 0);
2167       } else {
2168         smov(dst, tmp, size, 0);
2169       }
2170       if (bt == T_LONG) {
2171         orr(dst, dst, src1);
2172       } else {
2173         orrw(dst, dst, src1);
2174       }
2175       break;
2176     }
2177     case Op_XorReductionV: {
2178       sve_eorv(tmp, size, pg, src2);
2179       if (bt == T_INT || bt == T_LONG) {
2180         umov(dst, tmp, size, 0);
2181       } else {
2182         smov(dst, tmp, size, 0);
2183       }
2184       if (bt == T_LONG) {
2185         eor(dst, dst, src1);
2186       } else {
2187         eorw(dst, dst, src1);
2188       }
2189       break;
2190     }
2191     case Op_MaxReductionV:
2192     case Op_MinReductionV:
2193     case Op_UMaxReductionV:
2194     case Op_UMinReductionV: {
2195       bool is_min;
2196       bool is_unsigned;
2197       Condition cond;
2198       decode_minmax_reduction_opc(opc, &is_min, &is_unsigned, &cond);
2199       sve_minmaxv(is_unsigned, is_min, tmp, size, pg, src2);
2200       // Move result from vector to general register
2201       if (is_unsigned || bt == T_INT || bt == T_LONG) {
2202         umov(dst, tmp, size, 0);
2203       } else {
2204         smov(dst, tmp, size, 0);
2205       }
2206       if (bt == T_LONG) {
2207         cmp(dst, src1);
2208         csel(dst, dst, src1, cond);
2209       } else {
2210         cmpw(dst, src1);
2211         cselw(dst, dst, src1, cond);
2212       }
2213       break;
2214     }
2215     default:
2216       assert(false, "unsupported");
2217       ShouldNotReachHere();
2218   }
2219 
2220   if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) {
2221     if (bt == T_BYTE) {
2222       sxtb(dst, dst);
2223     } else if (bt == T_SHORT) {
2224       sxth(dst, dst);
2225     }
2226   }
2227 }
2228 
2229 // Set elements of the dst predicate to true for lanes in the range of [0, lane_cnt), or
2230 // to false otherwise. The input "lane_cnt" should be smaller than or equal to the supported
2231 // max vector length of the basic type. Clobbers: rscratch1 and the rFlagsReg.
2232 void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt) {
2233   uint32_t max_vector_length = Matcher::max_vector_size(bt);
2234   assert(lane_cnt <= max_vector_length, "unsupported input lane_cnt");
2235 
2236   // Set all elements to false if the input "lane_cnt" is zero.
2237   if (lane_cnt == 0) {
2238     sve_pfalse(dst);
2239     return;
2240   }
2241 
2242   SIMD_RegVariant size = elemType_to_regVariant(bt);
2243   assert(size != Q, "invalid size");
2244 
2245   // Set all true if "lane_cnt" equals to the max lane count.
2246   if (lane_cnt == max_vector_length) {
2247     sve_ptrue(dst, size, /* ALL */ 0b11111);
2248     return;
2249   }
2250 
2251   // Fixed numbers for "ptrue".
2252   switch(lane_cnt) {
2253   case 1: /* VL1 */
2254   case 2: /* VL2 */
2255   case 3: /* VL3 */
2256   case 4: /* VL4 */
2257   case 5: /* VL5 */
2258   case 6: /* VL6 */
2259   case 7: /* VL7 */
2260   case 8: /* VL8 */
2261     sve_ptrue(dst, size, lane_cnt);
2262     return;
2263   case 16:
2264     sve_ptrue(dst, size, /* VL16 */ 0b01001);
2265     return;
2266   case 32:
2267     sve_ptrue(dst, size, /* VL32 */ 0b01010);
2268     return;
2269   case 64:
2270     sve_ptrue(dst, size, /* VL64 */ 0b01011);
2271     return;
2272   case 128:
2273     sve_ptrue(dst, size, /* VL128 */ 0b01100);
2274     return;
2275   case 256:
2276     sve_ptrue(dst, size, /* VL256 */ 0b01101);
2277     return;
2278   default:
2279     break;
2280   }
2281 
2282   // Special patterns for "ptrue".
2283   if (lane_cnt == round_down_power_of_2(max_vector_length)) {
2284     sve_ptrue(dst, size, /* POW2 */ 0b00000);
2285   } else if (lane_cnt == max_vector_length - (max_vector_length % 4)) {
2286     sve_ptrue(dst, size, /* MUL4 */ 0b11101);
2287   } else if (lane_cnt == max_vector_length - (max_vector_length % 3)) {
2288     sve_ptrue(dst, size, /* MUL3 */ 0b11110);
2289   } else {
2290     // Encode to "whileltw" for the remaining cases.
2291     mov(rscratch1, lane_cnt);
2292     sve_whileltw(dst, size, zr, rscratch1);
2293   }
2294 }
2295 
2296 // Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst.
2297 // Any remaining elements of dst will be filled with zero.
2298 // Clobbers: rscratch1
2299 // Preserves: mask, vzr
2300 void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask,
2301                                            FloatRegister vzr, FloatRegister vtmp,
2302                                            PRegister pgtmp, unsigned vector_length_in_bytes) {
2303   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2304   // When called by sve_compress_byte, src and vtmp may be the same register.
2305   assert_different_registers(dst, src, vzr);
2306   assert_different_registers(dst, vtmp, vzr);
2307   assert_different_registers(mask, pgtmp);
2308   // high <-- low
2309   // Example input:   src   = hh gg ff ee dd cc bb aa, one character is 8 bits.
2310   //                  mask  = 01 00 00 01 01 00 01 01, one character is 1 bit.
2311   // Expected result: dst   = 00 00 00 hh ee dd bb aa
2312 
2313   // Extend lowest half to type INT.
2314   // dst   =  00dd  00cc  00bb  00aa
2315   sve_uunpklo(dst, S, src);
2316   // pgtmp =  0001  0000  0001  0001
2317   sve_punpklo(pgtmp, mask);
2318   // Pack the active elements in size of type INT to the right,
2319   // and fill the remainings with zero.
2320   // dst   =  0000  00dd  00bb  00aa
2321   sve_compact(dst, S, dst, pgtmp);
2322   // Narrow the result back to type SHORT.
2323   // dst   = 00 00 00 00 00 dd bb aa
2324   sve_uzp1(dst, H, dst, vzr);
2325 
2326   // Return if the vector length is no more than MaxVectorSize/2, since the
2327   // highest half is invalid.
2328   if (vector_length_in_bytes <= (MaxVectorSize >> 1)) {
2329     return;
2330   }
2331 
2332   // Count the active elements of lowest half.
2333   // rscratch1 = 3
2334   sve_cntp(rscratch1, S, ptrue, pgtmp);
2335 
2336   // Repeat to the highest half.
2337   // pgtmp =  0001  0000  0000  0001
2338   sve_punpkhi(pgtmp, mask);
2339   // vtmp  =  00hh  00gg  00ff  00ee
2340   sve_uunpkhi(vtmp, S, src);
2341   // vtmp  =  0000  0000  00hh  00ee
2342   sve_compact(vtmp, S, vtmp, pgtmp);
2343   // vtmp  = 00 00 00 00 00 00 hh ee
2344   sve_uzp1(vtmp, H, vtmp, vzr);
2345 
2346   // pgtmp = 00 00 00 00 00 01 01 01
2347   sve_whilelt(pgtmp, H, zr, rscratch1);
2348   // Compressed low:  dst  = 00 00 00 00 00 dd bb aa
2349   // Compressed high: vtmp = 00 00 00 00 00 00 hh ee
2350   // Combine the compressed low with the compressed high:
2351   //                  dst  = 00 00 00 hh ee dd bb aa
2352   sve_splice(dst, H, pgtmp, vtmp);
2353 }
2354 
2355 // Clobbers: rscratch1, rscratch2
2356 // Preserves: src, mask
2357 void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask,
2358                                           FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
2359                                           PRegister ptmp, PRegister pgtmp, unsigned vector_length_in_bytes) {
2360   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2361   assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3);
2362   assert_different_registers(mask, ptmp, pgtmp);
2363   // high <-- low
2364   // Example input:   src   = q p n m l k j i h g f e d c b a, one character is 8 bits.
2365   //                  mask  = 0 1 0 0 0 0 0 1 0 1 0 0 0 1 0 1, one character is 1 bit.
2366   // Expected result: dst   = 0 0 0 0 0 0 0 0 0 0 0 p i g c a
2367   FloatRegister vzr = vtmp3;
2368   sve_dup(vzr, B, 0);
2369 
2370   // Extend lowest half to type SHORT.
2371   // vtmp1 =  0h  0g  0f  0e  0d  0c  0b  0a
2372   sve_uunpklo(vtmp1, H, src);
2373   // ptmp  =  00  01  00  00  00  01  00  01
2374   sve_punpklo(ptmp, mask);
2375   // Pack the active elements in size of type SHORT to the right,
2376   // and fill the remainings with zero.
2377   // dst   =  00  00  00  00  00  0g  0c  0a
2378   unsigned extended_size = vector_length_in_bytes << 1;
2379   sve_compress_short(dst, vtmp1, ptmp, vzr, vtmp2, pgtmp, extended_size > MaxVectorSize ? MaxVectorSize : extended_size);
2380   // Narrow the result back to type BYTE.
2381   // dst   = 0 0 0 0 0 0 0 0 0 0 0 0 0 g c a
2382   sve_uzp1(dst, B, dst, vzr);
2383 
2384   // Return if the vector length is no more than MaxVectorSize/2, since the
2385   // highest half is invalid.
2386   if (vector_length_in_bytes <= (MaxVectorSize >> 1)) {
2387     return;
2388   }
2389   // Count the active elements of lowest half.
2390   // rscratch2 = 3
2391   sve_cntp(rscratch2, H, ptrue, ptmp);
2392 
2393   // Repeat to the highest half.
2394   // ptmp  =  00  01  00  00  00  00  00  01
2395   sve_punpkhi(ptmp, mask);
2396   // vtmp2 =  0q  0p  0n  0m  0l  0k  0j  0i
2397   sve_uunpkhi(vtmp2, H, src);
2398   // vtmp1 =  00  00  00  00  00  00  0p  0i
2399   sve_compress_short(vtmp1, vtmp2, ptmp, vzr, vtmp2, pgtmp, extended_size - MaxVectorSize);
2400   // vtmp1 = 0 0 0 0 0 0 0 0 0 0 0 0 0 0 p i
2401   sve_uzp1(vtmp1, B, vtmp1, vzr);
2402 
2403   // ptmp  = 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1
2404   sve_whilelt(ptmp, B, zr, rscratch2);
2405   // Compressed low:  dst   = 0 0 0 0 0 0 0 0 0 0 0 0 0 g c a
2406   // Compressed high: vtmp1 = 0 0 0 0 0 0 0 0 0 0 0 0 0 0 p i
2407   // Combine the compressed low with the compressed high:
2408   //                  dst   = 0 0 0 0 0 0 0 0 0 0 0 p i g c a
2409   sve_splice(dst, B, ptmp, vtmp1);
2410 }
2411 
2412 void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2413   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2414   SIMD_Arrangement size = isQ ? T16B : T8B;
2415   if (bt == T_BYTE) {
2416     rbit(dst, size, src);
2417   } else {
2418     neon_reverse_bytes(dst, src, bt, isQ);
2419     rbit(dst, size, dst);
2420   }
2421 }
2422 
2423 void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2424   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2425   SIMD_Arrangement size = isQ ? T16B : T8B;
2426   switch (bt) {
2427     case T_BYTE:
2428       if (dst != src) {
2429         orr(dst, size, src, src);
2430       }
2431       break;
2432     case T_SHORT:
2433       rev16(dst, size, src);
2434       break;
2435     case T_INT:
2436       rev32(dst, size, src);
2437       break;
2438     case T_LONG:
2439       rev64(dst, size, src);
2440       break;
2441     default:
2442       assert(false, "unsupported");
2443       ShouldNotReachHere();
2444   }
2445 }
2446 
2447 // VectorRearrange implementation for short/int/float/long/double types with NEON
2448 // instructions. For VectorRearrange short/int/float, we use NEON tbl instruction.
2449 // But since it supports bytes table only, we need to lookup 2/4 bytes as a group.
2450 // For VectorRearrange long/double, we compare the shuffle input with iota indices,
2451 // and use bsl to implement the operation.
2452 void C2_MacroAssembler::neon_rearrange_hsd(FloatRegister dst, FloatRegister src,
2453                                            FloatRegister shuffle, FloatRegister tmp,
2454                                            BasicType bt, bool isQ) {
2455   assert_different_registers(dst, src, shuffle, tmp);
2456   SIMD_Arrangement size1 = isQ ? T16B : T8B;
2457   SIMD_Arrangement size2 = esize2arrangement((uint)type2aelembytes(bt), isQ);
2458 
2459   // Here is an example that rearranges a NEON vector with 4 ints:
2460   // Rearrange V1 int[a0, a1, a2, a3] to V2 int[a2, a3, a0, a1]
2461   //   1. We assume the shuffle input is Vi int[2, 3, 0, 1].
2462   //   2. Multiply Vi int[2, 3, 0, 1] with constant int vector
2463   //      [0x04040404, 0x04040404, 0x04040404, 0x04040404], and get
2464   //      tbl base Vm int[0x08080808, 0x0c0c0c0c, 0x00000000, 0x04040404].
2465   //   3. Add Vm with constant int[0x03020100, 0x03020100, 0x03020100, 0x03020100],
2466   //      and get tbl index Vm int[0x0b0a0908, 0x0f0e0d0c, 0x03020100, 0x07060504]
2467   //   4. Use Vm as index register, and use V1 as table register.
2468   //      Then get V2 as the result by tbl NEON instructions.
2469   switch (bt) {
2470     case T_SHORT:
2471       mov(tmp, size1, 0x02);
2472       mulv(dst, size2, shuffle, tmp);
2473       mov(tmp, size2, 0x0100);
2474       addv(dst, size1, dst, tmp);
2475       tbl(dst, size1, src, 1, dst);
2476       break;
2477     case T_INT:
2478     case T_FLOAT:
2479       mov(tmp, size1, 0x04);
2480       mulv(dst, size2, shuffle, tmp);
2481       mov(tmp, size2, 0x03020100);
2482       addv(dst, size1, dst, tmp);
2483       tbl(dst, size1, src, 1, dst);
2484       break;
2485     case T_LONG:
2486     case T_DOUBLE:
2487       {
2488         int idx = vector_iota_entry_index(T_LONG);
2489         lea(rscratch1,
2490             ExternalAddress(StubRoutines::aarch64::vector_iota_indices(idx)));
2491         ldrq(tmp, rscratch1);
2492         // Check whether the input "shuffle" is the same with iota indices.
2493         // Return "src" if true, otherwise swap the two elements of "src".
2494         cm(EQ, dst, size2, shuffle, tmp);
2495         ext(tmp, size1, src, src, 8);
2496         bsl(dst, size1, src, tmp);
2497       }
2498       break;
2499     default:
2500       assert(false, "unsupported element type");
2501       ShouldNotReachHere();
2502   }
2503 }
2504 
2505 // Extract a scalar element from an sve vector at position 'idx'.
2506 // The input elements in src are expected to be of integral type.
2507 void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src,
2508                                              int idx, FloatRegister vtmp) {
2509   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2510   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2511   if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction
2512     if (bt == T_INT || bt == T_LONG) {
2513       umov(dst, src, size, idx);
2514     } else {
2515       smov(dst, src, size, idx);
2516     }
2517   } else {
2518     sve_movprfx(vtmp, src);
2519     // Although vtmp and src hold the same value after movprfx, we must use src
2520     // (not vtmp) as the second source of ext. The movprfx destination register
2521     // must not appear in any source operand of the following instruction except
2522     // as the destructive operand.
2523     sve_ext(vtmp, src, idx << size);
2524     if (bt == T_INT || bt == T_LONG) {
2525       umov(dst, vtmp, size, 0);
2526     } else {
2527       smov(dst, vtmp, size, 0);
2528     }
2529   }
2530 }
2531 
2532 // java.lang.Math::round intrinsics
2533 
2534 // Clobbers: rscratch1, rflags
2535 void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2536                                           FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) {
2537   assert_different_registers(tmp1, tmp2, tmp3, src, dst);
2538   switch (T) {
2539     case T2S:
2540     case T4S:
2541       fmovs(tmp1, T, 0.5f);
2542       mov(rscratch1, jint_cast(0x1.0p23f));
2543       break;
2544     case T2D:
2545       fmovd(tmp1, T, 0.5);
2546       mov(rscratch1, julong_cast(0x1.0p52));
2547       break;
2548     default:
2549       assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2550   }
2551   fadd(tmp1, T, tmp1, src);
2552   fcvtms(tmp1, T, tmp1);
2553   // tmp1 = floor(src + 0.5, ties to even)
2554 
2555   fcvtas(dst, T, src);
2556   // dst = round(src), ties to away
2557 
2558   fneg(tmp3, T, src);
2559   dup(tmp2, T, rscratch1);
2560   cm(HS, tmp3, T, tmp3, tmp2);
2561   // tmp3 is now a set of flags
2562 
2563   bif(dst, T16B, tmp1, tmp3);
2564   // result in dst
2565 }
2566 
2567 // Clobbers: rscratch1, rflags
2568 void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2569                                          FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) {
2570   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2571   assert_different_registers(tmp1, tmp2, src, dst);
2572 
2573   switch (T) {
2574     case S:
2575       mov(rscratch1, jint_cast(0x1.0p23f));
2576       break;
2577     case D:
2578       mov(rscratch1, julong_cast(0x1.0p52));
2579       break;
2580     default:
2581       assert(T == S || T == D, "invalid register variant");
2582   }
2583 
2584   sve_frinta(dst, T, ptrue, src);
2585   // dst = round(src), ties to away
2586 
2587   Label none;
2588 
2589   sve_fneg(tmp1, T, ptrue, src);
2590   sve_dup(tmp2, T, rscratch1);
2591   sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1);
2592   br(EQ, none);
2593   {
2594     sve_cpy(tmp1, T, pgtmp, 0.5);
2595     sve_fadd(tmp1, T, pgtmp, src);
2596     sve_frintm(dst, T, pgtmp, tmp1);
2597     // dst = floor(src + 0.5, ties to even)
2598   }
2599   bind(none);
2600 
2601   sve_fcvtzs(dst, T, ptrue, dst, T);
2602   // result in dst
2603 }
2604 
2605 void C2_MacroAssembler::vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero,
2606                                            FloatRegister one, SIMD_Arrangement T) {
2607   assert_different_registers(dst, src, zero, one);
2608   assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2609 
2610   facgt(dst, T, src, zero);
2611   ushr(dst, T, dst, 1); // dst=0 for +-0.0 and NaN. 0x7FF..F otherwise
2612   bsl(dst, T == T2S ? T8B : T16B, one, src); // Result in dst
2613 }
2614 
2615 void C2_MacroAssembler::vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero,
2616                                           FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T) {
2617     assert_different_registers(dst, src, zero, one, vtmp);
2618     assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2619 
2620     sve_orr(vtmp, src, src);
2621     sve_fac(Assembler::GT, pgtmp, T, ptrue, src, zero); // pmtp=0 for +-0.0 and NaN. 0x1 otherwise
2622     switch (T) {
2623     case S:
2624       sve_and(vtmp, T, min_jint); // Extract the sign bit of float value in every lane of src
2625       sve_orr(vtmp, T, jint_cast(1.0)); // OR it with +1 to make the final result +1 or -1 depending
2626                                         // on the sign of the float value
2627       break;
2628     case D:
2629       sve_and(vtmp, T, min_jlong);
2630       sve_orr(vtmp, T, jlong_cast(1.0));
2631       break;
2632     default:
2633       assert(false, "unsupported");
2634       ShouldNotReachHere();
2635     }
2636     sve_sel(dst, T, pgtmp, vtmp, src); // Select either from src or vtmp based on the predicate register pgtmp
2637                                        // Result in dst
2638 }
2639 
2640 bool C2_MacroAssembler::in_scratch_emit_size() {
2641   if (ciEnv::current()->task() != nullptr) {
2642     PhaseOutput* phase_output = Compile::current()->output();
2643     if (phase_output != nullptr && phase_output->in_scratch_emit_size()) {
2644       return true;
2645     }
2646   }
2647   return MacroAssembler::in_scratch_emit_size();
2648 }
2649 
2650 static void abort_verify_int_in_range(uint idx, jint val, jint lo, jint hi) {
2651   fatal("Invalid CastII, idx: %u, val: %d, lo: %d, hi: %d", idx, val, lo, hi);
2652 }
2653 
2654 void C2_MacroAssembler::verify_int_in_range(uint idx, const TypeInt* t, Register rval, Register rtmp) {
2655   assert(!t->empty() && !t->singleton(), "%s", Type::str(t));
2656   if (t == TypeInt::INT) {
2657     return;
2658   }
2659 
2660   BLOCK_COMMENT("verify_int_in_range {");
2661   Label L_success, L_failure;
2662 
2663   jint lo = t->_lo;
2664   jint hi = t->_hi;
2665 
2666   if (lo != min_jint) {
2667     subsw(rtmp, rval, lo);
2668     br(Assembler::LT, L_failure);
2669   }
2670   if (hi != max_jint) {
2671     subsw(rtmp, rval, hi);
2672     br(Assembler::GT, L_failure);
2673   }
2674   b(L_success);
2675 
2676   bind(L_failure);
2677   movw(c_rarg0, idx);
2678   mov(c_rarg1, rval);
2679   movw(c_rarg2, lo);
2680   movw(c_rarg3, hi);
2681   reconstruct_frame_pointer(rtmp);
2682   rt_call(CAST_FROM_FN_PTR(address, abort_verify_int_in_range), rtmp);
2683   hlt(0);
2684 
2685   bind(L_success);
2686   BLOCK_COMMENT("} verify_int_in_range");
2687 }
2688 
2689 static void abort_verify_long_in_range(uint idx, jlong val, jlong lo, jlong hi) {
2690   fatal("Invalid CastLL, idx: %u, val: " JLONG_FORMAT ", lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, val, lo, hi);
2691 }
2692 
2693 void C2_MacroAssembler::verify_long_in_range(uint idx, const TypeLong* t, Register rval, Register rtmp) {
2694   assert(!t->empty() && !t->singleton(), "%s", Type::str(t));
2695   if (t == TypeLong::LONG) {
2696     return;
2697   }
2698 
2699   BLOCK_COMMENT("verify_long_in_range {");
2700   Label L_success, L_failure;
2701 
2702   jlong lo = t->_lo;
2703   jlong hi = t->_hi;
2704 
2705   if (lo != min_jlong) {
2706     subs(rtmp, rval, lo);
2707     br(Assembler::LT, L_failure);
2708   }
2709   if (hi != max_jlong) {
2710     subs(rtmp, rval, hi);
2711     br(Assembler::GT, L_failure);
2712   }
2713   b(L_success);
2714 
2715   bind(L_failure);
2716   movw(c_rarg0, idx);
2717   mov(c_rarg1, rval);
2718   mov(c_rarg2, lo);
2719   mov(c_rarg3, hi);
2720   reconstruct_frame_pointer(rtmp);
2721   rt_call(CAST_FROM_FN_PTR(address, abort_verify_long_in_range), rtmp);
2722   hlt(0);
2723 
2724   bind(L_success);
2725   BLOCK_COMMENT("} verify_long_in_range");
2726 }
2727 
2728 void C2_MacroAssembler::reconstruct_frame_pointer(Register rtmp) {
2729   const int framesize = Compile::current()->output()->frame_size_in_bytes();
2730   if (PreserveFramePointer) {
2731     // frame pointer is valid
2732 #ifdef ASSERT
2733     // Verify frame pointer value in rfp.
2734     add(rtmp, sp, framesize - 2 * wordSize);
2735     Label L_success;
2736     cmp(rfp, rtmp);
2737     br(Assembler::EQ, L_success);
2738     stop("frame pointer mismatch");
2739     bind(L_success);
2740 #endif // ASSERT
2741   } else {
2742     add(rfp, sp, framesize - 2 * wordSize);
2743   }
2744 }
2745 
2746 // Selects elements from two source vectors (src1, src2) based on index values in the index register
2747 // using Neon instructions and places it in the destination vector element corresponding to the
2748 // index vector element. Each index in the index register must be in the range - [0, 2 * NUM_ELEM),
2749 // where NUM_ELEM is the number of BasicType elements per vector.
2750 // If idx < NUM_ELEM --> selects src1[idx] (idx is an element of the index register)
2751 // Otherwise, selects src2[idx – NUM_ELEM]
2752 void C2_MacroAssembler::select_from_two_vectors_neon(FloatRegister dst, FloatRegister src1,
2753                                                      FloatRegister src2, FloatRegister index,
2754                                                      FloatRegister tmp, unsigned vector_length_in_bytes) {
2755   assert_different_registers(dst, src1, src2, tmp);
2756   SIMD_Arrangement size = vector_length_in_bytes == 16 ? T16B : T8B;
2757 
2758   if (vector_length_in_bytes == 16) {
2759     assert(UseSVE <= 1, "sve must be <= 1");
2760     assert(src1->successor() == src2, "Source registers must be ordered");
2761     // If the vector length is 16B, then use the Neon "tbl" instruction with two vector table
2762     tbl(dst, size, src1, 2, index);
2763   } else { // vector length == 8
2764     assert(UseSVE == 0, "must be Neon only");
2765     // We need to fit both the source vectors (src1, src2) in a 128-bit register because the
2766     // Neon "tbl" instruction supports only looking up 16B vectors. We then use the Neon "tbl"
2767     // instruction with one vector lookup
2768     ins(tmp, D, src1, 0, 0);
2769     ins(tmp, D, src2, 1, 0);
2770     tbl(dst, size, tmp, 1, index);
2771   }
2772 }
2773 
2774 // Selects elements from two source vectors (src1, src2) based on index values in the index register
2775 // using SVE/SVE2 instructions and places it in the destination vector element corresponding to the
2776 // index vector element. Each index in the index register must be in the range - [0, 2 * NUM_ELEM),
2777 // where NUM_ELEM is the number of BasicType elements per vector.
2778 // If idx < NUM_ELEM --> selects src1[idx] (idx is an element of the index register)
2779 // Otherwise, selects src2[idx – NUM_ELEM]
2780 void C2_MacroAssembler::select_from_two_vectors_sve(FloatRegister dst, FloatRegister src1,
2781                                                     FloatRegister src2, FloatRegister index,
2782                                                     FloatRegister tmp, SIMD_RegVariant T,
2783                                                     unsigned vector_length_in_bytes) {
2784   assert_different_registers(dst, src1, src2, index, tmp);
2785 
2786   if (vector_length_in_bytes == 8) {
2787     // We need to fit both the source vectors (src1, src2) in a single vector register because the
2788     // SVE "tbl" instruction is unpredicated and works on the entire vector which can lead to
2789     // incorrect results if each source vector is only partially filled. We then use the SVE "tbl"
2790     // instruction with one vector lookup
2791     assert(UseSVE >= 1, "sve must be >= 1");
2792     ins(tmp, D, src1, 0, 0);
2793     ins(tmp, D, src2, 1, 0);
2794     sve_tbl(dst, T, tmp, index);
2795   } else {  // UseSVE == 2 and vector_length_in_bytes > 8
2796     // If the vector length is > 8, then use the SVE2 "tbl" instruction with the two vector table.
2797     // The assertion - vector_length_in_bytes == MaxVectorSize ensures that this operation
2798     // is not executed on machines where vector_length_in_bytes < MaxVectorSize
2799     // with the only exception of 8B vector length.
2800     assert(UseSVE == 2 && vector_length_in_bytes == MaxVectorSize, "must be");
2801     assert(src1->successor() == src2, "Source registers must be ordered");
2802     sve_tbl(dst, T, src1, src2, index);
2803   }
2804 }
2805 
2806 void C2_MacroAssembler::select_from_two_vectors(FloatRegister dst, FloatRegister src1,
2807                                                 FloatRegister src2, FloatRegister index,
2808                                                 FloatRegister tmp, BasicType bt,
2809                                                 unsigned vector_length_in_bytes) {
2810 
2811   assert_different_registers(dst, src1, src2, index, tmp);
2812 
2813   // The cases that can reach this method are -
2814   // - UseSVE = 0/1, vector_length_in_bytes = 8 or 16, excluding double and long types
2815   // - UseSVE = 2, vector_length_in_bytes >= 8, for all types
2816   //
2817   // SVE/SVE2 tbl instructions are generated when UseSVE = 1 with vector_length_in_bytes = 8
2818   // and UseSVE = 2 with vector_length_in_bytes >= 8
2819   //
2820   // Neon instructions are generated when UseSVE = 0 with vector_length_in_bytes = 8 or 16 and
2821   // UseSVE = 1 with vector_length_in_bytes = 16
2822 
2823   if ((UseSVE == 1 && vector_length_in_bytes == 8) || UseSVE == 2) {
2824     SIMD_RegVariant T = elemType_to_regVariant(bt);
2825     select_from_two_vectors_sve(dst, src1, src2, index, tmp, T, vector_length_in_bytes);
2826     return;
2827   }
2828 
2829   // The only BasicTypes that can reach here are T_SHORT, T_BYTE, T_INT and T_FLOAT
2830   assert(bt != T_DOUBLE && bt != T_LONG, "unsupported basic type");
2831   assert(vector_length_in_bytes <= 16, "length_in_bytes must be <= 16");
2832 
2833   bool isQ = vector_length_in_bytes == 16;
2834 
2835   SIMD_Arrangement size1 = isQ ? T16B : T8B;
2836   SIMD_Arrangement size2 = esize2arrangement((uint)type2aelembytes(bt), isQ);
2837 
2838   // Neon "tbl" instruction only supports byte tables, so we need to look at chunks of
2839   // 2B for selecting shorts or chunks of 4B for selecting ints/floats from the table.
2840   // The index values in "index" register are in the range of [0, 2 * NUM_ELEM) where NUM_ELEM
2841   // is the number of elements that can fit in a vector. For ex. for T_SHORT with 64-bit vector length,
2842   // the indices can range from [0, 8).
2843   // As an example with 64-bit vector length and T_SHORT type - let index = [2, 5, 1, 0]
2844   // Move a constant 0x02 in every byte of tmp - tmp = [0x0202, 0x0202, 0x0202, 0x0202]
2845   // Multiply index vector with tmp to yield - dst = [0x0404, 0x0a0a, 0x0202, 0x0000]
2846   // Move a constant 0x0100 in every 2B of tmp - tmp = [0x0100, 0x0100, 0x0100, 0x0100]
2847   // Add the multiplied result to the vector in tmp to obtain the byte level
2848   // offsets - dst = [0x0504, 0x0b0a, 0x0302, 0x0100]
2849   // Use these offsets in the "tbl" instruction to select chunks of 2B.
2850 
2851   if (bt == T_BYTE) {
2852     select_from_two_vectors_neon(dst, src1, src2, index, tmp, vector_length_in_bytes);
2853   } else {
2854     int elem_size = (bt == T_SHORT) ? 2 : 4;
2855     uint64_t tbl_offset = (bt == T_SHORT) ? 0x0100u : 0x03020100u;
2856 
2857     mov(tmp, size1, elem_size);
2858     mulv(dst, size2, index, tmp);
2859     mov(tmp, size2, tbl_offset);
2860     addv(dst, size1, dst, tmp); // "dst" now contains the processed index elements
2861                                 // to select a set of 2B/4B
2862     select_from_two_vectors_neon(dst, src1, src2, dst, tmp, vector_length_in_bytes);
2863   }
2864 }
2865 
2866 // Vector expand implementation. Elements from the src vector are expanded into
2867 // the dst vector under the control of the vector mask.
2868 // Since there are no native instructions directly corresponding to expand before
2869 // SVE2p2, the following implementations mainly leverages the TBL instruction to
2870 // implement expand. To compute the index input for TBL, the prefix sum algorithm
2871 // (https://en.wikipedia.org/wiki/Prefix_sum) is used. The same algorithm is used
2872 // for NEON and SVE, but with different instructions where appropriate.
2873 
2874 // Vector expand implementation for NEON.
2875 //
2876 // An example of 128-bit Byte vector:
2877 //   Data direction: high <== low
2878 //   Input:
2879 //         src   = g  f  e  d  c  b  a  9  8  7  6  5  4  3  2  1
2880 //         mask  = 0  0 -1 -1  0  0 -1 -1  0  0 -1 -1  0  0 -1 -1
2881 //   Expected result:
2882 //         dst   = 0  0  8  7  0  0  6  5  0  0  4  3  0  0  2  1
2883 void C2_MacroAssembler::vector_expand_neon(FloatRegister dst, FloatRegister src, FloatRegister mask,
2884                                            FloatRegister tmp1, FloatRegister tmp2, BasicType bt,
2885                                            int vector_length_in_bytes) {
2886   assert(vector_length_in_bytes <= 16, "the vector length in bytes for NEON must be <= 16");
2887   assert_different_registers(dst, src, mask, tmp1, tmp2);
2888   // Since the TBL instruction only supports byte table, we need to
2889   // compute indices in byte type for all types.
2890   SIMD_Arrangement size = vector_length_in_bytes == 16 ? T16B : T8B;
2891   // tmp1 =  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
2892   dup(tmp1, size, zr);
2893   // dst  =  0  0  1  1  0  0  1  1  0  0  1  1  0  0  1  1
2894   negr(dst, size, mask);
2895   // Calculate vector index for TBL with prefix sum algorithm.
2896   // dst  =  8  8  8  7  6  6  6  5  4  4  4  3  2  2  2  1
2897   for (int i = 1; i < vector_length_in_bytes; i <<= 1) {
2898     ext(tmp2, size, tmp1, dst, vector_length_in_bytes - i);
2899     addv(dst, size, tmp2, dst);
2900   }
2901   // tmp2 =  0  0 -1 -1  0  0 -1 -1  0  0 -1 -1  0  0 -1 -1
2902   orr(tmp2, size, mask, mask);
2903   // tmp2 =  0  0  8  7  0  0  6  5  0  0  4  3  0  0  2  1
2904   bsl(tmp2, size, dst, tmp1);
2905   // tmp1 =  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
2906   movi(tmp1, size, 1);
2907   // dst  = -1 -1  7  6 -1 -1  5  4 -1 -1  3  2 -1 -1  1  0
2908   subv(dst, size, tmp2, tmp1);
2909   // dst  =  0  0  8  7  0  0  6  5  0  0  4  3  0  0  2  1
2910   tbl(dst, size, src, 1, dst);
2911 }
2912 
2913 // Vector expand implementation for SVE.
2914 //
2915 // An example of 128-bit Short vector:
2916 //   Data direction: high <== low
2917 //   Input:
2918 //         src   = gf ed cb a9 87 65 43 21
2919 //         pg    = 00 01 00 01 00 01 00 01
2920 //   Expected result:
2921 //         dst   = 00 87 00 65 00 43 00 21
2922 void C2_MacroAssembler::vector_expand_sve(FloatRegister dst, FloatRegister src, PRegister pg,
2923                                           FloatRegister tmp1, FloatRegister tmp2, BasicType bt,
2924                                           int vector_length_in_bytes) {
2925   assert(UseSVE > 0, "expand implementation only for SVE");
2926   assert_different_registers(dst, src, tmp1, tmp2);
2927   SIMD_RegVariant size = elemType_to_regVariant(bt);
2928 
2929   // tmp1 = 00 00 00 00 00 00 00 00
2930   sve_dup(tmp1, size, 0);
2931   sve_movprfx(tmp2, tmp1);
2932   // tmp2 = 00 01 00 01 00 01 00 01
2933   sve_cpy(tmp2, size, pg, 1, true);
2934   // Calculate vector index for TBL with prefix sum algorithm.
2935   // tmp2 = 04 04 03 03 02 02 01 01
2936   for (int i = type2aelembytes(bt); i < vector_length_in_bytes; i <<= 1) {
2937     sve_movprfx(dst, tmp1);
2938     // The EXT instruction operates on the full-width sve register. The correct
2939     // index calculation method is:
2940     // vector_length_in_bytes - i + MaxVectorSize - vector_length_in_bytes =>
2941     // MaxVectorSize - i.
2942     sve_ext(dst, tmp2, MaxVectorSize - i);
2943     sve_add(tmp2, size, dst, tmp2);
2944   }
2945   // dst  = 00 04 00 03 00 02 00 01
2946   sve_sel(dst, size, pg, tmp2, tmp1);
2947   // dst  = -1 03 -1 02 -1 01 -1 00
2948   sve_sub(dst, size, 1);
2949   // dst  = 00 87 00 65 00 43 00 21
2950   sve_tbl(dst, size, src, dst);
2951 }
2952 
2953 // Optimized SVE cpy (imm, zeroing) instruction.
2954 //
2955 // `movi; cpy(imm, merging)` and `cpy(imm, zeroing)` have the same
2956 // functionality, but test results show that `movi; cpy(imm, merging)` has
2957 // higher throughput on some microarchitectures. This would depend on
2958 // microarchitecture and so may vary between implementations.
2959 void C2_MacroAssembler::sve_cpy(FloatRegister dst, SIMD_RegVariant T,
2960                                 PRegister pg, int imm8, bool isMerge) {
2961   if (VM_Version::prefer_sve_merging_mode_cpy() && !isMerge) {
2962     // Generates a NEON instruction `movi V<dst>.2d, #0`.
2963     // On AArch64, Z and V registers alias in the low 128 bits, so V<dst> is
2964     // the low 128 bits of Z<dst>. A write to V<dst> also clears all bits of
2965     // Z<dst> above 128, so this `movi` instruction effectively zeroes the
2966     // entire Z<dst> register. According to the Arm Software Optimization
2967     // Guide, `movi` is zero latency.
2968     movi(dst, T2D, 0);
2969     isMerge = true;
2970   }
2971   Assembler::sve_cpy(dst, T, pg, imm8, isMerge);
2972 }
2973 
2974 int C2_MacroAssembler::vector_iota_entry_index(BasicType bt) {
2975   // The vector iota entries array is ordered by type B/S/I/L/F/D, and
2976   // the offset between two types is 16.
2977   switch(bt) {
2978   case T_BYTE:
2979     return 0;
2980   case T_SHORT:
2981     return 1;
2982   case T_INT:
2983     return 2;
2984   case T_LONG:
2985     return 3;
2986   case T_FLOAT:
2987     return 4;
2988   case T_DOUBLE:
2989     return 5;
2990   default:
2991     ShouldNotReachHere();
2992   }
2993 }