1 /*
   2  * Copyright (c) 2020, 2026, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "asm/assembler.hpp"
  26 #include "asm/assembler.inline.hpp"
  27 #include "opto/c2_MacroAssembler.hpp"
  28 #include "opto/compile.hpp"
  29 #include "opto/intrinsicnode.hpp"
  30 #include "opto/matcher.hpp"
  31 #include "opto/output.hpp"
  32 #include "opto/subnode.hpp"
  33 #include "runtime/objectMonitorTable.hpp"
  34 #include "runtime/stubRoutines.hpp"
  35 #include "runtime/synchronizer.hpp"
  36 #include "utilities/globalDefinitions.hpp"
  37 #include "utilities/powerOfTwo.hpp"
  38 
  39 #ifdef PRODUCT
  40 #define BLOCK_COMMENT(str) /* nothing */
  41 #define STOP(error) stop(error)
  42 #else
  43 #define BLOCK_COMMENT(str) block_comment(str)
  44 #define STOP(error) block_comment(error); stop(error)
  45 #endif
  46 
  47 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  48 
  49 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
  50 
  51 // jdk.internal.util.ArraysSupport.vectorizedHashCode
  52 address C2_MacroAssembler::arrays_hashcode(Register ary, Register cnt, Register result,
  53                                            FloatRegister vdata0, FloatRegister vdata1,
  54                                            FloatRegister vdata2, FloatRegister vdata3,
  55                                            FloatRegister vmul0, FloatRegister vmul1,
  56                                            FloatRegister vmul2, FloatRegister vmul3,
  57                                            FloatRegister vpow, FloatRegister vpowm,
  58                                            BasicType eltype) {
  59   ARRAYS_HASHCODE_REGISTERS;
  60 
  61   Register tmp1 = rscratch1, tmp2 = rscratch2;
  62 
  63   Label TAIL, STUB_SWITCH, STUB_SWITCH_OUT, LOOP, BR_BASE, LARGE, DONE;
  64 
  65   // Vectorization factor. Number of array elements loaded to one SIMD&FP registers by the stubs. We
  66   // use 8H load arrangements for chars and shorts and 8B for booleans and bytes. It's possible to
  67   // use 4H for chars and shorts instead, but using 8H gives better performance.
  68   const size_t vf = eltype == T_BOOLEAN || eltype == T_BYTE ? 8
  69                     : eltype == T_CHAR || eltype == T_SHORT ? 8
  70                     : eltype == T_INT                       ? 4
  71                                                             : 0;
  72   guarantee(vf, "unsupported eltype");
  73 
  74   // Unroll factor for the scalar loop below. The value is chosen based on performance analysis.
  75   const size_t unroll_factor = 4;
  76 
  77   switch (eltype) {
  78   case T_BOOLEAN:
  79     BLOCK_COMMENT("arrays_hashcode(unsigned byte) {");
  80     break;
  81   case T_CHAR:
  82     BLOCK_COMMENT("arrays_hashcode(char) {");
  83     break;
  84   case T_BYTE:
  85     BLOCK_COMMENT("arrays_hashcode(byte) {");
  86     break;
  87   case T_SHORT:
  88     BLOCK_COMMENT("arrays_hashcode(short) {");
  89     break;
  90   case T_INT:
  91     BLOCK_COMMENT("arrays_hashcode(int) {");
  92     break;
  93   default:
  94     ShouldNotReachHere();
  95   }
  96 
  97   // large_arrays_hashcode(T_INT) performs worse than the scalar loop below when the Neon loop
  98   // implemented by the stub executes just once. Call the stub only if at least two iterations will
  99   // be executed.
 100   const size_t large_threshold = eltype == T_INT ? vf * 2 : vf;
 101   cmpw(cnt, large_threshold);
 102   br(Assembler::HS, LARGE);
 103 
 104   bind(TAIL);
 105 
 106   // The andr performs cnt % uf where uf = unroll_factor. The subtract shifted by 3 offsets past
 107   // uf - (cnt % uf) pairs of load + madd insns i.e. it only executes cnt % uf load + madd pairs.
 108   // Iteration eats up the remainder, uf elements at a time.
 109   assert(is_power_of_2(unroll_factor), "can't use this value to calculate the jump target PC");
 110   andr(tmp2, cnt, unroll_factor - 1);
 111   adr(tmp1, BR_BASE);
 112   // For Cortex-A53 offset is 4 because 2 nops are generated.
 113   sub(tmp1, tmp1, tmp2, ext::sxtw, VM_Version::supports_a53mac() ? 4 : 3);
 114   movw(tmp2, 0x1f);
 115   br(tmp1);
 116 
 117   bind(LOOP);
 118   for (size_t i = 0; i < unroll_factor; ++i) {
 119     load(tmp1, Address(post(ary, type2aelembytes(eltype))), eltype);
 120     maddw(result, result, tmp2, tmp1);
 121     // maddw generates an extra nop for Cortex-A53 (see maddw definition in macroAssembler).
 122     // Generate 2nd nop to have 4 instructions per iteration.
 123     if (VM_Version::supports_a53mac()) {
 124       nop();
 125     }
 126   }
 127   bind(BR_BASE);
 128   subsw(cnt, cnt, unroll_factor);
 129   br(Assembler::HS, LOOP);
 130 
 131   b(DONE);
 132 
 133   bind(LARGE);
 134 
 135   RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_arrays_hashcode(eltype));
 136   assert(stub.target() != nullptr, "array_hashcode stub has not been generated");
 137   address tpc = trampoline_call(stub);
 138   if (tpc == nullptr) {
 139     DEBUG_ONLY(reset_labels(TAIL, BR_BASE));
 140     postcond(pc() == badAddress);
 141     return nullptr;
 142   }
 143 
 144   bind(DONE);
 145 
 146   BLOCK_COMMENT("} // arrays_hashcode");
 147 
 148   postcond(pc() != badAddress);
 149   return pc();
 150 }
 151 
 152 void C2_MacroAssembler::fast_lock(Register obj, Register box, Register t1,
 153                                   Register t2, Register t3) {
 154   assert_different_registers(obj, box, t1, t2, t3, rscratch2);
 155 
 156   // Handle inflated monitor.
 157   Label inflated;
 158   // Finish fast lock successfully. MUST branch to with flag == EQ
 159   Label locked;
 160   // Finish fast lock unsuccessfully. MUST branch to with flag == NE
 161   Label slow_path;
 162 
 163   if (UseObjectMonitorTable) {
 164     // Clear cache in case fast locking succeeds or we need to take the slow-path.
 165     str(zr, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 166   }
 167 
 168   if (DiagnoseSyncOnValueBasedClasses != 0) {
 169     load_klass(t1, obj);
 170     ldrb(t1, Address(t1, Klass::misc_flags_offset()));
 171     tst(t1, KlassFlags::_misc_is_value_based_class);
 172     br(Assembler::NE, slow_path);
 173   }
 174 
 175   const Register t1_mark = t1;
 176   const Register t3_t = t3;
 177 
 178   { // Fast locking
 179 
 180     // Push lock to the lock stack and finish successfully. MUST branch to with flag == EQ
 181     Label push;
 182 
 183     const Register t2_top = t2;
 184 
 185     // Check if lock-stack is full.
 186     ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 187     cmpw(t2_top, (unsigned)LockStack::end_offset() - 1);
 188     br(Assembler::GT, slow_path);
 189 
 190     // Check if recursive.
 191     subw(t3_t, t2_top, oopSize);
 192     ldr(t3_t, Address(rthread, t3_t));
 193     cmp(obj, t3_t);
 194     br(Assembler::EQ, push);
 195 
 196     // Relaxed normal load to check for monitor. Optimization for monitor case.
 197     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 198     tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
 199 
 200     // Not inflated
 201     assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea");
 202 
 203     // Try to lock. Transition lock-bits 0b01 => 0b00
 204     orr(t1_mark, t1_mark, markWord::unlocked_value);
 205     eor(t3_t, t1_mark, markWord::unlocked_value);
 206     cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
 207             /*acquire*/ true, /*release*/ false, /*weak*/ false, noreg);
 208     br(Assembler::NE, slow_path);
 209 
 210     bind(push);
 211     // After successful lock, push object on lock-stack.
 212     str(obj, Address(rthread, t2_top));
 213     addw(t2_top, t2_top, oopSize);
 214     strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 215     b(locked);
 216   }
 217 
 218   { // Handle inflated monitor.
 219     bind(inflated);
 220 
 221     const Register t1_monitor = t1;
 222 
 223     if (!UseObjectMonitorTable) {
 224       assert(t1_monitor == t1_mark, "should be the same here");
 225     } else {
 226       const Register t1_hash = t1;
 227       Label monitor_found;
 228 
 229       // Save the mark, we might need it to extract the hash.
 230       mov(t3, t1_mark);
 231 
 232       // Look for the monitor in the om_cache.
 233 
 234       ByteSize cache_offset   = JavaThread::om_cache_oops_offset();
 235       ByteSize monitor_offset = OMCache::oop_to_monitor_difference();
 236       const int num_unrolled  = OMCache::CAPACITY;
 237       for (int i = 0; i < num_unrolled; i++) {
 238         ldr(t1_monitor, Address(rthread, cache_offset + monitor_offset));
 239         ldr(t2, Address(rthread, cache_offset));
 240         cmp(obj, t2);
 241         br(Assembler::EQ, monitor_found);
 242         cache_offset = cache_offset + OMCache::oop_to_oop_difference();
 243       }
 244 
 245       // Look for the monitor in the table.
 246 
 247       // Get the hash code.
 248       ubfx(t1_hash, t3, markWord::hash_shift, markWord::hash_bits);
 249 
 250       // Get the table and calculate the bucket's address
 251       lea(t3, ExternalAddress(ObjectMonitorTable::current_table_address()));
 252       ldr(t3, Address(t3));
 253       ldr(t2, Address(t3, ObjectMonitorTable::table_capacity_mask_offset()));
 254       ands(t1_hash, t1_hash, t2);
 255       ldr(t3, Address(t3, ObjectMonitorTable::table_buckets_offset()));
 256 
 257       // Read the monitor from the bucket.
 258       ldr(t1_monitor, Address(t3, t1_hash, Address::lsl(LogBytesPerWord)));
 259 
 260       // Check if the monitor in the bucket is special (empty, tombstone or removed).
 261       cmp(t1_monitor, (unsigned char)ObjectMonitorTable::SpecialPointerValues::below_is_special);
 262       br(Assembler::LO, slow_path);
 263 
 264       // Check if object matches.
 265       ldr(t3, Address(t1_monitor, ObjectMonitor::object_offset()));
 266       BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
 267       bs_asm->try_resolve_weak_handle_in_c2(this, t3, t2, slow_path);
 268       cmp(t3, obj);
 269       br(Assembler::NE, slow_path);
 270 
 271       bind(monitor_found);
 272     }
 273 
 274     const Register t2_owner_addr = t2;
 275     const Register t3_owner = t3;
 276     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 277     const Address owner_address(t1_monitor, ObjectMonitor::owner_offset() - monitor_tag);
 278     const Address recursions_address(t1_monitor, ObjectMonitor::recursions_offset() - monitor_tag);
 279 
 280     Label monitor_locked;
 281 
 282     // Compute owner address.
 283     lea(t2_owner_addr, owner_address);
 284 
 285     // Try to CAS owner (no owner => current thread's _monitor_owner_id).
 286     ldr(rscratch2, Address(rthread, JavaThread::monitor_owner_id_offset()));
 287     cmpxchg(t2_owner_addr, zr, rscratch2, Assembler::xword, /*acquire*/ true,
 288             /*release*/ false, /*weak*/ false, t3_owner);
 289     br(Assembler::EQ, monitor_locked);
 290 
 291     // Check if recursive.
 292     cmp(t3_owner, rscratch2);
 293     br(Assembler::NE, slow_path);
 294 
 295     // Recursive.
 296     increment(recursions_address, 1);
 297 
 298     bind(monitor_locked);
 299     if (UseObjectMonitorTable) {
 300       str(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 301     }
 302   }
 303 
 304   bind(locked);
 305 
 306 #ifdef ASSERT
 307   // Check that locked label is reached with Flags == EQ.
 308   Label flag_correct;
 309   br(Assembler::EQ, flag_correct);
 310   stop("Fast Lock Flag != EQ");
 311 #endif
 312 
 313   bind(slow_path);
 314 #ifdef ASSERT
 315   // Check that slow_path label is reached with Flags == NE.
 316   br(Assembler::NE, flag_correct);
 317   stop("Fast Lock Flag != NE");
 318   bind(flag_correct);
 319 #endif
 320   // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
 321 }
 322 
 323 void C2_MacroAssembler::fast_unlock(Register obj, Register box, Register t1,
 324                                     Register t2, Register t3) {
 325   assert_different_registers(obj, box, t1, t2, t3);
 326 
 327   // Handle inflated monitor.
 328   Label inflated, inflated_load_mark;
 329   // Finish fast unlock successfully. MUST branch to with flag == EQ
 330   Label unlocked;
 331   // Finish fast unlock unsuccessfully. MUST branch to with flag == NE
 332   Label slow_path;
 333 
 334   const Register t1_mark = t1;
 335   const Register t2_top = t2;
 336   const Register t3_t = t3;
 337 
 338   { // Fast unlock
 339 
 340     Label push_and_slow_path;
 341 
 342     // Check if obj is top of lock-stack.
 343     ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 344     subw(t2_top, t2_top, oopSize);
 345     ldr(t3_t, Address(rthread, t2_top));
 346     cmp(obj, t3_t);
 347     // Top of lock stack was not obj. Must be monitor.
 348     br(Assembler::NE, inflated_load_mark);
 349 
 350     // Pop lock-stack.
 351     DEBUG_ONLY(str(zr, Address(rthread, t2_top));)
 352     strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 353 
 354     // Check if recursive.
 355     subw(t3_t, t2_top, oopSize);
 356     ldr(t3_t, Address(rthread, t3_t));
 357     cmp(obj, t3_t);
 358     br(Assembler::EQ, unlocked);
 359 
 360     // Not recursive.
 361     // Load Mark.
 362     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 363 
 364     // Check header for monitor (0b10).
 365     // Because we got here by popping (meaning we pushed in locked)
 366     // there will be no monitor in the box. So we need to push back the obj
 367     // so that the runtime can fix any potential anonymous owner.
 368     tbnz(t1_mark, exact_log2(markWord::monitor_value), UseObjectMonitorTable ? push_and_slow_path : inflated);
 369 
 370     // Try to unlock. Transition lock bits 0b00 => 0b01
 371     assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea");
 372     orr(t3_t, t1_mark, markWord::unlocked_value);
 373     cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
 374             /*acquire*/ false, /*release*/ true, /*weak*/ false, noreg);
 375     br(Assembler::EQ, unlocked);
 376 
 377     bind(push_and_slow_path);
 378     // Compare and exchange failed.
 379     // Restore lock-stack and handle the unlock in runtime.
 380     DEBUG_ONLY(str(obj, Address(rthread, t2_top));)
 381     addw(t2_top, t2_top, oopSize);
 382     str(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 383     b(slow_path);
 384   }
 385 
 386 
 387   { // Handle inflated monitor.
 388     bind(inflated_load_mark);
 389     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 390 #ifdef ASSERT
 391     tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
 392     stop("Fast Unlock not monitor");
 393 #endif
 394 
 395     bind(inflated);
 396 
 397 #ifdef ASSERT
 398     Label check_done;
 399     subw(t2_top, t2_top, oopSize);
 400     cmpw(t2_top, in_bytes(JavaThread::lock_stack_base_offset()));
 401     br(Assembler::LT, check_done);
 402     ldr(t3_t, Address(rthread, t2_top));
 403     cmp(obj, t3_t);
 404     br(Assembler::NE, inflated);
 405     stop("Fast Unlock lock on stack");
 406     bind(check_done);
 407 #endif
 408 
 409     const Register t1_monitor = t1;
 410 
 411     if (!UseObjectMonitorTable) {
 412       assert(t1_monitor == t1_mark, "should be the same here");
 413 
 414       // Untag the monitor.
 415       add(t1_monitor, t1_mark, -(int)markWord::monitor_value);
 416     } else {
 417       ldr(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 418       // null check with Flags == NE, no valid pointer below alignof(ObjectMonitor*)
 419       cmp(t1_monitor, checked_cast<uint8_t>(alignof(ObjectMonitor*)));
 420       br(Assembler::LO, slow_path);
 421     }
 422 
 423     const Register t2_recursions = t2;
 424     Label not_recursive;
 425 
 426     // Check if recursive.
 427     ldr(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
 428     cbz(t2_recursions, not_recursive);
 429 
 430     // Recursive unlock.
 431     sub(t2_recursions, t2_recursions, 1u);
 432     str(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
 433     // Set flag == EQ
 434     cmp(t2_recursions, t2_recursions);
 435     b(unlocked);
 436 
 437     bind(not_recursive);
 438 
 439     const Register t2_owner_addr = t2;
 440 
 441     // Compute owner address.
 442     lea(t2_owner_addr, Address(t1_monitor, ObjectMonitor::owner_offset()));
 443 
 444     // Set owner to null.
 445     // Release to satisfy the JMM
 446     stlr(zr, t2_owner_addr);
 447     // We need a full fence after clearing owner to avoid stranding.
 448     // StoreLoad achieves this.
 449     membar(StoreLoad);
 450 
 451     // Check if the entry_list is empty.
 452     ldr(rscratch1, Address(t1_monitor, ObjectMonitor::entry_list_offset()));
 453     cmp(rscratch1, zr);
 454     br(Assembler::EQ, unlocked);  // If so we are done.
 455 
 456     // Check if there is a successor.
 457     ldr(rscratch1, Address(t1_monitor, ObjectMonitor::succ_offset()));
 458     cmp(rscratch1, zr);
 459     br(Assembler::NE, unlocked);  // If so we are done.
 460 
 461     // Save the monitor pointer in the current thread, so we can try to
 462     // reacquire the lock in SharedRuntime::monitor_exit_helper().
 463     str(t1_monitor, Address(rthread, JavaThread::unlocked_inflated_monitor_offset()));
 464 
 465     cmp(zr, rthread); // Set Flag to NE => slow path
 466     b(slow_path);
 467   }
 468 
 469   bind(unlocked);
 470   cmp(zr, zr); // Set Flags to EQ => fast path
 471 
 472 #ifdef ASSERT
 473   // Check that unlocked label is reached with Flags == EQ.
 474   Label flag_correct;
 475   br(Assembler::EQ, flag_correct);
 476   stop("Fast Unlock Flag != EQ");
 477 #endif
 478 
 479   bind(slow_path);
 480 #ifdef ASSERT
 481   // Check that slow_path label is reached with Flags == NE.
 482   br(Assembler::NE, flag_correct);
 483   stop("Fast Unlock Flag != NE");
 484   bind(flag_correct);
 485 #endif
 486   // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
 487 }
 488 
 489 // Search for str1 in str2 and return index or -1
 490 // Clobbers: rscratch1, rscratch2, rflags. May also clobber v0-v1, when icnt1==-1.
 491 void C2_MacroAssembler::string_indexof(Register str2, Register str1,
 492                                        Register cnt2, Register cnt1,
 493                                        Register tmp1, Register tmp2,
 494                                        Register tmp3, Register tmp4,
 495                                        Register tmp5, Register tmp6,
 496                                        int icnt1, Register result, int ae) {
 497   // NOTE: tmp5, tmp6 can be zr depending on specific method version
 498   Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
 499 
 500   Register ch1 = rscratch1;
 501   Register ch2 = rscratch2;
 502   Register cnt1tmp = tmp1;
 503   Register cnt2tmp = tmp2;
 504   Register cnt1_neg = cnt1;
 505   Register cnt2_neg = cnt2;
 506   Register result_tmp = tmp4;
 507 
 508   bool isL = ae == StrIntrinsicNode::LL;
 509 
 510   bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
 511   bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
 512   int str1_chr_shift = str1_isL ? 0:1;
 513   int str2_chr_shift = str2_isL ? 0:1;
 514   int str1_chr_size = str1_isL ? 1:2;
 515   int str2_chr_size = str2_isL ? 1:2;
 516   chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
 517                                       (chr_insn)&MacroAssembler::ldrh;
 518   chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
 519                                       (chr_insn)&MacroAssembler::ldrh;
 520   chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
 521   chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
 522 
 523   // Note, inline_string_indexOf() generates checks:
 524   // if (substr.count > string.count) return -1;
 525   // if (substr.count == 0) return 0;
 526 
 527   // We have two strings, a source string in str2, cnt2 and a pattern string
 528   // in str1, cnt1. Find the 1st occurrence of pattern in source or return -1.
 529 
 530   // For larger pattern and source we use a simplified Boyer Moore algorithm.
 531   // With a small pattern and source we use linear scan.
 532 
 533   if (icnt1 == -1) {
 534     sub(result_tmp, cnt2, cnt1);
 535     cmp(cnt1, (u1)8);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
 536     br(LT, LINEARSEARCH);
 537     dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
 538     subs(zr, cnt1, 256);
 539     lsr(tmp1, cnt2, 2);
 540     ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
 541     br(GE, LINEARSTUB);
 542   }
 543 
 544 // The Boyer Moore alogorithm is based on the description here:-
 545 //
 546 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
 547 //
 548 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
 549 // and the 'Good Suffix' rule.
 550 //
 551 // These rules are essentially heuristics for how far we can shift the
 552 // pattern along the search string.
 553 //
 554 // The implementation here uses the 'Bad Character' rule only because of the
 555 // complexity of initialisation for the 'Good Suffix' rule.
 556 //
 557 // This is also known as the Boyer-Moore-Horspool algorithm:-
 558 //
 559 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
 560 //
 561 // This particular implementation has few java-specific optimizations.
 562 //
 563 // #define ASIZE 256
 564 //
 565 //    int bm(unsigned char *x, int m, unsigned char *y, int n) {
 566 //       int i, j;
 567 //       unsigned c;
 568 //       unsigned char bc[ASIZE];
 569 //
 570 //       /* Preprocessing */
 571 //       for (i = 0; i < ASIZE; ++i)
 572 //          bc[i] = m;
 573 //       for (i = 0; i < m - 1; ) {
 574 //          c = x[i];
 575 //          ++i;
 576 //          // c < 256 for Latin1 string, so, no need for branch
 577 //          #ifdef PATTERN_STRING_IS_LATIN1
 578 //          bc[c] = m - i;
 579 //          #else
 580 //          if (c < ASIZE) bc[c] = m - i;
 581 //          #endif
 582 //       }
 583 //
 584 //       /* Searching */
 585 //       j = 0;
 586 //       while (j <= n - m) {
 587 //          c = y[i+j];
 588 //          if (x[m-1] == c)
 589 //            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
 590 //          if (i < 0) return j;
 591 //          // c < 256 for Latin1 string, so, no need for branch
 592 //          #ifdef SOURCE_STRING_IS_LATIN1
 593 //          // LL case: (c< 256) always true. Remove branch
 594 //          j += bc[y[j+m-1]];
 595 //          #endif
 596 //          #ifndef PATTERN_STRING_IS_UTF
 597 //          // UU case: need if (c<ASIZE) check. Skip 1 character if not.
 598 //          if (c < ASIZE)
 599 //            j += bc[y[j+m-1]];
 600 //          else
 601 //            j += 1
 602 //          #endif
 603 //          #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
 604 //          // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
 605 //          if (c < ASIZE)
 606 //            j += bc[y[j+m-1]];
 607 //          else
 608 //            j += m
 609 //          #endif
 610 //       }
 611 //    }
 612 
 613   if (icnt1 == -1) {
 614     Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
 615         BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
 616     Register cnt1end = tmp2;
 617     Register str2end = cnt2;
 618     Register skipch = tmp2;
 619 
 620     // str1 length is >=8, so, we can read at least 1 register for cases when
 621     // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
 622     // UL case. We'll re-read last character in inner pre-loop code to have
 623     // single outer pre-loop load
 624     const int firstStep = isL ? 7 : 3;
 625 
 626     const int ASIZE = 256;
 627     const int STORED_BYTES = 32; // amount of bytes stored per instruction
 628     sub(sp, sp, ASIZE);
 629     mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
 630     mov(ch1, sp);
 631     BIND(BM_INIT_LOOP);
 632       stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
 633       subs(tmp5, tmp5, 1);
 634       br(GT, BM_INIT_LOOP);
 635 
 636       sub(cnt1tmp, cnt1, 1);
 637       mov(tmp5, str2);
 638       add(str2end, str2, result_tmp, LSL, str2_chr_shift);
 639       sub(ch2, cnt1, 1);
 640       mov(tmp3, str1);
 641     BIND(BCLOOP);
 642       (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
 643       if (!str1_isL) {
 644         subs(zr, ch1, ASIZE);
 645         br(HS, BCSKIP);
 646       }
 647       strb(ch2, Address(sp, ch1));
 648     BIND(BCSKIP);
 649       subs(ch2, ch2, 1);
 650       br(GT, BCLOOP);
 651 
 652       add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
 653       if (str1_isL == str2_isL) {
 654         // load last 8 bytes (8LL/4UU symbols)
 655         ldr(tmp6, Address(tmp6, -wordSize));
 656       } else {
 657         ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
 658         // convert Latin1 to UTF. We'll have to wait until load completed, but
 659         // it's still faster than per-character loads+checks
 660         lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
 661         ubfx(ch1, tmp6, 8, 8); // str1[N-2]
 662         ubfx(ch2, tmp6, 16, 8); // str1[N-3]
 663         andr(tmp6, tmp6, 0xFF); // str1[N-4]
 664         orr(ch2, ch1, ch2, LSL, 16);
 665         orr(tmp6, tmp6, tmp3, LSL, 48);
 666         orr(tmp6, tmp6, ch2, LSL, 16);
 667       }
 668     BIND(BMLOOPSTR2);
 669       (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 670       sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
 671       if (str1_isL == str2_isL) {
 672         // re-init tmp3. It's for free because it's executed in parallel with
 673         // load above. Alternative is to initialize it before loop, but it'll
 674         // affect performance on in-order systems with 2 or more ld/st pipelines
 675         lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
 676       }
 677       if (!isL) { // UU/UL case
 678         lsl(ch2, cnt1tmp, 1); // offset in bytes
 679       }
 680       cmp(tmp3, skipch);
 681       br(NE, BMSKIP);
 682       ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
 683       mov(ch1, tmp6);
 684       if (isL) {
 685         b(BMLOOPSTR1_AFTER_LOAD);
 686       } else {
 687         sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
 688         b(BMLOOPSTR1_CMP);
 689       }
 690     BIND(BMLOOPSTR1);
 691       (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
 692       (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 693     BIND(BMLOOPSTR1_AFTER_LOAD);
 694       subs(cnt1tmp, cnt1tmp, 1);
 695       br(LT, BMLOOPSTR1_LASTCMP);
 696     BIND(BMLOOPSTR1_CMP);
 697       cmp(ch1, ch2);
 698       br(EQ, BMLOOPSTR1);
 699     BIND(BMSKIP);
 700       if (!isL) {
 701         // if we've met UTF symbol while searching Latin1 pattern, then we can
 702         // skip cnt1 symbols
 703         if (str1_isL != str2_isL) {
 704           mov(result_tmp, cnt1);
 705         } else {
 706           mov(result_tmp, 1);
 707         }
 708         subs(zr, skipch, ASIZE);
 709         br(HS, BMADV);
 710       }
 711       ldrb(result_tmp, Address(sp, skipch)); // load skip distance
 712     BIND(BMADV);
 713       sub(cnt1tmp, cnt1, 1);
 714       add(str2, str2, result_tmp, LSL, str2_chr_shift);
 715       cmp(str2, str2end);
 716       br(LE, BMLOOPSTR2);
 717       add(sp, sp, ASIZE);
 718       b(NOMATCH);
 719     BIND(BMLOOPSTR1_LASTCMP);
 720       cmp(ch1, ch2);
 721       br(NE, BMSKIP);
 722     BIND(BMMATCH);
 723       sub(result, str2, tmp5);
 724       if (!str2_isL) lsr(result, result, 1);
 725       add(sp, sp, ASIZE);
 726       b(DONE);
 727 
 728     BIND(LINEARSTUB);
 729     cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm
 730     br(LT, LINEAR_MEDIUM);
 731     mov(result, zr);
 732     RuntimeAddress stub = nullptr;
 733     if (isL) {
 734       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
 735       assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated");
 736     } else if (str1_isL) {
 737       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
 738        assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated");
 739     } else {
 740       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
 741       assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated");
 742     }
 743     address call = trampoline_call(stub);
 744     if (call == nullptr) {
 745       DEBUG_ONLY(reset_labels(LINEARSEARCH, LINEAR_MEDIUM, DONE, NOMATCH, MATCH));
 746       ciEnv::current()->record_failure("CodeCache is full");
 747       return;
 748     }
 749     b(DONE);
 750   }
 751 
 752   BIND(LINEARSEARCH);
 753   {
 754     Label DO1, DO2, DO3;
 755 
 756     Register str2tmp = tmp2;
 757     Register first = tmp3;
 758 
 759     if (icnt1 == -1)
 760     {
 761         Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
 762 
 763         cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2));
 764         br(LT, DOSHORT);
 765       BIND(LINEAR_MEDIUM);
 766         (this->*str1_load_1chr)(first, Address(str1));
 767         lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
 768         sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
 769         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 770         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 771 
 772       BIND(FIRST_LOOP);
 773         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
 774         cmp(first, ch2);
 775         br(EQ, STR1_LOOP);
 776       BIND(STR2_NEXT);
 777         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 778         br(LE, FIRST_LOOP);
 779         b(NOMATCH);
 780 
 781       BIND(STR1_LOOP);
 782         adds(cnt1tmp, cnt1_neg, str1_chr_size);
 783         add(cnt2tmp, cnt2_neg, str2_chr_size);
 784         br(GE, MATCH);
 785 
 786       BIND(STR1_NEXT);
 787         (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
 788         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
 789         cmp(ch1, ch2);
 790         br(NE, STR2_NEXT);
 791         adds(cnt1tmp, cnt1tmp, str1_chr_size);
 792         add(cnt2tmp, cnt2tmp, str2_chr_size);
 793         br(LT, STR1_NEXT);
 794         b(MATCH);
 795 
 796       BIND(DOSHORT);
 797       if (str1_isL == str2_isL) {
 798         cmp(cnt1, (u1)2);
 799         br(LT, DO1);
 800         br(GT, DO3);
 801       }
 802     }
 803 
 804     if (icnt1 == 4) {
 805       Label CH1_LOOP;
 806 
 807         (this->*load_4chr)(ch1, str1);
 808         sub(result_tmp, cnt2, 4);
 809         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 810         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 811 
 812       BIND(CH1_LOOP);
 813         (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
 814         cmp(ch1, ch2);
 815         br(EQ, MATCH);
 816         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 817         br(LE, CH1_LOOP);
 818         b(NOMATCH);
 819       }
 820 
 821     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
 822       Label CH1_LOOP;
 823 
 824       BIND(DO2);
 825         (this->*load_2chr)(ch1, str1);
 826         if (icnt1 == 2) {
 827           sub(result_tmp, cnt2, 2);
 828         }
 829         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 830         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 831       BIND(CH1_LOOP);
 832         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
 833         cmp(ch1, ch2);
 834         br(EQ, MATCH);
 835         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 836         br(LE, CH1_LOOP);
 837         b(NOMATCH);
 838     }
 839 
 840     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
 841       Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
 842 
 843       BIND(DO3);
 844         (this->*load_2chr)(first, str1);
 845         (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
 846         if (icnt1 == 3) {
 847           sub(result_tmp, cnt2, 3);
 848         }
 849         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 850         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 851       BIND(FIRST_LOOP);
 852         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
 853         cmpw(first, ch2);
 854         br(EQ, STR1_LOOP);
 855       BIND(STR2_NEXT);
 856         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 857         br(LE, FIRST_LOOP);
 858         b(NOMATCH);
 859 
 860       BIND(STR1_LOOP);
 861         add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
 862         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
 863         cmp(ch1, ch2);
 864         br(NE, STR2_NEXT);
 865         b(MATCH);
 866     }
 867 
 868     if (icnt1 == -1 || icnt1 == 1) {
 869       Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
 870 
 871       BIND(DO1);
 872         (this->*str1_load_1chr)(ch1, str1);
 873         cmp(cnt2, (u1)8);
 874         br(LT, DO1_SHORT);
 875 
 876         sub(result_tmp, cnt2, 8/str2_chr_size);
 877         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 878         mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
 879         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 880 
 881         if (str2_isL) {
 882           orr(ch1, ch1, ch1, LSL, 8);
 883         }
 884         orr(ch1, ch1, ch1, LSL, 16);
 885         orr(ch1, ch1, ch1, LSL, 32);
 886       BIND(CH1_LOOP);
 887         ldr(ch2, Address(str2, cnt2_neg));
 888         eor(ch2, ch1, ch2);
 889         sub(tmp1, ch2, tmp3);
 890         orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 891         bics(tmp1, tmp1, tmp2);
 892         br(NE, HAS_ZERO);
 893         adds(cnt2_neg, cnt2_neg, 8);
 894         br(LT, CH1_LOOP);
 895 
 896         cmp(cnt2_neg, (u1)8);
 897         mov(cnt2_neg, 0);
 898         br(LT, CH1_LOOP);
 899         b(NOMATCH);
 900 
 901       BIND(HAS_ZERO);
 902         rev(tmp1, tmp1);
 903         clz(tmp1, tmp1);
 904         add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
 905         b(MATCH);
 906 
 907       BIND(DO1_SHORT);
 908         mov(result_tmp, cnt2);
 909         lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
 910         sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
 911       BIND(DO1_LOOP);
 912         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
 913         cmpw(ch1, ch2);
 914         br(EQ, MATCH);
 915         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 916         br(LT, DO1_LOOP);
 917     }
 918   }
 919   BIND(NOMATCH);
 920     mov(result, -1);
 921     b(DONE);
 922   BIND(MATCH);
 923     add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
 924   BIND(DONE);
 925 }
 926 
 927 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
 928 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
 929 
 930 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1,
 931                                             Register ch, Register result,
 932                                             Register tmp1, Register tmp2, Register tmp3)
 933 {
 934   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
 935   Register cnt1_neg = cnt1;
 936   Register ch1 = rscratch1;
 937   Register result_tmp = rscratch2;
 938 
 939   cbz(cnt1, NOMATCH);
 940 
 941   cmp(cnt1, (u1)4);
 942   br(LT, DO1_SHORT);
 943 
 944   orr(ch, ch, ch, LSL, 16);
 945   orr(ch, ch, ch, LSL, 32);
 946 
 947   sub(cnt1, cnt1, 4);
 948   mov(result_tmp, cnt1);
 949   lea(str1, Address(str1, cnt1, Address::uxtw(1)));
 950   sub(cnt1_neg, zr, cnt1, LSL, 1);
 951 
 952   mov(tmp3, 0x0001000100010001);
 953 
 954   BIND(CH1_LOOP);
 955     ldr(ch1, Address(str1, cnt1_neg));
 956     eor(ch1, ch, ch1);
 957     sub(tmp1, ch1, tmp3);
 958     orr(tmp2, ch1, 0x7fff7fff7fff7fff);
 959     bics(tmp1, tmp1, tmp2);
 960     br(NE, HAS_ZERO);
 961     adds(cnt1_neg, cnt1_neg, 8);
 962     br(LT, CH1_LOOP);
 963 
 964     cmp(cnt1_neg, (u1)8);
 965     mov(cnt1_neg, 0);
 966     br(LT, CH1_LOOP);
 967     b(NOMATCH);
 968 
 969   BIND(HAS_ZERO);
 970     rev(tmp1, tmp1);
 971     clz(tmp1, tmp1);
 972     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
 973     b(MATCH);
 974 
 975   BIND(DO1_SHORT);
 976     mov(result_tmp, cnt1);
 977     lea(str1, Address(str1, cnt1, Address::uxtw(1)));
 978     sub(cnt1_neg, zr, cnt1, LSL, 1);
 979   BIND(DO1_LOOP);
 980     ldrh(ch1, Address(str1, cnt1_neg));
 981     cmpw(ch, ch1);
 982     br(EQ, MATCH);
 983     adds(cnt1_neg, cnt1_neg, 2);
 984     br(LT, DO1_LOOP);
 985   BIND(NOMATCH);
 986     mov(result, -1);
 987     b(DONE);
 988   BIND(MATCH);
 989     add(result, result_tmp, cnt1_neg, ASR, 1);
 990   BIND(DONE);
 991 }
 992 
 993 void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1,
 994                                                 Register ch, Register result,
 995                                                 FloatRegister ztmp1,
 996                                                 FloatRegister ztmp2,
 997                                                 PRegister tmp_pg,
 998                                                 PRegister tmp_pdn, bool isL)
 999 {
1000   // Note that `tmp_pdn` should *NOT* be used as governing predicate register.
1001   assert(tmp_pg->is_governing(),
1002          "this register has to be a governing predicate register");
1003 
1004   Label LOOP, MATCH, DONE, NOMATCH;
1005   Register vec_len = rscratch1;
1006   Register idx = rscratch2;
1007 
1008   SIMD_RegVariant T = (isL == true) ? B : H;
1009 
1010   cbz(cnt1, NOMATCH);
1011 
1012   // Assign the particular char throughout the vector.
1013   sve_dup(ztmp2, T, ch);
1014   if (isL) {
1015     sve_cntb(vec_len);
1016   } else {
1017     sve_cnth(vec_len);
1018   }
1019   mov(idx, 0);
1020 
1021   // Generate a predicate to control the reading of input string.
1022   sve_whilelt(tmp_pg, T, idx, cnt1);
1023 
1024   BIND(LOOP);
1025     // Read a vector of 8- or 16-bit data depending on the string type. Note
1026     // that inactive elements indicated by the predicate register won't cause
1027     // a data read from memory to the destination vector.
1028     if (isL) {
1029       sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx));
1030     } else {
1031       sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1)));
1032     }
1033     add(idx, idx, vec_len);
1034 
1035     // Perform the comparison. An element of the destination predicate is set
1036     // to active if the particular char is matched.
1037     sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2);
1038 
1039     // Branch if the particular char is found.
1040     br(NE, MATCH);
1041 
1042     sve_whilelt(tmp_pg, T, idx, cnt1);
1043 
1044     // Loop back if the particular char not found.
1045     br(MI, LOOP);
1046 
1047   BIND(NOMATCH);
1048     mov(result, -1);
1049     b(DONE);
1050 
1051   BIND(MATCH);
1052     // Undo the index increment.
1053     sub(idx, idx, vec_len);
1054 
1055     // Crop the vector to find its location.
1056     sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */);
1057     add(result, idx, -1);
1058     sve_incp(result, T, tmp_pdn);
1059   BIND(DONE);
1060 }
1061 
1062 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1,
1063                                             Register ch, Register result,
1064                                             Register tmp1, Register tmp2, Register tmp3)
1065 {
1066   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
1067   Register cnt1_neg = cnt1;
1068   Register ch1 = rscratch1;
1069   Register result_tmp = rscratch2;
1070 
1071   cbz(cnt1, NOMATCH);
1072 
1073   cmp(cnt1, (u1)8);
1074   br(LT, DO1_SHORT);
1075 
1076   orr(ch, ch, ch, LSL, 8);
1077   orr(ch, ch, ch, LSL, 16);
1078   orr(ch, ch, ch, LSL, 32);
1079 
1080   sub(cnt1, cnt1, 8);
1081   mov(result_tmp, cnt1);
1082   lea(str1, Address(str1, cnt1));
1083   sub(cnt1_neg, zr, cnt1);
1084 
1085   mov(tmp3, 0x0101010101010101);
1086 
1087   BIND(CH1_LOOP);
1088     ldr(ch1, Address(str1, cnt1_neg));
1089     eor(ch1, ch, ch1);
1090     sub(tmp1, ch1, tmp3);
1091     orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f);
1092     bics(tmp1, tmp1, tmp2);
1093     br(NE, HAS_ZERO);
1094     adds(cnt1_neg, cnt1_neg, 8);
1095     br(LT, CH1_LOOP);
1096 
1097     cmp(cnt1_neg, (u1)8);
1098     mov(cnt1_neg, 0);
1099     br(LT, CH1_LOOP);
1100     b(NOMATCH);
1101 
1102   BIND(HAS_ZERO);
1103     rev(tmp1, tmp1);
1104     clz(tmp1, tmp1);
1105     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
1106     b(MATCH);
1107 
1108   BIND(DO1_SHORT);
1109     mov(result_tmp, cnt1);
1110     lea(str1, Address(str1, cnt1));
1111     sub(cnt1_neg, zr, cnt1);
1112   BIND(DO1_LOOP);
1113     ldrb(ch1, Address(str1, cnt1_neg));
1114     cmp(ch, ch1);
1115     br(EQ, MATCH);
1116     adds(cnt1_neg, cnt1_neg, 1);
1117     br(LT, DO1_LOOP);
1118   BIND(NOMATCH);
1119     mov(result, -1);
1120     b(DONE);
1121   BIND(MATCH);
1122     add(result, result_tmp, cnt1_neg);
1123   BIND(DONE);
1124 }
1125 
1126 // Compare strings.
1127 void C2_MacroAssembler::string_compare(Register str1, Register str2,
1128     Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
1129     FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
1130     PRegister pgtmp1, PRegister pgtmp2, int ae) {
1131   Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
1132       DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
1133       SHORT_LOOP_START, TAIL_CHECK;
1134 
1135   bool isLL = ae == StrIntrinsicNode::LL;
1136   bool isLU = ae == StrIntrinsicNode::LU;
1137   bool isUL = ae == StrIntrinsicNode::UL;
1138 
1139   // The stub threshold for LL strings is: 72 (64 + 8) chars
1140   // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch)
1141   // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least)
1142   const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36);
1143 
1144   bool str1_isL = isLL || isLU;
1145   bool str2_isL = isLL || isUL;
1146 
1147   int str1_chr_shift = str1_isL ? 0 : 1;
1148   int str2_chr_shift = str2_isL ? 0 : 1;
1149   int str1_chr_size = str1_isL ? 1 : 2;
1150   int str2_chr_size = str2_isL ? 1 : 2;
1151   int minCharsInWord = isLL ? wordSize : wordSize/2;
1152 
1153   FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
1154   chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
1155                                       (chr_insn)&MacroAssembler::ldrh;
1156   chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
1157                                       (chr_insn)&MacroAssembler::ldrh;
1158   uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
1159                             (uxt_insn)&MacroAssembler::uxthw;
1160 
1161   BLOCK_COMMENT("string_compare {");
1162 
1163   // Bizarrely, the counts are passed in bytes, regardless of whether they
1164   // are L or U strings, however the result is always in characters.
1165   if (!str1_isL) asrw(cnt1, cnt1, 1);
1166   if (!str2_isL) asrw(cnt2, cnt2, 1);
1167 
1168   // Compute the minimum of the string lengths and save the difference.
1169   subsw(result, cnt1, cnt2);
1170   cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
1171 
1172   // A very short string
1173   cmpw(cnt2, minCharsInWord);
1174   br(Assembler::LE, SHORT_STRING);
1175 
1176   // Compare longwords
1177   // load first parts of strings and finish initialization while loading
1178   {
1179     if (str1_isL == str2_isL) { // LL or UU
1180       ldr(tmp1, Address(str1));
1181       cmp(str1, str2);
1182       br(Assembler::EQ, DONE);
1183       ldr(tmp2, Address(str2));
1184       cmp(cnt2, stub_threshold);
1185       br(GE, STUB);
1186       subsw(cnt2, cnt2, minCharsInWord);
1187       br(EQ, TAIL_CHECK);
1188       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1189       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1190       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1191     } else if (isLU) {
1192       ldrs(vtmp, Address(str1));
1193       ldr(tmp2, Address(str2));
1194       cmp(cnt2, stub_threshold);
1195       br(GE, STUB);
1196       subw(cnt2, cnt2, 4);
1197       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1198       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1199       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1200       zip1(vtmp, T8B, vtmp, vtmpZ);
1201       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1202       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1203       add(cnt1, cnt1, 4);
1204       fmovd(tmp1, vtmp);
1205     } else { // UL case
1206       ldr(tmp1, Address(str1));
1207       ldrs(vtmp, Address(str2));
1208       cmp(cnt2, stub_threshold);
1209       br(GE, STUB);
1210       subw(cnt2, cnt2, 4);
1211       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1212       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1213       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1214       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1215       zip1(vtmp, T8B, vtmp, vtmpZ);
1216       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1217       add(cnt1, cnt1, 8);
1218       fmovd(tmp2, vtmp);
1219     }
1220     adds(cnt2, cnt2, isUL ? 4 : 8);
1221     br(GE, TAIL);
1222     eor(rscratch2, tmp1, tmp2);
1223     cbnz(rscratch2, DIFF);
1224     // main loop
1225     bind(NEXT_WORD);
1226     if (str1_isL == str2_isL) {
1227       ldr(tmp1, Address(str1, cnt2));
1228       ldr(tmp2, Address(str2, cnt2));
1229       adds(cnt2, cnt2, 8);
1230     } else if (isLU) {
1231       ldrs(vtmp, Address(str1, cnt1));
1232       ldr(tmp2, Address(str2, cnt2));
1233       add(cnt1, cnt1, 4);
1234       zip1(vtmp, T8B, vtmp, vtmpZ);
1235       fmovd(tmp1, vtmp);
1236       adds(cnt2, cnt2, 8);
1237     } else { // UL
1238       ldrs(vtmp, Address(str2, cnt2));
1239       ldr(tmp1, Address(str1, cnt1));
1240       zip1(vtmp, T8B, vtmp, vtmpZ);
1241       add(cnt1, cnt1, 8);
1242       fmovd(tmp2, vtmp);
1243       adds(cnt2, cnt2, 4);
1244     }
1245     br(GE, TAIL);
1246 
1247     eor(rscratch2, tmp1, tmp2);
1248     cbz(rscratch2, NEXT_WORD);
1249     b(DIFF);
1250     bind(TAIL);
1251     eor(rscratch2, tmp1, tmp2);
1252     cbnz(rscratch2, DIFF);
1253     // Last longword.  In the case where length == 4 we compare the
1254     // same longword twice, but that's still faster than another
1255     // conditional branch.
1256     if (str1_isL == str2_isL) {
1257       ldr(tmp1, Address(str1));
1258       ldr(tmp2, Address(str2));
1259     } else if (isLU) {
1260       ldrs(vtmp, Address(str1));
1261       ldr(tmp2, Address(str2));
1262       zip1(vtmp, T8B, vtmp, vtmpZ);
1263       fmovd(tmp1, vtmp);
1264     } else { // UL
1265       ldrs(vtmp, Address(str2));
1266       ldr(tmp1, Address(str1));
1267       zip1(vtmp, T8B, vtmp, vtmpZ);
1268       fmovd(tmp2, vtmp);
1269     }
1270     bind(TAIL_CHECK);
1271     eor(rscratch2, tmp1, tmp2);
1272     cbz(rscratch2, DONE);
1273 
1274     // Find the first different characters in the longwords and
1275     // compute their difference.
1276     bind(DIFF);
1277     rev(rscratch2, rscratch2);
1278     clz(rscratch2, rscratch2);
1279     andr(rscratch2, rscratch2, isLL ? -8 : -16);
1280     lsrv(tmp1, tmp1, rscratch2);
1281     (this->*ext_chr)(tmp1, tmp1);
1282     lsrv(tmp2, tmp2, rscratch2);
1283     (this->*ext_chr)(tmp2, tmp2);
1284     subw(result, tmp1, tmp2);
1285     b(DONE);
1286   }
1287 
1288   bind(STUB);
1289     RuntimeAddress stub = nullptr;
1290     switch(ae) {
1291       case StrIntrinsicNode::LL:
1292         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
1293         break;
1294       case StrIntrinsicNode::UU:
1295         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
1296         break;
1297       case StrIntrinsicNode::LU:
1298         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
1299         break;
1300       case StrIntrinsicNode::UL:
1301         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
1302         break;
1303       default:
1304         ShouldNotReachHere();
1305      }
1306     assert(stub.target() != nullptr, "compare_long_string stub has not been generated");
1307     address call = trampoline_call(stub);
1308     if (call == nullptr) {
1309       DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START));
1310       ciEnv::current()->record_failure("CodeCache is full");
1311       return;
1312     }
1313     b(DONE);
1314 
1315   bind(SHORT_STRING);
1316   // Is the minimum length zero?
1317   cbz(cnt2, DONE);
1318   // arrange code to do most branches while loading and loading next characters
1319   // while comparing previous
1320   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1321   subs(cnt2, cnt2, 1);
1322   br(EQ, SHORT_LAST_INIT);
1323   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1324   b(SHORT_LOOP_START);
1325   bind(SHORT_LOOP);
1326   subs(cnt2, cnt2, 1);
1327   br(EQ, SHORT_LAST);
1328   bind(SHORT_LOOP_START);
1329   (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
1330   (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
1331   cmp(tmp1, cnt1);
1332   br(NE, SHORT_LOOP_TAIL);
1333   subs(cnt2, cnt2, 1);
1334   br(EQ, SHORT_LAST2);
1335   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1336   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1337   cmp(tmp2, rscratch1);
1338   br(EQ, SHORT_LOOP);
1339   sub(result, tmp2, rscratch1);
1340   b(DONE);
1341   bind(SHORT_LOOP_TAIL);
1342   sub(result, tmp1, cnt1);
1343   b(DONE);
1344   bind(SHORT_LAST2);
1345   cmp(tmp2, rscratch1);
1346   br(EQ, DONE);
1347   sub(result, tmp2, rscratch1);
1348 
1349   b(DONE);
1350   bind(SHORT_LAST_INIT);
1351   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1352   bind(SHORT_LAST);
1353   cmp(tmp1, cnt1);
1354   br(EQ, DONE);
1355   sub(result, tmp1, cnt1);
1356 
1357   bind(DONE);
1358 
1359   BLOCK_COMMENT("} string_compare");
1360 }
1361 
1362 void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1,
1363                                      FloatRegister src2, Condition cond, bool isQ) {
1364   SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1365   FloatRegister zn = src1, zm = src2;
1366   bool needs_negation = false;
1367   switch (cond) {
1368     case LT: cond = GT; zn = src2; zm = src1; break;
1369     case LE: cond = GE; zn = src2; zm = src1; break;
1370     case LO: cond = HI; zn = src2; zm = src1; break;
1371     case LS: cond = HS; zn = src2; zm = src1; break;
1372     case NE: cond = EQ; needs_negation = true; break;
1373     default:
1374       break;
1375   }
1376 
1377   if (is_floating_point_type(bt)) {
1378     fcm(cond, dst, size, zn, zm);
1379   } else {
1380     cm(cond, dst, size, zn, zm);
1381   }
1382 
1383   if (needs_negation) {
1384     notr(dst, isQ ? T16B : T8B, dst);
1385   }
1386 }
1387 
1388 void C2_MacroAssembler::neon_compare_zero(FloatRegister dst, BasicType bt, FloatRegister src,
1389                                           Condition cond, bool isQ) {
1390   SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1391   if (bt == T_FLOAT || bt == T_DOUBLE) {
1392     if (cond == Assembler::NE) {
1393       fcm(Assembler::EQ, dst, size, src);
1394       notr(dst, isQ ? T16B : T8B, dst);
1395     } else {
1396       fcm(cond, dst, size, src);
1397     }
1398   } else {
1399     if (cond == Assembler::NE) {
1400       cm(Assembler::EQ, dst, size, src);
1401       notr(dst, isQ ? T16B : T8B, dst);
1402     } else {
1403       cm(cond, dst, size, src);
1404     }
1405   }
1406 }
1407 
1408 // Compress the least significant bit of each byte to the rightmost and clear
1409 // the higher garbage bits.
1410 void C2_MacroAssembler::bytemask_compress(Register dst) {
1411   // Example input, dst = 0x01 00 00 00 01 01 00 01
1412   // The "??" bytes are garbage.
1413   orr(dst, dst, dst, Assembler::LSR, 7);  // dst = 0x?? 02 ?? 00 ?? 03 ?? 01
1414   orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D
1415   orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D
1416   andr(dst, dst, 0xff);                   // dst = 0x8D
1417 }
1418 
1419 // Pack the value of each mask element in "src" into a long value in "dst", at most
1420 // the first 64 lane elements. The input "src" is a vector of boolean represented as
1421 // bytes with 0x00/0x01 as element values. Each lane value from "src" is packed into
1422 // one bit in "dst".
1423 //
1424 // Example:   src = 0x0001010000010001 0100000001010001, lane_cnt = 16
1425 // Expected:  dst = 0x658D
1426 //
1427 // Clobbers: rscratch1
1428 void C2_MacroAssembler::sve_vmask_tolong(Register dst, FloatRegister src,
1429                                          FloatRegister vtmp, int lane_cnt) {
1430   assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
1431   assert_different_registers(dst, rscratch1);
1432   assert_different_registers(src, vtmp);
1433   assert(UseSVE > 0, "must be");
1434 
1435   // Compress the lowest 8 bytes.
1436   fmovd(dst, src);
1437   bytemask_compress(dst);
1438   if (lane_cnt <= 8) return;
1439 
1440   // Repeat on higher bytes and join the results.
1441   // Compress 8 bytes in each iteration.
1442   for (int idx = 1; idx < (lane_cnt / 8); idx++) {
1443     sve_extract_integral(rscratch1, T_LONG, src, idx, vtmp);
1444     bytemask_compress(rscratch1);
1445     orr(dst, dst, rscratch1, Assembler::LSL, idx << 3);
1446   }
1447 }
1448 
1449 // The function is same as above "sve_vmask_tolong", but it uses SVE2's BEXT
1450 // instruction which requires the FEAT_BITPERM feature.
1451 void C2_MacroAssembler::sve2_vmask_tolong(Register dst, FloatRegister src,
1452                                           FloatRegister vtmp1, FloatRegister vtmp2,
1453                                           int lane_cnt) {
1454   assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
1455   assert_different_registers(src, vtmp1, vtmp2);
1456   assert(UseSVE > 1 && VM_Version::supports_svebitperm(), "must be");
1457 
1458   // Given a vector with the value 0x00 or 0x01 in each byte, the basic idea
1459   // is to compress each significant bit of the byte in a cross-lane way. Due
1460   // to the lack of a cross-lane bit-compress instruction, we use BEXT
1461   // (bit-compress in each lane) with the biggest lane size (T = D) then
1462   // concatenate the results.
1463 
1464   // The second source input of BEXT, initialized with 0x01 in each byte.
1465   // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1466   sve_dup(vtmp2, B, 1);
1467 
1468   // BEXT vtmp1.D, src.D, vtmp2.D
1469   // src   = 0x0001010000010001 | 0x0100000001010001
1470   // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1471   //         ---------------------------------------
1472   // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1473   sve_bext(vtmp1, D, src, vtmp2);
1474 
1475   // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the
1476   // result to dst.
1477   // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1478   // dst   = 0x658D
1479   if (lane_cnt <= 8) {
1480     // No need to concatenate.
1481     umov(dst, vtmp1, B, 0);
1482   } else if (lane_cnt <= 16) {
1483     ins(vtmp1, B, vtmp1, 1, 8);
1484     umov(dst, vtmp1, H, 0);
1485   } else {
1486     // As the lane count is 64 at most, the final expected value must be in
1487     // the lowest 64 bits after narrowing vtmp1 from D to B.
1488     sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2);
1489     umov(dst, vtmp1, D, 0);
1490   }
1491 }
1492 
1493 // Unpack the mask, a long value in "src", into a vector register of boolean
1494 // represented as bytes with 0x00/0x01 as element values in "dst".  Each bit in
1495 // "src" is unpacked into one byte lane in "dst". Note that "dst" can support at
1496 // most 64 lanes.
1497 //
1498 // Below example gives the expected dst vector register, with a valid src(0x658D)
1499 // on a 128-bit vector size machine.
1500 // dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01
1501 void C2_MacroAssembler::sve_vmask_fromlong(FloatRegister dst, Register src,
1502                                            FloatRegister vtmp, int lane_cnt) {
1503   assert_different_registers(dst, vtmp);
1504   assert(UseSVE == 2 && VM_Version::supports_svebitperm() &&
1505          lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported");
1506 
1507   // Example:   src = 0x658D, lane_cnt = 16
1508   // Expected:  dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01
1509 
1510   // Put long value from general purpose register into the first lane of vector.
1511   // vtmp = 0x0000000000000000 | 0x000000000000658D
1512   sve_dup(vtmp, B, 0);
1513   mov(vtmp, D, 0, src);
1514 
1515   // Transform the value in the first lane which is mask in bit now to the mask in
1516   // byte, which can be done by SVE2's BDEP instruction.
1517 
1518   // The first source input of BDEP instruction. Deposite each byte in every 8 bytes.
1519   // vtmp = 0x0000000000000065 | 0x000000000000008D
1520   if (lane_cnt <= 8) {
1521     // Nothing. As only one byte exsits.
1522   } else if (lane_cnt <= 16) {
1523     ins(vtmp, B, vtmp, 8, 1);
1524   } else {
1525     sve_vector_extend(vtmp, D, vtmp, B);
1526   }
1527 
1528   // The second source input of BDEP instruction, initialized with 0x01 for each byte.
1529   // dst = 0x01010101 0x01010101 0x01010101 0x01010101
1530   sve_dup(dst, B, 1);
1531 
1532   // BDEP dst.D, vtmp.D, dst.D
1533   // vtmp = 0x0000000000000065 | 0x000000000000008D
1534   // dst  = 0x0101010101010101 | 0x0101010101010101
1535   //        ---------------------------------------
1536   // dst  = 0x0001010000010001 | 0x0100000001010001
1537   sve_bdep(dst, D, vtmp, dst);
1538 }
1539 
1540 // Clobbers: rflags
1541 void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg,
1542                                     FloatRegister zn, FloatRegister zm, Condition cond) {
1543   assert(pg->is_governing(), "This register has to be a governing predicate register");
1544   FloatRegister z1 = zn, z2 = zm;
1545   switch (cond) {
1546     case LE: z1 = zm; z2 = zn; cond = GE; break;
1547     case LT: z1 = zm; z2 = zn; cond = GT; break;
1548     case LO: z1 = zm; z2 = zn; cond = HI; break;
1549     case LS: z1 = zm; z2 = zn; cond = HS; break;
1550     default:
1551       break;
1552   }
1553 
1554   SIMD_RegVariant size = elemType_to_regVariant(bt);
1555   if (is_floating_point_type(bt)) {
1556     sve_fcm(cond, pd, size, pg, z1, z2);
1557   } else {
1558     assert(is_integral_type(bt), "unsupported element type");
1559     sve_cmp(cond, pd, size, pg, z1, z2);
1560   }
1561 }
1562 
1563 // Get index of the last mask lane that is set
1564 void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) {
1565   SIMD_RegVariant size = elemType_to_regVariant(bt);
1566   sve_rev(ptmp, size, src);
1567   sve_brkb(ptmp, ptrue, ptmp, false);
1568   sve_cntp(dst, size, ptrue, ptmp);
1569   movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1);
1570   subw(dst, rscratch1, dst);
1571 }
1572 
1573 // Extend integer vector src to dst with the same lane count
1574 // but larger element size, e.g. 4B -> 4I
1575 void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes,
1576                                            FloatRegister src, BasicType src_bt, bool is_unsigned) {
1577   if (src_bt == T_BYTE) {
1578     // 4B to 4S/4I, 8B to 8S
1579     assert(dst_vlen_in_bytes == 8 || dst_vlen_in_bytes == 16, "unsupported");
1580     assert(dst_bt == T_SHORT || dst_bt == T_INT, "unsupported");
1581     _xshll(is_unsigned, dst, T8H, src, T8B, 0);
1582     if (dst_bt == T_INT) {
1583       _xshll(is_unsigned, dst, T4S, dst, T4H, 0);
1584     }
1585   } else if (src_bt == T_SHORT) {
1586     // 2S to 2I/2L, 4S to 4I
1587     assert(dst_vlen_in_bytes == 8 || dst_vlen_in_bytes == 16, "unsupported");
1588     assert(dst_bt == T_INT || dst_bt == T_LONG, "unsupported");
1589     _xshll(is_unsigned, dst, T4S, src, T4H, 0);
1590     if (dst_bt == T_LONG) {
1591       _xshll(is_unsigned, dst, T2D, dst, T2S, 0);
1592     }
1593   } else if (src_bt == T_INT) {
1594     // 2I to 2L
1595     assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported");
1596     _xshll(is_unsigned, dst, T2D, src, T2S, 0);
1597   } else {
1598     ShouldNotReachHere();
1599   }
1600 }
1601 
1602 // Narrow integer vector src down to dst with the same lane count
1603 // but smaller element size, e.g. 4I -> 4B
1604 void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt,
1605                                            FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) {
1606   if (src_bt == T_SHORT) {
1607     // 4S/8S to 4B/8B
1608     assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
1609     assert(dst_bt == T_BYTE, "unsupported");
1610     xtn(dst, T8B, src, T8H);
1611   } else if (src_bt == T_INT) {
1612     // 2I to 2S, 4I to 4B/4S
1613     assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
1614     assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported");
1615     xtn(dst, T4H, src, T4S);
1616     if (dst_bt == T_BYTE) {
1617       xtn(dst, T8B, dst, T8H);
1618     }
1619   } else if (src_bt == T_LONG) {
1620     // 2L to 2S/2I
1621     assert(src_vlen_in_bytes == 16, "unsupported");
1622     assert(dst_bt == T_INT || dst_bt == T_SHORT, "unsupported");
1623     xtn(dst, T2S, src, T2D);
1624     if (dst_bt == T_SHORT) {
1625       xtn(dst, T4H, dst, T4S);
1626     }
1627   } else {
1628     ShouldNotReachHere();
1629   }
1630 }
1631 
1632 void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size,
1633                                           FloatRegister src, SIMD_RegVariant src_size,
1634                                           bool is_unsigned) {
1635   assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size");
1636 
1637   if (src_size == B) {
1638     switch (dst_size) {
1639     case H:
1640       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1641       break;
1642     case S:
1643       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1644       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1645       break;
1646     case D:
1647       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1648       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1649       _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1650       break;
1651     default:
1652       ShouldNotReachHere();
1653     }
1654   } else if (src_size == H) {
1655     if (dst_size == S) {
1656       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1657     } else { // D
1658       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1659       _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1660     }
1661   } else if (src_size == S) {
1662     _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, src);
1663   }
1664 }
1665 
1666 // Vector narrow from src to dst with specified element sizes.
1667 // High part of dst vector will be filled with zero.
1668 void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size,
1669                                           FloatRegister src, SIMD_RegVariant src_size,
1670                                           FloatRegister tmp) {
1671   assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size");
1672   assert_different_registers(src, tmp);
1673   sve_dup(tmp, src_size, 0);
1674   if (src_size == D) {
1675     switch (dst_size) {
1676     case S:
1677       sve_uzp1(dst, S, src, tmp);
1678       break;
1679     case H:
1680       assert_different_registers(dst, tmp);
1681       sve_uzp1(dst, S, src, tmp);
1682       sve_uzp1(dst, H, dst, tmp);
1683       break;
1684     case B:
1685       assert_different_registers(dst, tmp);
1686       sve_uzp1(dst, S, src, tmp);
1687       sve_uzp1(dst, H, dst, tmp);
1688       sve_uzp1(dst, B, dst, tmp);
1689       break;
1690     default:
1691       ShouldNotReachHere();
1692     }
1693   } else if (src_size == S) {
1694     if (dst_size == H) {
1695       sve_uzp1(dst, H, src, tmp);
1696     } else { // B
1697       assert_different_registers(dst, tmp);
1698       sve_uzp1(dst, H, src, tmp);
1699       sve_uzp1(dst, B, dst, tmp);
1700     }
1701   } else if (src_size == H) {
1702     sve_uzp1(dst, B, src, tmp);
1703   }
1704 }
1705 
1706 // Extend src predicate to dst predicate with the same lane count but larger
1707 // element size, e.g. 64Byte -> 512Long
1708 void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src,
1709                                              uint dst_element_length_in_bytes,
1710                                              uint src_element_length_in_bytes) {
1711   if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) {
1712     sve_punpklo(dst, src);
1713   } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) {
1714     sve_punpklo(dst, src);
1715     sve_punpklo(dst, dst);
1716   } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) {
1717     sve_punpklo(dst, src);
1718     sve_punpklo(dst, dst);
1719     sve_punpklo(dst, dst);
1720   } else {
1721     assert(false, "unsupported");
1722     ShouldNotReachHere();
1723   }
1724 }
1725 
1726 // Narrow src predicate to dst predicate with the same lane count but
1727 // smaller element size, e.g. 512Long -> 64Byte
1728 void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src, PRegister ptmp,
1729                                              uint dst_element_length_in_bytes, uint src_element_length_in_bytes) {
1730   // The insignificant bits in src predicate are expected to be zero.
1731   // To ensure the higher order bits of the resultant narrowed vector are 0, an all-zero predicate is
1732   // passed as the second argument. An example narrowing operation with a given mask would be -
1733   // 128Long -> 64Int on a 128-bit machine i.e 2L -> 2I
1734   // Mask (for 2 Longs) : TF
1735   // Predicate register for the above mask (16 bits) : 00000001 00000000
1736   // After narrowing (uzp1 dst.b, src.b, ptmp.b) : 0000 0000 0001 0000
1737   // Which translates to mask for 2 integers as : TF (lower half is considered while upper half is 0)
1738   assert_different_registers(src, ptmp);
1739   assert_different_registers(dst, ptmp);
1740   sve_pfalse(ptmp);
1741   if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) {
1742     sve_uzp1(dst, B, src, ptmp);
1743   } else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) {
1744     sve_uzp1(dst, H, src, ptmp);
1745     sve_uzp1(dst, B, dst, ptmp);
1746   } else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) {
1747     sve_uzp1(dst, S, src, ptmp);
1748     sve_uzp1(dst, H, dst, ptmp);
1749     sve_uzp1(dst, B, dst, ptmp);
1750   } else {
1751     assert(false, "unsupported");
1752     ShouldNotReachHere();
1753   }
1754 }
1755 
1756 // Vector reduction add for integral type with ASIMD instructions.
1757 void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt,
1758                                                  Register isrc, FloatRegister vsrc,
1759                                                  unsigned vector_length_in_bytes,
1760                                                  FloatRegister vtmp) {
1761   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1762   assert_different_registers(dst, isrc);
1763   bool isQ = vector_length_in_bytes == 16;
1764 
1765   BLOCK_COMMENT("neon_reduce_add_integral {");
1766     switch(bt) {
1767       case T_BYTE:
1768         addv(vtmp, isQ ? T16B : T8B, vsrc);
1769         smov(dst, vtmp, B, 0);
1770         addw(dst, dst, isrc, ext::sxtb);
1771         break;
1772       case T_SHORT:
1773         addv(vtmp, isQ ? T8H : T4H, vsrc);
1774         smov(dst, vtmp, H, 0);
1775         addw(dst, dst, isrc, ext::sxth);
1776         break;
1777       case T_INT:
1778         isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc);
1779         umov(dst, vtmp, S, 0);
1780         addw(dst, dst, isrc);
1781         break;
1782       case T_LONG:
1783         assert(isQ, "unsupported");
1784         addpd(vtmp, vsrc);
1785         umov(dst, vtmp, D, 0);
1786         add(dst, dst, isrc);
1787         break;
1788       default:
1789         assert(false, "unsupported");
1790         ShouldNotReachHere();
1791     }
1792   BLOCK_COMMENT("} neon_reduce_add_integral");
1793 }
1794 
1795 // Vector reduction multiply for integral type with ASIMD instructions.
1796 // Note: temporary registers vtmp1 and vtmp2 are not used in some cases.
1797 // Clobbers: rscratch1
1798 void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt,
1799                                                  Register isrc, FloatRegister vsrc,
1800                                                  unsigned vector_length_in_bytes,
1801                                                  FloatRegister vtmp1, FloatRegister vtmp2) {
1802   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1803   bool isQ = vector_length_in_bytes == 16;
1804 
1805   BLOCK_COMMENT("neon_reduce_mul_integral {");
1806     switch(bt) {
1807       case T_BYTE:
1808         if (isQ) {
1809           // Multiply the lower half and higher half of vector iteratively.
1810           // vtmp1 = vsrc[8:15]
1811           ins(vtmp1, D, vsrc, 0, 1);
1812           // vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7]
1813           mulv(vtmp1, T8B, vtmp1, vsrc);
1814           // vtmp2 = vtmp1[4:7]
1815           ins(vtmp2, S, vtmp1, 0, 1);
1816           // vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3]
1817           mulv(vtmp1, T8B, vtmp2, vtmp1);
1818         } else {
1819           ins(vtmp1, S, vsrc, 0, 1);
1820           mulv(vtmp1, T8B, vtmp1, vsrc);
1821         }
1822         // vtmp2 = vtmp1[2:3]
1823         ins(vtmp2, H, vtmp1, 0, 1);
1824         // vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1]
1825         mulv(vtmp2, T8B, vtmp2, vtmp1);
1826         // dst = vtmp2[0] * isrc * vtmp2[1]
1827         umov(rscratch1, vtmp2, B, 0);
1828         mulw(dst, rscratch1, isrc);
1829         sxtb(dst, dst);
1830         umov(rscratch1, vtmp2, B, 1);
1831         mulw(dst, rscratch1, dst);
1832         sxtb(dst, dst);
1833         break;
1834       case T_SHORT:
1835         if (isQ) {
1836           ins(vtmp2, D, vsrc, 0, 1);
1837           mulv(vtmp2, T4H, vtmp2, vsrc);
1838           ins(vtmp1, S, vtmp2, 0, 1);
1839           mulv(vtmp1, T4H, vtmp1, vtmp2);
1840         } else {
1841           ins(vtmp1, S, vsrc, 0, 1);
1842           mulv(vtmp1, T4H, vtmp1, vsrc);
1843         }
1844         umov(rscratch1, vtmp1, H, 0);
1845         mulw(dst, rscratch1, isrc);
1846         sxth(dst, dst);
1847         umov(rscratch1, vtmp1, H, 1);
1848         mulw(dst, rscratch1, dst);
1849         sxth(dst, dst);
1850         break;
1851       case T_INT:
1852         if (isQ) {
1853           ins(vtmp1, D, vsrc, 0, 1);
1854           mulv(vtmp1, T2S, vtmp1, vsrc);
1855         } else {
1856           vtmp1 = vsrc;
1857         }
1858         umov(rscratch1, vtmp1, S, 0);
1859         mul(dst, rscratch1, isrc);
1860         umov(rscratch1, vtmp1, S, 1);
1861         mul(dst, rscratch1, dst);
1862         break;
1863       case T_LONG:
1864         umov(rscratch1, vsrc, D, 0);
1865         mul(dst, isrc, rscratch1);
1866         umov(rscratch1, vsrc, D, 1);
1867         mul(dst, dst, rscratch1);
1868         break;
1869       default:
1870         assert(false, "unsupported");
1871         ShouldNotReachHere();
1872     }
1873   BLOCK_COMMENT("} neon_reduce_mul_integral");
1874 }
1875 
1876 // Vector reduction multiply for floating-point type with ASIMD instructions.
1877 void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt,
1878                                            FloatRegister fsrc, FloatRegister vsrc,
1879                                            unsigned vector_length_in_bytes,
1880                                            FloatRegister vtmp) {
1881   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1882   bool isQ = vector_length_in_bytes == 16;
1883 
1884   BLOCK_COMMENT("neon_reduce_mul_fp {");
1885     switch(bt) {
1886       case T_FLOAT:
1887         fmuls(dst, fsrc, vsrc);
1888         ins(vtmp, S, vsrc, 0, 1);
1889         fmuls(dst, dst, vtmp);
1890         if (isQ) {
1891           ins(vtmp, S, vsrc, 0, 2);
1892           fmuls(dst, dst, vtmp);
1893           ins(vtmp, S, vsrc, 0, 3);
1894           fmuls(dst, dst, vtmp);
1895          }
1896         break;
1897       case T_DOUBLE:
1898         assert(isQ, "unsupported");
1899         fmuld(dst, fsrc, vsrc);
1900         ins(vtmp, D, vsrc, 0, 1);
1901         fmuld(dst, dst, vtmp);
1902         break;
1903       default:
1904         assert(false, "unsupported");
1905         ShouldNotReachHere();
1906     }
1907   BLOCK_COMMENT("} neon_reduce_mul_fp");
1908 }
1909 
1910 // Helper to select logical instruction
1911 void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd,
1912                                                    Register Rn, Register Rm,
1913                                                    enum shift_kind kind, unsigned shift) {
1914   switch(opc) {
1915     case Op_AndReductionV:
1916       is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift);
1917       break;
1918     case Op_OrReductionV:
1919       is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift);
1920       break;
1921     case Op_XorReductionV:
1922       is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift);
1923       break;
1924     default:
1925       assert(false, "unsupported");
1926       ShouldNotReachHere();
1927   }
1928 }
1929 
1930 // Vector reduction logical operations And, Or, Xor
1931 // Clobbers: rscratch1
1932 void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt,
1933                                             Register isrc, FloatRegister vsrc,
1934                                             unsigned vector_length_in_bytes) {
1935   assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV,
1936          "unsupported");
1937   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1938   assert_different_registers(dst, isrc);
1939   bool isQ = vector_length_in_bytes == 16;
1940 
1941   BLOCK_COMMENT("neon_reduce_logical {");
1942     umov(rscratch1, vsrc, isQ ? D : S, 0);
1943     umov(dst, vsrc, isQ ? D : S, 1);
1944     neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1);
1945     switch(bt) {
1946       case T_BYTE:
1947         if (isQ) {
1948           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1949         }
1950         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
1951         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8);
1952         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1953         sxtb(dst, dst);
1954         break;
1955       case T_SHORT:
1956         if (isQ) {
1957           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1958         }
1959         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
1960         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1961         sxth(dst, dst);
1962         break;
1963       case T_INT:
1964         if (isQ) {
1965           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1966         }
1967         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1968         break;
1969       case T_LONG:
1970         assert(isQ, "unsupported");
1971         neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst);
1972         break;
1973       default:
1974         assert(false, "unsupported");
1975         ShouldNotReachHere();
1976     }
1977   BLOCK_COMMENT("} neon_reduce_logical");
1978 }
1979 
1980 // Helper function to decode min/max reduction operation properties
1981 void C2_MacroAssembler::decode_minmax_reduction_opc(int opc, bool* is_min,
1982                                                     bool* is_unsigned,
1983                                                     Condition* cond) {
1984   switch(opc) {
1985     case Op_MinReductionV:
1986       *is_min = true;  *is_unsigned = false; *cond = LT; break;
1987     case Op_MaxReductionV:
1988       *is_min = false; *is_unsigned = false; *cond = GT; break;
1989     case Op_UMinReductionV:
1990       *is_min = true;  *is_unsigned = true;  *cond = LO; break;
1991     case Op_UMaxReductionV:
1992       *is_min = false; *is_unsigned = true;  *cond = HI; break;
1993     default:
1994       ShouldNotReachHere();
1995   }
1996 }
1997 
1998 // Vector reduction min/max/umin/umax for integral type with ASIMD instructions.
1999 // Note: vtmp is not used and expected to be fnoreg for T_LONG case.
2000 // Clobbers: rscratch1, rflags
2001 void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt,
2002                                                     Register isrc, FloatRegister vsrc,
2003                                                     unsigned vector_length_in_bytes,
2004                                                     FloatRegister vtmp) {
2005   assert(opc == Op_MinReductionV || opc == Op_MaxReductionV ||
2006          opc == Op_UMinReductionV || opc == Op_UMaxReductionV, "unsupported");
2007   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
2008   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported");
2009   assert_different_registers(dst, isrc);
2010   bool isQ = vector_length_in_bytes == 16;
2011   bool is_min;
2012   bool is_unsigned;
2013   Condition cond;
2014   decode_minmax_reduction_opc(opc, &is_min, &is_unsigned, &cond);
2015   BLOCK_COMMENT("neon_reduce_minmax_integral {");
2016     if (bt == T_LONG) {
2017       assert(vtmp == fnoreg, "should be");
2018       assert(isQ, "should be");
2019       umov(rscratch1, vsrc, D, 0);
2020       cmp(isrc, rscratch1);
2021       csel(dst, isrc, rscratch1, cond);
2022       umov(rscratch1, vsrc, D, 1);
2023       cmp(dst, rscratch1);
2024       csel(dst, dst, rscratch1, cond);
2025     } else {
2026       SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
2027       if (size == T2S) {
2028         // For T2S (2x32-bit elements), use pairwise instructions because
2029         // uminv/umaxv/sminv/smaxv don't support arrangement 2S.
2030         neon_minmaxp(is_unsigned, is_min, vtmp, size, vsrc, vsrc);
2031       } else {
2032         // For other sizes, use reduction to scalar instructions.
2033         neon_minmaxv(is_unsigned, is_min, vtmp, size, vsrc);
2034       }
2035       if (bt == T_INT) {
2036         umov(dst, vtmp, S, 0);
2037       } else if (is_unsigned) {
2038         umov(dst, vtmp, elemType_to_regVariant(bt), 0);
2039       } else {
2040         smov(dst, vtmp, elemType_to_regVariant(bt), 0);
2041       }
2042       cmpw(dst, isrc);
2043       cselw(dst, dst, isrc, cond);
2044     }
2045   BLOCK_COMMENT("} neon_reduce_minmax_integral");
2046 }
2047 
2048 // Vector reduction for integral type with SVE instruction.
2049 // Supported operations are Add, And, Or, Xor, Max, Min, UMax, UMin.
2050 // rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV.
2051 void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1,
2052                                             FloatRegister src2, PRegister pg, FloatRegister tmp) {
2053   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2054   assert(pg->is_governing(), "This register has to be a governing predicate register");
2055   assert_different_registers(src1, dst);
2056   // Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved.
2057   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2058   switch (opc) {
2059     case Op_AddReductionVI: {
2060       sve_uaddv(tmp, size, pg, src2);
2061       if (bt == T_BYTE) {
2062         smov(dst, tmp, size, 0);
2063         addw(dst, src1, dst, ext::sxtb);
2064       } else if (bt == T_SHORT) {
2065         smov(dst, tmp, size, 0);
2066         addw(dst, src1, dst, ext::sxth);
2067       } else {
2068         umov(dst, tmp, size, 0);
2069         addw(dst, dst, src1);
2070       }
2071       break;
2072     }
2073     case Op_AddReductionVL: {
2074       sve_uaddv(tmp, size, pg, src2);
2075       umov(dst, tmp, size, 0);
2076       add(dst, dst, src1);
2077       break;
2078     }
2079     case Op_AndReductionV: {
2080       sve_andv(tmp, size, pg, src2);
2081       if (bt == T_INT || bt == T_LONG) {
2082         umov(dst, tmp, size, 0);
2083       } else {
2084         smov(dst, tmp, size, 0);
2085       }
2086       if (bt == T_LONG) {
2087         andr(dst, dst, src1);
2088       } else {
2089         andw(dst, dst, src1);
2090       }
2091       break;
2092     }
2093     case Op_OrReductionV: {
2094       sve_orv(tmp, size, pg, src2);
2095       if (bt == T_INT || bt == T_LONG) {
2096         umov(dst, tmp, size, 0);
2097       } else {
2098         smov(dst, tmp, size, 0);
2099       }
2100       if (bt == T_LONG) {
2101         orr(dst, dst, src1);
2102       } else {
2103         orrw(dst, dst, src1);
2104       }
2105       break;
2106     }
2107     case Op_XorReductionV: {
2108       sve_eorv(tmp, size, pg, src2);
2109       if (bt == T_INT || bt == T_LONG) {
2110         umov(dst, tmp, size, 0);
2111       } else {
2112         smov(dst, tmp, size, 0);
2113       }
2114       if (bt == T_LONG) {
2115         eor(dst, dst, src1);
2116       } else {
2117         eorw(dst, dst, src1);
2118       }
2119       break;
2120     }
2121     case Op_MaxReductionV:
2122     case Op_MinReductionV:
2123     case Op_UMaxReductionV:
2124     case Op_UMinReductionV: {
2125       bool is_min;
2126       bool is_unsigned;
2127       Condition cond;
2128       decode_minmax_reduction_opc(opc, &is_min, &is_unsigned, &cond);
2129       sve_minmaxv(is_unsigned, is_min, tmp, size, pg, src2);
2130       // Move result from vector to general register
2131       if (is_unsigned || bt == T_INT || bt == T_LONG) {
2132         umov(dst, tmp, size, 0);
2133       } else {
2134         smov(dst, tmp, size, 0);
2135       }
2136       if (bt == T_LONG) {
2137         cmp(dst, src1);
2138         csel(dst, dst, src1, cond);
2139       } else {
2140         cmpw(dst, src1);
2141         cselw(dst, dst, src1, cond);
2142       }
2143       break;
2144     }
2145     default:
2146       assert(false, "unsupported");
2147       ShouldNotReachHere();
2148   }
2149 
2150   if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) {
2151     if (bt == T_BYTE) {
2152       sxtb(dst, dst);
2153     } else if (bt == T_SHORT) {
2154       sxth(dst, dst);
2155     }
2156   }
2157 }
2158 
2159 // Set elements of the dst predicate to true for lanes in the range of [0, lane_cnt), or
2160 // to false otherwise. The input "lane_cnt" should be smaller than or equal to the supported
2161 // max vector length of the basic type. Clobbers: rscratch1 and the rFlagsReg.
2162 void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt) {
2163   uint32_t max_vector_length = Matcher::max_vector_size(bt);
2164   assert(lane_cnt <= max_vector_length, "unsupported input lane_cnt");
2165 
2166   // Set all elements to false if the input "lane_cnt" is zero.
2167   if (lane_cnt == 0) {
2168     sve_pfalse(dst);
2169     return;
2170   }
2171 
2172   SIMD_RegVariant size = elemType_to_regVariant(bt);
2173   assert(size != Q, "invalid size");
2174 
2175   // Set all true if "lane_cnt" equals to the max lane count.
2176   if (lane_cnt == max_vector_length) {
2177     sve_ptrue(dst, size, /* ALL */ 0b11111);
2178     return;
2179   }
2180 
2181   // Fixed numbers for "ptrue".
2182   switch(lane_cnt) {
2183   case 1: /* VL1 */
2184   case 2: /* VL2 */
2185   case 3: /* VL3 */
2186   case 4: /* VL4 */
2187   case 5: /* VL5 */
2188   case 6: /* VL6 */
2189   case 7: /* VL7 */
2190   case 8: /* VL8 */
2191     sve_ptrue(dst, size, lane_cnt);
2192     return;
2193   case 16:
2194     sve_ptrue(dst, size, /* VL16 */ 0b01001);
2195     return;
2196   case 32:
2197     sve_ptrue(dst, size, /* VL32 */ 0b01010);
2198     return;
2199   case 64:
2200     sve_ptrue(dst, size, /* VL64 */ 0b01011);
2201     return;
2202   case 128:
2203     sve_ptrue(dst, size, /* VL128 */ 0b01100);
2204     return;
2205   case 256:
2206     sve_ptrue(dst, size, /* VL256 */ 0b01101);
2207     return;
2208   default:
2209     break;
2210   }
2211 
2212   // Special patterns for "ptrue".
2213   if (lane_cnt == round_down_power_of_2(max_vector_length)) {
2214     sve_ptrue(dst, size, /* POW2 */ 0b00000);
2215   } else if (lane_cnt == max_vector_length - (max_vector_length % 4)) {
2216     sve_ptrue(dst, size, /* MUL4 */ 0b11101);
2217   } else if (lane_cnt == max_vector_length - (max_vector_length % 3)) {
2218     sve_ptrue(dst, size, /* MUL3 */ 0b11110);
2219   } else {
2220     // Encode to "whileltw" for the remaining cases.
2221     mov(rscratch1, lane_cnt);
2222     sve_whileltw(dst, size, zr, rscratch1);
2223   }
2224 }
2225 
2226 // Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst.
2227 // Any remaining elements of dst will be filled with zero.
2228 // Clobbers: rscratch1
2229 // Preserves: mask, vzr
2230 void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask,
2231                                            FloatRegister vzr, FloatRegister vtmp,
2232                                            PRegister pgtmp, unsigned vector_length_in_bytes) {
2233   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2234   // When called by sve_compress_byte, src and vtmp may be the same register.
2235   assert_different_registers(dst, src, vzr);
2236   assert_different_registers(dst, vtmp, vzr);
2237   assert_different_registers(mask, pgtmp);
2238   // high <-- low
2239   // Example input:   src   = hh gg ff ee dd cc bb aa, one character is 8 bits.
2240   //                  mask  = 01 00 00 01 01 00 01 01, one character is 1 bit.
2241   // Expected result: dst   = 00 00 00 hh ee dd bb aa
2242 
2243   // Extend lowest half to type INT.
2244   // dst   =  00dd  00cc  00bb  00aa
2245   sve_uunpklo(dst, S, src);
2246   // pgtmp =  0001  0000  0001  0001
2247   sve_punpklo(pgtmp, mask);
2248   // Pack the active elements in size of type INT to the right,
2249   // and fill the remainings with zero.
2250   // dst   =  0000  00dd  00bb  00aa
2251   sve_compact(dst, S, dst, pgtmp);
2252   // Narrow the result back to type SHORT.
2253   // dst   = 00 00 00 00 00 dd bb aa
2254   sve_uzp1(dst, H, dst, vzr);
2255 
2256   // Return if the vector length is no more than MaxVectorSize/2, since the
2257   // highest half is invalid.
2258   if (vector_length_in_bytes <= (MaxVectorSize >> 1)) {
2259     return;
2260   }
2261 
2262   // Count the active elements of lowest half.
2263   // rscratch1 = 3
2264   sve_cntp(rscratch1, S, ptrue, pgtmp);
2265 
2266   // Repeat to the highest half.
2267   // pgtmp =  0001  0000  0000  0001
2268   sve_punpkhi(pgtmp, mask);
2269   // vtmp  =  00hh  00gg  00ff  00ee
2270   sve_uunpkhi(vtmp, S, src);
2271   // vtmp  =  0000  0000  00hh  00ee
2272   sve_compact(vtmp, S, vtmp, pgtmp);
2273   // vtmp  = 00 00 00 00 00 00 hh ee
2274   sve_uzp1(vtmp, H, vtmp, vzr);
2275 
2276   // pgtmp = 00 00 00 00 00 01 01 01
2277   sve_whilelt(pgtmp, H, zr, rscratch1);
2278   // Compressed low:  dst  = 00 00 00 00 00 dd bb aa
2279   // Compressed high: vtmp = 00 00 00 00 00 00 hh ee
2280   // Combine the compressed low with the compressed high:
2281   //                  dst  = 00 00 00 hh ee dd bb aa
2282   sve_splice(dst, H, pgtmp, vtmp);
2283 }
2284 
2285 // Clobbers: rscratch1, rscratch2
2286 // Preserves: src, mask
2287 void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask,
2288                                           FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
2289                                           PRegister ptmp, PRegister pgtmp, unsigned vector_length_in_bytes) {
2290   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2291   assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3);
2292   assert_different_registers(mask, ptmp, pgtmp);
2293   // high <-- low
2294   // Example input:   src   = q p n m l k j i h g f e d c b a, one character is 8 bits.
2295   //                  mask  = 0 1 0 0 0 0 0 1 0 1 0 0 0 1 0 1, one character is 1 bit.
2296   // Expected result: dst   = 0 0 0 0 0 0 0 0 0 0 0 p i g c a
2297   FloatRegister vzr = vtmp3;
2298   sve_dup(vzr, B, 0);
2299 
2300   // Extend lowest half to type SHORT.
2301   // vtmp1 =  0h  0g  0f  0e  0d  0c  0b  0a
2302   sve_uunpklo(vtmp1, H, src);
2303   // ptmp  =  00  01  00  00  00  01  00  01
2304   sve_punpklo(ptmp, mask);
2305   // Pack the active elements in size of type SHORT to the right,
2306   // and fill the remainings with zero.
2307   // dst   =  00  00  00  00  00  0g  0c  0a
2308   unsigned extended_size = vector_length_in_bytes << 1;
2309   sve_compress_short(dst, vtmp1, ptmp, vzr, vtmp2, pgtmp, extended_size > MaxVectorSize ? MaxVectorSize : extended_size);
2310   // Narrow the result back to type BYTE.
2311   // dst   = 0 0 0 0 0 0 0 0 0 0 0 0 0 g c a
2312   sve_uzp1(dst, B, dst, vzr);
2313 
2314   // Return if the vector length is no more than MaxVectorSize/2, since the
2315   // highest half is invalid.
2316   if (vector_length_in_bytes <= (MaxVectorSize >> 1)) {
2317     return;
2318   }
2319   // Count the active elements of lowest half.
2320   // rscratch2 = 3
2321   sve_cntp(rscratch2, H, ptrue, ptmp);
2322 
2323   // Repeat to the highest half.
2324   // ptmp  =  00  01  00  00  00  00  00  01
2325   sve_punpkhi(ptmp, mask);
2326   // vtmp2 =  0q  0p  0n  0m  0l  0k  0j  0i
2327   sve_uunpkhi(vtmp2, H, src);
2328   // vtmp1 =  00  00  00  00  00  00  0p  0i
2329   sve_compress_short(vtmp1, vtmp2, ptmp, vzr, vtmp2, pgtmp, extended_size - MaxVectorSize);
2330   // vtmp1 = 0 0 0 0 0 0 0 0 0 0 0 0 0 0 p i
2331   sve_uzp1(vtmp1, B, vtmp1, vzr);
2332 
2333   // ptmp  = 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1
2334   sve_whilelt(ptmp, B, zr, rscratch2);
2335   // Compressed low:  dst   = 0 0 0 0 0 0 0 0 0 0 0 0 0 g c a
2336   // Compressed high: vtmp1 = 0 0 0 0 0 0 0 0 0 0 0 0 0 0 p i
2337   // Combine the compressed low with the compressed high:
2338   //                  dst   = 0 0 0 0 0 0 0 0 0 0 0 p i g c a
2339   sve_splice(dst, B, ptmp, vtmp1);
2340 }
2341 
2342 void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2343   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2344   SIMD_Arrangement size = isQ ? T16B : T8B;
2345   if (bt == T_BYTE) {
2346     rbit(dst, size, src);
2347   } else {
2348     neon_reverse_bytes(dst, src, bt, isQ);
2349     rbit(dst, size, dst);
2350   }
2351 }
2352 
2353 void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2354   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2355   SIMD_Arrangement size = isQ ? T16B : T8B;
2356   switch (bt) {
2357     case T_BYTE:
2358       if (dst != src) {
2359         orr(dst, size, src, src);
2360       }
2361       break;
2362     case T_SHORT:
2363       rev16(dst, size, src);
2364       break;
2365     case T_INT:
2366       rev32(dst, size, src);
2367       break;
2368     case T_LONG:
2369       rev64(dst, size, src);
2370       break;
2371     default:
2372       assert(false, "unsupported");
2373       ShouldNotReachHere();
2374   }
2375 }
2376 
2377 // VectorRearrange implementation for short/int/float/long/double types with NEON
2378 // instructions. For VectorRearrange short/int/float, we use NEON tbl instruction.
2379 // But since it supports bytes table only, we need to lookup 2/4 bytes as a group.
2380 // For VectorRearrange long/double, we compare the shuffle input with iota indices,
2381 // and use bsl to implement the operation.
2382 void C2_MacroAssembler::neon_rearrange_hsd(FloatRegister dst, FloatRegister src,
2383                                            FloatRegister shuffle, FloatRegister tmp,
2384                                            BasicType bt, bool isQ) {
2385   assert_different_registers(dst, src, shuffle, tmp);
2386   SIMD_Arrangement size1 = isQ ? T16B : T8B;
2387   SIMD_Arrangement size2 = esize2arrangement((uint)type2aelembytes(bt), isQ);
2388 
2389   // Here is an example that rearranges a NEON vector with 4 ints:
2390   // Rearrange V1 int[a0, a1, a2, a3] to V2 int[a2, a3, a0, a1]
2391   //   1. We assume the shuffle input is Vi int[2, 3, 0, 1].
2392   //   2. Multiply Vi int[2, 3, 0, 1] with constant int vector
2393   //      [0x04040404, 0x04040404, 0x04040404, 0x04040404], and get
2394   //      tbl base Vm int[0x08080808, 0x0c0c0c0c, 0x00000000, 0x04040404].
2395   //   3. Add Vm with constant int[0x03020100, 0x03020100, 0x03020100, 0x03020100],
2396   //      and get tbl index Vm int[0x0b0a0908, 0x0f0e0d0c, 0x03020100, 0x07060504]
2397   //   4. Use Vm as index register, and use V1 as table register.
2398   //      Then get V2 as the result by tbl NEON instructions.
2399   switch (bt) {
2400     case T_SHORT:
2401       mov(tmp, size1, 0x02);
2402       mulv(dst, size2, shuffle, tmp);
2403       mov(tmp, size2, 0x0100);
2404       addv(dst, size1, dst, tmp);
2405       tbl(dst, size1, src, 1, dst);
2406       break;
2407     case T_INT:
2408     case T_FLOAT:
2409       mov(tmp, size1, 0x04);
2410       mulv(dst, size2, shuffle, tmp);
2411       mov(tmp, size2, 0x03020100);
2412       addv(dst, size1, dst, tmp);
2413       tbl(dst, size1, src, 1, dst);
2414       break;
2415     case T_LONG:
2416     case T_DOUBLE:
2417       // Load the iota indices for Long type. The indices are ordered by
2418       // type B/S/I/L/F/D, and the offset between two types is 16; Hence
2419       // the offset for L is 48.
2420       lea(rscratch1,
2421           ExternalAddress(StubRoutines::aarch64::vector_iota_indices() + 48));
2422       ldrq(tmp, rscratch1);
2423       // Check whether the input "shuffle" is the same with iota indices.
2424       // Return "src" if true, otherwise swap the two elements of "src".
2425       cm(EQ, dst, size2, shuffle, tmp);
2426       ext(tmp, size1, src, src, 8);
2427       bsl(dst, size1, src, tmp);
2428       break;
2429     default:
2430       assert(false, "unsupported element type");
2431       ShouldNotReachHere();
2432   }
2433 }
2434 
2435 // Extract a scalar element from an sve vector at position 'idx'.
2436 // The input elements in src are expected to be of integral type.
2437 void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src,
2438                                              int idx, FloatRegister vtmp) {
2439   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2440   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2441   if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction
2442     if (bt == T_INT || bt == T_LONG) {
2443       umov(dst, src, size, idx);
2444     } else {
2445       smov(dst, src, size, idx);
2446     }
2447   } else {
2448     sve_orr(vtmp, src, src);
2449     sve_ext(vtmp, vtmp, idx << size);
2450     if (bt == T_INT || bt == T_LONG) {
2451       umov(dst, vtmp, size, 0);
2452     } else {
2453       smov(dst, vtmp, size, 0);
2454     }
2455   }
2456 }
2457 
2458 // java.lang.Math::round intrinsics
2459 
2460 // Clobbers: rscratch1, rflags
2461 void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2462                                           FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) {
2463   assert_different_registers(tmp1, tmp2, tmp3, src, dst);
2464   switch (T) {
2465     case T2S:
2466     case T4S:
2467       fmovs(tmp1, T, 0.5f);
2468       mov(rscratch1, jint_cast(0x1.0p23f));
2469       break;
2470     case T2D:
2471       fmovd(tmp1, T, 0.5);
2472       mov(rscratch1, julong_cast(0x1.0p52));
2473       break;
2474     default:
2475       assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2476   }
2477   fadd(tmp1, T, tmp1, src);
2478   fcvtms(tmp1, T, tmp1);
2479   // tmp1 = floor(src + 0.5, ties to even)
2480 
2481   fcvtas(dst, T, src);
2482   // dst = round(src), ties to away
2483 
2484   fneg(tmp3, T, src);
2485   dup(tmp2, T, rscratch1);
2486   cm(HS, tmp3, T, tmp3, tmp2);
2487   // tmp3 is now a set of flags
2488 
2489   bif(dst, T16B, tmp1, tmp3);
2490   // result in dst
2491 }
2492 
2493 // Clobbers: rscratch1, rflags
2494 void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2495                                          FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) {
2496   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2497   assert_different_registers(tmp1, tmp2, src, dst);
2498 
2499   switch (T) {
2500     case S:
2501       mov(rscratch1, jint_cast(0x1.0p23f));
2502       break;
2503     case D:
2504       mov(rscratch1, julong_cast(0x1.0p52));
2505       break;
2506     default:
2507       assert(T == S || T == D, "invalid register variant");
2508   }
2509 
2510   sve_frinta(dst, T, ptrue, src);
2511   // dst = round(src), ties to away
2512 
2513   Label none;
2514 
2515   sve_fneg(tmp1, T, ptrue, src);
2516   sve_dup(tmp2, T, rscratch1);
2517   sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1);
2518   br(EQ, none);
2519   {
2520     sve_cpy(tmp1, T, pgtmp, 0.5);
2521     sve_fadd(tmp1, T, pgtmp, src);
2522     sve_frintm(dst, T, pgtmp, tmp1);
2523     // dst = floor(src + 0.5, ties to even)
2524   }
2525   bind(none);
2526 
2527   sve_fcvtzs(dst, T, ptrue, dst, T);
2528   // result in dst
2529 }
2530 
2531 void C2_MacroAssembler::vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero,
2532                                            FloatRegister one, SIMD_Arrangement T) {
2533   assert_different_registers(dst, src, zero, one);
2534   assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2535 
2536   facgt(dst, T, src, zero);
2537   ushr(dst, T, dst, 1); // dst=0 for +-0.0 and NaN. 0x7FF..F otherwise
2538   bsl(dst, T == T2S ? T8B : T16B, one, src); // Result in dst
2539 }
2540 
2541 void C2_MacroAssembler::vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero,
2542                                           FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T) {
2543     assert_different_registers(dst, src, zero, one, vtmp);
2544     assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2545 
2546     sve_orr(vtmp, src, src);
2547     sve_fac(Assembler::GT, pgtmp, T, ptrue, src, zero); // pmtp=0 for +-0.0 and NaN. 0x1 otherwise
2548     switch (T) {
2549     case S:
2550       sve_and(vtmp, T, min_jint); // Extract the sign bit of float value in every lane of src
2551       sve_orr(vtmp, T, jint_cast(1.0)); // OR it with +1 to make the final result +1 or -1 depending
2552                                         // on the sign of the float value
2553       break;
2554     case D:
2555       sve_and(vtmp, T, min_jlong);
2556       sve_orr(vtmp, T, jlong_cast(1.0));
2557       break;
2558     default:
2559       assert(false, "unsupported");
2560       ShouldNotReachHere();
2561     }
2562     sve_sel(dst, T, pgtmp, vtmp, src); // Select either from src or vtmp based on the predicate register pgtmp
2563                                        // Result in dst
2564 }
2565 
2566 bool C2_MacroAssembler::in_scratch_emit_size() {
2567   if (ciEnv::current()->task() != nullptr) {
2568     PhaseOutput* phase_output = Compile::current()->output();
2569     if (phase_output != nullptr && phase_output->in_scratch_emit_size()) {
2570       return true;
2571     }
2572   }
2573   return MacroAssembler::in_scratch_emit_size();
2574 }
2575 
2576 static void abort_verify_int_in_range(uint idx, jint val, jint lo, jint hi) {
2577   fatal("Invalid CastII, idx: %u, val: %d, lo: %d, hi: %d", idx, val, lo, hi);
2578 }
2579 
2580 void C2_MacroAssembler::verify_int_in_range(uint idx, const TypeInt* t, Register rval, Register rtmp) {
2581   assert(!t->empty() && !t->singleton(), "%s", Type::str(t));
2582   if (t == TypeInt::INT) {
2583     return;
2584   }
2585 
2586   BLOCK_COMMENT("verify_int_in_range {");
2587   Label L_success, L_failure;
2588 
2589   jint lo = t->_lo;
2590   jint hi = t->_hi;
2591 
2592   if (lo != min_jint) {
2593     subsw(rtmp, rval, lo);
2594     br(Assembler::LT, L_failure);
2595   }
2596   if (hi != max_jint) {
2597     subsw(rtmp, rval, hi);
2598     br(Assembler::GT, L_failure);
2599   }
2600   b(L_success);
2601 
2602   bind(L_failure);
2603   movw(c_rarg0, idx);
2604   mov(c_rarg1, rval);
2605   movw(c_rarg2, lo);
2606   movw(c_rarg3, hi);
2607   reconstruct_frame_pointer(rtmp);
2608   rt_call(CAST_FROM_FN_PTR(address, abort_verify_int_in_range), rtmp);
2609   hlt(0);
2610 
2611   bind(L_success);
2612   BLOCK_COMMENT("} verify_int_in_range");
2613 }
2614 
2615 static void abort_verify_long_in_range(uint idx, jlong val, jlong lo, jlong hi) {
2616   fatal("Invalid CastLL, idx: %u, val: " JLONG_FORMAT ", lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, val, lo, hi);
2617 }
2618 
2619 void C2_MacroAssembler::verify_long_in_range(uint idx, const TypeLong* t, Register rval, Register rtmp) {
2620   assert(!t->empty() && !t->singleton(), "%s", Type::str(t));
2621   if (t == TypeLong::LONG) {
2622     return;
2623   }
2624 
2625   BLOCK_COMMENT("verify_long_in_range {");
2626   Label L_success, L_failure;
2627 
2628   jlong lo = t->_lo;
2629   jlong hi = t->_hi;
2630 
2631   if (lo != min_jlong) {
2632     subs(rtmp, rval, lo);
2633     br(Assembler::LT, L_failure);
2634   }
2635   if (hi != max_jlong) {
2636     subs(rtmp, rval, hi);
2637     br(Assembler::GT, L_failure);
2638   }
2639   b(L_success);
2640 
2641   bind(L_failure);
2642   movw(c_rarg0, idx);
2643   mov(c_rarg1, rval);
2644   mov(c_rarg2, lo);
2645   mov(c_rarg3, hi);
2646   reconstruct_frame_pointer(rtmp);
2647   rt_call(CAST_FROM_FN_PTR(address, abort_verify_long_in_range), rtmp);
2648   hlt(0);
2649 
2650   bind(L_success);
2651   BLOCK_COMMENT("} verify_long_in_range");
2652 }
2653 
2654 void C2_MacroAssembler::reconstruct_frame_pointer(Register rtmp) {
2655   const int framesize = Compile::current()->output()->frame_size_in_bytes();
2656   if (PreserveFramePointer) {
2657     // frame pointer is valid
2658 #ifdef ASSERT
2659     // Verify frame pointer value in rfp.
2660     add(rtmp, sp, framesize - 2 * wordSize);
2661     Label L_success;
2662     cmp(rfp, rtmp);
2663     br(Assembler::EQ, L_success);
2664     stop("frame pointer mismatch");
2665     bind(L_success);
2666 #endif // ASSERT
2667   } else {
2668     add(rfp, sp, framesize - 2 * wordSize);
2669   }
2670 }
2671 
2672 // Selects elements from two source vectors (src1, src2) based on index values in the index register
2673 // using Neon instructions and places it in the destination vector element corresponding to the
2674 // index vector element. Each index in the index register must be in the range - [0, 2 * NUM_ELEM),
2675 // where NUM_ELEM is the number of BasicType elements per vector.
2676 // If idx < NUM_ELEM --> selects src1[idx] (idx is an element of the index register)
2677 // Otherwise, selects src2[idx – NUM_ELEM]
2678 void C2_MacroAssembler::select_from_two_vectors_neon(FloatRegister dst, FloatRegister src1,
2679                                                      FloatRegister src2, FloatRegister index,
2680                                                      FloatRegister tmp, unsigned vector_length_in_bytes) {
2681   assert_different_registers(dst, src1, src2, tmp);
2682   SIMD_Arrangement size = vector_length_in_bytes == 16 ? T16B : T8B;
2683 
2684   if (vector_length_in_bytes == 16) {
2685     assert(UseSVE <= 1, "sve must be <= 1");
2686     assert(src1->successor() == src2, "Source registers must be ordered");
2687     // If the vector length is 16B, then use the Neon "tbl" instruction with two vector table
2688     tbl(dst, size, src1, 2, index);
2689   } else { // vector length == 8
2690     assert(UseSVE == 0, "must be Neon only");
2691     // We need to fit both the source vectors (src1, src2) in a 128-bit register because the
2692     // Neon "tbl" instruction supports only looking up 16B vectors. We then use the Neon "tbl"
2693     // instruction with one vector lookup
2694     ins(tmp, D, src1, 0, 0);
2695     ins(tmp, D, src2, 1, 0);
2696     tbl(dst, size, tmp, 1, index);
2697   }
2698 }
2699 
2700 // Selects elements from two source vectors (src1, src2) based on index values in the index register
2701 // using SVE/SVE2 instructions and places it in the destination vector element corresponding to the
2702 // index vector element. Each index in the index register must be in the range - [0, 2 * NUM_ELEM),
2703 // where NUM_ELEM is the number of BasicType elements per vector.
2704 // If idx < NUM_ELEM --> selects src1[idx] (idx is an element of the index register)
2705 // Otherwise, selects src2[idx – NUM_ELEM]
2706 void C2_MacroAssembler::select_from_two_vectors_sve(FloatRegister dst, FloatRegister src1,
2707                                                     FloatRegister src2, FloatRegister index,
2708                                                     FloatRegister tmp, SIMD_RegVariant T,
2709                                                     unsigned vector_length_in_bytes) {
2710   assert_different_registers(dst, src1, src2, index, tmp);
2711 
2712   if (vector_length_in_bytes == 8) {
2713     // We need to fit both the source vectors (src1, src2) in a single vector register because the
2714     // SVE "tbl" instruction is unpredicated and works on the entire vector which can lead to
2715     // incorrect results if each source vector is only partially filled. We then use the SVE "tbl"
2716     // instruction with one vector lookup
2717     assert(UseSVE >= 1, "sve must be >= 1");
2718     ins(tmp, D, src1, 0, 0);
2719     ins(tmp, D, src2, 1, 0);
2720     sve_tbl(dst, T, tmp, index);
2721   } else {  // UseSVE == 2 and vector_length_in_bytes > 8
2722     // If the vector length is > 8, then use the SVE2 "tbl" instruction with the two vector table.
2723     // The assertion - vector_length_in_bytes == MaxVectorSize ensures that this operation
2724     // is not executed on machines where vector_length_in_bytes < MaxVectorSize
2725     // with the only exception of 8B vector length.
2726     assert(UseSVE == 2 && vector_length_in_bytes == MaxVectorSize, "must be");
2727     assert(src1->successor() == src2, "Source registers must be ordered");
2728     sve_tbl(dst, T, src1, src2, index);
2729   }
2730 }
2731 
2732 void C2_MacroAssembler::select_from_two_vectors(FloatRegister dst, FloatRegister src1,
2733                                                 FloatRegister src2, FloatRegister index,
2734                                                 FloatRegister tmp, BasicType bt,
2735                                                 unsigned vector_length_in_bytes) {
2736 
2737   assert_different_registers(dst, src1, src2, index, tmp);
2738 
2739   // The cases that can reach this method are -
2740   // - UseSVE = 0/1, vector_length_in_bytes = 8 or 16, excluding double and long types
2741   // - UseSVE = 2, vector_length_in_bytes >= 8, for all types
2742   //
2743   // SVE/SVE2 tbl instructions are generated when UseSVE = 1 with vector_length_in_bytes = 8
2744   // and UseSVE = 2 with vector_length_in_bytes >= 8
2745   //
2746   // Neon instructions are generated when UseSVE = 0 with vector_length_in_bytes = 8 or 16 and
2747   // UseSVE = 1 with vector_length_in_bytes = 16
2748 
2749   if ((UseSVE == 1 && vector_length_in_bytes == 8) || UseSVE == 2) {
2750     SIMD_RegVariant T = elemType_to_regVariant(bt);
2751     select_from_two_vectors_sve(dst, src1, src2, index, tmp, T, vector_length_in_bytes);
2752     return;
2753   }
2754 
2755   // The only BasicTypes that can reach here are T_SHORT, T_BYTE, T_INT and T_FLOAT
2756   assert(bt != T_DOUBLE && bt != T_LONG, "unsupported basic type");
2757   assert(vector_length_in_bytes <= 16, "length_in_bytes must be <= 16");
2758 
2759   bool isQ = vector_length_in_bytes == 16;
2760 
2761   SIMD_Arrangement size1 = isQ ? T16B : T8B;
2762   SIMD_Arrangement size2 = esize2arrangement((uint)type2aelembytes(bt), isQ);
2763 
2764   // Neon "tbl" instruction only supports byte tables, so we need to look at chunks of
2765   // 2B for selecting shorts or chunks of 4B for selecting ints/floats from the table.
2766   // The index values in "index" register are in the range of [0, 2 * NUM_ELEM) where NUM_ELEM
2767   // is the number of elements that can fit in a vector. For ex. for T_SHORT with 64-bit vector length,
2768   // the indices can range from [0, 8).
2769   // As an example with 64-bit vector length and T_SHORT type - let index = [2, 5, 1, 0]
2770   // Move a constant 0x02 in every byte of tmp - tmp = [0x0202, 0x0202, 0x0202, 0x0202]
2771   // Multiply index vector with tmp to yield - dst = [0x0404, 0x0a0a, 0x0202, 0x0000]
2772   // Move a constant 0x0100 in every 2B of tmp - tmp = [0x0100, 0x0100, 0x0100, 0x0100]
2773   // Add the multiplied result to the vector in tmp to obtain the byte level
2774   // offsets - dst = [0x0504, 0x0b0a, 0x0302, 0x0100]
2775   // Use these offsets in the "tbl" instruction to select chunks of 2B.
2776 
2777   if (bt == T_BYTE) {
2778     select_from_two_vectors_neon(dst, src1, src2, index, tmp, vector_length_in_bytes);
2779   } else {
2780     int elem_size = (bt == T_SHORT) ? 2 : 4;
2781     uint64_t tbl_offset = (bt == T_SHORT) ? 0x0100u : 0x03020100u;
2782 
2783     mov(tmp, size1, elem_size);
2784     mulv(dst, size2, index, tmp);
2785     mov(tmp, size2, tbl_offset);
2786     addv(dst, size1, dst, tmp); // "dst" now contains the processed index elements
2787                                 // to select a set of 2B/4B
2788     select_from_two_vectors_neon(dst, src1, src2, dst, tmp, vector_length_in_bytes);
2789   }
2790 }
2791 
2792 // Vector expand implementation. Elements from the src vector are expanded into
2793 // the dst vector under the control of the vector mask.
2794 // Since there are no native instructions directly corresponding to expand before
2795 // SVE2p2, the following implementations mainly leverages the TBL instruction to
2796 // implement expand. To compute the index input for TBL, the prefix sum algorithm
2797 // (https://en.wikipedia.org/wiki/Prefix_sum) is used. The same algorithm is used
2798 // for NEON and SVE, but with different instructions where appropriate.
2799 
2800 // Vector expand implementation for NEON.
2801 //
2802 // An example of 128-bit Byte vector:
2803 //   Data direction: high <== low
2804 //   Input:
2805 //         src   = g  f  e  d  c  b  a  9  8  7  6  5  4  3  2  1
2806 //         mask  = 0  0 -1 -1  0  0 -1 -1  0  0 -1 -1  0  0 -1 -1
2807 //   Expected result:
2808 //         dst   = 0  0  8  7  0  0  6  5  0  0  4  3  0  0  2  1
2809 void C2_MacroAssembler::vector_expand_neon(FloatRegister dst, FloatRegister src, FloatRegister mask,
2810                                            FloatRegister tmp1, FloatRegister tmp2, BasicType bt,
2811                                            int vector_length_in_bytes) {
2812   assert(vector_length_in_bytes <= 16, "the vector length in bytes for NEON must be <= 16");
2813   assert_different_registers(dst, src, mask, tmp1, tmp2);
2814   // Since the TBL instruction only supports byte table, we need to
2815   // compute indices in byte type for all types.
2816   SIMD_Arrangement size = vector_length_in_bytes == 16 ? T16B : T8B;
2817   // tmp1 =  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
2818   dup(tmp1, size, zr);
2819   // dst  =  0  0  1  1  0  0  1  1  0  0  1  1  0  0  1  1
2820   negr(dst, size, mask);
2821   // Calculate vector index for TBL with prefix sum algorithm.
2822   // dst  =  8  8  8  7  6  6  6  5  4  4  4  3  2  2  2  1
2823   for (int i = 1; i < vector_length_in_bytes; i <<= 1) {
2824     ext(tmp2, size, tmp1, dst, vector_length_in_bytes - i);
2825     addv(dst, size, tmp2, dst);
2826   }
2827   // tmp2 =  0  0 -1 -1  0  0 -1 -1  0  0 -1 -1  0  0 -1 -1
2828   orr(tmp2, size, mask, mask);
2829   // tmp2 =  0  0  8  7  0  0  6  5  0  0  4  3  0  0  2  1
2830   bsl(tmp2, size, dst, tmp1);
2831   // tmp1 =  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
2832   movi(tmp1, size, 1);
2833   // dst  = -1 -1  7  6 -1 -1  5  4 -1 -1  3  2 -1 -1  1  0
2834   subv(dst, size, tmp2, tmp1);
2835   // dst  =  0  0  8  7  0  0  6  5  0  0  4  3  0  0  2  1
2836   tbl(dst, size, src, 1, dst);
2837 }
2838 
2839 // Vector expand implementation for SVE.
2840 //
2841 // An example of 128-bit Short vector:
2842 //   Data direction: high <== low
2843 //   Input:
2844 //         src   = gf ed cb a9 87 65 43 21
2845 //         pg    = 00 01 00 01 00 01 00 01
2846 //   Expected result:
2847 //         dst   = 00 87 00 65 00 43 00 21
2848 void C2_MacroAssembler::vector_expand_sve(FloatRegister dst, FloatRegister src, PRegister pg,
2849                                           FloatRegister tmp1, FloatRegister tmp2, BasicType bt,
2850                                           int vector_length_in_bytes) {
2851   assert(UseSVE > 0, "expand implementation only for SVE");
2852   assert_different_registers(dst, src, tmp1, tmp2);
2853   SIMD_RegVariant size = elemType_to_regVariant(bt);
2854 
2855   // tmp1 = 00 00 00 00 00 00 00 00
2856   sve_dup(tmp1, size, 0);
2857   sve_movprfx(tmp2, tmp1);
2858   // tmp2 = 00 01 00 01 00 01 00 01
2859   sve_cpy(tmp2, size, pg, 1, true);
2860   // Calculate vector index for TBL with prefix sum algorithm.
2861   // tmp2 = 04 04 03 03 02 02 01 01
2862   for (int i = type2aelembytes(bt); i < vector_length_in_bytes; i <<= 1) {
2863     sve_movprfx(dst, tmp1);
2864     // The EXT instruction operates on the full-width sve register. The correct
2865     // index calculation method is:
2866     // vector_length_in_bytes - i + MaxVectorSize - vector_length_in_bytes =>
2867     // MaxVectorSize - i.
2868     sve_ext(dst, tmp2, MaxVectorSize - i);
2869     sve_add(tmp2, size, dst, tmp2);
2870   }
2871   // dst  = 00 04 00 03 00 02 00 01
2872   sve_sel(dst, size, pg, tmp2, tmp1);
2873   // dst  = -1 03 -1 02 -1 01 -1 00
2874   sve_sub(dst, size, 1);
2875   // dst  = 00 87 00 65 00 43 00 21
2876   sve_tbl(dst, size, src, dst);
2877 }
2878 
2879 // Optimized SVE cpy (imm, zeroing) instruction.
2880 //
2881 // `movi; cpy(imm, merging)` and `cpy(imm, zeroing)` have the same
2882 // functionality, but test results show that `movi; cpy(imm, merging)` has
2883 // higher throughput on some microarchitectures. This would depend on
2884 // microarchitecture and so may vary between implementations.
2885 void C2_MacroAssembler::sve_cpy(FloatRegister dst, SIMD_RegVariant T,
2886                                 PRegister pg, int imm8, bool isMerge) {
2887   if (VM_Version::prefer_sve_merging_mode_cpy() && !isMerge) {
2888     // Generates a NEON instruction `movi V<dst>.2d, #0`.
2889     // On AArch64, Z and V registers alias in the low 128 bits, so V<dst> is
2890     // the low 128 bits of Z<dst>. A write to V<dst> also clears all bits of
2891     // Z<dst> above 128, so this `movi` instruction effectively zeroes the
2892     // entire Z<dst> register. According to the Arm Software Optimization
2893     // Guide, `movi` is zero latency.
2894     movi(dst, T2D, 0);
2895     isMerge = true;
2896   }
2897   Assembler::sve_cpy(dst, T, pg, imm8, isMerge);
2898 }