1 /*
   2  * Copyright (c) 2020, 2026, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright 2026 Arm Limited and/or its affiliates.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "opto/c2_MacroAssembler.hpp"
  29 #include "opto/compile.hpp"
  30 #include "opto/intrinsicnode.hpp"
  31 #include "opto/matcher.hpp"
  32 #include "opto/output.hpp"
  33 #include "opto/subnode.hpp"
  34 #include "runtime/objectMonitorTable.hpp"
  35 #include "runtime/stubRoutines.hpp"
  36 #include "runtime/synchronizer.hpp"
  37 #include "utilities/globalDefinitions.hpp"
  38 #include "utilities/powerOfTwo.hpp"
  39 
  40 #ifdef PRODUCT
  41 #define BLOCK_COMMENT(str) /* nothing */
  42 #define STOP(error) stop(error)
  43 #else
  44 #define BLOCK_COMMENT(str) block_comment(str)
  45 #define STOP(error) block_comment(error); stop(error)
  46 #endif
  47 
  48 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  49 
  50 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
  51 
  52 // jdk.internal.util.ArraysSupport.vectorizedHashCode
  53 address C2_MacroAssembler::arrays_hashcode(Register ary, Register cnt, Register result,
  54                                            FloatRegister vdata0, FloatRegister vdata1,
  55                                            FloatRegister vdata2, FloatRegister vdata3,
  56                                            FloatRegister vmul0, FloatRegister vmul1,
  57                                            FloatRegister vmul2, FloatRegister vmul3,
  58                                            FloatRegister vpow, FloatRegister vpowm,
  59                                            BasicType eltype) {
  60   ARRAYS_HASHCODE_REGISTERS;
  61 
  62   Register tmp1 = rscratch1, tmp2 = rscratch2;
  63 
  64   Label TAIL, STUB_SWITCH, STUB_SWITCH_OUT, LOOP, BR_BASE, LARGE, DONE;
  65 
  66   // Vectorization factor. Number of array elements loaded to one SIMD&FP registers by the stubs. We
  67   // use 8H load arrangements for chars and shorts and 8B for booleans and bytes. It's possible to
  68   // use 4H for chars and shorts instead, but using 8H gives better performance.
  69   const size_t vf = eltype == T_BOOLEAN || eltype == T_BYTE ? 8
  70                     : eltype == T_CHAR || eltype == T_SHORT ? 8
  71                     : eltype == T_INT                       ? 4
  72                                                             : 0;
  73   guarantee(vf, "unsupported eltype");
  74 
  75   // Unroll factor for the scalar loop below. The value is chosen based on performance analysis.
  76   const size_t unroll_factor = 4;
  77 
  78   switch (eltype) {
  79   case T_BOOLEAN:
  80     BLOCK_COMMENT("arrays_hashcode(unsigned byte) {");
  81     break;
  82   case T_CHAR:
  83     BLOCK_COMMENT("arrays_hashcode(char) {");
  84     break;
  85   case T_BYTE:
  86     BLOCK_COMMENT("arrays_hashcode(byte) {");
  87     break;
  88   case T_SHORT:
  89     BLOCK_COMMENT("arrays_hashcode(short) {");
  90     break;
  91   case T_INT:
  92     BLOCK_COMMENT("arrays_hashcode(int) {");
  93     break;
  94   default:
  95     ShouldNotReachHere();
  96   }
  97 
  98   // large_arrays_hashcode(T_INT) performs worse than the scalar loop below when the Neon loop
  99   // implemented by the stub executes just once. Call the stub only if at least two iterations will
 100   // be executed.
 101   const size_t large_threshold = eltype == T_INT ? vf * 2 : vf;
 102   cmpw(cnt, large_threshold);
 103   br(Assembler::HS, LARGE);
 104 
 105   bind(TAIL);
 106 
 107   // The andr performs cnt % uf where uf = unroll_factor. The subtract shifted by 3 offsets past
 108   // uf - (cnt % uf) pairs of load + madd insns i.e. it only executes cnt % uf load + madd pairs.
 109   // Iteration eats up the remainder, uf elements at a time.
 110   assert(is_power_of_2(unroll_factor), "can't use this value to calculate the jump target PC");
 111   andr(tmp2, cnt, unroll_factor - 1);
 112   adr(tmp1, BR_BASE);
 113   // For Cortex-A53 offset is 4 because 2 nops are generated.
 114   sub(tmp1, tmp1, tmp2, ext::sxtw, VM_Version::supports_a53mac() ? 4 : 3);
 115   movw(tmp2, 0x1f);
 116   br(tmp1);
 117 
 118   bind(LOOP);
 119   for (size_t i = 0; i < unroll_factor; ++i) {
 120     load(tmp1, Address(post(ary, type2aelembytes(eltype))), eltype);
 121     maddw(result, result, tmp2, tmp1);
 122     // maddw generates an extra nop for Cortex-A53 (see maddw definition in macroAssembler).
 123     // Generate 2nd nop to have 4 instructions per iteration.
 124     if (VM_Version::supports_a53mac()) {
 125       nop();
 126     }
 127   }
 128   bind(BR_BASE);
 129   subsw(cnt, cnt, unroll_factor);
 130   br(Assembler::HS, LOOP);
 131 
 132   b(DONE);
 133 
 134   bind(LARGE);
 135 
 136   RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_arrays_hashcode(eltype));
 137   assert(stub.target() != nullptr, "array_hashcode stub has not been generated");
 138   address tpc = trampoline_call(stub);
 139   if (tpc == nullptr) {
 140     DEBUG_ONLY(reset_labels(TAIL, BR_BASE));
 141     postcond(pc() == badAddress);
 142     return nullptr;
 143   }
 144 
 145   bind(DONE);
 146 
 147   BLOCK_COMMENT("} // arrays_hashcode");
 148 
 149   postcond(pc() != badAddress);
 150   return pc();
 151 }
 152 
 153 void C2_MacroAssembler::fast_lock(Register obj, Register box, Register t1,
 154                                   Register t2, Register t3) {
 155   assert_different_registers(obj, box, t1, t2, t3, rscratch2);
 156 
 157   // Handle inflated monitor.
 158   Label inflated;
 159   // Finish fast lock successfully. MUST branch to with flag == EQ
 160   Label locked;
 161   // Finish fast lock unsuccessfully. MUST branch to with flag == NE
 162   Label slow_path;
 163 
 164   if (UseObjectMonitorTable) {
 165     // Clear cache in case fast locking succeeds or we need to take the slow-path.
 166     str(zr, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 167   }
 168 
 169   if (DiagnoseSyncOnValueBasedClasses != 0) {
 170     load_klass(t1, obj);
 171     ldrb(t1, Address(t1, Klass::misc_flags_offset()));
 172     tst(t1, KlassFlags::_misc_is_value_based_class);
 173     br(Assembler::NE, slow_path);
 174   }
 175 
 176   const Register t1_mark = t1;
 177   const Register t3_t = t3;
 178 
 179   { // Fast locking
 180 
 181     // Push lock to the lock stack and finish successfully. MUST branch to with flag == EQ
 182     Label push;
 183 
 184     const Register t2_top = t2;
 185 
 186     // Check if lock-stack is full.
 187     ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 188     cmpw(t2_top, (unsigned)LockStack::end_offset() - 1);
 189     br(Assembler::GT, slow_path);
 190 
 191     // Check if recursive.
 192     subw(t3_t, t2_top, oopSize);
 193     ldr(t3_t, Address(rthread, t3_t));
 194     cmp(obj, t3_t);
 195     br(Assembler::EQ, push);
 196 
 197     // Relaxed normal load to check for monitor. Optimization for monitor case.
 198     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 199     tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
 200 
 201     // Not inflated
 202     assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea");
 203 
 204     // Try to lock. Transition lock-bits 0b01 => 0b00
 205     orr(t1_mark, t1_mark, markWord::unlocked_value);
 206     eor(t3_t, t1_mark, markWord::unlocked_value);
 207     cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword, memory_order_acquire);
 208     br(Assembler::NE, slow_path);
 209 
 210     bind(push);
 211     // After successful lock, push object on lock-stack.
 212     str(obj, Address(rthread, t2_top));
 213     addw(t2_top, t2_top, oopSize);
 214     strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 215     b(locked);
 216   }
 217 
 218   { // Handle inflated monitor.
 219     bind(inflated);
 220 
 221     const Register t1_monitor = t1;
 222 
 223     if (!UseObjectMonitorTable) {
 224       assert(t1_monitor == t1_mark, "should be the same here");
 225     } else {
 226       const Register t1_hash = t1;
 227       Label monitor_found;
 228 
 229       // Save the mark, we might need it to extract the hash.
 230       mov(t3, t1_mark);
 231 
 232       // Look for the monitor in the om_cache.
 233 
 234       ByteSize cache_offset   = JavaThread::om_cache_oops_offset();
 235       ByteSize monitor_offset = OMCache::oop_to_monitor_difference();
 236       const int num_unrolled  = OMCache::CAPACITY;
 237       for (int i = 0; i < num_unrolled; i++) {
 238         ldr(t1_monitor, Address(rthread, cache_offset + monitor_offset));
 239         ldr(t2, Address(rthread, cache_offset));
 240         cmp(obj, t2);
 241         br(Assembler::EQ, monitor_found);
 242         cache_offset = cache_offset + OMCache::oop_to_oop_difference();
 243       }
 244 
 245       // Look for the monitor in the table.
 246 
 247       // Get the hash code.
 248       ubfx(t1_hash, t3, markWord::hash_shift, markWord::hash_bits);
 249 
 250       // Get the table and calculate the bucket's address
 251       lea(t3, ExternalAddress(ObjectMonitorTable::current_table_address()));
 252       ldr(t3, Address(t3));
 253       ldr(t2, Address(t3, ObjectMonitorTable::table_capacity_mask_offset()));
 254       ands(t1_hash, t1_hash, t2);
 255       ldr(t3, Address(t3, ObjectMonitorTable::table_buckets_offset()));
 256 
 257       // Read the monitor from the bucket.
 258       ldr(t1_monitor, Address(t3, t1_hash, Address::lsl(LogBytesPerWord)));
 259 
 260       // Check if the monitor in the bucket is special (empty, tombstone or removed).
 261       cmp(t1_monitor, (unsigned char)ObjectMonitorTable::SpecialPointerValues::below_is_special);
 262       br(Assembler::LO, slow_path);
 263 
 264       // Check if object matches.
 265       ldr(t3, Address(t1_monitor, ObjectMonitor::object_offset()));
 266       BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
 267       bs_asm->try_peek_weak_handle_in_nmethod(this, t3, t3, t2, slow_path);
 268       cmp(t3, obj);
 269       br(Assembler::NE, slow_path);
 270 
 271       bind(monitor_found);
 272     }
 273 
 274     const Register t2_owner_addr = t2;
 275     const Register t3_owner = t3;
 276     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 277     const Address owner_address(t1_monitor, ObjectMonitor::owner_offset() - monitor_tag);
 278     const Address recursions_address(t1_monitor, ObjectMonitor::recursions_offset() - monitor_tag);
 279 
 280     Label monitor_locked;
 281 
 282     // Compute owner address.
 283     lea(t2_owner_addr, owner_address);
 284 
 285     // Try to CAS owner (no owner => current thread's _monitor_owner_id).
 286     ldr(rscratch2, Address(rthread, JavaThread::monitor_owner_id_offset()));
 287     cmpxchg(t2_owner_addr, zr, rscratch2, Assembler::xword, memory_order_acquire, t3_owner);
 288     br(Assembler::EQ, monitor_locked);
 289 
 290     // Check if recursive.
 291     cmp(t3_owner, rscratch2);
 292     br(Assembler::NE, slow_path);
 293 
 294     // Recursive.
 295     increment(recursions_address, 1);
 296 
 297     bind(monitor_locked);
 298     if (UseObjectMonitorTable) {
 299       str(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 300     }
 301   }
 302 
 303   bind(locked);
 304 
 305 #ifdef ASSERT
 306   // Check that locked label is reached with Flags == EQ.
 307   Label flag_correct;
 308   br(Assembler::EQ, flag_correct);
 309   stop("Fast Lock Flag != EQ");
 310 #endif
 311 
 312   bind(slow_path);
 313 #ifdef ASSERT
 314   // Check that slow_path label is reached with Flags == NE.
 315   br(Assembler::NE, flag_correct);
 316   stop("Fast Lock Flag != NE");
 317   bind(flag_correct);
 318 #endif
 319   // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
 320 }
 321 
 322 void C2_MacroAssembler::fast_unlock(Register obj, Register box, Register t1,
 323                                     Register t2, Register t3) {
 324   assert_different_registers(obj, box, t1, t2, t3);
 325 
 326   // Handle inflated monitor.
 327   Label inflated, inflated_load_mark;
 328   // Finish fast unlock successfully. MUST branch to with flag == EQ
 329   Label unlocked;
 330   // Finish fast unlock unsuccessfully. MUST branch to with flag == NE
 331   Label slow_path;
 332 
 333   const Register t1_mark = t1;
 334   const Register t2_top = t2;
 335   const Register t3_t = t3;
 336 
 337   { // Fast unlock
 338 
 339     Label push_and_slow_path;
 340 
 341     // Check if obj is top of lock-stack.
 342     ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 343     subw(t2_top, t2_top, oopSize);
 344     ldr(t3_t, Address(rthread, t2_top));
 345     cmp(obj, t3_t);
 346     // Top of lock stack was not obj. Must be monitor.
 347     br(Assembler::NE, inflated_load_mark);
 348 
 349     // Pop lock-stack.
 350     DEBUG_ONLY(str(zr, Address(rthread, t2_top));)
 351     strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 352 
 353     // Check if recursive.
 354     subw(t3_t, t2_top, oopSize);
 355     ldr(t3_t, Address(rthread, t3_t));
 356     cmp(obj, t3_t);
 357     br(Assembler::EQ, unlocked);
 358 
 359     // Not recursive.
 360     // Load Mark.
 361     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 362 
 363     // Check header for monitor (0b10).
 364     // Because we got here by popping (meaning we pushed in locked)
 365     // there will be no monitor in the box. So we need to push back the obj
 366     // so that the runtime can fix any potential anonymous owner.
 367     tbnz(t1_mark, exact_log2(markWord::monitor_value), UseObjectMonitorTable ? push_and_slow_path : inflated);
 368 
 369     // Try to unlock. Transition lock bits 0b00 => 0b01
 370     assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea");
 371     orr(t3_t, t1_mark, markWord::unlocked_value);
 372     cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword, memory_order_release);
 373     br(Assembler::EQ, unlocked);
 374 
 375     bind(push_and_slow_path);
 376     // Compare and exchange failed.
 377     // Restore lock-stack and handle the unlock in runtime.
 378     DEBUG_ONLY(str(obj, Address(rthread, t2_top));)
 379     addw(t2_top, t2_top, oopSize);
 380     str(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 381     b(slow_path);
 382   }
 383 
 384 
 385   { // Handle inflated monitor.
 386     bind(inflated_load_mark);
 387     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 388 #ifdef ASSERT
 389     tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
 390     stop("Fast Unlock not monitor");
 391 #endif
 392 
 393     bind(inflated);
 394 
 395 #ifdef ASSERT
 396     Label check_done;
 397     subw(t2_top, t2_top, oopSize);
 398     cmpw(t2_top, in_bytes(JavaThread::lock_stack_base_offset()));
 399     br(Assembler::LT, check_done);
 400     ldr(t3_t, Address(rthread, t2_top));
 401     cmp(obj, t3_t);
 402     br(Assembler::NE, inflated);
 403     stop("Fast Unlock lock on stack");
 404     bind(check_done);
 405 #endif
 406 
 407     const Register t1_monitor = t1;
 408 
 409     if (!UseObjectMonitorTable) {
 410       assert(t1_monitor == t1_mark, "should be the same here");
 411 
 412       // Untag the monitor.
 413       add(t1_monitor, t1_mark, -(int)markWord::monitor_value);
 414     } else {
 415       ldr(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 416       // null check with Flags == NE, no valid pointer below alignof(ObjectMonitor*)
 417       cmp(t1_monitor, checked_cast<uint8_t>(alignof(ObjectMonitor*)));
 418       br(Assembler::LO, slow_path);
 419     }
 420 
 421     const Register t2_recursions = t2;
 422     Label not_recursive;
 423 
 424     // Check if recursive.
 425     ldr(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
 426     cbz(t2_recursions, not_recursive);
 427 
 428     // Recursive unlock.
 429     sub(t2_recursions, t2_recursions, 1u);
 430     str(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
 431     // Set flag == EQ
 432     cmp(t2_recursions, t2_recursions);
 433     b(unlocked);
 434 
 435     bind(not_recursive);
 436 
 437     const Register t2_owner_addr = t2;
 438 
 439     // Compute owner address.
 440     lea(t2_owner_addr, Address(t1_monitor, ObjectMonitor::owner_offset()));
 441 
 442     // Set owner to null.
 443     // Release to satisfy the JMM
 444     stlr(zr, t2_owner_addr);
 445     // We need a full fence after clearing owner to avoid stranding.
 446     // StoreLoad achieves this.
 447     membar(StoreLoad);
 448 
 449     // Check if the entry_list is empty.
 450     ldr(rscratch1, Address(t1_monitor, ObjectMonitor::entry_list_offset()));
 451     cmp(rscratch1, zr);
 452     br(Assembler::EQ, unlocked);  // If so we are done.
 453 
 454     // Check if there is a successor.
 455     ldr(rscratch1, Address(t1_monitor, ObjectMonitor::succ_offset()));
 456     cmp(rscratch1, zr);
 457     br(Assembler::NE, unlocked);  // If so we are done.
 458 
 459     // Save the monitor pointer in the current thread, so we can try to
 460     // reacquire the lock in SharedRuntime::monitor_exit_helper().
 461     str(t1_monitor, Address(rthread, JavaThread::unlocked_inflated_monitor_offset()));
 462 
 463     cmp(zr, rthread); // Set Flag to NE => slow path
 464     b(slow_path);
 465   }
 466 
 467   bind(unlocked);
 468   cmp(zr, zr); // Set Flags to EQ => fast path
 469 
 470 #ifdef ASSERT
 471   // Check that unlocked label is reached with Flags == EQ.
 472   Label flag_correct;
 473   br(Assembler::EQ, flag_correct);
 474   stop("Fast Unlock Flag != EQ");
 475 #endif
 476 
 477   bind(slow_path);
 478 #ifdef ASSERT
 479   // Check that slow_path label is reached with Flags == NE.
 480   br(Assembler::NE, flag_correct);
 481   stop("Fast Unlock Flag != NE");
 482   bind(flag_correct);
 483 #endif
 484   // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
 485 }
 486 
 487 // Search for str1 in str2 and return index or -1
 488 // Clobbers: rscratch1, rscratch2, rflags. May also clobber v0-v1, when icnt1==-1.
 489 void C2_MacroAssembler::string_indexof(Register str2, Register str1,
 490                                        Register cnt2, Register cnt1,
 491                                        Register tmp1, Register tmp2,
 492                                        Register tmp3, Register tmp4,
 493                                        Register tmp5, Register tmp6,
 494                                        int icnt1, Register result, int ae) {
 495   // NOTE: tmp5, tmp6 can be zr depending on specific method version
 496   Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
 497 
 498   Register ch1 = rscratch1;
 499   Register ch2 = rscratch2;
 500   Register cnt1tmp = tmp1;
 501   Register cnt2tmp = tmp2;
 502   Register cnt1_neg = cnt1;
 503   Register cnt2_neg = cnt2;
 504   Register result_tmp = tmp4;
 505 
 506   bool isL = ae == StrIntrinsicNode::LL;
 507 
 508   bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
 509   bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
 510   int str1_chr_shift = str1_isL ? 0:1;
 511   int str2_chr_shift = str2_isL ? 0:1;
 512   int str1_chr_size = str1_isL ? 1:2;
 513   int str2_chr_size = str2_isL ? 1:2;
 514   chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
 515                                       (chr_insn)&MacroAssembler::ldrh;
 516   chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
 517                                       (chr_insn)&MacroAssembler::ldrh;
 518   chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
 519   chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
 520 
 521   // Note, inline_string_indexOf() generates checks:
 522   // if (substr.count > string.count) return -1;
 523   // if (substr.count == 0) return 0;
 524 
 525   // We have two strings, a source string in str2, cnt2 and a pattern string
 526   // in str1, cnt1. Find the 1st occurrence of pattern in source or return -1.
 527 
 528   // For larger pattern and source we use a simplified Boyer Moore algorithm.
 529   // With a small pattern and source we use linear scan.
 530 
 531   if (icnt1 == -1) {
 532     sub(result_tmp, cnt2, cnt1);
 533     cmp(cnt1, (u1)8);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
 534     br(LT, LINEARSEARCH);
 535     dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
 536     subs(zr, cnt1, 256);
 537     lsr(tmp1, cnt2, 2);
 538     ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
 539     br(GE, LINEARSTUB);
 540   }
 541 
 542 // The Boyer Moore alogorithm is based on the description here:-
 543 //
 544 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
 545 //
 546 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
 547 // and the 'Good Suffix' rule.
 548 //
 549 // These rules are essentially heuristics for how far we can shift the
 550 // pattern along the search string.
 551 //
 552 // The implementation here uses the 'Bad Character' rule only because of the
 553 // complexity of initialisation for the 'Good Suffix' rule.
 554 //
 555 // This is also known as the Boyer-Moore-Horspool algorithm:-
 556 //
 557 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
 558 //
 559 // This particular implementation has few java-specific optimizations.
 560 //
 561 // #define ASIZE 256
 562 //
 563 //    int bm(unsigned char *x, int m, unsigned char *y, int n) {
 564 //       int i, j;
 565 //       unsigned c;
 566 //       unsigned char bc[ASIZE];
 567 //
 568 //       /* Preprocessing */
 569 //       for (i = 0; i < ASIZE; ++i)
 570 //          bc[i] = m;
 571 //       for (i = 0; i < m - 1; ) {
 572 //          c = x[i];
 573 //          ++i;
 574 //          // c < 256 for Latin1 string, so, no need for branch
 575 //          #ifdef PATTERN_STRING_IS_LATIN1
 576 //          bc[c] = m - i;
 577 //          #else
 578 //          if (c < ASIZE) bc[c] = m - i;
 579 //          #endif
 580 //       }
 581 //
 582 //       /* Searching */
 583 //       j = 0;
 584 //       while (j <= n - m) {
 585 //          c = y[i+j];
 586 //          if (x[m-1] == c)
 587 //            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
 588 //          if (i < 0) return j;
 589 //          // c < 256 for Latin1 string, so, no need for branch
 590 //          #ifdef SOURCE_STRING_IS_LATIN1
 591 //          // LL case: (c< 256) always true. Remove branch
 592 //          j += bc[y[j+m-1]];
 593 //          #endif
 594 //          #ifndef PATTERN_STRING_IS_UTF
 595 //          // UU case: need if (c<ASIZE) check. Skip 1 character if not.
 596 //          if (c < ASIZE)
 597 //            j += bc[y[j+m-1]];
 598 //          else
 599 //            j += 1
 600 //          #endif
 601 //          #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
 602 //          // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
 603 //          if (c < ASIZE)
 604 //            j += bc[y[j+m-1]];
 605 //          else
 606 //            j += m
 607 //          #endif
 608 //       }
 609 //    }
 610 
 611   if (icnt1 == -1) {
 612     Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
 613         BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
 614     Register cnt1end = tmp2;
 615     Register str2end = cnt2;
 616     Register skipch = tmp2;
 617 
 618     // str1 length is >=8, so, we can read at least 1 register for cases when
 619     // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
 620     // UL case. We'll re-read last character in inner pre-loop code to have
 621     // single outer pre-loop load
 622     const int firstStep = isL ? 7 : 3;
 623 
 624     const int ASIZE = 256;
 625     const int STORED_BYTES = 32; // amount of bytes stored per instruction
 626     sub(sp, sp, ASIZE);
 627     mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
 628     mov(ch1, sp);
 629     BIND(BM_INIT_LOOP);
 630       stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
 631       subs(tmp5, tmp5, 1);
 632       br(GT, BM_INIT_LOOP);
 633 
 634       sub(cnt1tmp, cnt1, 1);
 635       mov(tmp5, str2);
 636       add(str2end, str2, result_tmp, LSL, str2_chr_shift);
 637       sub(ch2, cnt1, 1);
 638       mov(tmp3, str1);
 639     BIND(BCLOOP);
 640       (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
 641       if (!str1_isL) {
 642         subs(zr, ch1, ASIZE);
 643         br(HS, BCSKIP);
 644       }
 645       strb(ch2, Address(sp, ch1));
 646     BIND(BCSKIP);
 647       subs(ch2, ch2, 1);
 648       br(GT, BCLOOP);
 649 
 650       add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
 651       if (str1_isL == str2_isL) {
 652         // load last 8 bytes (8LL/4UU symbols)
 653         ldr(tmp6, Address(tmp6, -wordSize));
 654       } else {
 655         ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
 656         // convert Latin1 to UTF. We'll have to wait until load completed, but
 657         // it's still faster than per-character loads+checks
 658         lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
 659         ubfx(ch1, tmp6, 8, 8); // str1[N-2]
 660         ubfx(ch2, tmp6, 16, 8); // str1[N-3]
 661         andr(tmp6, tmp6, 0xFF); // str1[N-4]
 662         orr(ch2, ch1, ch2, LSL, 16);
 663         orr(tmp6, tmp6, tmp3, LSL, 48);
 664         orr(tmp6, tmp6, ch2, LSL, 16);
 665       }
 666     BIND(BMLOOPSTR2);
 667       (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 668       sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
 669       if (str1_isL == str2_isL) {
 670         // re-init tmp3. It's for free because it's executed in parallel with
 671         // load above. Alternative is to initialize it before loop, but it'll
 672         // affect performance on in-order systems with 2 or more ld/st pipelines
 673         lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
 674       }
 675       if (!isL) { // UU/UL case
 676         lsl(ch2, cnt1tmp, 1); // offset in bytes
 677       }
 678       cmp(tmp3, skipch);
 679       br(NE, BMSKIP);
 680       ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
 681       mov(ch1, tmp6);
 682       if (isL) {
 683         b(BMLOOPSTR1_AFTER_LOAD);
 684       } else {
 685         sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
 686         b(BMLOOPSTR1_CMP);
 687       }
 688     BIND(BMLOOPSTR1);
 689       (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
 690       (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 691     BIND(BMLOOPSTR1_AFTER_LOAD);
 692       subs(cnt1tmp, cnt1tmp, 1);
 693       br(LT, BMLOOPSTR1_LASTCMP);
 694     BIND(BMLOOPSTR1_CMP);
 695       cmp(ch1, ch2);
 696       br(EQ, BMLOOPSTR1);
 697     BIND(BMSKIP);
 698       if (!isL) {
 699         // if we've met UTF symbol while searching Latin1 pattern, then we can
 700         // skip cnt1 symbols
 701         if (str1_isL != str2_isL) {
 702           mov(result_tmp, cnt1);
 703         } else {
 704           mov(result_tmp, 1);
 705         }
 706         subs(zr, skipch, ASIZE);
 707         br(HS, BMADV);
 708       }
 709       ldrb(result_tmp, Address(sp, skipch)); // load skip distance
 710     BIND(BMADV);
 711       sub(cnt1tmp, cnt1, 1);
 712       add(str2, str2, result_tmp, LSL, str2_chr_shift);
 713       cmp(str2, str2end);
 714       br(LE, BMLOOPSTR2);
 715       add(sp, sp, ASIZE);
 716       b(NOMATCH);
 717     BIND(BMLOOPSTR1_LASTCMP);
 718       cmp(ch1, ch2);
 719       br(NE, BMSKIP);
 720     BIND(BMMATCH);
 721       sub(result, str2, tmp5);
 722       if (!str2_isL) lsr(result, result, 1);
 723       add(sp, sp, ASIZE);
 724       b(DONE);
 725 
 726     BIND(LINEARSTUB);
 727     cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm
 728     br(LT, LINEAR_MEDIUM);
 729     mov(result, zr);
 730     RuntimeAddress stub = nullptr;
 731     if (isL) {
 732       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
 733       assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated");
 734     } else if (str1_isL) {
 735       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
 736        assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated");
 737     } else {
 738       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
 739       assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated");
 740     }
 741     address call = trampoline_call(stub);
 742     if (call == nullptr) {
 743       DEBUG_ONLY(reset_labels(LINEARSEARCH, LINEAR_MEDIUM, DONE, NOMATCH, MATCH));
 744       ciEnv::current()->record_failure("CodeCache is full");
 745       return;
 746     }
 747     b(DONE);
 748   }
 749 
 750   BIND(LINEARSEARCH);
 751   {
 752     Label DO1, DO2, DO3;
 753 
 754     Register str2tmp = tmp2;
 755     Register first = tmp3;
 756 
 757     if (icnt1 == -1)
 758     {
 759         Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
 760 
 761         cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2));
 762         br(LT, DOSHORT);
 763       BIND(LINEAR_MEDIUM);
 764         (this->*str1_load_1chr)(first, Address(str1));
 765         lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
 766         sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
 767         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 768         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 769 
 770       BIND(FIRST_LOOP);
 771         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
 772         cmp(first, ch2);
 773         br(EQ, STR1_LOOP);
 774       BIND(STR2_NEXT);
 775         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 776         br(LE, FIRST_LOOP);
 777         b(NOMATCH);
 778 
 779       BIND(STR1_LOOP);
 780         adds(cnt1tmp, cnt1_neg, str1_chr_size);
 781         add(cnt2tmp, cnt2_neg, str2_chr_size);
 782         br(GE, MATCH);
 783 
 784       BIND(STR1_NEXT);
 785         (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
 786         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
 787         cmp(ch1, ch2);
 788         br(NE, STR2_NEXT);
 789         adds(cnt1tmp, cnt1tmp, str1_chr_size);
 790         add(cnt2tmp, cnt2tmp, str2_chr_size);
 791         br(LT, STR1_NEXT);
 792         b(MATCH);
 793 
 794       BIND(DOSHORT);
 795       if (str1_isL == str2_isL) {
 796         cmp(cnt1, (u1)2);
 797         br(LT, DO1);
 798         br(GT, DO3);
 799       }
 800     }
 801 
 802     if (icnt1 == 4) {
 803       Label CH1_LOOP;
 804 
 805         (this->*load_4chr)(ch1, str1);
 806         sub(result_tmp, cnt2, 4);
 807         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 808         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 809 
 810       BIND(CH1_LOOP);
 811         (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
 812         cmp(ch1, ch2);
 813         br(EQ, MATCH);
 814         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 815         br(LE, CH1_LOOP);
 816         b(NOMATCH);
 817       }
 818 
 819     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
 820       Label CH1_LOOP;
 821 
 822       BIND(DO2);
 823         (this->*load_2chr)(ch1, str1);
 824         if (icnt1 == 2) {
 825           sub(result_tmp, cnt2, 2);
 826         }
 827         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 828         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 829       BIND(CH1_LOOP);
 830         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
 831         cmp(ch1, ch2);
 832         br(EQ, MATCH);
 833         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 834         br(LE, CH1_LOOP);
 835         b(NOMATCH);
 836     }
 837 
 838     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
 839       Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
 840 
 841       BIND(DO3);
 842         (this->*load_2chr)(first, str1);
 843         (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
 844         if (icnt1 == 3) {
 845           sub(result_tmp, cnt2, 3);
 846         }
 847         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 848         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 849       BIND(FIRST_LOOP);
 850         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
 851         cmpw(first, ch2);
 852         br(EQ, STR1_LOOP);
 853       BIND(STR2_NEXT);
 854         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 855         br(LE, FIRST_LOOP);
 856         b(NOMATCH);
 857 
 858       BIND(STR1_LOOP);
 859         add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
 860         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
 861         cmp(ch1, ch2);
 862         br(NE, STR2_NEXT);
 863         b(MATCH);
 864     }
 865 
 866     if (icnt1 == -1 || icnt1 == 1) {
 867       Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
 868 
 869       BIND(DO1);
 870         (this->*str1_load_1chr)(ch1, str1);
 871         cmp(cnt2, (u1)8);
 872         br(LT, DO1_SHORT);
 873 
 874         sub(result_tmp, cnt2, 8/str2_chr_size);
 875         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 876         mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
 877         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 878 
 879         if (str2_isL) {
 880           orr(ch1, ch1, ch1, LSL, 8);
 881         }
 882         orr(ch1, ch1, ch1, LSL, 16);
 883         orr(ch1, ch1, ch1, LSL, 32);
 884       BIND(CH1_LOOP);
 885         ldr(ch2, Address(str2, cnt2_neg));
 886         eor(ch2, ch1, ch2);
 887         sub(tmp1, ch2, tmp3);
 888         orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 889         bics(tmp1, tmp1, tmp2);
 890         br(NE, HAS_ZERO);
 891         adds(cnt2_neg, cnt2_neg, 8);
 892         br(LT, CH1_LOOP);
 893 
 894         cmp(cnt2_neg, (u1)8);
 895         mov(cnt2_neg, 0);
 896         br(LT, CH1_LOOP);
 897         b(NOMATCH);
 898 
 899       BIND(HAS_ZERO);
 900         rev(tmp1, tmp1);
 901         clz(tmp1, tmp1);
 902         add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
 903         b(MATCH);
 904 
 905       BIND(DO1_SHORT);
 906         mov(result_tmp, cnt2);
 907         lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
 908         sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
 909       BIND(DO1_LOOP);
 910         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
 911         cmpw(ch1, ch2);
 912         br(EQ, MATCH);
 913         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 914         br(LT, DO1_LOOP);
 915     }
 916   }
 917   BIND(NOMATCH);
 918     mov(result, -1);
 919     b(DONE);
 920   BIND(MATCH);
 921     add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
 922   BIND(DONE);
 923 }
 924 
 925 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
 926 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
 927 
 928 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1,
 929                                             Register ch, Register result,
 930                                             Register tmp1, Register tmp2, Register tmp3)
 931 {
 932   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
 933   Register cnt1_neg = cnt1;
 934   Register ch1 = rscratch1;
 935   Register result_tmp = rscratch2;
 936 
 937   cbz(cnt1, NOMATCH);
 938 
 939   cmp(cnt1, (u1)4);
 940   br(LT, DO1_SHORT);
 941 
 942   orr(ch, ch, ch, LSL, 16);
 943   orr(ch, ch, ch, LSL, 32);
 944 
 945   sub(cnt1, cnt1, 4);
 946   mov(result_tmp, cnt1);
 947   lea(str1, Address(str1, cnt1, Address::uxtw(1)));
 948   sub(cnt1_neg, zr, cnt1, LSL, 1);
 949 
 950   mov(tmp3, 0x0001000100010001);
 951 
 952   BIND(CH1_LOOP);
 953     ldr(ch1, Address(str1, cnt1_neg));
 954     eor(ch1, ch, ch1);
 955     sub(tmp1, ch1, tmp3);
 956     orr(tmp2, ch1, 0x7fff7fff7fff7fff);
 957     bics(tmp1, tmp1, tmp2);
 958     br(NE, HAS_ZERO);
 959     adds(cnt1_neg, cnt1_neg, 8);
 960     br(LT, CH1_LOOP);
 961 
 962     cmp(cnt1_neg, (u1)8);
 963     mov(cnt1_neg, 0);
 964     br(LT, CH1_LOOP);
 965     b(NOMATCH);
 966 
 967   BIND(HAS_ZERO);
 968     rev(tmp1, tmp1);
 969     clz(tmp1, tmp1);
 970     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
 971     b(MATCH);
 972 
 973   BIND(DO1_SHORT);
 974     mov(result_tmp, cnt1);
 975     lea(str1, Address(str1, cnt1, Address::uxtw(1)));
 976     sub(cnt1_neg, zr, cnt1, LSL, 1);
 977   BIND(DO1_LOOP);
 978     ldrh(ch1, Address(str1, cnt1_neg));
 979     cmpw(ch, ch1);
 980     br(EQ, MATCH);
 981     adds(cnt1_neg, cnt1_neg, 2);
 982     br(LT, DO1_LOOP);
 983   BIND(NOMATCH);
 984     mov(result, -1);
 985     b(DONE);
 986   BIND(MATCH);
 987     add(result, result_tmp, cnt1_neg, ASR, 1);
 988   BIND(DONE);
 989 }
 990 
 991 void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1,
 992                                                 Register ch, Register result,
 993                                                 FloatRegister ztmp1,
 994                                                 FloatRegister ztmp2,
 995                                                 PRegister tmp_pg,
 996                                                 PRegister tmp_pdn, bool isL)
 997 {
 998   // Note that `tmp_pdn` should *NOT* be used as governing predicate register.
 999   assert(tmp_pg->is_governing(),
1000          "this register has to be a governing predicate register");
1001 
1002   Label LOOP, MATCH, DONE, NOMATCH;
1003   Register vec_len = rscratch1;
1004   Register idx = rscratch2;
1005 
1006   SIMD_RegVariant T = (isL == true) ? B : H;
1007 
1008   cbz(cnt1, NOMATCH);
1009 
1010   // Assign the particular char throughout the vector.
1011   sve_dup(ztmp2, T, ch);
1012   if (isL) {
1013     sve_cntb(vec_len);
1014   } else {
1015     sve_cnth(vec_len);
1016   }
1017   mov(idx, 0);
1018 
1019   // Generate a predicate to control the reading of input string.
1020   sve_whilelt(tmp_pg, T, idx, cnt1);
1021 
1022   BIND(LOOP);
1023     // Read a vector of 8- or 16-bit data depending on the string type. Note
1024     // that inactive elements indicated by the predicate register won't cause
1025     // a data read from memory to the destination vector.
1026     if (isL) {
1027       sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx));
1028     } else {
1029       sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1)));
1030     }
1031     add(idx, idx, vec_len);
1032 
1033     // Perform the comparison. An element of the destination predicate is set
1034     // to active if the particular char is matched.
1035     sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2);
1036 
1037     // Branch if the particular char is found.
1038     br(NE, MATCH);
1039 
1040     sve_whilelt(tmp_pg, T, idx, cnt1);
1041 
1042     // Loop back if the particular char not found.
1043     br(MI, LOOP);
1044 
1045   BIND(NOMATCH);
1046     mov(result, -1);
1047     b(DONE);
1048 
1049   BIND(MATCH);
1050     // Undo the index increment.
1051     sub(idx, idx, vec_len);
1052 
1053     // Crop the vector to find its location.
1054     sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */);
1055     add(result, idx, -1);
1056     sve_incp(result, T, tmp_pdn);
1057   BIND(DONE);
1058 }
1059 
1060 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1,
1061                                             Register ch, Register result,
1062                                             Register tmp1, Register tmp2, Register tmp3)
1063 {
1064   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
1065   Register cnt1_neg = cnt1;
1066   Register ch1 = rscratch1;
1067   Register result_tmp = rscratch2;
1068 
1069   cbz(cnt1, NOMATCH);
1070 
1071   cmp(cnt1, (u1)8);
1072   br(LT, DO1_SHORT);
1073 
1074   orr(ch, ch, ch, LSL, 8);
1075   orr(ch, ch, ch, LSL, 16);
1076   orr(ch, ch, ch, LSL, 32);
1077 
1078   sub(cnt1, cnt1, 8);
1079   mov(result_tmp, cnt1);
1080   lea(str1, Address(str1, cnt1));
1081   sub(cnt1_neg, zr, cnt1);
1082 
1083   mov(tmp3, 0x0101010101010101);
1084 
1085   BIND(CH1_LOOP);
1086     ldr(ch1, Address(str1, cnt1_neg));
1087     eor(ch1, ch, ch1);
1088     sub(tmp1, ch1, tmp3);
1089     orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f);
1090     bics(tmp1, tmp1, tmp2);
1091     br(NE, HAS_ZERO);
1092     adds(cnt1_neg, cnt1_neg, 8);
1093     br(LT, CH1_LOOP);
1094 
1095     cmp(cnt1_neg, (u1)8);
1096     mov(cnt1_neg, 0);
1097     br(LT, CH1_LOOP);
1098     b(NOMATCH);
1099 
1100   BIND(HAS_ZERO);
1101     rev(tmp1, tmp1);
1102     clz(tmp1, tmp1);
1103     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
1104     b(MATCH);
1105 
1106   BIND(DO1_SHORT);
1107     mov(result_tmp, cnt1);
1108     lea(str1, Address(str1, cnt1));
1109     sub(cnt1_neg, zr, cnt1);
1110   BIND(DO1_LOOP);
1111     ldrb(ch1, Address(str1, cnt1_neg));
1112     cmp(ch, ch1);
1113     br(EQ, MATCH);
1114     adds(cnt1_neg, cnt1_neg, 1);
1115     br(LT, DO1_LOOP);
1116   BIND(NOMATCH);
1117     mov(result, -1);
1118     b(DONE);
1119   BIND(MATCH);
1120     add(result, result_tmp, cnt1_neg);
1121   BIND(DONE);
1122 }
1123 
1124 // Compare strings.
1125 void C2_MacroAssembler::string_compare(Register str1, Register str2,
1126     Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
1127     FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
1128     PRegister pgtmp1, PRegister pgtmp2, int ae) {
1129   Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
1130       DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
1131       SHORT_LOOP_START, TAIL_CHECK;
1132 
1133   bool isLL = ae == StrIntrinsicNode::LL;
1134   bool isLU = ae == StrIntrinsicNode::LU;
1135   bool isUL = ae == StrIntrinsicNode::UL;
1136 
1137   // The stub threshold for LL strings is: 72 (64 + 8) chars
1138   // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch)
1139   // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least)
1140   const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36);
1141 
1142   bool str1_isL = isLL || isLU;
1143   bool str2_isL = isLL || isUL;
1144 
1145   int str1_chr_shift = str1_isL ? 0 : 1;
1146   int str2_chr_shift = str2_isL ? 0 : 1;
1147   int str1_chr_size = str1_isL ? 1 : 2;
1148   int str2_chr_size = str2_isL ? 1 : 2;
1149   int minCharsInWord = isLL ? wordSize : wordSize/2;
1150 
1151   FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
1152   chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
1153                                       (chr_insn)&MacroAssembler::ldrh;
1154   chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
1155                                       (chr_insn)&MacroAssembler::ldrh;
1156   uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
1157                             (uxt_insn)&MacroAssembler::uxthw;
1158 
1159   BLOCK_COMMENT("string_compare {");
1160 
1161   // Bizarrely, the counts are passed in bytes, regardless of whether they
1162   // are L or U strings, however the result is always in characters.
1163   if (!str1_isL) asrw(cnt1, cnt1, 1);
1164   if (!str2_isL) asrw(cnt2, cnt2, 1);
1165 
1166   // Compute the minimum of the string lengths and save the difference.
1167   subsw(result, cnt1, cnt2);
1168   cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
1169 
1170   // A very short string
1171   cmpw(cnt2, minCharsInWord);
1172   br(Assembler::LE, SHORT_STRING);
1173 
1174   // Compare longwords
1175   // load first parts of strings and finish initialization while loading
1176   {
1177     if (str1_isL == str2_isL) { // LL or UU
1178       ldr(tmp1, Address(str1));
1179       cmp(str1, str2);
1180       br(Assembler::EQ, DONE);
1181       ldr(tmp2, Address(str2));
1182       cmp(cnt2, stub_threshold);
1183       br(GE, STUB);
1184       subsw(cnt2, cnt2, minCharsInWord);
1185       br(EQ, TAIL_CHECK);
1186       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1187       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1188       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1189     } else if (isLU) {
1190       ldrs(vtmp, Address(str1));
1191       ldr(tmp2, Address(str2));
1192       cmp(cnt2, stub_threshold);
1193       br(GE, STUB);
1194       subw(cnt2, cnt2, 4);
1195       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1196       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1197       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1198       zip1(vtmp, T8B, vtmp, vtmpZ);
1199       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1200       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1201       add(cnt1, cnt1, 4);
1202       fmovd(tmp1, vtmp);
1203     } else { // UL case
1204       ldr(tmp1, Address(str1));
1205       ldrs(vtmp, Address(str2));
1206       cmp(cnt2, stub_threshold);
1207       br(GE, STUB);
1208       subw(cnt2, cnt2, 4);
1209       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1210       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1211       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1212       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1213       zip1(vtmp, T8B, vtmp, vtmpZ);
1214       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1215       add(cnt1, cnt1, 8);
1216       fmovd(tmp2, vtmp);
1217     }
1218     adds(cnt2, cnt2, isUL ? 4 : 8);
1219     br(GE, TAIL);
1220     eor(rscratch2, tmp1, tmp2);
1221     cbnz(rscratch2, DIFF);
1222     // main loop
1223     bind(NEXT_WORD);
1224     if (str1_isL == str2_isL) {
1225       ldr(tmp1, Address(str1, cnt2));
1226       ldr(tmp2, Address(str2, cnt2));
1227       adds(cnt2, cnt2, 8);
1228     } else if (isLU) {
1229       ldrs(vtmp, Address(str1, cnt1));
1230       ldr(tmp2, Address(str2, cnt2));
1231       add(cnt1, cnt1, 4);
1232       zip1(vtmp, T8B, vtmp, vtmpZ);
1233       fmovd(tmp1, vtmp);
1234       adds(cnt2, cnt2, 8);
1235     } else { // UL
1236       ldrs(vtmp, Address(str2, cnt2));
1237       ldr(tmp1, Address(str1, cnt1));
1238       zip1(vtmp, T8B, vtmp, vtmpZ);
1239       add(cnt1, cnt1, 8);
1240       fmovd(tmp2, vtmp);
1241       adds(cnt2, cnt2, 4);
1242     }
1243     br(GE, TAIL);
1244 
1245     eor(rscratch2, tmp1, tmp2);
1246     cbz(rscratch2, NEXT_WORD);
1247     b(DIFF);
1248     bind(TAIL);
1249     eor(rscratch2, tmp1, tmp2);
1250     cbnz(rscratch2, DIFF);
1251     // Last longword.  In the case where length == 4 we compare the
1252     // same longword twice, but that's still faster than another
1253     // conditional branch.
1254     if (str1_isL == str2_isL) {
1255       ldr(tmp1, Address(str1));
1256       ldr(tmp2, Address(str2));
1257     } else if (isLU) {
1258       ldrs(vtmp, Address(str1));
1259       ldr(tmp2, Address(str2));
1260       zip1(vtmp, T8B, vtmp, vtmpZ);
1261       fmovd(tmp1, vtmp);
1262     } else { // UL
1263       ldrs(vtmp, Address(str2));
1264       ldr(tmp1, Address(str1));
1265       zip1(vtmp, T8B, vtmp, vtmpZ);
1266       fmovd(tmp2, vtmp);
1267     }
1268     bind(TAIL_CHECK);
1269     eor(rscratch2, tmp1, tmp2);
1270     cbz(rscratch2, DONE);
1271 
1272     // Find the first different characters in the longwords and
1273     // compute their difference.
1274     bind(DIFF);
1275     rev(rscratch2, rscratch2);
1276     clz(rscratch2, rscratch2);
1277     andr(rscratch2, rscratch2, isLL ? -8 : -16);
1278     lsrv(tmp1, tmp1, rscratch2);
1279     (this->*ext_chr)(tmp1, tmp1);
1280     lsrv(tmp2, tmp2, rscratch2);
1281     (this->*ext_chr)(tmp2, tmp2);
1282     subw(result, tmp1, tmp2);
1283     b(DONE);
1284   }
1285 
1286   bind(STUB);
1287     RuntimeAddress stub = nullptr;
1288     switch(ae) {
1289       case StrIntrinsicNode::LL:
1290         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
1291         break;
1292       case StrIntrinsicNode::UU:
1293         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
1294         break;
1295       case StrIntrinsicNode::LU:
1296         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
1297         break;
1298       case StrIntrinsicNode::UL:
1299         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
1300         break;
1301       default:
1302         ShouldNotReachHere();
1303      }
1304     assert(stub.target() != nullptr, "compare_long_string stub has not been generated");
1305     address call = trampoline_call(stub);
1306     if (call == nullptr) {
1307       DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START));
1308       ciEnv::current()->record_failure("CodeCache is full");
1309       return;
1310     }
1311     b(DONE);
1312 
1313   bind(SHORT_STRING);
1314   // Is the minimum length zero?
1315   cbz(cnt2, DONE);
1316   // arrange code to do most branches while loading and loading next characters
1317   // while comparing previous
1318   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1319   subs(cnt2, cnt2, 1);
1320   br(EQ, SHORT_LAST_INIT);
1321   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1322   b(SHORT_LOOP_START);
1323   bind(SHORT_LOOP);
1324   subs(cnt2, cnt2, 1);
1325   br(EQ, SHORT_LAST);
1326   bind(SHORT_LOOP_START);
1327   (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
1328   (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
1329   cmp(tmp1, cnt1);
1330   br(NE, SHORT_LOOP_TAIL);
1331   subs(cnt2, cnt2, 1);
1332   br(EQ, SHORT_LAST2);
1333   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1334   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1335   cmp(tmp2, rscratch1);
1336   br(EQ, SHORT_LOOP);
1337   sub(result, tmp2, rscratch1);
1338   b(DONE);
1339   bind(SHORT_LOOP_TAIL);
1340   sub(result, tmp1, cnt1);
1341   b(DONE);
1342   bind(SHORT_LAST2);
1343   cmp(tmp2, rscratch1);
1344   br(EQ, DONE);
1345   sub(result, tmp2, rscratch1);
1346 
1347   b(DONE);
1348   bind(SHORT_LAST_INIT);
1349   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1350   bind(SHORT_LAST);
1351   cmp(tmp1, cnt1);
1352   br(EQ, DONE);
1353   sub(result, tmp1, cnt1);
1354 
1355   bind(DONE);
1356 
1357   BLOCK_COMMENT("} string_compare");
1358 }
1359 
1360 void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1,
1361                                      FloatRegister src2, Condition cond, bool isQ) {
1362   SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1363   FloatRegister zn = src1, zm = src2;
1364   bool needs_negation = false;
1365   switch (cond) {
1366     case LT: cond = GT; zn = src2; zm = src1; break;
1367     case LE: cond = GE; zn = src2; zm = src1; break;
1368     case LO: cond = HI; zn = src2; zm = src1; break;
1369     case LS: cond = HS; zn = src2; zm = src1; break;
1370     case NE: cond = EQ; needs_negation = true; break;
1371     default:
1372       break;
1373   }
1374 
1375   if (is_floating_point_type(bt)) {
1376     fcm(cond, dst, size, zn, zm);
1377   } else {
1378     cm(cond, dst, size, zn, zm);
1379   }
1380 
1381   if (needs_negation) {
1382     notr(dst, isQ ? T16B : T8B, dst);
1383   }
1384 }
1385 
1386 void C2_MacroAssembler::neon_compare_zero(FloatRegister dst, BasicType bt, FloatRegister src,
1387                                           Condition cond, bool isQ) {
1388   SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1389   if (bt == T_FLOAT || bt == T_DOUBLE) {
1390     if (cond == Assembler::NE) {
1391       fcm(Assembler::EQ, dst, size, src);
1392       notr(dst, isQ ? T16B : T8B, dst);
1393     } else {
1394       fcm(cond, dst, size, src);
1395     }
1396   } else {
1397     if (cond == Assembler::NE) {
1398       cm(Assembler::EQ, dst, size, src);
1399       notr(dst, isQ ? T16B : T8B, dst);
1400     } else {
1401       cm(cond, dst, size, src);
1402     }
1403   }
1404 }
1405 
1406 // Compress the least significant bit of each byte to the rightmost and clear
1407 // the higher garbage bits.
1408 void C2_MacroAssembler::bytemask_compress(Register dst) {
1409   // Example input, dst = 0x01 00 00 00 01 01 00 01
1410   // The "??" bytes are garbage.
1411   orr(dst, dst, dst, Assembler::LSR, 7);  // dst = 0x?? 02 ?? 00 ?? 03 ?? 01
1412   orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D
1413   orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D
1414   andr(dst, dst, 0xff);                   // dst = 0x8D
1415 }
1416 
1417 // Pack the value of each mask element in "src" into a long value in "dst", at most
1418 // the first 64 lane elements. The input "src" is a vector of boolean represented as
1419 // bytes with 0x00/0x01 as element values. Each lane value from "src" is packed into
1420 // one bit in "dst".
1421 //
1422 // Example:   src = 0x0001010000010001 0100000001010001, lane_cnt = 16
1423 // Expected:  dst = 0x658D
1424 //
1425 // Clobbers: rscratch1
1426 void C2_MacroAssembler::sve_vmask_tolong(Register dst, FloatRegister src,
1427                                          FloatRegister vtmp, int lane_cnt) {
1428   assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
1429   assert_different_registers(dst, rscratch1);
1430   assert_different_registers(src, vtmp);
1431   assert(UseSVE > 0, "must be");
1432 
1433   // Compress the lowest 8 bytes.
1434   fmovd(dst, src);
1435   bytemask_compress(dst);
1436   if (lane_cnt <= 8) return;
1437 
1438   // Repeat on higher bytes and join the results.
1439   // Compress 8 bytes in each iteration.
1440   for (int idx = 1; idx < (lane_cnt / 8); idx++) {
1441     sve_extract_integral(rscratch1, T_LONG, src, idx, vtmp);
1442     bytemask_compress(rscratch1);
1443     orr(dst, dst, rscratch1, Assembler::LSL, idx << 3);
1444   }
1445 }
1446 
1447 // The function is same as above "sve_vmask_tolong", but it uses SVE2's BEXT
1448 // instruction which requires the FEAT_BITPERM feature.
1449 void C2_MacroAssembler::sve2_vmask_tolong(Register dst, FloatRegister src,
1450                                           FloatRegister vtmp1, FloatRegister vtmp2,
1451                                           int lane_cnt) {
1452   assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
1453   assert_different_registers(src, vtmp1, vtmp2);
1454   assert(UseSVE > 1 && VM_Version::supports_svebitperm(), "must be");
1455 
1456   // Given a vector with the value 0x00 or 0x01 in each byte, the basic idea
1457   // is to compress each significant bit of the byte in a cross-lane way. Due
1458   // to the lack of a cross-lane bit-compress instruction, we use BEXT
1459   // (bit-compress in each lane) with the biggest lane size (T = D) then
1460   // concatenate the results.
1461 
1462   // The second source input of BEXT, initialized with 0x01 in each byte.
1463   // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1464   sve_dup(vtmp2, B, 1);
1465 
1466   // BEXT vtmp1.D, src.D, vtmp2.D
1467   // src   = 0x0001010000010001 | 0x0100000001010001
1468   // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1469   //         ---------------------------------------
1470   // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1471   sve_bext(vtmp1, D, src, vtmp2);
1472 
1473   // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the
1474   // result to dst.
1475   // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1476   // dst   = 0x658D
1477   if (lane_cnt <= 8) {
1478     // No need to concatenate.
1479     umov(dst, vtmp1, B, 0);
1480   } else if (lane_cnt <= 16) {
1481     ins(vtmp1, B, vtmp1, 1, 8);
1482     umov(dst, vtmp1, H, 0);
1483   } else {
1484     // As the lane count is 64 at most, the final expected value must be in
1485     // the lowest 64 bits after narrowing vtmp1 from D to B.
1486     sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2);
1487     umov(dst, vtmp1, D, 0);
1488   }
1489 }
1490 
1491 // Unpack the mask, a long value in "src", into a vector register of boolean
1492 // represented as bytes with 0x00/0x01 as element values in "dst".  Each bit in
1493 // "src" is unpacked into one byte lane in "dst". Note that "dst" can support at
1494 // most 64 lanes.
1495 //
1496 // Below example gives the expected dst vector register, with a valid src(0x658D)
1497 // on a 128-bit vector size machine.
1498 // dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01
1499 void C2_MacroAssembler::sve_vmask_fromlong(FloatRegister dst, Register src,
1500                                            FloatRegister vtmp, int lane_cnt) {
1501   assert_different_registers(dst, vtmp);
1502   assert(UseSVE == 2 && VM_Version::supports_svebitperm() &&
1503          lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported");
1504 
1505   // Example:   src = 0x658D, lane_cnt = 16
1506   // Expected:  dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01
1507 
1508   // Put long value from general purpose register into the first lane of vector.
1509   // vtmp = 0x0000000000000000 | 0x000000000000658D
1510   sve_dup(vtmp, B, 0);
1511   mov(vtmp, D, 0, src);
1512 
1513   // Transform the value in the first lane which is mask in bit now to the mask in
1514   // byte, which can be done by SVE2's BDEP instruction.
1515 
1516   // The first source input of BDEP instruction. Deposite each byte in every 8 bytes.
1517   // vtmp = 0x0000000000000065 | 0x000000000000008D
1518   if (lane_cnt <= 8) {
1519     // Nothing. As only one byte exsits.
1520   } else if (lane_cnt <= 16) {
1521     ins(vtmp, B, vtmp, 8, 1);
1522   } else {
1523     sve_vector_extend(vtmp, D, vtmp, B);
1524   }
1525 
1526   // The second source input of BDEP instruction, initialized with 0x01 for each byte.
1527   // dst = 0x01010101 0x01010101 0x01010101 0x01010101
1528   sve_dup(dst, B, 1);
1529 
1530   // BDEP dst.D, vtmp.D, dst.D
1531   // vtmp = 0x0000000000000065 | 0x000000000000008D
1532   // dst  = 0x0101010101010101 | 0x0101010101010101
1533   //        ---------------------------------------
1534   // dst  = 0x0001010000010001 | 0x0100000001010001
1535   sve_bdep(dst, D, vtmp, dst);
1536 }
1537 
1538 // Clobbers: rflags
1539 void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg,
1540                                     FloatRegister zn, FloatRegister zm, Condition cond) {
1541   assert(pg->is_governing(), "This register has to be a governing predicate register");
1542   FloatRegister z1 = zn, z2 = zm;
1543   switch (cond) {
1544     case LE: z1 = zm; z2 = zn; cond = GE; break;
1545     case LT: z1 = zm; z2 = zn; cond = GT; break;
1546     case LO: z1 = zm; z2 = zn; cond = HI; break;
1547     case LS: z1 = zm; z2 = zn; cond = HS; break;
1548     default:
1549       break;
1550   }
1551 
1552   SIMD_RegVariant size = elemType_to_regVariant(bt);
1553   if (is_floating_point_type(bt)) {
1554     sve_fcm(cond, pd, size, pg, z1, z2);
1555   } else {
1556     assert(is_integral_type(bt), "unsupported element type");
1557     sve_cmp(cond, pd, size, pg, z1, z2);
1558   }
1559 }
1560 
1561 // Get index of the last mask lane that is set
1562 void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) {
1563   SIMD_RegVariant size = elemType_to_regVariant(bt);
1564   sve_rev(ptmp, size, src);
1565   sve_brkb(ptmp, ptrue, ptmp, false);
1566   sve_cntp(dst, size, ptrue, ptmp);
1567   movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1);
1568   subw(dst, rscratch1, dst);
1569 }
1570 
1571 // Extend integer vector src to dst with the same lane count
1572 // but larger element size, e.g. 4B -> 4I
1573 void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes,
1574                                            FloatRegister src, BasicType src_bt, bool is_unsigned) {
1575   if (src_bt == T_BYTE) {
1576     // 4B to 4S/4I, 8B to 8S
1577     assert(dst_vlen_in_bytes == 8 || dst_vlen_in_bytes == 16, "unsupported");
1578     assert(dst_bt == T_SHORT || dst_bt == T_INT, "unsupported");
1579     _xshll(is_unsigned, dst, T8H, src, T8B, 0);
1580     if (dst_bt == T_INT) {
1581       _xshll(is_unsigned, dst, T4S, dst, T4H, 0);
1582     }
1583   } else if (src_bt == T_SHORT) {
1584     // 2S to 2I/2L, 4S to 4I
1585     assert(dst_vlen_in_bytes == 8 || dst_vlen_in_bytes == 16, "unsupported");
1586     assert(dst_bt == T_INT || dst_bt == T_LONG, "unsupported");
1587     _xshll(is_unsigned, dst, T4S, src, T4H, 0);
1588     if (dst_bt == T_LONG) {
1589       _xshll(is_unsigned, dst, T2D, dst, T2S, 0);
1590     }
1591   } else if (src_bt == T_INT) {
1592     // 2I to 2L
1593     assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported");
1594     _xshll(is_unsigned, dst, T2D, src, T2S, 0);
1595   } else {
1596     ShouldNotReachHere();
1597   }
1598 }
1599 
1600 // Narrow integer vector src down to dst with the same lane count
1601 // but smaller element size, e.g. 4I -> 4B
1602 void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt,
1603                                            FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) {
1604   if (src_bt == T_SHORT) {
1605     // 4S/8S to 4B/8B
1606     assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
1607     assert(dst_bt == T_BYTE, "unsupported");
1608     xtn(dst, T8B, src, T8H);
1609   } else if (src_bt == T_INT) {
1610     // 2I to 2S, 4I to 4B/4S
1611     assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
1612     assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported");
1613     xtn(dst, T4H, src, T4S);
1614     if (dst_bt == T_BYTE) {
1615       xtn(dst, T8B, dst, T8H);
1616     }
1617   } else if (src_bt == T_LONG) {
1618     // 2L to 2S/2I
1619     assert(src_vlen_in_bytes == 16, "unsupported");
1620     assert(dst_bt == T_INT || dst_bt == T_SHORT, "unsupported");
1621     xtn(dst, T2S, src, T2D);
1622     if (dst_bt == T_SHORT) {
1623       xtn(dst, T4H, dst, T4S);
1624     }
1625   } else {
1626     ShouldNotReachHere();
1627   }
1628 }
1629 
1630 void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size,
1631                                           FloatRegister src, SIMD_RegVariant src_size,
1632                                           bool is_unsigned) {
1633   assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size");
1634 
1635   if (src_size == B) {
1636     switch (dst_size) {
1637     case H:
1638       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1639       break;
1640     case S:
1641       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1642       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1643       break;
1644     case D:
1645       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1646       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1647       _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1648       break;
1649     default:
1650       ShouldNotReachHere();
1651     }
1652   } else if (src_size == H) {
1653     if (dst_size == S) {
1654       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1655     } else { // D
1656       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1657       _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1658     }
1659   } else if (src_size == S) {
1660     _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, src);
1661   }
1662 }
1663 
1664 // Vector narrow from src to dst with specified element sizes.
1665 // High part of dst vector will be filled with zero.
1666 void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size,
1667                                           FloatRegister src, SIMD_RegVariant src_size,
1668                                           FloatRegister tmp) {
1669   assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size");
1670   assert_different_registers(src, tmp);
1671   sve_dup(tmp, src_size, 0);
1672   if (src_size == D) {
1673     switch (dst_size) {
1674     case S:
1675       sve_uzp1(dst, S, src, tmp);
1676       break;
1677     case H:
1678       assert_different_registers(dst, tmp);
1679       sve_uzp1(dst, S, src, tmp);
1680       sve_uzp1(dst, H, dst, tmp);
1681       break;
1682     case B:
1683       assert_different_registers(dst, tmp);
1684       sve_uzp1(dst, S, src, tmp);
1685       sve_uzp1(dst, H, dst, tmp);
1686       sve_uzp1(dst, B, dst, tmp);
1687       break;
1688     default:
1689       ShouldNotReachHere();
1690     }
1691   } else if (src_size == S) {
1692     if (dst_size == H) {
1693       sve_uzp1(dst, H, src, tmp);
1694     } else { // B
1695       assert_different_registers(dst, tmp);
1696       sve_uzp1(dst, H, src, tmp);
1697       sve_uzp1(dst, B, dst, tmp);
1698     }
1699   } else if (src_size == H) {
1700     sve_uzp1(dst, B, src, tmp);
1701   }
1702 }
1703 
1704 // Extend src predicate to dst predicate with the same lane count but larger
1705 // element size, e.g. 64Byte -> 512Long
1706 void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src,
1707                                              uint dst_element_length_in_bytes,
1708                                              uint src_element_length_in_bytes) {
1709   if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) {
1710     sve_punpklo(dst, src);
1711   } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) {
1712     sve_punpklo(dst, src);
1713     sve_punpklo(dst, dst);
1714   } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) {
1715     sve_punpklo(dst, src);
1716     sve_punpklo(dst, dst);
1717     sve_punpklo(dst, dst);
1718   } else {
1719     assert(false, "unsupported");
1720     ShouldNotReachHere();
1721   }
1722 }
1723 
1724 // Narrow src predicate to dst predicate with the same lane count but
1725 // smaller element size, e.g. 512Long -> 64Byte
1726 void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src, PRegister ptmp,
1727                                              uint dst_element_length_in_bytes, uint src_element_length_in_bytes) {
1728   // The insignificant bits in src predicate are expected to be zero.
1729   // To ensure the higher order bits of the resultant narrowed vector are 0, an all-zero predicate is
1730   // passed as the second argument. An example narrowing operation with a given mask would be -
1731   // 128Long -> 64Int on a 128-bit machine i.e 2L -> 2I
1732   // Mask (for 2 Longs) : TF
1733   // Predicate register for the above mask (16 bits) : 00000001 00000000
1734   // After narrowing (uzp1 dst.b, src.b, ptmp.b) : 0000 0000 0001 0000
1735   // Which translates to mask for 2 integers as : TF (lower half is considered while upper half is 0)
1736   assert_different_registers(src, ptmp);
1737   assert_different_registers(dst, ptmp);
1738   sve_pfalse(ptmp);
1739   if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) {
1740     sve_uzp1(dst, B, src, ptmp);
1741   } else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) {
1742     sve_uzp1(dst, H, src, ptmp);
1743     sve_uzp1(dst, B, dst, ptmp);
1744   } else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) {
1745     sve_uzp1(dst, S, src, ptmp);
1746     sve_uzp1(dst, H, dst, ptmp);
1747     sve_uzp1(dst, B, dst, ptmp);
1748   } else {
1749     assert(false, "unsupported");
1750     ShouldNotReachHere();
1751   }
1752 }
1753 
1754 // Vector reduction add for integral type with ASIMD instructions.
1755 void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt,
1756                                                  Register isrc, FloatRegister vsrc,
1757                                                  unsigned vector_length_in_bytes,
1758                                                  FloatRegister vtmp) {
1759   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1760   assert_different_registers(dst, isrc);
1761   bool isQ = vector_length_in_bytes == 16;
1762 
1763   BLOCK_COMMENT("neon_reduce_add_integral {");
1764     switch(bt) {
1765       case T_BYTE:
1766         addv(vtmp, isQ ? T16B : T8B, vsrc);
1767         smov(dst, vtmp, B, 0);
1768         addw(dst, dst, isrc, ext::sxtb);
1769         break;
1770       case T_SHORT:
1771         addv(vtmp, isQ ? T8H : T4H, vsrc);
1772         smov(dst, vtmp, H, 0);
1773         addw(dst, dst, isrc, ext::sxth);
1774         break;
1775       case T_INT:
1776         isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc);
1777         umov(dst, vtmp, S, 0);
1778         addw(dst, dst, isrc);
1779         break;
1780       case T_LONG:
1781         assert(isQ, "unsupported");
1782         addpd(vtmp, vsrc);
1783         umov(dst, vtmp, D, 0);
1784         add(dst, dst, isrc);
1785         break;
1786       default:
1787         assert(false, "unsupported");
1788         ShouldNotReachHere();
1789     }
1790   BLOCK_COMMENT("} neon_reduce_add_integral");
1791 }
1792 
1793 // Vector reduction multiply for integral type with ASIMD instructions.
1794 // Note: temporary registers vtmp1 and vtmp2 are not used in some cases.
1795 // Clobbers: rscratch1
1796 void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt,
1797                                                  Register isrc, FloatRegister vsrc,
1798                                                  unsigned vector_length_in_bytes,
1799                                                  FloatRegister vtmp1, FloatRegister vtmp2) {
1800   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1801   bool isQ = vector_length_in_bytes == 16;
1802 
1803   BLOCK_COMMENT("neon_reduce_mul_integral {");
1804     switch(bt) {
1805       case T_BYTE:
1806         if (isQ) {
1807           // Multiply the lower half and higher half of vector iteratively.
1808           // vtmp1 = vsrc[8:15]
1809           ins(vtmp1, D, vsrc, 0, 1);
1810           // vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7]
1811           mulv(vtmp1, T8B, vtmp1, vsrc);
1812           // vtmp2 = vtmp1[4:7]
1813           ins(vtmp2, S, vtmp1, 0, 1);
1814           // vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3]
1815           mulv(vtmp1, T8B, vtmp2, vtmp1);
1816         } else {
1817           ins(vtmp1, S, vsrc, 0, 1);
1818           mulv(vtmp1, T8B, vtmp1, vsrc);
1819         }
1820         // vtmp2 = vtmp1[2:3]
1821         ins(vtmp2, H, vtmp1, 0, 1);
1822         // vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1]
1823         mulv(vtmp2, T8B, vtmp2, vtmp1);
1824         // dst = vtmp2[0] * isrc * vtmp2[1]
1825         umov(rscratch1, vtmp2, B, 0);
1826         mulw(dst, rscratch1, isrc);
1827         sxtb(dst, dst);
1828         umov(rscratch1, vtmp2, B, 1);
1829         mulw(dst, rscratch1, dst);
1830         sxtb(dst, dst);
1831         break;
1832       case T_SHORT:
1833         if (isQ) {
1834           ins(vtmp2, D, vsrc, 0, 1);
1835           mulv(vtmp2, T4H, vtmp2, vsrc);
1836           ins(vtmp1, S, vtmp2, 0, 1);
1837           mulv(vtmp1, T4H, vtmp1, vtmp2);
1838         } else {
1839           ins(vtmp1, S, vsrc, 0, 1);
1840           mulv(vtmp1, T4H, vtmp1, vsrc);
1841         }
1842         umov(rscratch1, vtmp1, H, 0);
1843         mulw(dst, rscratch1, isrc);
1844         sxth(dst, dst);
1845         umov(rscratch1, vtmp1, H, 1);
1846         mulw(dst, rscratch1, dst);
1847         sxth(dst, dst);
1848         break;
1849       case T_INT:
1850         if (isQ) {
1851           ins(vtmp1, D, vsrc, 0, 1);
1852           mulv(vtmp1, T2S, vtmp1, vsrc);
1853         } else {
1854           vtmp1 = vsrc;
1855         }
1856         umov(rscratch1, vtmp1, S, 0);
1857         mul(dst, rscratch1, isrc);
1858         umov(rscratch1, vtmp1, S, 1);
1859         mul(dst, rscratch1, dst);
1860         break;
1861       case T_LONG:
1862         umov(rscratch1, vsrc, D, 0);
1863         mul(dst, isrc, rscratch1);
1864         umov(rscratch1, vsrc, D, 1);
1865         mul(dst, dst, rscratch1);
1866         break;
1867       default:
1868         assert(false, "unsupported");
1869         ShouldNotReachHere();
1870     }
1871   BLOCK_COMMENT("} neon_reduce_mul_integral");
1872 }
1873 
1874 // Vector reduction multiply for floating-point type with ASIMD instructions.
1875 void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt,
1876                                            FloatRegister fsrc, FloatRegister vsrc,
1877                                            unsigned vector_length_in_bytes,
1878                                            FloatRegister vtmp) {
1879   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1880   bool isQ = vector_length_in_bytes == 16;
1881 
1882   BLOCK_COMMENT("neon_reduce_mul_fp {");
1883     switch(bt) {
1884       // The T_SHORT type below is for Float16 type which also uses floating-point
1885       // instructions.
1886       case T_SHORT:
1887         fmulh(dst, fsrc, vsrc);
1888         ext(vtmp, T8B, vsrc, vsrc, 2);
1889         fmulh(dst, dst, vtmp);
1890         ext(vtmp, T8B, vsrc, vsrc, 4);
1891         fmulh(dst, dst, vtmp);
1892         ext(vtmp, T8B, vsrc, vsrc, 6);
1893         fmulh(dst, dst, vtmp);
1894         if (isQ) {
1895           ext(vtmp, T16B, vsrc, vsrc, 8);
1896           fmulh(dst, dst, vtmp);
1897           ext(vtmp, T16B, vsrc, vsrc, 10);
1898           fmulh(dst, dst, vtmp);
1899           ext(vtmp, T16B, vsrc, vsrc, 12);
1900           fmulh(dst, dst, vtmp);
1901           ext(vtmp, T16B, vsrc, vsrc, 14);
1902           fmulh(dst, dst, vtmp);
1903         }
1904         break;
1905       case T_FLOAT:
1906         fmuls(dst, fsrc, vsrc);
1907         ins(vtmp, S, vsrc, 0, 1);
1908         fmuls(dst, dst, vtmp);
1909         if (isQ) {
1910           ins(vtmp, S, vsrc, 0, 2);
1911           fmuls(dst, dst, vtmp);
1912           ins(vtmp, S, vsrc, 0, 3);
1913           fmuls(dst, dst, vtmp);
1914          }
1915         break;
1916       case T_DOUBLE:
1917         assert(isQ, "unsupported");
1918         fmuld(dst, fsrc, vsrc);
1919         ins(vtmp, D, vsrc, 0, 1);
1920         fmuld(dst, dst, vtmp);
1921         break;
1922       default:
1923         assert(false, "unsupported");
1924         ShouldNotReachHere();
1925     }
1926   BLOCK_COMMENT("} neon_reduce_mul_fp");
1927 }
1928 
1929 // Vector reduction add for half float type with ASIMD instructions.
1930 void C2_MacroAssembler::neon_reduce_add_fp16(FloatRegister dst, FloatRegister fsrc, FloatRegister vsrc,
1931                                              unsigned vector_length_in_bytes, FloatRegister vtmp) {
1932   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1933   bool isQ = vector_length_in_bytes == 16;
1934 
1935   BLOCK_COMMENT("neon_reduce_add_fp16 {");
1936     faddh(dst, fsrc, vsrc);
1937     ext(vtmp, T8B, vsrc, vsrc, 2);
1938     faddh(dst, dst, vtmp);
1939     ext(vtmp, T8B, vsrc, vsrc, 4);
1940     faddh(dst, dst, vtmp);
1941     ext(vtmp, T8B, vsrc, vsrc, 6);
1942     faddh(dst, dst, vtmp);
1943     if (isQ) {
1944       ext(vtmp, T16B, vsrc, vsrc, 8);
1945       faddh(dst, dst, vtmp);
1946       ext(vtmp, T16B, vsrc, vsrc, 10);
1947       faddh(dst, dst, vtmp);
1948       ext(vtmp, T16B, vsrc, vsrc, 12);
1949       faddh(dst, dst, vtmp);
1950       ext(vtmp, T16B, vsrc, vsrc, 14);
1951       faddh(dst, dst, vtmp);
1952     }
1953   BLOCK_COMMENT("} neon_reduce_add_fp16");
1954 }
1955 
1956 // Helper to select logical instruction
1957 void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd,
1958                                                    Register Rn, Register Rm,
1959                                                    enum shift_kind kind, unsigned shift) {
1960   switch(opc) {
1961     case Op_AndReductionV:
1962       is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift);
1963       break;
1964     case Op_OrReductionV:
1965       is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift);
1966       break;
1967     case Op_XorReductionV:
1968       is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift);
1969       break;
1970     default:
1971       assert(false, "unsupported");
1972       ShouldNotReachHere();
1973   }
1974 }
1975 
1976 // Vector reduction logical operations And, Or, Xor
1977 // Clobbers: rscratch1
1978 void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt,
1979                                             Register isrc, FloatRegister vsrc,
1980                                             unsigned vector_length_in_bytes) {
1981   assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV,
1982          "unsupported");
1983   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1984   assert_different_registers(dst, isrc);
1985   bool isQ = vector_length_in_bytes == 16;
1986 
1987   BLOCK_COMMENT("neon_reduce_logical {");
1988     umov(rscratch1, vsrc, isQ ? D : S, 0);
1989     umov(dst, vsrc, isQ ? D : S, 1);
1990     neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1);
1991     switch(bt) {
1992       case T_BYTE:
1993         if (isQ) {
1994           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1995         }
1996         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
1997         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8);
1998         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1999         sxtb(dst, dst);
2000         break;
2001       case T_SHORT:
2002         if (isQ) {
2003           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
2004         }
2005         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
2006         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
2007         sxth(dst, dst);
2008         break;
2009       case T_INT:
2010         if (isQ) {
2011           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
2012         }
2013         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
2014         break;
2015       case T_LONG:
2016         assert(isQ, "unsupported");
2017         neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst);
2018         break;
2019       default:
2020         assert(false, "unsupported");
2021         ShouldNotReachHere();
2022     }
2023   BLOCK_COMMENT("} neon_reduce_logical");
2024 }
2025 
2026 // Helper function to decode min/max reduction operation properties
2027 void C2_MacroAssembler::decode_minmax_reduction_opc(int opc, bool* is_min,
2028                                                     bool* is_unsigned,
2029                                                     Condition* cond) {
2030   switch(opc) {
2031     case Op_MinReductionV:
2032       *is_min = true;  *is_unsigned = false; *cond = LT; break;
2033     case Op_MaxReductionV:
2034       *is_min = false; *is_unsigned = false; *cond = GT; break;
2035     case Op_UMinReductionV:
2036       *is_min = true;  *is_unsigned = true;  *cond = LO; break;
2037     case Op_UMaxReductionV:
2038       *is_min = false; *is_unsigned = true;  *cond = HI; break;
2039     default:
2040       ShouldNotReachHere();
2041   }
2042 }
2043 
2044 // Vector reduction min/max/umin/umax for integral type with ASIMD instructions.
2045 // Note: vtmp is not used and expected to be fnoreg for T_LONG case.
2046 // Clobbers: rscratch1, rflags
2047 void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt,
2048                                                     Register isrc, FloatRegister vsrc,
2049                                                     unsigned vector_length_in_bytes,
2050                                                     FloatRegister vtmp) {
2051   assert(opc == Op_MinReductionV || opc == Op_MaxReductionV ||
2052          opc == Op_UMinReductionV || opc == Op_UMaxReductionV, "unsupported");
2053   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
2054   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported");
2055   assert_different_registers(dst, isrc);
2056   bool isQ = vector_length_in_bytes == 16;
2057   bool is_min;
2058   bool is_unsigned;
2059   Condition cond;
2060   decode_minmax_reduction_opc(opc, &is_min, &is_unsigned, &cond);
2061   BLOCK_COMMENT("neon_reduce_minmax_integral {");
2062     if (bt == T_LONG) {
2063       assert(vtmp == fnoreg, "should be");
2064       assert(isQ, "should be");
2065       umov(rscratch1, vsrc, D, 0);
2066       cmp(isrc, rscratch1);
2067       csel(dst, isrc, rscratch1, cond);
2068       umov(rscratch1, vsrc, D, 1);
2069       cmp(dst, rscratch1);
2070       csel(dst, dst, rscratch1, cond);
2071     } else {
2072       SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
2073       if (size == T2S) {
2074         // For T2S (2x32-bit elements), use pairwise instructions because
2075         // uminv/umaxv/sminv/smaxv don't support arrangement 2S.
2076         neon_minmaxp(is_unsigned, is_min, vtmp, size, vsrc, vsrc);
2077       } else {
2078         // For other sizes, use reduction to scalar instructions.
2079         neon_minmaxv(is_unsigned, is_min, vtmp, size, vsrc);
2080       }
2081       if (bt == T_INT) {
2082         umov(dst, vtmp, S, 0);
2083       } else if (is_unsigned) {
2084         umov(dst, vtmp, elemType_to_regVariant(bt), 0);
2085       } else {
2086         smov(dst, vtmp, elemType_to_regVariant(bt), 0);
2087       }
2088       cmpw(dst, isrc);
2089       cselw(dst, dst, isrc, cond);
2090     }
2091   BLOCK_COMMENT("} neon_reduce_minmax_integral");
2092 }
2093 
2094 // Vector reduction for integral type with SVE instruction.
2095 // Supported operations are Add, And, Or, Xor, Max, Min, UMax, UMin.
2096 // rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV.
2097 void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1,
2098                                             FloatRegister src2, PRegister pg, FloatRegister tmp) {
2099   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2100   assert(pg->is_governing(), "This register has to be a governing predicate register");
2101   assert_different_registers(src1, dst);
2102   // Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved.
2103   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2104   switch (opc) {
2105     case Op_AddReductionVI: {
2106       sve_uaddv(tmp, size, pg, src2);
2107       if (bt == T_BYTE) {
2108         smov(dst, tmp, size, 0);
2109         addw(dst, src1, dst, ext::sxtb);
2110       } else if (bt == T_SHORT) {
2111         smov(dst, tmp, size, 0);
2112         addw(dst, src1, dst, ext::sxth);
2113       } else {
2114         umov(dst, tmp, size, 0);
2115         addw(dst, dst, src1);
2116       }
2117       break;
2118     }
2119     case Op_AddReductionVL: {
2120       sve_uaddv(tmp, size, pg, src2);
2121       umov(dst, tmp, size, 0);
2122       add(dst, dst, src1);
2123       break;
2124     }
2125     case Op_AndReductionV: {
2126       sve_andv(tmp, size, pg, src2);
2127       if (bt == T_INT || bt == T_LONG) {
2128         umov(dst, tmp, size, 0);
2129       } else {
2130         smov(dst, tmp, size, 0);
2131       }
2132       if (bt == T_LONG) {
2133         andr(dst, dst, src1);
2134       } else {
2135         andw(dst, dst, src1);
2136       }
2137       break;
2138     }
2139     case Op_OrReductionV: {
2140       sve_orv(tmp, size, pg, src2);
2141       if (bt == T_INT || bt == T_LONG) {
2142         umov(dst, tmp, size, 0);
2143       } else {
2144         smov(dst, tmp, size, 0);
2145       }
2146       if (bt == T_LONG) {
2147         orr(dst, dst, src1);
2148       } else {
2149         orrw(dst, dst, src1);
2150       }
2151       break;
2152     }
2153     case Op_XorReductionV: {
2154       sve_eorv(tmp, size, pg, src2);
2155       if (bt == T_INT || bt == T_LONG) {
2156         umov(dst, tmp, size, 0);
2157       } else {
2158         smov(dst, tmp, size, 0);
2159       }
2160       if (bt == T_LONG) {
2161         eor(dst, dst, src1);
2162       } else {
2163         eorw(dst, dst, src1);
2164       }
2165       break;
2166     }
2167     case Op_MaxReductionV:
2168     case Op_MinReductionV:
2169     case Op_UMaxReductionV:
2170     case Op_UMinReductionV: {
2171       bool is_min;
2172       bool is_unsigned;
2173       Condition cond;
2174       decode_minmax_reduction_opc(opc, &is_min, &is_unsigned, &cond);
2175       sve_minmaxv(is_unsigned, is_min, tmp, size, pg, src2);
2176       // Move result from vector to general register
2177       if (is_unsigned || bt == T_INT || bt == T_LONG) {
2178         umov(dst, tmp, size, 0);
2179       } else {
2180         smov(dst, tmp, size, 0);
2181       }
2182       if (bt == T_LONG) {
2183         cmp(dst, src1);
2184         csel(dst, dst, src1, cond);
2185       } else {
2186         cmpw(dst, src1);
2187         cselw(dst, dst, src1, cond);
2188       }
2189       break;
2190     }
2191     default:
2192       assert(false, "unsupported");
2193       ShouldNotReachHere();
2194   }
2195 
2196   if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) {
2197     if (bt == T_BYTE) {
2198       sxtb(dst, dst);
2199     } else if (bt == T_SHORT) {
2200       sxth(dst, dst);
2201     }
2202   }
2203 }
2204 
2205 // Set elements of the dst predicate to true for lanes in the range of [0, lane_cnt), or
2206 // to false otherwise. The input "lane_cnt" should be smaller than or equal to the supported
2207 // max vector length of the basic type. Clobbers: rscratch1 and the rFlagsReg.
2208 void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt) {
2209   uint32_t max_vector_length = Matcher::max_vector_size(bt);
2210   assert(lane_cnt <= max_vector_length, "unsupported input lane_cnt");
2211 
2212   // Set all elements to false if the input "lane_cnt" is zero.
2213   if (lane_cnt == 0) {
2214     sve_pfalse(dst);
2215     return;
2216   }
2217 
2218   SIMD_RegVariant size = elemType_to_regVariant(bt);
2219   assert(size != Q, "invalid size");
2220 
2221   // Set all true if "lane_cnt" equals to the max lane count.
2222   if (lane_cnt == max_vector_length) {
2223     sve_ptrue(dst, size, /* ALL */ 0b11111);
2224     return;
2225   }
2226 
2227   // Fixed numbers for "ptrue".
2228   switch(lane_cnt) {
2229   case 1: /* VL1 */
2230   case 2: /* VL2 */
2231   case 3: /* VL3 */
2232   case 4: /* VL4 */
2233   case 5: /* VL5 */
2234   case 6: /* VL6 */
2235   case 7: /* VL7 */
2236   case 8: /* VL8 */
2237     sve_ptrue(dst, size, lane_cnt);
2238     return;
2239   case 16:
2240     sve_ptrue(dst, size, /* VL16 */ 0b01001);
2241     return;
2242   case 32:
2243     sve_ptrue(dst, size, /* VL32 */ 0b01010);
2244     return;
2245   case 64:
2246     sve_ptrue(dst, size, /* VL64 */ 0b01011);
2247     return;
2248   case 128:
2249     sve_ptrue(dst, size, /* VL128 */ 0b01100);
2250     return;
2251   case 256:
2252     sve_ptrue(dst, size, /* VL256 */ 0b01101);
2253     return;
2254   default:
2255     break;
2256   }
2257 
2258   // Special patterns for "ptrue".
2259   if (lane_cnt == round_down_power_of_2(max_vector_length)) {
2260     sve_ptrue(dst, size, /* POW2 */ 0b00000);
2261   } else if (lane_cnt == max_vector_length - (max_vector_length % 4)) {
2262     sve_ptrue(dst, size, /* MUL4 */ 0b11101);
2263   } else if (lane_cnt == max_vector_length - (max_vector_length % 3)) {
2264     sve_ptrue(dst, size, /* MUL3 */ 0b11110);
2265   } else {
2266     // Encode to "whileltw" for the remaining cases.
2267     mov(rscratch1, lane_cnt);
2268     sve_whileltw(dst, size, zr, rscratch1);
2269   }
2270 }
2271 
2272 // Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst.
2273 // Any remaining elements of dst will be filled with zero.
2274 // Clobbers: rscratch1
2275 // Preserves: mask, vzr
2276 void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask,
2277                                            FloatRegister vzr, FloatRegister vtmp,
2278                                            PRegister pgtmp, unsigned vector_length_in_bytes) {
2279   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2280   // When called by sve_compress_byte, src and vtmp may be the same register.
2281   assert_different_registers(dst, src, vzr);
2282   assert_different_registers(dst, vtmp, vzr);
2283   assert_different_registers(mask, pgtmp);
2284   // high <-- low
2285   // Example input:   src   = hh gg ff ee dd cc bb aa, one character is 8 bits.
2286   //                  mask  = 01 00 00 01 01 00 01 01, one character is 1 bit.
2287   // Expected result: dst   = 00 00 00 hh ee dd bb aa
2288 
2289   // Extend lowest half to type INT.
2290   // dst   =  00dd  00cc  00bb  00aa
2291   sve_uunpklo(dst, S, src);
2292   // pgtmp =  0001  0000  0001  0001
2293   sve_punpklo(pgtmp, mask);
2294   // Pack the active elements in size of type INT to the right,
2295   // and fill the remainings with zero.
2296   // dst   =  0000  00dd  00bb  00aa
2297   sve_compact(dst, S, dst, pgtmp);
2298   // Narrow the result back to type SHORT.
2299   // dst   = 00 00 00 00 00 dd bb aa
2300   sve_uzp1(dst, H, dst, vzr);
2301 
2302   // Return if the vector length is no more than MaxVectorSize/2, since the
2303   // highest half is invalid.
2304   if (vector_length_in_bytes <= (MaxVectorSize >> 1)) {
2305     return;
2306   }
2307 
2308   // Count the active elements of lowest half.
2309   // rscratch1 = 3
2310   sve_cntp(rscratch1, S, ptrue, pgtmp);
2311 
2312   // Repeat to the highest half.
2313   // pgtmp =  0001  0000  0000  0001
2314   sve_punpkhi(pgtmp, mask);
2315   // vtmp  =  00hh  00gg  00ff  00ee
2316   sve_uunpkhi(vtmp, S, src);
2317   // vtmp  =  0000  0000  00hh  00ee
2318   sve_compact(vtmp, S, vtmp, pgtmp);
2319   // vtmp  = 00 00 00 00 00 00 hh ee
2320   sve_uzp1(vtmp, H, vtmp, vzr);
2321 
2322   // pgtmp = 00 00 00 00 00 01 01 01
2323   sve_whilelt(pgtmp, H, zr, rscratch1);
2324   // Compressed low:  dst  = 00 00 00 00 00 dd bb aa
2325   // Compressed high: vtmp = 00 00 00 00 00 00 hh ee
2326   // Combine the compressed low with the compressed high:
2327   //                  dst  = 00 00 00 hh ee dd bb aa
2328   sve_splice(dst, H, pgtmp, vtmp);
2329 }
2330 
2331 // Clobbers: rscratch1, rscratch2
2332 // Preserves: src, mask
2333 void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask,
2334                                           FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
2335                                           PRegister ptmp, PRegister pgtmp, unsigned vector_length_in_bytes) {
2336   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2337   assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3);
2338   assert_different_registers(mask, ptmp, pgtmp);
2339   // high <-- low
2340   // Example input:   src   = q p n m l k j i h g f e d c b a, one character is 8 bits.
2341   //                  mask  = 0 1 0 0 0 0 0 1 0 1 0 0 0 1 0 1, one character is 1 bit.
2342   // Expected result: dst   = 0 0 0 0 0 0 0 0 0 0 0 p i g c a
2343   FloatRegister vzr = vtmp3;
2344   sve_dup(vzr, B, 0);
2345 
2346   // Extend lowest half to type SHORT.
2347   // vtmp1 =  0h  0g  0f  0e  0d  0c  0b  0a
2348   sve_uunpklo(vtmp1, H, src);
2349   // ptmp  =  00  01  00  00  00  01  00  01
2350   sve_punpklo(ptmp, mask);
2351   // Pack the active elements in size of type SHORT to the right,
2352   // and fill the remainings with zero.
2353   // dst   =  00  00  00  00  00  0g  0c  0a
2354   unsigned extended_size = vector_length_in_bytes << 1;
2355   sve_compress_short(dst, vtmp1, ptmp, vzr, vtmp2, pgtmp, extended_size > MaxVectorSize ? MaxVectorSize : extended_size);
2356   // Narrow the result back to type BYTE.
2357   // dst   = 0 0 0 0 0 0 0 0 0 0 0 0 0 g c a
2358   sve_uzp1(dst, B, dst, vzr);
2359 
2360   // Return if the vector length is no more than MaxVectorSize/2, since the
2361   // highest half is invalid.
2362   if (vector_length_in_bytes <= (MaxVectorSize >> 1)) {
2363     return;
2364   }
2365   // Count the active elements of lowest half.
2366   // rscratch2 = 3
2367   sve_cntp(rscratch2, H, ptrue, ptmp);
2368 
2369   // Repeat to the highest half.
2370   // ptmp  =  00  01  00  00  00  00  00  01
2371   sve_punpkhi(ptmp, mask);
2372   // vtmp2 =  0q  0p  0n  0m  0l  0k  0j  0i
2373   sve_uunpkhi(vtmp2, H, src);
2374   // vtmp1 =  00  00  00  00  00  00  0p  0i
2375   sve_compress_short(vtmp1, vtmp2, ptmp, vzr, vtmp2, pgtmp, extended_size - MaxVectorSize);
2376   // vtmp1 = 0 0 0 0 0 0 0 0 0 0 0 0 0 0 p i
2377   sve_uzp1(vtmp1, B, vtmp1, vzr);
2378 
2379   // ptmp  = 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1
2380   sve_whilelt(ptmp, B, zr, rscratch2);
2381   // Compressed low:  dst   = 0 0 0 0 0 0 0 0 0 0 0 0 0 g c a
2382   // Compressed high: vtmp1 = 0 0 0 0 0 0 0 0 0 0 0 0 0 0 p i
2383   // Combine the compressed low with the compressed high:
2384   //                  dst   = 0 0 0 0 0 0 0 0 0 0 0 p i g c a
2385   sve_splice(dst, B, ptmp, vtmp1);
2386 }
2387 
2388 void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2389   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2390   SIMD_Arrangement size = isQ ? T16B : T8B;
2391   if (bt == T_BYTE) {
2392     rbit(dst, size, src);
2393   } else {
2394     neon_reverse_bytes(dst, src, bt, isQ);
2395     rbit(dst, size, dst);
2396   }
2397 }
2398 
2399 void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2400   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2401   SIMD_Arrangement size = isQ ? T16B : T8B;
2402   switch (bt) {
2403     case T_BYTE:
2404       if (dst != src) {
2405         orr(dst, size, src, src);
2406       }
2407       break;
2408     case T_SHORT:
2409       rev16(dst, size, src);
2410       break;
2411     case T_INT:
2412       rev32(dst, size, src);
2413       break;
2414     case T_LONG:
2415       rev64(dst, size, src);
2416       break;
2417     default:
2418       assert(false, "unsupported");
2419       ShouldNotReachHere();
2420   }
2421 }
2422 
2423 // VectorRearrange implementation for short/int/float/long/double types with NEON
2424 // instructions. For VectorRearrange short/int/float, we use NEON tbl instruction.
2425 // But since it supports bytes table only, we need to lookup 2/4 bytes as a group.
2426 // For VectorRearrange long/double, we compare the shuffle input with iota indices,
2427 // and use bsl to implement the operation.
2428 void C2_MacroAssembler::neon_rearrange_hsd(FloatRegister dst, FloatRegister src,
2429                                            FloatRegister shuffle, FloatRegister tmp,
2430                                            BasicType bt, bool isQ) {
2431   assert_different_registers(dst, src, shuffle, tmp);
2432   SIMD_Arrangement size1 = isQ ? T16B : T8B;
2433   SIMD_Arrangement size2 = esize2arrangement((uint)type2aelembytes(bt), isQ);
2434 
2435   // Here is an example that rearranges a NEON vector with 4 ints:
2436   // Rearrange V1 int[a0, a1, a2, a3] to V2 int[a2, a3, a0, a1]
2437   //   1. We assume the shuffle input is Vi int[2, 3, 0, 1].
2438   //   2. Multiply Vi int[2, 3, 0, 1] with constant int vector
2439   //      [0x04040404, 0x04040404, 0x04040404, 0x04040404], and get
2440   //      tbl base Vm int[0x08080808, 0x0c0c0c0c, 0x00000000, 0x04040404].
2441   //   3. Add Vm with constant int[0x03020100, 0x03020100, 0x03020100, 0x03020100],
2442   //      and get tbl index Vm int[0x0b0a0908, 0x0f0e0d0c, 0x03020100, 0x07060504]
2443   //   4. Use Vm as index register, and use V1 as table register.
2444   //      Then get V2 as the result by tbl NEON instructions.
2445   switch (bt) {
2446     case T_SHORT:
2447       mov(tmp, size1, 0x02);
2448       mulv(dst, size2, shuffle, tmp);
2449       mov(tmp, size2, 0x0100);
2450       addv(dst, size1, dst, tmp);
2451       tbl(dst, size1, src, 1, dst);
2452       break;
2453     case T_INT:
2454     case T_FLOAT:
2455       mov(tmp, size1, 0x04);
2456       mulv(dst, size2, shuffle, tmp);
2457       mov(tmp, size2, 0x03020100);
2458       addv(dst, size1, dst, tmp);
2459       tbl(dst, size1, src, 1, dst);
2460       break;
2461     case T_LONG:
2462     case T_DOUBLE:
2463       {
2464         int idx = vector_iota_entry_index(T_LONG);
2465         lea(rscratch1,
2466             ExternalAddress(StubRoutines::aarch64::vector_iota_indices(idx)));
2467         ldrq(tmp, rscratch1);
2468         // Check whether the input "shuffle" is the same with iota indices.
2469         // Return "src" if true, otherwise swap the two elements of "src".
2470         cm(EQ, dst, size2, shuffle, tmp);
2471         ext(tmp, size1, src, src, 8);
2472         bsl(dst, size1, src, tmp);
2473       }
2474       break;
2475     default:
2476       assert(false, "unsupported element type");
2477       ShouldNotReachHere();
2478   }
2479 }
2480 
2481 // Extract a scalar element from an sve vector at position 'idx'.
2482 // The input elements in src are expected to be of integral type.
2483 void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src,
2484                                              int idx, FloatRegister vtmp) {
2485   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2486   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2487   if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction
2488     if (bt == T_INT || bt == T_LONG) {
2489       umov(dst, src, size, idx);
2490     } else {
2491       smov(dst, src, size, idx);
2492     }
2493   } else {
2494     sve_movprfx(vtmp, src);
2495     // Although vtmp and src hold the same value after movprfx, we must use src
2496     // (not vtmp) as the second source of ext. The movprfx destination register
2497     // must not appear in any source operand of the following instruction except
2498     // as the destructive operand.
2499     sve_ext(vtmp, src, idx << size);
2500     if (bt == T_INT || bt == T_LONG) {
2501       umov(dst, vtmp, size, 0);
2502     } else {
2503       smov(dst, vtmp, size, 0);
2504     }
2505   }
2506 }
2507 
2508 // java.lang.Math::round intrinsics
2509 
2510 // Clobbers: rscratch1, rflags
2511 void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2512                                           FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) {
2513   assert_different_registers(tmp1, tmp2, tmp3, src, dst);
2514   switch (T) {
2515     case T2S:
2516     case T4S:
2517       fmovs(tmp1, T, 0.5f);
2518       mov(rscratch1, jint_cast(0x1.0p23f));
2519       break;
2520     case T2D:
2521       fmovd(tmp1, T, 0.5);
2522       mov(rscratch1, julong_cast(0x1.0p52));
2523       break;
2524     default:
2525       assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2526   }
2527   fadd(tmp1, T, tmp1, src);
2528   fcvtms(tmp1, T, tmp1);
2529   // tmp1 = floor(src + 0.5, ties to even)
2530 
2531   fcvtas(dst, T, src);
2532   // dst = round(src), ties to away
2533 
2534   fneg(tmp3, T, src);
2535   dup(tmp2, T, rscratch1);
2536   cm(HS, tmp3, T, tmp3, tmp2);
2537   // tmp3 is now a set of flags
2538 
2539   bif(dst, T16B, tmp1, tmp3);
2540   // result in dst
2541 }
2542 
2543 // Clobbers: rscratch1, rflags
2544 void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2545                                          FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) {
2546   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2547   assert_different_registers(tmp1, tmp2, src, dst);
2548 
2549   switch (T) {
2550     case S:
2551       mov(rscratch1, jint_cast(0x1.0p23f));
2552       break;
2553     case D:
2554       mov(rscratch1, julong_cast(0x1.0p52));
2555       break;
2556     default:
2557       assert(T == S || T == D, "invalid register variant");
2558   }
2559 
2560   sve_frinta(dst, T, ptrue, src);
2561   // dst = round(src), ties to away
2562 
2563   Label none;
2564 
2565   sve_fneg(tmp1, T, ptrue, src);
2566   sve_dup(tmp2, T, rscratch1);
2567   sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1);
2568   br(EQ, none);
2569   {
2570     sve_cpy(tmp1, T, pgtmp, 0.5);
2571     sve_fadd(tmp1, T, pgtmp, src);
2572     sve_frintm(dst, T, pgtmp, tmp1);
2573     // dst = floor(src + 0.5, ties to even)
2574   }
2575   bind(none);
2576 
2577   sve_fcvtzs(dst, T, ptrue, dst, T);
2578   // result in dst
2579 }
2580 
2581 void C2_MacroAssembler::vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero,
2582                                            FloatRegister one, SIMD_Arrangement T) {
2583   assert_different_registers(dst, src, zero, one);
2584   assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2585 
2586   facgt(dst, T, src, zero);
2587   ushr(dst, T, dst, 1); // dst=0 for +-0.0 and NaN. 0x7FF..F otherwise
2588   bsl(dst, T == T2S ? T8B : T16B, one, src); // Result in dst
2589 }
2590 
2591 void C2_MacroAssembler::vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero,
2592                                           FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T) {
2593     assert_different_registers(dst, src, zero, one, vtmp);
2594     assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2595 
2596     sve_orr(vtmp, src, src);
2597     sve_fac(Assembler::GT, pgtmp, T, ptrue, src, zero); // pmtp=0 for +-0.0 and NaN. 0x1 otherwise
2598     switch (T) {
2599     case S:
2600       sve_and(vtmp, T, min_jint); // Extract the sign bit of float value in every lane of src
2601       sve_orr(vtmp, T, jint_cast(1.0)); // OR it with +1 to make the final result +1 or -1 depending
2602                                         // on the sign of the float value
2603       break;
2604     case D:
2605       sve_and(vtmp, T, min_jlong);
2606       sve_orr(vtmp, T, jlong_cast(1.0));
2607       break;
2608     default:
2609       assert(false, "unsupported");
2610       ShouldNotReachHere();
2611     }
2612     sve_sel(dst, T, pgtmp, vtmp, src); // Select either from src or vtmp based on the predicate register pgtmp
2613                                        // Result in dst
2614 }
2615 
2616 bool C2_MacroAssembler::in_scratch_emit_size() {
2617   if (ciEnv::current()->task() != nullptr) {
2618     PhaseOutput* phase_output = Compile::current()->output();
2619     if (phase_output != nullptr && phase_output->in_scratch_emit_size()) {
2620       return true;
2621     }
2622   }
2623   return MacroAssembler::in_scratch_emit_size();
2624 }
2625 
2626 static void abort_verify_int_in_range(uint idx, jint val, jint lo, jint hi) {
2627   fatal("Invalid CastII, idx: %u, val: %d, lo: %d, hi: %d", idx, val, lo, hi);
2628 }
2629 
2630 void C2_MacroAssembler::verify_int_in_range(uint idx, const TypeInt* t, Register rval, Register rtmp) {
2631   assert(!t->empty() && !t->singleton(), "%s", Type::str(t));
2632   if (t == TypeInt::INT) {
2633     return;
2634   }
2635 
2636   BLOCK_COMMENT("verify_int_in_range {");
2637   Label L_success, L_failure;
2638 
2639   jint lo = t->_lo;
2640   jint hi = t->_hi;
2641 
2642   if (lo != min_jint) {
2643     subsw(rtmp, rval, lo);
2644     br(Assembler::LT, L_failure);
2645   }
2646   if (hi != max_jint) {
2647     subsw(rtmp, rval, hi);
2648     br(Assembler::GT, L_failure);
2649   }
2650   b(L_success);
2651 
2652   bind(L_failure);
2653   movw(c_rarg0, idx);
2654   mov(c_rarg1, rval);
2655   movw(c_rarg2, lo);
2656   movw(c_rarg3, hi);
2657   reconstruct_frame_pointer(rtmp);
2658   rt_call(CAST_FROM_FN_PTR(address, abort_verify_int_in_range), rtmp);
2659   hlt(0);
2660 
2661   bind(L_success);
2662   BLOCK_COMMENT("} verify_int_in_range");
2663 }
2664 
2665 static void abort_verify_long_in_range(uint idx, jlong val, jlong lo, jlong hi) {
2666   fatal("Invalid CastLL, idx: %u, val: " JLONG_FORMAT ", lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, val, lo, hi);
2667 }
2668 
2669 void C2_MacroAssembler::verify_long_in_range(uint idx, const TypeLong* t, Register rval, Register rtmp) {
2670   assert(!t->empty() && !t->singleton(), "%s", Type::str(t));
2671   if (t == TypeLong::LONG) {
2672     return;
2673   }
2674 
2675   BLOCK_COMMENT("verify_long_in_range {");
2676   Label L_success, L_failure;
2677 
2678   jlong lo = t->_lo;
2679   jlong hi = t->_hi;
2680 
2681   if (lo != min_jlong) {
2682     subs(rtmp, rval, lo);
2683     br(Assembler::LT, L_failure);
2684   }
2685   if (hi != max_jlong) {
2686     subs(rtmp, rval, hi);
2687     br(Assembler::GT, L_failure);
2688   }
2689   b(L_success);
2690 
2691   bind(L_failure);
2692   movw(c_rarg0, idx);
2693   mov(c_rarg1, rval);
2694   mov(c_rarg2, lo);
2695   mov(c_rarg3, hi);
2696   reconstruct_frame_pointer(rtmp);
2697   rt_call(CAST_FROM_FN_PTR(address, abort_verify_long_in_range), rtmp);
2698   hlt(0);
2699 
2700   bind(L_success);
2701   BLOCK_COMMENT("} verify_long_in_range");
2702 }
2703 
2704 void C2_MacroAssembler::reconstruct_frame_pointer(Register rtmp) {
2705   const int framesize = Compile::current()->output()->frame_size_in_bytes();
2706   if (PreserveFramePointer) {
2707     // frame pointer is valid
2708 #ifdef ASSERT
2709     // Verify frame pointer value in rfp.
2710     add(rtmp, sp, framesize - 2 * wordSize);
2711     Label L_success;
2712     cmp(rfp, rtmp);
2713     br(Assembler::EQ, L_success);
2714     stop("frame pointer mismatch");
2715     bind(L_success);
2716 #endif // ASSERT
2717   } else {
2718     add(rfp, sp, framesize - 2 * wordSize);
2719   }
2720 }
2721 
2722 // Selects elements from two source vectors (src1, src2) based on index values in the index register
2723 // using Neon instructions and places it in the destination vector element corresponding to the
2724 // index vector element. Each index in the index register must be in the range - [0, 2 * NUM_ELEM),
2725 // where NUM_ELEM is the number of BasicType elements per vector.
2726 // If idx < NUM_ELEM --> selects src1[idx] (idx is an element of the index register)
2727 // Otherwise, selects src2[idx – NUM_ELEM]
2728 void C2_MacroAssembler::select_from_two_vectors_neon(FloatRegister dst, FloatRegister src1,
2729                                                      FloatRegister src2, FloatRegister index,
2730                                                      FloatRegister tmp, unsigned vector_length_in_bytes) {
2731   assert_different_registers(dst, src1, src2, tmp);
2732   SIMD_Arrangement size = vector_length_in_bytes == 16 ? T16B : T8B;
2733 
2734   if (vector_length_in_bytes == 16) {
2735     assert(UseSVE <= 1, "sve must be <= 1");
2736     assert(src1->successor() == src2, "Source registers must be ordered");
2737     // If the vector length is 16B, then use the Neon "tbl" instruction with two vector table
2738     tbl(dst, size, src1, 2, index);
2739   } else { // vector length == 8
2740     assert(UseSVE == 0, "must be Neon only");
2741     // We need to fit both the source vectors (src1, src2) in a 128-bit register because the
2742     // Neon "tbl" instruction supports only looking up 16B vectors. We then use the Neon "tbl"
2743     // instruction with one vector lookup
2744     ins(tmp, D, src1, 0, 0);
2745     ins(tmp, D, src2, 1, 0);
2746     tbl(dst, size, tmp, 1, index);
2747   }
2748 }
2749 
2750 // Selects elements from two source vectors (src1, src2) based on index values in the index register
2751 // using SVE/SVE2 instructions and places it in the destination vector element corresponding to the
2752 // index vector element. Each index in the index register must be in the range - [0, 2 * NUM_ELEM),
2753 // where NUM_ELEM is the number of BasicType elements per vector.
2754 // If idx < NUM_ELEM --> selects src1[idx] (idx is an element of the index register)
2755 // Otherwise, selects src2[idx – NUM_ELEM]
2756 void C2_MacroAssembler::select_from_two_vectors_sve(FloatRegister dst, FloatRegister src1,
2757                                                     FloatRegister src2, FloatRegister index,
2758                                                     FloatRegister tmp, SIMD_RegVariant T,
2759                                                     unsigned vector_length_in_bytes) {
2760   assert_different_registers(dst, src1, src2, index, tmp);
2761 
2762   if (vector_length_in_bytes == 8) {
2763     // We need to fit both the source vectors (src1, src2) in a single vector register because the
2764     // SVE "tbl" instruction is unpredicated and works on the entire vector which can lead to
2765     // incorrect results if each source vector is only partially filled. We then use the SVE "tbl"
2766     // instruction with one vector lookup
2767     assert(UseSVE >= 1, "sve must be >= 1");
2768     ins(tmp, D, src1, 0, 0);
2769     ins(tmp, D, src2, 1, 0);
2770     sve_tbl(dst, T, tmp, index);
2771   } else {  // UseSVE == 2 and vector_length_in_bytes > 8
2772     // If the vector length is > 8, then use the SVE2 "tbl" instruction with the two vector table.
2773     // The assertion - vector_length_in_bytes == MaxVectorSize ensures that this operation
2774     // is not executed on machines where vector_length_in_bytes < MaxVectorSize
2775     // with the only exception of 8B vector length.
2776     assert(UseSVE == 2 && vector_length_in_bytes == MaxVectorSize, "must be");
2777     assert(src1->successor() == src2, "Source registers must be ordered");
2778     sve_tbl(dst, T, src1, src2, index);
2779   }
2780 }
2781 
2782 void C2_MacroAssembler::select_from_two_vectors(FloatRegister dst, FloatRegister src1,
2783                                                 FloatRegister src2, FloatRegister index,
2784                                                 FloatRegister tmp, BasicType bt,
2785                                                 unsigned vector_length_in_bytes) {
2786 
2787   assert_different_registers(dst, src1, src2, index, tmp);
2788 
2789   // The cases that can reach this method are -
2790   // - UseSVE = 0/1, vector_length_in_bytes = 8 or 16, excluding double and long types
2791   // - UseSVE = 2, vector_length_in_bytes >= 8, for all types
2792   //
2793   // SVE/SVE2 tbl instructions are generated when UseSVE = 1 with vector_length_in_bytes = 8
2794   // and UseSVE = 2 with vector_length_in_bytes >= 8
2795   //
2796   // Neon instructions are generated when UseSVE = 0 with vector_length_in_bytes = 8 or 16 and
2797   // UseSVE = 1 with vector_length_in_bytes = 16
2798 
2799   if ((UseSVE == 1 && vector_length_in_bytes == 8) || UseSVE == 2) {
2800     SIMD_RegVariant T = elemType_to_regVariant(bt);
2801     select_from_two_vectors_sve(dst, src1, src2, index, tmp, T, vector_length_in_bytes);
2802     return;
2803   }
2804 
2805   // The only BasicTypes that can reach here are T_SHORT, T_BYTE, T_INT and T_FLOAT
2806   assert(bt != T_DOUBLE && bt != T_LONG, "unsupported basic type");
2807   assert(vector_length_in_bytes <= 16, "length_in_bytes must be <= 16");
2808 
2809   bool isQ = vector_length_in_bytes == 16;
2810 
2811   SIMD_Arrangement size1 = isQ ? T16B : T8B;
2812   SIMD_Arrangement size2 = esize2arrangement((uint)type2aelembytes(bt), isQ);
2813 
2814   // Neon "tbl" instruction only supports byte tables, so we need to look at chunks of
2815   // 2B for selecting shorts or chunks of 4B for selecting ints/floats from the table.
2816   // The index values in "index" register are in the range of [0, 2 * NUM_ELEM) where NUM_ELEM
2817   // is the number of elements that can fit in a vector. For ex. for T_SHORT with 64-bit vector length,
2818   // the indices can range from [0, 8).
2819   // As an example with 64-bit vector length and T_SHORT type - let index = [2, 5, 1, 0]
2820   // Move a constant 0x02 in every byte of tmp - tmp = [0x0202, 0x0202, 0x0202, 0x0202]
2821   // Multiply index vector with tmp to yield - dst = [0x0404, 0x0a0a, 0x0202, 0x0000]
2822   // Move a constant 0x0100 in every 2B of tmp - tmp = [0x0100, 0x0100, 0x0100, 0x0100]
2823   // Add the multiplied result to the vector in tmp to obtain the byte level
2824   // offsets - dst = [0x0504, 0x0b0a, 0x0302, 0x0100]
2825   // Use these offsets in the "tbl" instruction to select chunks of 2B.
2826 
2827   if (bt == T_BYTE) {
2828     select_from_two_vectors_neon(dst, src1, src2, index, tmp, vector_length_in_bytes);
2829   } else {
2830     int elem_size = (bt == T_SHORT) ? 2 : 4;
2831     uint64_t tbl_offset = (bt == T_SHORT) ? 0x0100u : 0x03020100u;
2832 
2833     mov(tmp, size1, elem_size);
2834     mulv(dst, size2, index, tmp);
2835     mov(tmp, size2, tbl_offset);
2836     addv(dst, size1, dst, tmp); // "dst" now contains the processed index elements
2837                                 // to select a set of 2B/4B
2838     select_from_two_vectors_neon(dst, src1, src2, dst, tmp, vector_length_in_bytes);
2839   }
2840 }
2841 
2842 // Vector expand implementation. Elements from the src vector are expanded into
2843 // the dst vector under the control of the vector mask.
2844 // Since there are no native instructions directly corresponding to expand before
2845 // SVE2p2, the following implementations mainly leverages the TBL instruction to
2846 // implement expand. To compute the index input for TBL, the prefix sum algorithm
2847 // (https://en.wikipedia.org/wiki/Prefix_sum) is used. The same algorithm is used
2848 // for NEON and SVE, but with different instructions where appropriate.
2849 
2850 // Vector expand implementation for NEON.
2851 //
2852 // An example of 128-bit Byte vector:
2853 //   Data direction: high <== low
2854 //   Input:
2855 //         src   = g  f  e  d  c  b  a  9  8  7  6  5  4  3  2  1
2856 //         mask  = 0  0 -1 -1  0  0 -1 -1  0  0 -1 -1  0  0 -1 -1
2857 //   Expected result:
2858 //         dst   = 0  0  8  7  0  0  6  5  0  0  4  3  0  0  2  1
2859 void C2_MacroAssembler::vector_expand_neon(FloatRegister dst, FloatRegister src, FloatRegister mask,
2860                                            FloatRegister tmp1, FloatRegister tmp2, BasicType bt,
2861                                            int vector_length_in_bytes) {
2862   assert(vector_length_in_bytes <= 16, "the vector length in bytes for NEON must be <= 16");
2863   assert_different_registers(dst, src, mask, tmp1, tmp2);
2864   // Since the TBL instruction only supports byte table, we need to
2865   // compute indices in byte type for all types.
2866   SIMD_Arrangement size = vector_length_in_bytes == 16 ? T16B : T8B;
2867   // tmp1 =  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
2868   dup(tmp1, size, zr);
2869   // dst  =  0  0  1  1  0  0  1  1  0  0  1  1  0  0  1  1
2870   negr(dst, size, mask);
2871   // Calculate vector index for TBL with prefix sum algorithm.
2872   // dst  =  8  8  8  7  6  6  6  5  4  4  4  3  2  2  2  1
2873   for (int i = 1; i < vector_length_in_bytes; i <<= 1) {
2874     ext(tmp2, size, tmp1, dst, vector_length_in_bytes - i);
2875     addv(dst, size, tmp2, dst);
2876   }
2877   // tmp2 =  0  0 -1 -1  0  0 -1 -1  0  0 -1 -1  0  0 -1 -1
2878   orr(tmp2, size, mask, mask);
2879   // tmp2 =  0  0  8  7  0  0  6  5  0  0  4  3  0  0  2  1
2880   bsl(tmp2, size, dst, tmp1);
2881   // tmp1 =  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
2882   movi(tmp1, size, 1);
2883   // dst  = -1 -1  7  6 -1 -1  5  4 -1 -1  3  2 -1 -1  1  0
2884   subv(dst, size, tmp2, tmp1);
2885   // dst  =  0  0  8  7  0  0  6  5  0  0  4  3  0  0  2  1
2886   tbl(dst, size, src, 1, dst);
2887 }
2888 
2889 // Vector expand implementation for SVE.
2890 //
2891 // An example of 128-bit Short vector:
2892 //   Data direction: high <== low
2893 //   Input:
2894 //         src   = gf ed cb a9 87 65 43 21
2895 //         pg    = 00 01 00 01 00 01 00 01
2896 //   Expected result:
2897 //         dst   = 00 87 00 65 00 43 00 21
2898 void C2_MacroAssembler::vector_expand_sve(FloatRegister dst, FloatRegister src, PRegister pg,
2899                                           FloatRegister tmp1, FloatRegister tmp2, BasicType bt,
2900                                           int vector_length_in_bytes) {
2901   assert(UseSVE > 0, "expand implementation only for SVE");
2902   assert_different_registers(dst, src, tmp1, tmp2);
2903   SIMD_RegVariant size = elemType_to_regVariant(bt);
2904 
2905   // tmp1 = 00 00 00 00 00 00 00 00
2906   sve_dup(tmp1, size, 0);
2907   sve_movprfx(tmp2, tmp1);
2908   // tmp2 = 00 01 00 01 00 01 00 01
2909   sve_cpy(tmp2, size, pg, 1, true);
2910   // Calculate vector index for TBL with prefix sum algorithm.
2911   // tmp2 = 04 04 03 03 02 02 01 01
2912   for (int i = type2aelembytes(bt); i < vector_length_in_bytes; i <<= 1) {
2913     sve_movprfx(dst, tmp1);
2914     // The EXT instruction operates on the full-width sve register. The correct
2915     // index calculation method is:
2916     // vector_length_in_bytes - i + MaxVectorSize - vector_length_in_bytes =>
2917     // MaxVectorSize - i.
2918     sve_ext(dst, tmp2, MaxVectorSize - i);
2919     sve_add(tmp2, size, dst, tmp2);
2920   }
2921   // dst  = 00 04 00 03 00 02 00 01
2922   sve_sel(dst, size, pg, tmp2, tmp1);
2923   // dst  = -1 03 -1 02 -1 01 -1 00
2924   sve_sub(dst, size, 1);
2925   // dst  = 00 87 00 65 00 43 00 21
2926   sve_tbl(dst, size, src, dst);
2927 }
2928 
2929 // Optimized SVE cpy (imm, zeroing) instruction.
2930 //
2931 // `movi; cpy(imm, merging)` and `cpy(imm, zeroing)` have the same
2932 // functionality, but test results show that `movi; cpy(imm, merging)` has
2933 // higher throughput on some microarchitectures. This would depend on
2934 // microarchitecture and so may vary between implementations.
2935 void C2_MacroAssembler::sve_cpy(FloatRegister dst, SIMD_RegVariant T,
2936                                 PRegister pg, int imm8, bool isMerge) {
2937   if (VM_Version::prefer_sve_merging_mode_cpy() && !isMerge) {
2938     // Generates a NEON instruction `movi V<dst>.2d, #0`.
2939     // On AArch64, Z and V registers alias in the low 128 bits, so V<dst> is
2940     // the low 128 bits of Z<dst>. A write to V<dst> also clears all bits of
2941     // Z<dst> above 128, so this `movi` instruction effectively zeroes the
2942     // entire Z<dst> register. According to the Arm Software Optimization
2943     // Guide, `movi` is zero latency.
2944     movi(dst, T2D, 0);
2945     isMerge = true;
2946   }
2947   Assembler::sve_cpy(dst, T, pg, imm8, isMerge);
2948 }
2949 
2950 int C2_MacroAssembler::vector_iota_entry_index(BasicType bt) {
2951   // The vector iota entries array is ordered by type B/S/I/L/F/D, and
2952   // the offset between two types is 16.
2953   switch(bt) {
2954   case T_BYTE:
2955     return 0;
2956   case T_SHORT:
2957     return 1;
2958   case T_INT:
2959     return 2;
2960   case T_LONG:
2961     return 3;
2962   case T_FLOAT:
2963     return 4;
2964   case T_DOUBLE:
2965     return 5;
2966   default:
2967     ShouldNotReachHere();
2968   }
2969 }