1 /*
   2  * Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "asm/assembler.hpp"
  26 #include "asm/assembler.inline.hpp"
  27 #include "opto/c2_MacroAssembler.hpp"
  28 #include "opto/compile.hpp"
  29 #include "opto/intrinsicnode.hpp"
  30 #include "opto/matcher.hpp"
  31 #include "opto/output.hpp"
  32 #include "opto/subnode.hpp"
  33 #include "runtime/stubRoutines.hpp"
  34 #include "utilities/globalDefinitions.hpp"
  35 #include "utilities/powerOfTwo.hpp"
  36 
  37 #ifdef PRODUCT
  38 #define BLOCK_COMMENT(str) /* nothing */
  39 #define STOP(error) stop(error)
  40 #else
  41 #define BLOCK_COMMENT(str) block_comment(str)
  42 #define STOP(error) block_comment(error); stop(error)
  43 #endif
  44 
  45 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  46 
  47 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
  48 
  49 // jdk.internal.util.ArraysSupport.vectorizedHashCode
  50 address C2_MacroAssembler::arrays_hashcode(Register ary, Register cnt, Register result,
  51                                            FloatRegister vdata0, FloatRegister vdata1,
  52                                            FloatRegister vdata2, FloatRegister vdata3,
  53                                            FloatRegister vmul0, FloatRegister vmul1,
  54                                            FloatRegister vmul2, FloatRegister vmul3,
  55                                            FloatRegister vpow, FloatRegister vpowm,
  56                                            BasicType eltype) {
  57   ARRAYS_HASHCODE_REGISTERS;
  58 
  59   Register tmp1 = rscratch1, tmp2 = rscratch2;
  60 
  61   Label TAIL, STUB_SWITCH, STUB_SWITCH_OUT, LOOP, BR_BASE, LARGE, DONE;
  62 
  63   // Vectorization factor. Number of array elements loaded to one SIMD&FP registers by the stubs. We
  64   // use 8H load arrangements for chars and shorts and 8B for booleans and bytes. It's possible to
  65   // use 4H for chars and shorts instead, but using 8H gives better performance.
  66   const size_t vf = eltype == T_BOOLEAN || eltype == T_BYTE ? 8
  67                     : eltype == T_CHAR || eltype == T_SHORT ? 8
  68                     : eltype == T_INT                       ? 4
  69                                                             : 0;
  70   guarantee(vf, "unsupported eltype");
  71 
  72   // Unroll factor for the scalar loop below. The value is chosen based on performance analysis.
  73   const size_t unroll_factor = 4;
  74 
  75   switch (eltype) {
  76   case T_BOOLEAN:
  77     BLOCK_COMMENT("arrays_hashcode(unsigned byte) {");
  78     break;
  79   case T_CHAR:
  80     BLOCK_COMMENT("arrays_hashcode(char) {");
  81     break;
  82   case T_BYTE:
  83     BLOCK_COMMENT("arrays_hashcode(byte) {");
  84     break;
  85   case T_SHORT:
  86     BLOCK_COMMENT("arrays_hashcode(short) {");
  87     break;
  88   case T_INT:
  89     BLOCK_COMMENT("arrays_hashcode(int) {");
  90     break;
  91   default:
  92     ShouldNotReachHere();
  93   }
  94 
  95   // large_arrays_hashcode(T_INT) performs worse than the scalar loop below when the Neon loop
  96   // implemented by the stub executes just once. Call the stub only if at least two iterations will
  97   // be executed.
  98   const size_t large_threshold = eltype == T_INT ? vf * 2 : vf;
  99   cmpw(cnt, large_threshold);
 100   br(Assembler::HS, LARGE);
 101 
 102   bind(TAIL);
 103 
 104   // The andr performs cnt % uf where uf = unroll_factor. The subtract shifted by 3 offsets past
 105   // uf - (cnt % uf) pairs of load + madd insns i.e. it only executes cnt % uf load + madd pairs.
 106   // Iteration eats up the remainder, uf elements at a time.
 107   assert(is_power_of_2(unroll_factor), "can't use this value to calculate the jump target PC");
 108   andr(tmp2, cnt, unroll_factor - 1);
 109   adr(tmp1, BR_BASE);
 110   // For Cortex-A53 offset is 4 because 2 nops are generated.
 111   sub(tmp1, tmp1, tmp2, ext::sxtw, VM_Version::supports_a53mac() ? 4 : 3);
 112   movw(tmp2, 0x1f);
 113   br(tmp1);
 114 
 115   bind(LOOP);
 116   for (size_t i = 0; i < unroll_factor; ++i) {
 117     load(tmp1, Address(post(ary, type2aelembytes(eltype))), eltype);
 118     maddw(result, result, tmp2, tmp1);
 119     // maddw generates an extra nop for Cortex-A53 (see maddw definition in macroAssembler).
 120     // Generate 2nd nop to have 4 instructions per iteration.
 121     if (VM_Version::supports_a53mac()) {
 122       nop();
 123     }
 124   }
 125   bind(BR_BASE);
 126   subsw(cnt, cnt, unroll_factor);
 127   br(Assembler::HS, LOOP);
 128 
 129   b(DONE);
 130 
 131   bind(LARGE);
 132 
 133   RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_arrays_hashcode(eltype));
 134   assert(stub.target() != nullptr, "array_hashcode stub has not been generated");
 135   address tpc = trampoline_call(stub);
 136   if (tpc == nullptr) {
 137     DEBUG_ONLY(reset_labels(TAIL, BR_BASE));
 138     postcond(pc() == badAddress);
 139     return nullptr;
 140   }
 141 
 142   bind(DONE);
 143 
 144   BLOCK_COMMENT("} // arrays_hashcode");
 145 
 146   postcond(pc() != badAddress);
 147   return pc();
 148 }
 149 
 150 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register t1,
 151                                               Register t2, Register t3) {
 152   assert_different_registers(obj, box, t1, t2, t3, rscratch2);
 153 
 154   // Handle inflated monitor.
 155   Label inflated;
 156   // Finish fast lock successfully. MUST branch to with flag == EQ
 157   Label locked;
 158   // Finish fast lock unsuccessfully. MUST branch to with flag == NE
 159   Label slow_path;
 160 
 161   if (UseObjectMonitorTable) {
 162     // Clear cache in case fast locking succeeds or we need to take the slow-path.
 163     str(zr, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 164   }
 165 
 166   if (DiagnoseSyncOnValueBasedClasses != 0) {
 167     load_klass(t1, obj);
 168     ldrb(t1, Address(t1, Klass::misc_flags_offset()));
 169     tst(t1, KlassFlags::_misc_is_value_based_class);
 170     br(Assembler::NE, slow_path);
 171   }
 172 
 173   const Register t1_mark = t1;
 174   const Register t3_t = t3;
 175 
 176   { // Lightweight locking
 177 
 178     // Push lock to the lock stack and finish successfully. MUST branch to with flag == EQ
 179     Label push;
 180 
 181     const Register t2_top = t2;
 182 
 183     // Check if lock-stack is full.
 184     ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 185     cmpw(t2_top, (unsigned)LockStack::end_offset() - 1);
 186     br(Assembler::GT, slow_path);
 187 
 188     // Check if recursive.
 189     subw(t3_t, t2_top, oopSize);
 190     ldr(t3_t, Address(rthread, t3_t));
 191     cmp(obj, t3_t);
 192     br(Assembler::EQ, push);
 193 
 194     // Relaxed normal load to check for monitor. Optimization for monitor case.
 195     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 196     tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
 197 
 198     // Not inflated
 199     assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea");
 200 
 201     // Try to lock. Transition lock-bits 0b01 => 0b00
 202     orr(t1_mark, t1_mark, markWord::unlocked_value);
 203     eor(t3_t, t1_mark, markWord::unlocked_value);
 204     cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
 205             /*acquire*/ true, /*release*/ false, /*weak*/ false, noreg);
 206     br(Assembler::NE, slow_path);
 207 
 208     bind(push);
 209     // After successful lock, push object on lock-stack.
 210     str(obj, Address(rthread, t2_top));
 211     addw(t2_top, t2_top, oopSize);
 212     strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 213     b(locked);
 214   }
 215 
 216   { // Handle inflated monitor.
 217     bind(inflated);
 218 
 219     const Register t1_monitor = t1;
 220 
 221     if (!UseObjectMonitorTable) {
 222       assert(t1_monitor == t1_mark, "should be the same here");
 223     } else {
 224       Label monitor_found;
 225 
 226       // Load cache address
 227       lea(t3_t, Address(rthread, JavaThread::om_cache_oops_offset()));
 228 
 229       const int num_unrolled = 2;
 230       for (int i = 0; i < num_unrolled; i++) {
 231         ldr(t1, Address(t3_t));
 232         cmp(obj, t1);
 233         br(Assembler::EQ, monitor_found);
 234         increment(t3_t, in_bytes(OMCache::oop_to_oop_difference()));
 235       }
 236 
 237       Label loop;
 238 
 239       // Search for obj in cache.
 240       bind(loop);
 241 
 242       // Check for match.
 243       ldr(t1, Address(t3_t));
 244       cmp(obj, t1);
 245       br(Assembler::EQ, monitor_found);
 246 
 247       // Search until null encountered, guaranteed _null_sentinel at end.
 248       increment(t3_t, in_bytes(OMCache::oop_to_oop_difference()));
 249       cbnz(t1, loop);
 250       // Cache Miss, NE set from cmp above, cbnz does not set flags
 251       b(slow_path);
 252 
 253       bind(monitor_found);
 254       ldr(t1_monitor, Address(t3_t, OMCache::oop_to_monitor_difference()));
 255     }
 256 
 257     const Register t2_owner_addr = t2;
 258     const Register t3_owner = t3;
 259     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 260     const Address owner_address(t1_monitor, ObjectMonitor::owner_offset() - monitor_tag);
 261     const Address recursions_address(t1_monitor, ObjectMonitor::recursions_offset() - monitor_tag);
 262 
 263     Label monitor_locked;
 264 
 265     // Compute owner address.
 266     lea(t2_owner_addr, owner_address);
 267 
 268     // Try to CAS owner (no owner => current thread's _monitor_owner_id).
 269     ldr(rscratch2, Address(rthread, JavaThread::monitor_owner_id_offset()));
 270     cmpxchg(t2_owner_addr, zr, rscratch2, Assembler::xword, /*acquire*/ true,
 271             /*release*/ false, /*weak*/ false, t3_owner);
 272     br(Assembler::EQ, monitor_locked);
 273 
 274     // Check if recursive.
 275     cmp(t3_owner, rscratch2);
 276     br(Assembler::NE, slow_path);
 277 
 278     // Recursive.
 279     increment(recursions_address, 1);
 280 
 281     bind(monitor_locked);
 282     if (UseObjectMonitorTable) {
 283       str(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 284     }
 285   }
 286 
 287   bind(locked);
 288 
 289 #ifdef ASSERT
 290   // Check that locked label is reached with Flags == EQ.
 291   Label flag_correct;
 292   br(Assembler::EQ, flag_correct);
 293   stop("Fast Lock Flag != EQ");
 294 #endif
 295 
 296   bind(slow_path);
 297 #ifdef ASSERT
 298   // Check that slow_path label is reached with Flags == NE.
 299   br(Assembler::NE, flag_correct);
 300   stop("Fast Lock Flag != NE");
 301   bind(flag_correct);
 302 #endif
 303   // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
 304 }
 305 
 306 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register box, Register t1,
 307                                                 Register t2, Register t3) {
 308   assert_different_registers(obj, box, t1, t2, t3);
 309 
 310   // Handle inflated monitor.
 311   Label inflated, inflated_load_mark;
 312   // Finish fast unlock successfully. MUST branch to with flag == EQ
 313   Label unlocked;
 314   // Finish fast unlock unsuccessfully. MUST branch to with flag == NE
 315   Label slow_path;
 316 
 317   const Register t1_mark = t1;
 318   const Register t2_top = t2;
 319   const Register t3_t = t3;
 320 
 321   { // Lightweight unlock
 322 
 323     Label push_and_slow_path;
 324 
 325     // Check if obj is top of lock-stack.
 326     ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 327     subw(t2_top, t2_top, oopSize);
 328     ldr(t3_t, Address(rthread, t2_top));
 329     cmp(obj, t3_t);
 330     // Top of lock stack was not obj. Must be monitor.
 331     br(Assembler::NE, inflated_load_mark);
 332 
 333     // Pop lock-stack.
 334     DEBUG_ONLY(str(zr, Address(rthread, t2_top));)
 335     strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 336 
 337     // Check if recursive.
 338     subw(t3_t, t2_top, oopSize);
 339     ldr(t3_t, Address(rthread, t3_t));
 340     cmp(obj, t3_t);
 341     br(Assembler::EQ, unlocked);
 342 
 343     // Not recursive.
 344     // Load Mark.
 345     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 346 
 347     // Check header for monitor (0b10).
 348     // Because we got here by popping (meaning we pushed in locked)
 349     // there will be no monitor in the box. So we need to push back the obj
 350     // so that the runtime can fix any potential anonymous owner.
 351     tbnz(t1_mark, exact_log2(markWord::monitor_value), UseObjectMonitorTable ? push_and_slow_path : inflated);
 352 
 353     // Try to unlock. Transition lock bits 0b00 => 0b01
 354     assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea");
 355     orr(t3_t, t1_mark, markWord::unlocked_value);
 356     cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
 357             /*acquire*/ false, /*release*/ true, /*weak*/ false, noreg);
 358     br(Assembler::EQ, unlocked);
 359 
 360     bind(push_and_slow_path);
 361     // Compare and exchange failed.
 362     // Restore lock-stack and handle the unlock in runtime.
 363     DEBUG_ONLY(str(obj, Address(rthread, t2_top));)
 364     addw(t2_top, t2_top, oopSize);
 365     str(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 366     b(slow_path);
 367   }
 368 
 369 
 370   { // Handle inflated monitor.
 371     bind(inflated_load_mark);
 372     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 373 #ifdef ASSERT
 374     tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
 375     stop("Fast Unlock not monitor");
 376 #endif
 377 
 378     bind(inflated);
 379 
 380 #ifdef ASSERT
 381     Label check_done;
 382     subw(t2_top, t2_top, oopSize);
 383     cmpw(t2_top, in_bytes(JavaThread::lock_stack_base_offset()));
 384     br(Assembler::LT, check_done);
 385     ldr(t3_t, Address(rthread, t2_top));
 386     cmp(obj, t3_t);
 387     br(Assembler::NE, inflated);
 388     stop("Fast Unlock lock on stack");
 389     bind(check_done);
 390 #endif
 391 
 392     const Register t1_monitor = t1;
 393 
 394     if (!UseObjectMonitorTable) {
 395       assert(t1_monitor == t1_mark, "should be the same here");
 396 
 397       // Untag the monitor.
 398       add(t1_monitor, t1_mark, -(int)markWord::monitor_value);
 399     } else {
 400       ldr(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 401       // null check with Flags == NE, no valid pointer below alignof(ObjectMonitor*)
 402       cmp(t1_monitor, checked_cast<uint8_t>(alignof(ObjectMonitor*)));
 403       br(Assembler::LO, slow_path);
 404     }
 405 
 406     const Register t2_recursions = t2;
 407     Label not_recursive;
 408 
 409     // Check if recursive.
 410     ldr(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
 411     cbz(t2_recursions, not_recursive);
 412 
 413     // Recursive unlock.
 414     sub(t2_recursions, t2_recursions, 1u);
 415     str(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
 416     // Set flag == EQ
 417     cmp(t2_recursions, t2_recursions);
 418     b(unlocked);
 419 
 420     bind(not_recursive);
 421 
 422     const Register t2_owner_addr = t2;
 423 
 424     // Compute owner address.
 425     lea(t2_owner_addr, Address(t1_monitor, ObjectMonitor::owner_offset()));
 426 
 427     // Set owner to null.
 428     // Release to satisfy the JMM
 429     stlr(zr, t2_owner_addr);
 430     // We need a full fence after clearing owner to avoid stranding.
 431     // StoreLoad achieves this.
 432     membar(StoreLoad);
 433 
 434     // Check if the entry_list is empty.
 435     ldr(rscratch1, Address(t1_monitor, ObjectMonitor::entry_list_offset()));
 436     cmp(rscratch1, zr);
 437     br(Assembler::EQ, unlocked);  // If so we are done.
 438 
 439     // Check if there is a successor.
 440     ldr(rscratch1, Address(t1_monitor, ObjectMonitor::succ_offset()));
 441     cmp(rscratch1, zr);
 442     br(Assembler::NE, unlocked);  // If so we are done.
 443 
 444     // Save the monitor pointer in the current thread, so we can try to
 445     // reacquire the lock in SharedRuntime::monitor_exit_helper().
 446     str(t1_monitor, Address(rthread, JavaThread::unlocked_inflated_monitor_offset()));
 447 
 448     cmp(zr, rthread); // Set Flag to NE => slow path
 449     b(slow_path);
 450   }
 451 
 452   bind(unlocked);
 453   cmp(zr, zr); // Set Flags to EQ => fast path
 454 
 455 #ifdef ASSERT
 456   // Check that unlocked label is reached with Flags == EQ.
 457   Label flag_correct;
 458   br(Assembler::EQ, flag_correct);
 459   stop("Fast Unlock Flag != EQ");
 460 #endif
 461 
 462   bind(slow_path);
 463 #ifdef ASSERT
 464   // Check that slow_path label is reached with Flags == NE.
 465   br(Assembler::NE, flag_correct);
 466   stop("Fast Unlock Flag != NE");
 467   bind(flag_correct);
 468 #endif
 469   // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
 470 }
 471 
 472 // Search for str1 in str2 and return index or -1
 473 // Clobbers: rscratch1, rscratch2, rflags. May also clobber v0-v1, when icnt1==-1.
 474 void C2_MacroAssembler::string_indexof(Register str2, Register str1,
 475                                        Register cnt2, Register cnt1,
 476                                        Register tmp1, Register tmp2,
 477                                        Register tmp3, Register tmp4,
 478                                        Register tmp5, Register tmp6,
 479                                        int icnt1, Register result, int ae) {
 480   // NOTE: tmp5, tmp6 can be zr depending on specific method version
 481   Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
 482 
 483   Register ch1 = rscratch1;
 484   Register ch2 = rscratch2;
 485   Register cnt1tmp = tmp1;
 486   Register cnt2tmp = tmp2;
 487   Register cnt1_neg = cnt1;
 488   Register cnt2_neg = cnt2;
 489   Register result_tmp = tmp4;
 490 
 491   bool isL = ae == StrIntrinsicNode::LL;
 492 
 493   bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
 494   bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
 495   int str1_chr_shift = str1_isL ? 0:1;
 496   int str2_chr_shift = str2_isL ? 0:1;
 497   int str1_chr_size = str1_isL ? 1:2;
 498   int str2_chr_size = str2_isL ? 1:2;
 499   chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
 500                                       (chr_insn)&MacroAssembler::ldrh;
 501   chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
 502                                       (chr_insn)&MacroAssembler::ldrh;
 503   chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
 504   chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
 505 
 506   // Note, inline_string_indexOf() generates checks:
 507   // if (substr.count > string.count) return -1;
 508   // if (substr.count == 0) return 0;
 509 
 510   // We have two strings, a source string in str2, cnt2 and a pattern string
 511   // in str1, cnt1. Find the 1st occurrence of pattern in source or return -1.
 512 
 513   // For larger pattern and source we use a simplified Boyer Moore algorithm.
 514   // With a small pattern and source we use linear scan.
 515 
 516   if (icnt1 == -1) {
 517     sub(result_tmp, cnt2, cnt1);
 518     cmp(cnt1, (u1)8);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
 519     br(LT, LINEARSEARCH);
 520     dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
 521     subs(zr, cnt1, 256);
 522     lsr(tmp1, cnt2, 2);
 523     ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
 524     br(GE, LINEARSTUB);
 525   }
 526 
 527 // The Boyer Moore alogorithm is based on the description here:-
 528 //
 529 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
 530 //
 531 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
 532 // and the 'Good Suffix' rule.
 533 //
 534 // These rules are essentially heuristics for how far we can shift the
 535 // pattern along the search string.
 536 //
 537 // The implementation here uses the 'Bad Character' rule only because of the
 538 // complexity of initialisation for the 'Good Suffix' rule.
 539 //
 540 // This is also known as the Boyer-Moore-Horspool algorithm:-
 541 //
 542 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
 543 //
 544 // This particular implementation has few java-specific optimizations.
 545 //
 546 // #define ASIZE 256
 547 //
 548 //    int bm(unsigned char *x, int m, unsigned char *y, int n) {
 549 //       int i, j;
 550 //       unsigned c;
 551 //       unsigned char bc[ASIZE];
 552 //
 553 //       /* Preprocessing */
 554 //       for (i = 0; i < ASIZE; ++i)
 555 //          bc[i] = m;
 556 //       for (i = 0; i < m - 1; ) {
 557 //          c = x[i];
 558 //          ++i;
 559 //          // c < 256 for Latin1 string, so, no need for branch
 560 //          #ifdef PATTERN_STRING_IS_LATIN1
 561 //          bc[c] = m - i;
 562 //          #else
 563 //          if (c < ASIZE) bc[c] = m - i;
 564 //          #endif
 565 //       }
 566 //
 567 //       /* Searching */
 568 //       j = 0;
 569 //       while (j <= n - m) {
 570 //          c = y[i+j];
 571 //          if (x[m-1] == c)
 572 //            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
 573 //          if (i < 0) return j;
 574 //          // c < 256 for Latin1 string, so, no need for branch
 575 //          #ifdef SOURCE_STRING_IS_LATIN1
 576 //          // LL case: (c< 256) always true. Remove branch
 577 //          j += bc[y[j+m-1]];
 578 //          #endif
 579 //          #ifndef PATTERN_STRING_IS_UTF
 580 //          // UU case: need if (c<ASIZE) check. Skip 1 character if not.
 581 //          if (c < ASIZE)
 582 //            j += bc[y[j+m-1]];
 583 //          else
 584 //            j += 1
 585 //          #endif
 586 //          #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
 587 //          // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
 588 //          if (c < ASIZE)
 589 //            j += bc[y[j+m-1]];
 590 //          else
 591 //            j += m
 592 //          #endif
 593 //       }
 594 //    }
 595 
 596   if (icnt1 == -1) {
 597     Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
 598         BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
 599     Register cnt1end = tmp2;
 600     Register str2end = cnt2;
 601     Register skipch = tmp2;
 602 
 603     // str1 length is >=8, so, we can read at least 1 register for cases when
 604     // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
 605     // UL case. We'll re-read last character in inner pre-loop code to have
 606     // single outer pre-loop load
 607     const int firstStep = isL ? 7 : 3;
 608 
 609     const int ASIZE = 256;
 610     const int STORED_BYTES = 32; // amount of bytes stored per instruction
 611     sub(sp, sp, ASIZE);
 612     mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
 613     mov(ch1, sp);
 614     BIND(BM_INIT_LOOP);
 615       stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
 616       subs(tmp5, tmp5, 1);
 617       br(GT, BM_INIT_LOOP);
 618 
 619       sub(cnt1tmp, cnt1, 1);
 620       mov(tmp5, str2);
 621       add(str2end, str2, result_tmp, LSL, str2_chr_shift);
 622       sub(ch2, cnt1, 1);
 623       mov(tmp3, str1);
 624     BIND(BCLOOP);
 625       (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
 626       if (!str1_isL) {
 627         subs(zr, ch1, ASIZE);
 628         br(HS, BCSKIP);
 629       }
 630       strb(ch2, Address(sp, ch1));
 631     BIND(BCSKIP);
 632       subs(ch2, ch2, 1);
 633       br(GT, BCLOOP);
 634 
 635       add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
 636       if (str1_isL == str2_isL) {
 637         // load last 8 bytes (8LL/4UU symbols)
 638         ldr(tmp6, Address(tmp6, -wordSize));
 639       } else {
 640         ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
 641         // convert Latin1 to UTF. We'll have to wait until load completed, but
 642         // it's still faster than per-character loads+checks
 643         lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
 644         ubfx(ch1, tmp6, 8, 8); // str1[N-2]
 645         ubfx(ch2, tmp6, 16, 8); // str1[N-3]
 646         andr(tmp6, tmp6, 0xFF); // str1[N-4]
 647         orr(ch2, ch1, ch2, LSL, 16);
 648         orr(tmp6, tmp6, tmp3, LSL, 48);
 649         orr(tmp6, tmp6, ch2, LSL, 16);
 650       }
 651     BIND(BMLOOPSTR2);
 652       (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 653       sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
 654       if (str1_isL == str2_isL) {
 655         // re-init tmp3. It's for free because it's executed in parallel with
 656         // load above. Alternative is to initialize it before loop, but it'll
 657         // affect performance on in-order systems with 2 or more ld/st pipelines
 658         lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
 659       }
 660       if (!isL) { // UU/UL case
 661         lsl(ch2, cnt1tmp, 1); // offset in bytes
 662       }
 663       cmp(tmp3, skipch);
 664       br(NE, BMSKIP);
 665       ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
 666       mov(ch1, tmp6);
 667       if (isL) {
 668         b(BMLOOPSTR1_AFTER_LOAD);
 669       } else {
 670         sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
 671         b(BMLOOPSTR1_CMP);
 672       }
 673     BIND(BMLOOPSTR1);
 674       (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
 675       (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 676     BIND(BMLOOPSTR1_AFTER_LOAD);
 677       subs(cnt1tmp, cnt1tmp, 1);
 678       br(LT, BMLOOPSTR1_LASTCMP);
 679     BIND(BMLOOPSTR1_CMP);
 680       cmp(ch1, ch2);
 681       br(EQ, BMLOOPSTR1);
 682     BIND(BMSKIP);
 683       if (!isL) {
 684         // if we've met UTF symbol while searching Latin1 pattern, then we can
 685         // skip cnt1 symbols
 686         if (str1_isL != str2_isL) {
 687           mov(result_tmp, cnt1);
 688         } else {
 689           mov(result_tmp, 1);
 690         }
 691         subs(zr, skipch, ASIZE);
 692         br(HS, BMADV);
 693       }
 694       ldrb(result_tmp, Address(sp, skipch)); // load skip distance
 695     BIND(BMADV);
 696       sub(cnt1tmp, cnt1, 1);
 697       add(str2, str2, result_tmp, LSL, str2_chr_shift);
 698       cmp(str2, str2end);
 699       br(LE, BMLOOPSTR2);
 700       add(sp, sp, ASIZE);
 701       b(NOMATCH);
 702     BIND(BMLOOPSTR1_LASTCMP);
 703       cmp(ch1, ch2);
 704       br(NE, BMSKIP);
 705     BIND(BMMATCH);
 706       sub(result, str2, tmp5);
 707       if (!str2_isL) lsr(result, result, 1);
 708       add(sp, sp, ASIZE);
 709       b(DONE);
 710 
 711     BIND(LINEARSTUB);
 712     cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm
 713     br(LT, LINEAR_MEDIUM);
 714     mov(result, zr);
 715     RuntimeAddress stub = nullptr;
 716     if (isL) {
 717       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
 718       assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated");
 719     } else if (str1_isL) {
 720       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
 721        assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated");
 722     } else {
 723       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
 724       assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated");
 725     }
 726     address call = trampoline_call(stub);
 727     if (call == nullptr) {
 728       DEBUG_ONLY(reset_labels(LINEARSEARCH, LINEAR_MEDIUM, DONE, NOMATCH, MATCH));
 729       ciEnv::current()->record_failure("CodeCache is full");
 730       return;
 731     }
 732     b(DONE);
 733   }
 734 
 735   BIND(LINEARSEARCH);
 736   {
 737     Label DO1, DO2, DO3;
 738 
 739     Register str2tmp = tmp2;
 740     Register first = tmp3;
 741 
 742     if (icnt1 == -1)
 743     {
 744         Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
 745 
 746         cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2));
 747         br(LT, DOSHORT);
 748       BIND(LINEAR_MEDIUM);
 749         (this->*str1_load_1chr)(first, Address(str1));
 750         lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
 751         sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
 752         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 753         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 754 
 755       BIND(FIRST_LOOP);
 756         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
 757         cmp(first, ch2);
 758         br(EQ, STR1_LOOP);
 759       BIND(STR2_NEXT);
 760         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 761         br(LE, FIRST_LOOP);
 762         b(NOMATCH);
 763 
 764       BIND(STR1_LOOP);
 765         adds(cnt1tmp, cnt1_neg, str1_chr_size);
 766         add(cnt2tmp, cnt2_neg, str2_chr_size);
 767         br(GE, MATCH);
 768 
 769       BIND(STR1_NEXT);
 770         (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
 771         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
 772         cmp(ch1, ch2);
 773         br(NE, STR2_NEXT);
 774         adds(cnt1tmp, cnt1tmp, str1_chr_size);
 775         add(cnt2tmp, cnt2tmp, str2_chr_size);
 776         br(LT, STR1_NEXT);
 777         b(MATCH);
 778 
 779       BIND(DOSHORT);
 780       if (str1_isL == str2_isL) {
 781         cmp(cnt1, (u1)2);
 782         br(LT, DO1);
 783         br(GT, DO3);
 784       }
 785     }
 786 
 787     if (icnt1 == 4) {
 788       Label CH1_LOOP;
 789 
 790         (this->*load_4chr)(ch1, str1);
 791         sub(result_tmp, cnt2, 4);
 792         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 793         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 794 
 795       BIND(CH1_LOOP);
 796         (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
 797         cmp(ch1, ch2);
 798         br(EQ, MATCH);
 799         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 800         br(LE, CH1_LOOP);
 801         b(NOMATCH);
 802       }
 803 
 804     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
 805       Label CH1_LOOP;
 806 
 807       BIND(DO2);
 808         (this->*load_2chr)(ch1, str1);
 809         if (icnt1 == 2) {
 810           sub(result_tmp, cnt2, 2);
 811         }
 812         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 813         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 814       BIND(CH1_LOOP);
 815         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
 816         cmp(ch1, ch2);
 817         br(EQ, MATCH);
 818         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 819         br(LE, CH1_LOOP);
 820         b(NOMATCH);
 821     }
 822 
 823     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
 824       Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
 825 
 826       BIND(DO3);
 827         (this->*load_2chr)(first, str1);
 828         (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
 829         if (icnt1 == 3) {
 830           sub(result_tmp, cnt2, 3);
 831         }
 832         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 833         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 834       BIND(FIRST_LOOP);
 835         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
 836         cmpw(first, ch2);
 837         br(EQ, STR1_LOOP);
 838       BIND(STR2_NEXT);
 839         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 840         br(LE, FIRST_LOOP);
 841         b(NOMATCH);
 842 
 843       BIND(STR1_LOOP);
 844         add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
 845         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
 846         cmp(ch1, ch2);
 847         br(NE, STR2_NEXT);
 848         b(MATCH);
 849     }
 850 
 851     if (icnt1 == -1 || icnt1 == 1) {
 852       Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
 853 
 854       BIND(DO1);
 855         (this->*str1_load_1chr)(ch1, str1);
 856         cmp(cnt2, (u1)8);
 857         br(LT, DO1_SHORT);
 858 
 859         sub(result_tmp, cnt2, 8/str2_chr_size);
 860         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 861         mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
 862         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 863 
 864         if (str2_isL) {
 865           orr(ch1, ch1, ch1, LSL, 8);
 866         }
 867         orr(ch1, ch1, ch1, LSL, 16);
 868         orr(ch1, ch1, ch1, LSL, 32);
 869       BIND(CH1_LOOP);
 870         ldr(ch2, Address(str2, cnt2_neg));
 871         eor(ch2, ch1, ch2);
 872         sub(tmp1, ch2, tmp3);
 873         orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 874         bics(tmp1, tmp1, tmp2);
 875         br(NE, HAS_ZERO);
 876         adds(cnt2_neg, cnt2_neg, 8);
 877         br(LT, CH1_LOOP);
 878 
 879         cmp(cnt2_neg, (u1)8);
 880         mov(cnt2_neg, 0);
 881         br(LT, CH1_LOOP);
 882         b(NOMATCH);
 883 
 884       BIND(HAS_ZERO);
 885         rev(tmp1, tmp1);
 886         clz(tmp1, tmp1);
 887         add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
 888         b(MATCH);
 889 
 890       BIND(DO1_SHORT);
 891         mov(result_tmp, cnt2);
 892         lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
 893         sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
 894       BIND(DO1_LOOP);
 895         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
 896         cmpw(ch1, ch2);
 897         br(EQ, MATCH);
 898         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 899         br(LT, DO1_LOOP);
 900     }
 901   }
 902   BIND(NOMATCH);
 903     mov(result, -1);
 904     b(DONE);
 905   BIND(MATCH);
 906     add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
 907   BIND(DONE);
 908 }
 909 
 910 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
 911 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
 912 
 913 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1,
 914                                             Register ch, Register result,
 915                                             Register tmp1, Register tmp2, Register tmp3)
 916 {
 917   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
 918   Register cnt1_neg = cnt1;
 919   Register ch1 = rscratch1;
 920   Register result_tmp = rscratch2;
 921 
 922   cbz(cnt1, NOMATCH);
 923 
 924   cmp(cnt1, (u1)4);
 925   br(LT, DO1_SHORT);
 926 
 927   orr(ch, ch, ch, LSL, 16);
 928   orr(ch, ch, ch, LSL, 32);
 929 
 930   sub(cnt1, cnt1, 4);
 931   mov(result_tmp, cnt1);
 932   lea(str1, Address(str1, cnt1, Address::uxtw(1)));
 933   sub(cnt1_neg, zr, cnt1, LSL, 1);
 934 
 935   mov(tmp3, 0x0001000100010001);
 936 
 937   BIND(CH1_LOOP);
 938     ldr(ch1, Address(str1, cnt1_neg));
 939     eor(ch1, ch, ch1);
 940     sub(tmp1, ch1, tmp3);
 941     orr(tmp2, ch1, 0x7fff7fff7fff7fff);
 942     bics(tmp1, tmp1, tmp2);
 943     br(NE, HAS_ZERO);
 944     adds(cnt1_neg, cnt1_neg, 8);
 945     br(LT, CH1_LOOP);
 946 
 947     cmp(cnt1_neg, (u1)8);
 948     mov(cnt1_neg, 0);
 949     br(LT, CH1_LOOP);
 950     b(NOMATCH);
 951 
 952   BIND(HAS_ZERO);
 953     rev(tmp1, tmp1);
 954     clz(tmp1, tmp1);
 955     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
 956     b(MATCH);
 957 
 958   BIND(DO1_SHORT);
 959     mov(result_tmp, cnt1);
 960     lea(str1, Address(str1, cnt1, Address::uxtw(1)));
 961     sub(cnt1_neg, zr, cnt1, LSL, 1);
 962   BIND(DO1_LOOP);
 963     ldrh(ch1, Address(str1, cnt1_neg));
 964     cmpw(ch, ch1);
 965     br(EQ, MATCH);
 966     adds(cnt1_neg, cnt1_neg, 2);
 967     br(LT, DO1_LOOP);
 968   BIND(NOMATCH);
 969     mov(result, -1);
 970     b(DONE);
 971   BIND(MATCH);
 972     add(result, result_tmp, cnt1_neg, ASR, 1);
 973   BIND(DONE);
 974 }
 975 
 976 void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1,
 977                                                 Register ch, Register result,
 978                                                 FloatRegister ztmp1,
 979                                                 FloatRegister ztmp2,
 980                                                 PRegister tmp_pg,
 981                                                 PRegister tmp_pdn, bool isL)
 982 {
 983   // Note that `tmp_pdn` should *NOT* be used as governing predicate register.
 984   assert(tmp_pg->is_governing(),
 985          "this register has to be a governing predicate register");
 986 
 987   Label LOOP, MATCH, DONE, NOMATCH;
 988   Register vec_len = rscratch1;
 989   Register idx = rscratch2;
 990 
 991   SIMD_RegVariant T = (isL == true) ? B : H;
 992 
 993   cbz(cnt1, NOMATCH);
 994 
 995   // Assign the particular char throughout the vector.
 996   sve_dup(ztmp2, T, ch);
 997   if (isL) {
 998     sve_cntb(vec_len);
 999   } else {
1000     sve_cnth(vec_len);
1001   }
1002   mov(idx, 0);
1003 
1004   // Generate a predicate to control the reading of input string.
1005   sve_whilelt(tmp_pg, T, idx, cnt1);
1006 
1007   BIND(LOOP);
1008     // Read a vector of 8- or 16-bit data depending on the string type. Note
1009     // that inactive elements indicated by the predicate register won't cause
1010     // a data read from memory to the destination vector.
1011     if (isL) {
1012       sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx));
1013     } else {
1014       sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1)));
1015     }
1016     add(idx, idx, vec_len);
1017 
1018     // Perform the comparison. An element of the destination predicate is set
1019     // to active if the particular char is matched.
1020     sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2);
1021 
1022     // Branch if the particular char is found.
1023     br(NE, MATCH);
1024 
1025     sve_whilelt(tmp_pg, T, idx, cnt1);
1026 
1027     // Loop back if the particular char not found.
1028     br(MI, LOOP);
1029 
1030   BIND(NOMATCH);
1031     mov(result, -1);
1032     b(DONE);
1033 
1034   BIND(MATCH);
1035     // Undo the index increment.
1036     sub(idx, idx, vec_len);
1037 
1038     // Crop the vector to find its location.
1039     sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */);
1040     add(result, idx, -1);
1041     sve_incp(result, T, tmp_pdn);
1042   BIND(DONE);
1043 }
1044 
1045 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1,
1046                                             Register ch, Register result,
1047                                             Register tmp1, Register tmp2, Register tmp3)
1048 {
1049   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
1050   Register cnt1_neg = cnt1;
1051   Register ch1 = rscratch1;
1052   Register result_tmp = rscratch2;
1053 
1054   cbz(cnt1, NOMATCH);
1055 
1056   cmp(cnt1, (u1)8);
1057   br(LT, DO1_SHORT);
1058 
1059   orr(ch, ch, ch, LSL, 8);
1060   orr(ch, ch, ch, LSL, 16);
1061   orr(ch, ch, ch, LSL, 32);
1062 
1063   sub(cnt1, cnt1, 8);
1064   mov(result_tmp, cnt1);
1065   lea(str1, Address(str1, cnt1));
1066   sub(cnt1_neg, zr, cnt1);
1067 
1068   mov(tmp3, 0x0101010101010101);
1069 
1070   BIND(CH1_LOOP);
1071     ldr(ch1, Address(str1, cnt1_neg));
1072     eor(ch1, ch, ch1);
1073     sub(tmp1, ch1, tmp3);
1074     orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f);
1075     bics(tmp1, tmp1, tmp2);
1076     br(NE, HAS_ZERO);
1077     adds(cnt1_neg, cnt1_neg, 8);
1078     br(LT, CH1_LOOP);
1079 
1080     cmp(cnt1_neg, (u1)8);
1081     mov(cnt1_neg, 0);
1082     br(LT, CH1_LOOP);
1083     b(NOMATCH);
1084 
1085   BIND(HAS_ZERO);
1086     rev(tmp1, tmp1);
1087     clz(tmp1, tmp1);
1088     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
1089     b(MATCH);
1090 
1091   BIND(DO1_SHORT);
1092     mov(result_tmp, cnt1);
1093     lea(str1, Address(str1, cnt1));
1094     sub(cnt1_neg, zr, cnt1);
1095   BIND(DO1_LOOP);
1096     ldrb(ch1, Address(str1, cnt1_neg));
1097     cmp(ch, ch1);
1098     br(EQ, MATCH);
1099     adds(cnt1_neg, cnt1_neg, 1);
1100     br(LT, DO1_LOOP);
1101   BIND(NOMATCH);
1102     mov(result, -1);
1103     b(DONE);
1104   BIND(MATCH);
1105     add(result, result_tmp, cnt1_neg);
1106   BIND(DONE);
1107 }
1108 
1109 // Compare strings.
1110 void C2_MacroAssembler::string_compare(Register str1, Register str2,
1111     Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
1112     FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
1113     PRegister pgtmp1, PRegister pgtmp2, int ae) {
1114   Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
1115       DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
1116       SHORT_LOOP_START, TAIL_CHECK;
1117 
1118   bool isLL = ae == StrIntrinsicNode::LL;
1119   bool isLU = ae == StrIntrinsicNode::LU;
1120   bool isUL = ae == StrIntrinsicNode::UL;
1121 
1122   // The stub threshold for LL strings is: 72 (64 + 8) chars
1123   // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch)
1124   // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least)
1125   const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36);
1126 
1127   bool str1_isL = isLL || isLU;
1128   bool str2_isL = isLL || isUL;
1129 
1130   int str1_chr_shift = str1_isL ? 0 : 1;
1131   int str2_chr_shift = str2_isL ? 0 : 1;
1132   int str1_chr_size = str1_isL ? 1 : 2;
1133   int str2_chr_size = str2_isL ? 1 : 2;
1134   int minCharsInWord = isLL ? wordSize : wordSize/2;
1135 
1136   FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
1137   chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
1138                                       (chr_insn)&MacroAssembler::ldrh;
1139   chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
1140                                       (chr_insn)&MacroAssembler::ldrh;
1141   uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
1142                             (uxt_insn)&MacroAssembler::uxthw;
1143 
1144   BLOCK_COMMENT("string_compare {");
1145 
1146   // Bizarrely, the counts are passed in bytes, regardless of whether they
1147   // are L or U strings, however the result is always in characters.
1148   if (!str1_isL) asrw(cnt1, cnt1, 1);
1149   if (!str2_isL) asrw(cnt2, cnt2, 1);
1150 
1151   // Compute the minimum of the string lengths and save the difference.
1152   subsw(result, cnt1, cnt2);
1153   cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
1154 
1155   // A very short string
1156   cmpw(cnt2, minCharsInWord);
1157   br(Assembler::LE, SHORT_STRING);
1158 
1159   // Compare longwords
1160   // load first parts of strings and finish initialization while loading
1161   {
1162     if (str1_isL == str2_isL) { // LL or UU
1163       ldr(tmp1, Address(str1));
1164       cmp(str1, str2);
1165       br(Assembler::EQ, DONE);
1166       ldr(tmp2, Address(str2));
1167       cmp(cnt2, stub_threshold);
1168       br(GE, STUB);
1169       subsw(cnt2, cnt2, minCharsInWord);
1170       br(EQ, TAIL_CHECK);
1171       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1172       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1173       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1174     } else if (isLU) {
1175       ldrs(vtmp, Address(str1));
1176       ldr(tmp2, Address(str2));
1177       cmp(cnt2, stub_threshold);
1178       br(GE, STUB);
1179       subw(cnt2, cnt2, 4);
1180       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1181       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1182       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1183       zip1(vtmp, T8B, vtmp, vtmpZ);
1184       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1185       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1186       add(cnt1, cnt1, 4);
1187       fmovd(tmp1, vtmp);
1188     } else { // UL case
1189       ldr(tmp1, Address(str1));
1190       ldrs(vtmp, Address(str2));
1191       cmp(cnt2, stub_threshold);
1192       br(GE, STUB);
1193       subw(cnt2, cnt2, 4);
1194       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1195       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1196       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1197       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1198       zip1(vtmp, T8B, vtmp, vtmpZ);
1199       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1200       add(cnt1, cnt1, 8);
1201       fmovd(tmp2, vtmp);
1202     }
1203     adds(cnt2, cnt2, isUL ? 4 : 8);
1204     br(GE, TAIL);
1205     eor(rscratch2, tmp1, tmp2);
1206     cbnz(rscratch2, DIFF);
1207     // main loop
1208     bind(NEXT_WORD);
1209     if (str1_isL == str2_isL) {
1210       ldr(tmp1, Address(str1, cnt2));
1211       ldr(tmp2, Address(str2, cnt2));
1212       adds(cnt2, cnt2, 8);
1213     } else if (isLU) {
1214       ldrs(vtmp, Address(str1, cnt1));
1215       ldr(tmp2, Address(str2, cnt2));
1216       add(cnt1, cnt1, 4);
1217       zip1(vtmp, T8B, vtmp, vtmpZ);
1218       fmovd(tmp1, vtmp);
1219       adds(cnt2, cnt2, 8);
1220     } else { // UL
1221       ldrs(vtmp, Address(str2, cnt2));
1222       ldr(tmp1, Address(str1, cnt1));
1223       zip1(vtmp, T8B, vtmp, vtmpZ);
1224       add(cnt1, cnt1, 8);
1225       fmovd(tmp2, vtmp);
1226       adds(cnt2, cnt2, 4);
1227     }
1228     br(GE, TAIL);
1229 
1230     eor(rscratch2, tmp1, tmp2);
1231     cbz(rscratch2, NEXT_WORD);
1232     b(DIFF);
1233     bind(TAIL);
1234     eor(rscratch2, tmp1, tmp2);
1235     cbnz(rscratch2, DIFF);
1236     // Last longword.  In the case where length == 4 we compare the
1237     // same longword twice, but that's still faster than another
1238     // conditional branch.
1239     if (str1_isL == str2_isL) {
1240       ldr(tmp1, Address(str1));
1241       ldr(tmp2, Address(str2));
1242     } else if (isLU) {
1243       ldrs(vtmp, Address(str1));
1244       ldr(tmp2, Address(str2));
1245       zip1(vtmp, T8B, vtmp, vtmpZ);
1246       fmovd(tmp1, vtmp);
1247     } else { // UL
1248       ldrs(vtmp, Address(str2));
1249       ldr(tmp1, Address(str1));
1250       zip1(vtmp, T8B, vtmp, vtmpZ);
1251       fmovd(tmp2, vtmp);
1252     }
1253     bind(TAIL_CHECK);
1254     eor(rscratch2, tmp1, tmp2);
1255     cbz(rscratch2, DONE);
1256 
1257     // Find the first different characters in the longwords and
1258     // compute their difference.
1259     bind(DIFF);
1260     rev(rscratch2, rscratch2);
1261     clz(rscratch2, rscratch2);
1262     andr(rscratch2, rscratch2, isLL ? -8 : -16);
1263     lsrv(tmp1, tmp1, rscratch2);
1264     (this->*ext_chr)(tmp1, tmp1);
1265     lsrv(tmp2, tmp2, rscratch2);
1266     (this->*ext_chr)(tmp2, tmp2);
1267     subw(result, tmp1, tmp2);
1268     b(DONE);
1269   }
1270 
1271   bind(STUB);
1272     RuntimeAddress stub = nullptr;
1273     switch(ae) {
1274       case StrIntrinsicNode::LL:
1275         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
1276         break;
1277       case StrIntrinsicNode::UU:
1278         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
1279         break;
1280       case StrIntrinsicNode::LU:
1281         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
1282         break;
1283       case StrIntrinsicNode::UL:
1284         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
1285         break;
1286       default:
1287         ShouldNotReachHere();
1288      }
1289     assert(stub.target() != nullptr, "compare_long_string stub has not been generated");
1290     address call = trampoline_call(stub);
1291     if (call == nullptr) {
1292       DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START));
1293       ciEnv::current()->record_failure("CodeCache is full");
1294       return;
1295     }
1296     b(DONE);
1297 
1298   bind(SHORT_STRING);
1299   // Is the minimum length zero?
1300   cbz(cnt2, DONE);
1301   // arrange code to do most branches while loading and loading next characters
1302   // while comparing previous
1303   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1304   subs(cnt2, cnt2, 1);
1305   br(EQ, SHORT_LAST_INIT);
1306   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1307   b(SHORT_LOOP_START);
1308   bind(SHORT_LOOP);
1309   subs(cnt2, cnt2, 1);
1310   br(EQ, SHORT_LAST);
1311   bind(SHORT_LOOP_START);
1312   (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
1313   (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
1314   cmp(tmp1, cnt1);
1315   br(NE, SHORT_LOOP_TAIL);
1316   subs(cnt2, cnt2, 1);
1317   br(EQ, SHORT_LAST2);
1318   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1319   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1320   cmp(tmp2, rscratch1);
1321   br(EQ, SHORT_LOOP);
1322   sub(result, tmp2, rscratch1);
1323   b(DONE);
1324   bind(SHORT_LOOP_TAIL);
1325   sub(result, tmp1, cnt1);
1326   b(DONE);
1327   bind(SHORT_LAST2);
1328   cmp(tmp2, rscratch1);
1329   br(EQ, DONE);
1330   sub(result, tmp2, rscratch1);
1331 
1332   b(DONE);
1333   bind(SHORT_LAST_INIT);
1334   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1335   bind(SHORT_LAST);
1336   cmp(tmp1, cnt1);
1337   br(EQ, DONE);
1338   sub(result, tmp1, cnt1);
1339 
1340   bind(DONE);
1341 
1342   BLOCK_COMMENT("} string_compare");
1343 }
1344 
1345 void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1,
1346                                      FloatRegister src2, Condition cond, bool isQ) {
1347   SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1348   FloatRegister zn = src1, zm = src2;
1349   bool needs_negation = false;
1350   switch (cond) {
1351     case LT: cond = GT; zn = src2; zm = src1; break;
1352     case LE: cond = GE; zn = src2; zm = src1; break;
1353     case LO: cond = HI; zn = src2; zm = src1; break;
1354     case LS: cond = HS; zn = src2; zm = src1; break;
1355     case NE: cond = EQ; needs_negation = true; break;
1356     default:
1357       break;
1358   }
1359 
1360   if (is_floating_point_type(bt)) {
1361     fcm(cond, dst, size, zn, zm);
1362   } else {
1363     cm(cond, dst, size, zn, zm);
1364   }
1365 
1366   if (needs_negation) {
1367     notr(dst, isQ ? T16B : T8B, dst);
1368   }
1369 }
1370 
1371 void C2_MacroAssembler::neon_compare_zero(FloatRegister dst, BasicType bt, FloatRegister src,
1372                                           Condition cond, bool isQ) {
1373   SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1374   if (bt == T_FLOAT || bt == T_DOUBLE) {
1375     if (cond == Assembler::NE) {
1376       fcm(Assembler::EQ, dst, size, src);
1377       notr(dst, isQ ? T16B : T8B, dst);
1378     } else {
1379       fcm(cond, dst, size, src);
1380     }
1381   } else {
1382     if (cond == Assembler::NE) {
1383       cm(Assembler::EQ, dst, size, src);
1384       notr(dst, isQ ? T16B : T8B, dst);
1385     } else {
1386       cm(cond, dst, size, src);
1387     }
1388   }
1389 }
1390 
1391 // Compress the least significant bit of each byte to the rightmost and clear
1392 // the higher garbage bits.
1393 void C2_MacroAssembler::bytemask_compress(Register dst) {
1394   // Example input, dst = 0x01 00 00 00 01 01 00 01
1395   // The "??" bytes are garbage.
1396   orr(dst, dst, dst, Assembler::LSR, 7);  // dst = 0x?? 02 ?? 00 ?? 03 ?? 01
1397   orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D
1398   orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D
1399   andr(dst, dst, 0xff);                   // dst = 0x8D
1400 }
1401 
1402 // Pack the lowest-numbered bit of each mask element in src into a long value
1403 // in dst, at most the first 64 lane elements.
1404 // Clobbers: rscratch1, if UseSVE=1 or the hardware doesn't support FEAT_BITPERM.
1405 void C2_MacroAssembler::sve_vmask_tolong(Register dst, PRegister src, BasicType bt, int lane_cnt,
1406                                          FloatRegister vtmp1, FloatRegister vtmp2) {
1407   assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
1408   assert_different_registers(dst, rscratch1);
1409   assert_different_registers(vtmp1, vtmp2);
1410 
1411   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
1412   // Example:   src = 0b01100101 10001101, bt = T_BYTE, lane_cnt = 16
1413   // Expected:  dst = 0x658D
1414 
1415   // Convert the mask into vector with sequential bytes.
1416   // vtmp1 = 0x00010100 0x00010001 0x01000000 0x01010001
1417   sve_cpy(vtmp1, size, src, 1, false);
1418   if (bt != T_BYTE) {
1419     sve_vector_narrow(vtmp1, B, vtmp1, size, vtmp2);
1420   }
1421 
1422   if (UseSVE > 1 && VM_Version::supports_svebitperm()) {
1423     // Given a vector with the value 0x00 or 0x01 in each byte, the basic idea
1424     // is to compress each significant bit of the byte in a cross-lane way. Due
1425     // to the lack of a cross-lane bit-compress instruction, we use BEXT
1426     // (bit-compress in each lane) with the biggest lane size (T = D) then
1427     // concatenate the results.
1428 
1429     // The second source input of BEXT, initialized with 0x01 in each byte.
1430     // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1431     sve_dup(vtmp2, B, 1);
1432 
1433     // BEXT vtmp1.D, vtmp1.D, vtmp2.D
1434     // vtmp1 = 0x0001010000010001 | 0x0100000001010001
1435     // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1436     //         ---------------------------------------
1437     // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1438     sve_bext(vtmp1, D, vtmp1, vtmp2);
1439 
1440     // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the
1441     // result to dst.
1442     // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1443     // dst   = 0x658D
1444     if (lane_cnt <= 8) {
1445       // No need to concatenate.
1446       umov(dst, vtmp1, B, 0);
1447     } else if (lane_cnt <= 16) {
1448       ins(vtmp1, B, vtmp1, 1, 8);
1449       umov(dst, vtmp1, H, 0);
1450     } else {
1451       // As the lane count is 64 at most, the final expected value must be in
1452       // the lowest 64 bits after narrowing vtmp1 from D to B.
1453       sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2);
1454       umov(dst, vtmp1, D, 0);
1455     }
1456   } else if (UseSVE > 0) {
1457     // Compress the lowest 8 bytes.
1458     fmovd(dst, vtmp1);
1459     bytemask_compress(dst);
1460     if (lane_cnt <= 8) return;
1461 
1462     // Repeat on higher bytes and join the results.
1463     // Compress 8 bytes in each iteration.
1464     for (int idx = 1; idx < (lane_cnt / 8); idx++) {
1465       sve_extract_integral(rscratch1, T_LONG, vtmp1, idx, vtmp2);
1466       bytemask_compress(rscratch1);
1467       orr(dst, dst, rscratch1, Assembler::LSL, idx << 3);
1468     }
1469   } else {
1470     assert(false, "unsupported");
1471     ShouldNotReachHere();
1472   }
1473 }
1474 
1475 // Unpack the mask, a long value in src, into predicate register dst based on the
1476 // corresponding data type. Note that dst can support at most 64 lanes.
1477 // Below example gives the expected dst predicate register in different types, with
1478 // a valid src(0x658D) on a 1024-bit vector size machine.
1479 // BYTE:  dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 00 00 65 8D
1480 // SHORT: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 14 11 40 51
1481 // INT:   dst = 0x00 00 00 00 00 00 00 00 01 10 01 01 10 00 11 01
1482 // LONG:  dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01
1483 //
1484 // The number of significant bits of src must be equal to lane_cnt. E.g., 0xFF658D which
1485 // has 24 significant bits would be an invalid input if dst predicate register refers to
1486 // a LONG type 1024-bit vector, which has at most 16 lanes.
1487 void C2_MacroAssembler::sve_vmask_fromlong(PRegister dst, Register src, BasicType bt, int lane_cnt,
1488                                            FloatRegister vtmp1, FloatRegister vtmp2) {
1489   assert(UseSVE == 2 && VM_Version::supports_svebitperm() &&
1490          lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported");
1491   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
1492   // Example:   src = 0x658D, bt = T_BYTE, size = B, lane_cnt = 16
1493   // Expected:  dst = 0b01101001 10001101
1494 
1495   // Put long value from general purpose register into the first lane of vector.
1496   // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1497   sve_dup(vtmp1, B, 0);
1498   mov(vtmp1, D, 0, src);
1499 
1500   // As sve_cmp generates mask value with the minimum unit in byte, we should
1501   // transform the value in the first lane which is mask in bit now to the
1502   // mask in byte, which can be done by SVE2's BDEP instruction.
1503 
1504   // The first source input of BDEP instruction. Deposite each byte in every 8 bytes.
1505   // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1506   if (lane_cnt <= 8) {
1507     // Nothing. As only one byte exsits.
1508   } else if (lane_cnt <= 16) {
1509     ins(vtmp1, B, vtmp1, 8, 1);
1510     mov(vtmp1, B, 1, zr);
1511   } else {
1512     sve_vector_extend(vtmp1, D, vtmp1, B);
1513   }
1514 
1515   // The second source input of BDEP instruction, initialized with 0x01 for each byte.
1516   // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1517   sve_dup(vtmp2, B, 1);
1518 
1519   // BDEP vtmp1.D, vtmp1.D, vtmp2.D
1520   // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1521   // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1522   //         ---------------------------------------
1523   // vtmp1 = 0x0001010000010001 | 0x0100000001010001
1524   sve_bdep(vtmp1, D, vtmp1, vtmp2);
1525 
1526   if (bt != T_BYTE) {
1527     sve_vector_extend(vtmp1, size, vtmp1, B);
1528   }
1529   // Generate mask according to the given vector, in which the elements have been
1530   // extended to expected type.
1531   // dst = 0b01101001 10001101
1532   sve_cmp(Assembler::NE, dst, size, ptrue, vtmp1, 0);
1533 }
1534 
1535 // Clobbers: rflags
1536 void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg,
1537                                     FloatRegister zn, FloatRegister zm, Condition cond) {
1538   assert(pg->is_governing(), "This register has to be a governing predicate register");
1539   FloatRegister z1 = zn, z2 = zm;
1540   switch (cond) {
1541     case LE: z1 = zm; z2 = zn; cond = GE; break;
1542     case LT: z1 = zm; z2 = zn; cond = GT; break;
1543     case LO: z1 = zm; z2 = zn; cond = HI; break;
1544     case LS: z1 = zm; z2 = zn; cond = HS; break;
1545     default:
1546       break;
1547   }
1548 
1549   SIMD_RegVariant size = elemType_to_regVariant(bt);
1550   if (is_floating_point_type(bt)) {
1551     sve_fcm(cond, pd, size, pg, z1, z2);
1552   } else {
1553     assert(is_integral_type(bt), "unsupported element type");
1554     sve_cmp(cond, pd, size, pg, z1, z2);
1555   }
1556 }
1557 
1558 // Get index of the last mask lane that is set
1559 void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) {
1560   SIMD_RegVariant size = elemType_to_regVariant(bt);
1561   sve_rev(ptmp, size, src);
1562   sve_brkb(ptmp, ptrue, ptmp, false);
1563   sve_cntp(dst, size, ptrue, ptmp);
1564   movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1);
1565   subw(dst, rscratch1, dst);
1566 }
1567 
1568 // Extend integer vector src to dst with the same lane count
1569 // but larger element size, e.g. 4B -> 4I
1570 void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes,
1571                                            FloatRegister src, BasicType src_bt, bool is_unsigned) {
1572   if (src_bt == T_BYTE) {
1573     // 4B to 4S/4I, 8B to 8S
1574     assert(dst_vlen_in_bytes == 8 || dst_vlen_in_bytes == 16, "unsupported");
1575     assert(dst_bt == T_SHORT || dst_bt == T_INT, "unsupported");
1576     _xshll(is_unsigned, dst, T8H, src, T8B, 0);
1577     if (dst_bt == T_INT) {
1578       _xshll(is_unsigned, dst, T4S, dst, T4H, 0);
1579     }
1580   } else if (src_bt == T_SHORT) {
1581     // 2S to 2I/2L, 4S to 4I
1582     assert(dst_vlen_in_bytes == 8 || dst_vlen_in_bytes == 16, "unsupported");
1583     assert(dst_bt == T_INT || dst_bt == T_LONG, "unsupported");
1584     _xshll(is_unsigned, dst, T4S, src, T4H, 0);
1585     if (dst_bt == T_LONG) {
1586       _xshll(is_unsigned, dst, T2D, dst, T2S, 0);
1587     }
1588   } else if (src_bt == T_INT) {
1589     // 2I to 2L
1590     assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported");
1591     _xshll(is_unsigned, dst, T2D, src, T2S, 0);
1592   } else {
1593     ShouldNotReachHere();
1594   }
1595 }
1596 
1597 // Narrow integer vector src down to dst with the same lane count
1598 // but smaller element size, e.g. 4I -> 4B
1599 void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt,
1600                                            FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) {
1601   if (src_bt == T_SHORT) {
1602     // 4S/8S to 4B/8B
1603     assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
1604     assert(dst_bt == T_BYTE, "unsupported");
1605     xtn(dst, T8B, src, T8H);
1606   } else if (src_bt == T_INT) {
1607     // 2I to 2S, 4I to 4B/4S
1608     assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
1609     assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported");
1610     xtn(dst, T4H, src, T4S);
1611     if (dst_bt == T_BYTE) {
1612       xtn(dst, T8B, dst, T8H);
1613     }
1614   } else if (src_bt == T_LONG) {
1615     // 2L to 2S/2I
1616     assert(src_vlen_in_bytes == 16, "unsupported");
1617     assert(dst_bt == T_INT || dst_bt == T_SHORT, "unsupported");
1618     xtn(dst, T2S, src, T2D);
1619     if (dst_bt == T_SHORT) {
1620       xtn(dst, T4H, dst, T4S);
1621     }
1622   } else {
1623     ShouldNotReachHere();
1624   }
1625 }
1626 
1627 void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size,
1628                                           FloatRegister src, SIMD_RegVariant src_size,
1629                                           bool is_unsigned) {
1630   assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size");
1631 
1632   if (src_size == B) {
1633     switch (dst_size) {
1634     case H:
1635       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1636       break;
1637     case S:
1638       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1639       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1640       break;
1641     case D:
1642       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1643       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1644       _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1645       break;
1646     default:
1647       ShouldNotReachHere();
1648     }
1649   } else if (src_size == H) {
1650     if (dst_size == S) {
1651       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1652     } else { // D
1653       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1654       _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1655     }
1656   } else if (src_size == S) {
1657     _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, src);
1658   }
1659 }
1660 
1661 // Vector narrow from src to dst with specified element sizes.
1662 // High part of dst vector will be filled with zero.
1663 void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size,
1664                                           FloatRegister src, SIMD_RegVariant src_size,
1665                                           FloatRegister tmp) {
1666   assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size");
1667   assert_different_registers(src, tmp);
1668   sve_dup(tmp, src_size, 0);
1669   if (src_size == D) {
1670     switch (dst_size) {
1671     case S:
1672       sve_uzp1(dst, S, src, tmp);
1673       break;
1674     case H:
1675       assert_different_registers(dst, tmp);
1676       sve_uzp1(dst, S, src, tmp);
1677       sve_uzp1(dst, H, dst, tmp);
1678       break;
1679     case B:
1680       assert_different_registers(dst, tmp);
1681       sve_uzp1(dst, S, src, tmp);
1682       sve_uzp1(dst, H, dst, tmp);
1683       sve_uzp1(dst, B, dst, tmp);
1684       break;
1685     default:
1686       ShouldNotReachHere();
1687     }
1688   } else if (src_size == S) {
1689     if (dst_size == H) {
1690       sve_uzp1(dst, H, src, tmp);
1691     } else { // B
1692       assert_different_registers(dst, tmp);
1693       sve_uzp1(dst, H, src, tmp);
1694       sve_uzp1(dst, B, dst, tmp);
1695     }
1696   } else if (src_size == H) {
1697     sve_uzp1(dst, B, src, tmp);
1698   }
1699 }
1700 
1701 // Extend src predicate to dst predicate with the same lane count but larger
1702 // element size, e.g. 64Byte -> 512Long
1703 void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src,
1704                                              uint dst_element_length_in_bytes,
1705                                              uint src_element_length_in_bytes) {
1706   if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) {
1707     sve_punpklo(dst, src);
1708   } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) {
1709     sve_punpklo(dst, src);
1710     sve_punpklo(dst, dst);
1711   } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) {
1712     sve_punpklo(dst, src);
1713     sve_punpklo(dst, dst);
1714     sve_punpklo(dst, dst);
1715   } else {
1716     assert(false, "unsupported");
1717     ShouldNotReachHere();
1718   }
1719 }
1720 
1721 // Narrow src predicate to dst predicate with the same lane count but
1722 // smaller element size, e.g. 512Long -> 64Byte
1723 void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src, PRegister ptmp,
1724                                              uint dst_element_length_in_bytes, uint src_element_length_in_bytes) {
1725   // The insignificant bits in src predicate are expected to be zero.
1726   // To ensure the higher order bits of the resultant narrowed vector are 0, an all-zero predicate is
1727   // passed as the second argument. An example narrowing operation with a given mask would be -
1728   // 128Long -> 64Int on a 128-bit machine i.e 2L -> 2I
1729   // Mask (for 2 Longs) : TF
1730   // Predicate register for the above mask (16 bits) : 00000001 00000000
1731   // After narrowing (uzp1 dst.b, src.b, ptmp.b) : 0000 0000 0001 0000
1732   // Which translates to mask for 2 integers as : TF (lower half is considered while upper half is 0)
1733   assert_different_registers(src, ptmp);
1734   assert_different_registers(dst, ptmp);
1735   sve_pfalse(ptmp);
1736   if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) {
1737     sve_uzp1(dst, B, src, ptmp);
1738   } else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) {
1739     sve_uzp1(dst, H, src, ptmp);
1740     sve_uzp1(dst, B, dst, ptmp);
1741   } else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) {
1742     sve_uzp1(dst, S, src, ptmp);
1743     sve_uzp1(dst, H, dst, ptmp);
1744     sve_uzp1(dst, B, dst, ptmp);
1745   } else {
1746     assert(false, "unsupported");
1747     ShouldNotReachHere();
1748   }
1749 }
1750 
1751 // Vector reduction add for integral type with ASIMD instructions.
1752 void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt,
1753                                                  Register isrc, FloatRegister vsrc,
1754                                                  unsigned vector_length_in_bytes,
1755                                                  FloatRegister vtmp) {
1756   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1757   assert_different_registers(dst, isrc);
1758   bool isQ = vector_length_in_bytes == 16;
1759 
1760   BLOCK_COMMENT("neon_reduce_add_integral {");
1761     switch(bt) {
1762       case T_BYTE:
1763         addv(vtmp, isQ ? T16B : T8B, vsrc);
1764         smov(dst, vtmp, B, 0);
1765         addw(dst, dst, isrc, ext::sxtb);
1766         break;
1767       case T_SHORT:
1768         addv(vtmp, isQ ? T8H : T4H, vsrc);
1769         smov(dst, vtmp, H, 0);
1770         addw(dst, dst, isrc, ext::sxth);
1771         break;
1772       case T_INT:
1773         isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc);
1774         umov(dst, vtmp, S, 0);
1775         addw(dst, dst, isrc);
1776         break;
1777       case T_LONG:
1778         assert(isQ, "unsupported");
1779         addpd(vtmp, vsrc);
1780         umov(dst, vtmp, D, 0);
1781         add(dst, dst, isrc);
1782         break;
1783       default:
1784         assert(false, "unsupported");
1785         ShouldNotReachHere();
1786     }
1787   BLOCK_COMMENT("} neon_reduce_add_integral");
1788 }
1789 
1790 // Vector reduction multiply for integral type with ASIMD instructions.
1791 // Note: temporary registers vtmp1 and vtmp2 are not used in some cases.
1792 // Clobbers: rscratch1
1793 void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt,
1794                                                  Register isrc, FloatRegister vsrc,
1795                                                  unsigned vector_length_in_bytes,
1796                                                  FloatRegister vtmp1, FloatRegister vtmp2) {
1797   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1798   bool isQ = vector_length_in_bytes == 16;
1799 
1800   BLOCK_COMMENT("neon_reduce_mul_integral {");
1801     switch(bt) {
1802       case T_BYTE:
1803         if (isQ) {
1804           // Multiply the lower half and higher half of vector iteratively.
1805           // vtmp1 = vsrc[8:15]
1806           ins(vtmp1, D, vsrc, 0, 1);
1807           // vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7]
1808           mulv(vtmp1, T8B, vtmp1, vsrc);
1809           // vtmp2 = vtmp1[4:7]
1810           ins(vtmp2, S, vtmp1, 0, 1);
1811           // vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3]
1812           mulv(vtmp1, T8B, vtmp2, vtmp1);
1813         } else {
1814           ins(vtmp1, S, vsrc, 0, 1);
1815           mulv(vtmp1, T8B, vtmp1, vsrc);
1816         }
1817         // vtmp2 = vtmp1[2:3]
1818         ins(vtmp2, H, vtmp1, 0, 1);
1819         // vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1]
1820         mulv(vtmp2, T8B, vtmp2, vtmp1);
1821         // dst = vtmp2[0] * isrc * vtmp2[1]
1822         umov(rscratch1, vtmp2, B, 0);
1823         mulw(dst, rscratch1, isrc);
1824         sxtb(dst, dst);
1825         umov(rscratch1, vtmp2, B, 1);
1826         mulw(dst, rscratch1, dst);
1827         sxtb(dst, dst);
1828         break;
1829       case T_SHORT:
1830         if (isQ) {
1831           ins(vtmp2, D, vsrc, 0, 1);
1832           mulv(vtmp2, T4H, vtmp2, vsrc);
1833           ins(vtmp1, S, vtmp2, 0, 1);
1834           mulv(vtmp1, T4H, vtmp1, vtmp2);
1835         } else {
1836           ins(vtmp1, S, vsrc, 0, 1);
1837           mulv(vtmp1, T4H, vtmp1, vsrc);
1838         }
1839         umov(rscratch1, vtmp1, H, 0);
1840         mulw(dst, rscratch1, isrc);
1841         sxth(dst, dst);
1842         umov(rscratch1, vtmp1, H, 1);
1843         mulw(dst, rscratch1, dst);
1844         sxth(dst, dst);
1845         break;
1846       case T_INT:
1847         if (isQ) {
1848           ins(vtmp1, D, vsrc, 0, 1);
1849           mulv(vtmp1, T2S, vtmp1, vsrc);
1850         } else {
1851           vtmp1 = vsrc;
1852         }
1853         umov(rscratch1, vtmp1, S, 0);
1854         mul(dst, rscratch1, isrc);
1855         umov(rscratch1, vtmp1, S, 1);
1856         mul(dst, rscratch1, dst);
1857         break;
1858       case T_LONG:
1859         umov(rscratch1, vsrc, D, 0);
1860         mul(dst, isrc, rscratch1);
1861         umov(rscratch1, vsrc, D, 1);
1862         mul(dst, dst, rscratch1);
1863         break;
1864       default:
1865         assert(false, "unsupported");
1866         ShouldNotReachHere();
1867     }
1868   BLOCK_COMMENT("} neon_reduce_mul_integral");
1869 }
1870 
1871 // Vector reduction multiply for floating-point type with ASIMD instructions.
1872 void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt,
1873                                            FloatRegister fsrc, FloatRegister vsrc,
1874                                            unsigned vector_length_in_bytes,
1875                                            FloatRegister vtmp) {
1876   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1877   bool isQ = vector_length_in_bytes == 16;
1878 
1879   BLOCK_COMMENT("neon_reduce_mul_fp {");
1880     switch(bt) {
1881       case T_FLOAT:
1882         fmuls(dst, fsrc, vsrc);
1883         ins(vtmp, S, vsrc, 0, 1);
1884         fmuls(dst, dst, vtmp);
1885         if (isQ) {
1886           ins(vtmp, S, vsrc, 0, 2);
1887           fmuls(dst, dst, vtmp);
1888           ins(vtmp, S, vsrc, 0, 3);
1889           fmuls(dst, dst, vtmp);
1890          }
1891         break;
1892       case T_DOUBLE:
1893         assert(isQ, "unsupported");
1894         fmuld(dst, fsrc, vsrc);
1895         ins(vtmp, D, vsrc, 0, 1);
1896         fmuld(dst, dst, vtmp);
1897         break;
1898       default:
1899         assert(false, "unsupported");
1900         ShouldNotReachHere();
1901     }
1902   BLOCK_COMMENT("} neon_reduce_mul_fp");
1903 }
1904 
1905 // Helper to select logical instruction
1906 void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd,
1907                                                    Register Rn, Register Rm,
1908                                                    enum shift_kind kind, unsigned shift) {
1909   switch(opc) {
1910     case Op_AndReductionV:
1911       is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift);
1912       break;
1913     case Op_OrReductionV:
1914       is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift);
1915       break;
1916     case Op_XorReductionV:
1917       is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift);
1918       break;
1919     default:
1920       assert(false, "unsupported");
1921       ShouldNotReachHere();
1922   }
1923 }
1924 
1925 // Vector reduction logical operations And, Or, Xor
1926 // Clobbers: rscratch1
1927 void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt,
1928                                             Register isrc, FloatRegister vsrc,
1929                                             unsigned vector_length_in_bytes) {
1930   assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV,
1931          "unsupported");
1932   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1933   assert_different_registers(dst, isrc);
1934   bool isQ = vector_length_in_bytes == 16;
1935 
1936   BLOCK_COMMENT("neon_reduce_logical {");
1937     umov(rscratch1, vsrc, isQ ? D : S, 0);
1938     umov(dst, vsrc, isQ ? D : S, 1);
1939     neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1);
1940     switch(bt) {
1941       case T_BYTE:
1942         if (isQ) {
1943           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1944         }
1945         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
1946         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8);
1947         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1948         sxtb(dst, dst);
1949         break;
1950       case T_SHORT:
1951         if (isQ) {
1952           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1953         }
1954         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
1955         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1956         sxth(dst, dst);
1957         break;
1958       case T_INT:
1959         if (isQ) {
1960           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1961         }
1962         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1963         break;
1964       case T_LONG:
1965         assert(isQ, "unsupported");
1966         neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst);
1967         break;
1968       default:
1969         assert(false, "unsupported");
1970         ShouldNotReachHere();
1971     }
1972   BLOCK_COMMENT("} neon_reduce_logical");
1973 }
1974 
1975 // Vector reduction min/max for integral type with ASIMD instructions.
1976 // Note: vtmp is not used and expected to be fnoreg for T_LONG case.
1977 // Clobbers: rscratch1, rflags
1978 void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt,
1979                                                     Register isrc, FloatRegister vsrc,
1980                                                     unsigned vector_length_in_bytes,
1981                                                     FloatRegister vtmp) {
1982   assert(opc == Op_MinReductionV || opc == Op_MaxReductionV, "unsupported");
1983   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1984   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported");
1985   assert_different_registers(dst, isrc);
1986   bool isQ = vector_length_in_bytes == 16;
1987   bool is_min = opc == Op_MinReductionV;
1988 
1989   BLOCK_COMMENT("neon_reduce_minmax_integral {");
1990     if (bt == T_LONG) {
1991       assert(vtmp == fnoreg, "should be");
1992       assert(isQ, "should be");
1993       umov(rscratch1, vsrc, D, 0);
1994       cmp(isrc, rscratch1);
1995       csel(dst, isrc, rscratch1, is_min ? LT : GT);
1996       umov(rscratch1, vsrc, D, 1);
1997       cmp(dst, rscratch1);
1998       csel(dst, dst, rscratch1, is_min ? LT : GT);
1999     } else {
2000       SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
2001       if (size == T2S) {
2002         is_min ? sminp(vtmp, size, vsrc, vsrc) : smaxp(vtmp, size, vsrc, vsrc);
2003       } else {
2004         is_min ? sminv(vtmp, size, vsrc) : smaxv(vtmp, size, vsrc);
2005       }
2006       if (bt == T_INT) {
2007         umov(dst, vtmp, S, 0);
2008       } else {
2009         smov(dst, vtmp, elemType_to_regVariant(bt), 0);
2010       }
2011       cmpw(dst, isrc);
2012       cselw(dst, dst, isrc, is_min ? LT : GT);
2013     }
2014   BLOCK_COMMENT("} neon_reduce_minmax_integral");
2015 }
2016 
2017 // Vector reduction for integral type with SVE instruction.
2018 // Supported operations are Add, And, Or, Xor, Max, Min.
2019 // rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV.
2020 void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1,
2021                                             FloatRegister src2, PRegister pg, FloatRegister tmp) {
2022   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2023   assert(pg->is_governing(), "This register has to be a governing predicate register");
2024   assert_different_registers(src1, dst);
2025   // Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved.
2026   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2027   switch (opc) {
2028     case Op_AddReductionVI: {
2029       sve_uaddv(tmp, size, pg, src2);
2030       if (bt == T_BYTE) {
2031         smov(dst, tmp, size, 0);
2032         addw(dst, src1, dst, ext::sxtb);
2033       } else if (bt == T_SHORT) {
2034         smov(dst, tmp, size, 0);
2035         addw(dst, src1, dst, ext::sxth);
2036       } else {
2037         umov(dst, tmp, size, 0);
2038         addw(dst, dst, src1);
2039       }
2040       break;
2041     }
2042     case Op_AddReductionVL: {
2043       sve_uaddv(tmp, size, pg, src2);
2044       umov(dst, tmp, size, 0);
2045       add(dst, dst, src1);
2046       break;
2047     }
2048     case Op_AndReductionV: {
2049       sve_andv(tmp, size, pg, src2);
2050       if (bt == T_INT || bt == T_LONG) {
2051         umov(dst, tmp, size, 0);
2052       } else {
2053         smov(dst, tmp, size, 0);
2054       }
2055       if (bt == T_LONG) {
2056         andr(dst, dst, src1);
2057       } else {
2058         andw(dst, dst, src1);
2059       }
2060       break;
2061     }
2062     case Op_OrReductionV: {
2063       sve_orv(tmp, size, pg, src2);
2064       if (bt == T_INT || bt == T_LONG) {
2065         umov(dst, tmp, size, 0);
2066       } else {
2067         smov(dst, tmp, size, 0);
2068       }
2069       if (bt == T_LONG) {
2070         orr(dst, dst, src1);
2071       } else {
2072         orrw(dst, dst, src1);
2073       }
2074       break;
2075     }
2076     case Op_XorReductionV: {
2077       sve_eorv(tmp, size, pg, src2);
2078       if (bt == T_INT || bt == T_LONG) {
2079         umov(dst, tmp, size, 0);
2080       } else {
2081         smov(dst, tmp, size, 0);
2082       }
2083       if (bt == T_LONG) {
2084         eor(dst, dst, src1);
2085       } else {
2086         eorw(dst, dst, src1);
2087       }
2088       break;
2089     }
2090     case Op_MaxReductionV: {
2091       sve_smaxv(tmp, size, pg, src2);
2092       if (bt == T_INT || bt == T_LONG) {
2093         umov(dst, tmp, size, 0);
2094       } else {
2095         smov(dst, tmp, size, 0);
2096       }
2097       if (bt == T_LONG) {
2098         cmp(dst, src1);
2099         csel(dst, dst, src1, Assembler::GT);
2100       } else {
2101         cmpw(dst, src1);
2102         cselw(dst, dst, src1, Assembler::GT);
2103       }
2104       break;
2105     }
2106     case Op_MinReductionV: {
2107       sve_sminv(tmp, size, pg, src2);
2108       if (bt == T_INT || bt == T_LONG) {
2109         umov(dst, tmp, size, 0);
2110       } else {
2111         smov(dst, tmp, size, 0);
2112       }
2113       if (bt == T_LONG) {
2114         cmp(dst, src1);
2115         csel(dst, dst, src1, Assembler::LT);
2116       } else {
2117         cmpw(dst, src1);
2118         cselw(dst, dst, src1, Assembler::LT);
2119       }
2120       break;
2121     }
2122     default:
2123       assert(false, "unsupported");
2124       ShouldNotReachHere();
2125   }
2126 
2127   if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) {
2128     if (bt == T_BYTE) {
2129       sxtb(dst, dst);
2130     } else if (bt == T_SHORT) {
2131       sxth(dst, dst);
2132     }
2133   }
2134 }
2135 
2136 // Set elements of the dst predicate to true for lanes in the range of [0, lane_cnt), or
2137 // to false otherwise. The input "lane_cnt" should be smaller than or equal to the supported
2138 // max vector length of the basic type. Clobbers: rscratch1 and the rFlagsReg.
2139 void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt) {
2140   uint32_t max_vector_length = Matcher::max_vector_size(bt);
2141   assert(lane_cnt <= max_vector_length, "unsupported input lane_cnt");
2142 
2143   // Set all elements to false if the input "lane_cnt" is zero.
2144   if (lane_cnt == 0) {
2145     sve_pfalse(dst);
2146     return;
2147   }
2148 
2149   SIMD_RegVariant size = elemType_to_regVariant(bt);
2150   assert(size != Q, "invalid size");
2151 
2152   // Set all true if "lane_cnt" equals to the max lane count.
2153   if (lane_cnt == max_vector_length) {
2154     sve_ptrue(dst, size, /* ALL */ 0b11111);
2155     return;
2156   }
2157 
2158   // Fixed numbers for "ptrue".
2159   switch(lane_cnt) {
2160   case 1: /* VL1 */
2161   case 2: /* VL2 */
2162   case 3: /* VL3 */
2163   case 4: /* VL4 */
2164   case 5: /* VL5 */
2165   case 6: /* VL6 */
2166   case 7: /* VL7 */
2167   case 8: /* VL8 */
2168     sve_ptrue(dst, size, lane_cnt);
2169     return;
2170   case 16:
2171     sve_ptrue(dst, size, /* VL16 */ 0b01001);
2172     return;
2173   case 32:
2174     sve_ptrue(dst, size, /* VL32 */ 0b01010);
2175     return;
2176   case 64:
2177     sve_ptrue(dst, size, /* VL64 */ 0b01011);
2178     return;
2179   case 128:
2180     sve_ptrue(dst, size, /* VL128 */ 0b01100);
2181     return;
2182   case 256:
2183     sve_ptrue(dst, size, /* VL256 */ 0b01101);
2184     return;
2185   default:
2186     break;
2187   }
2188 
2189   // Special patterns for "ptrue".
2190   if (lane_cnt == round_down_power_of_2(max_vector_length)) {
2191     sve_ptrue(dst, size, /* POW2 */ 0b00000);
2192   } else if (lane_cnt == max_vector_length - (max_vector_length % 4)) {
2193     sve_ptrue(dst, size, /* MUL4 */ 0b11101);
2194   } else if (lane_cnt == max_vector_length - (max_vector_length % 3)) {
2195     sve_ptrue(dst, size, /* MUL3 */ 0b11110);
2196   } else {
2197     // Encode to "whileltw" for the remaining cases.
2198     mov(rscratch1, lane_cnt);
2199     sve_whileltw(dst, size, zr, rscratch1);
2200   }
2201 }
2202 
2203 // Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst.
2204 // Any remaining elements of dst will be filled with zero.
2205 // Clobbers: rscratch1
2206 // Preserves: src, mask
2207 void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask,
2208                                            FloatRegister vtmp1, FloatRegister vtmp2,
2209                                            PRegister pgtmp) {
2210   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2211   assert_different_registers(dst, src, vtmp1, vtmp2);
2212   assert_different_registers(mask, pgtmp);
2213 
2214   // Example input:   src   = 8888 7777 6666 5555 4444 3333 2222 1111
2215   //                  mask  = 0001 0000 0000 0001 0001 0000 0001 0001
2216   // Expected result: dst   = 0000 0000 0000 8888 5555 4444 2222 1111
2217   sve_dup(vtmp2, H, 0);
2218 
2219   // Extend lowest half to type INT.
2220   // dst = 00004444 00003333 00002222 00001111
2221   sve_uunpklo(dst, S, src);
2222   // pgtmp = 00000001 00000000 00000001 00000001
2223   sve_punpklo(pgtmp, mask);
2224   // Pack the active elements in size of type INT to the right,
2225   // and fill the remainings with zero.
2226   // dst = 00000000 00004444 00002222 00001111
2227   sve_compact(dst, S, dst, pgtmp);
2228   // Narrow the result back to type SHORT.
2229   // dst = 0000 0000 0000 0000 0000 4444 2222 1111
2230   sve_uzp1(dst, H, dst, vtmp2);
2231   // Count the active elements of lowest half.
2232   // rscratch1 = 3
2233   sve_cntp(rscratch1, S, ptrue, pgtmp);
2234 
2235   // Repeat to the highest half.
2236   // pgtmp = 00000001 00000000 00000000 00000001
2237   sve_punpkhi(pgtmp, mask);
2238   // vtmp1 = 00008888 00007777 00006666 00005555
2239   sve_uunpkhi(vtmp1, S, src);
2240   // vtmp1 = 00000000 00000000 00008888 00005555
2241   sve_compact(vtmp1, S, vtmp1, pgtmp);
2242   // vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555
2243   sve_uzp1(vtmp1, H, vtmp1, vtmp2);
2244 
2245   // Compressed low:   dst   = 0000 0000 0000 0000 0000 4444 2222 1111
2246   // Compressed high:  vtmp1 = 0000 0000 0000 0000 0000 0000 8888  5555
2247   // Left shift(cross lane) compressed high with TRUE_CNT lanes,
2248   // TRUE_CNT is the number of active elements in the compressed low.
2249   neg(rscratch1, rscratch1);
2250   // vtmp2 = {4 3 2 1 0 -1 -2 -3}
2251   sve_index(vtmp2, H, rscratch1, 1);
2252   // vtmp1 = 0000 0000 0000 8888 5555 0000 0000 0000
2253   sve_tbl(vtmp1, H, vtmp1, vtmp2);
2254 
2255   // Combine the compressed high(after shifted) with the compressed low.
2256   // dst = 0000 0000 0000 8888 5555 4444 2222 1111
2257   sve_orr(dst, dst, vtmp1);
2258 }
2259 
2260 // Clobbers: rscratch1, rscratch2
2261 // Preserves: src, mask
2262 void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask,
2263                                           FloatRegister vtmp1, FloatRegister vtmp2,
2264                                           FloatRegister vtmp3, FloatRegister vtmp4,
2265                                           PRegister ptmp, PRegister pgtmp) {
2266   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2267   assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3, vtmp4);
2268   assert_different_registers(mask, ptmp, pgtmp);
2269   // Example input:   src   = 88 77 66 55 44 33 22 11
2270   //                  mask  = 01 00 00 01 01 00 01 01
2271   // Expected result: dst   = 00 00 00 88 55 44 22 11
2272 
2273   sve_dup(vtmp4, B, 0);
2274   // Extend lowest half to type SHORT.
2275   // vtmp1 = 0044 0033 0022 0011
2276   sve_uunpklo(vtmp1, H, src);
2277   // ptmp = 0001 0000 0001 0001
2278   sve_punpklo(ptmp, mask);
2279   // Count the active elements of lowest half.
2280   // rscratch2 = 3
2281   sve_cntp(rscratch2, H, ptrue, ptmp);
2282   // Pack the active elements in size of type SHORT to the right,
2283   // and fill the remainings with zero.
2284   // dst = 0000 0044 0022 0011
2285   sve_compress_short(dst, vtmp1, ptmp, vtmp2, vtmp3, pgtmp);
2286   // Narrow the result back to type BYTE.
2287   // dst = 00 00 00 00 00 44 22 11
2288   sve_uzp1(dst, B, dst, vtmp4);
2289 
2290   // Repeat to the highest half.
2291   // ptmp = 0001 0000 0000 0001
2292   sve_punpkhi(ptmp, mask);
2293   // vtmp1 = 0088 0077 0066 0055
2294   sve_uunpkhi(vtmp2, H, src);
2295   // vtmp1 = 0000 0000 0088 0055
2296   sve_compress_short(vtmp1, vtmp2, ptmp, vtmp3, vtmp4, pgtmp);
2297 
2298   sve_dup(vtmp4, B, 0);
2299   // vtmp1 = 00 00 00 00 00 00 88 55
2300   sve_uzp1(vtmp1, B, vtmp1, vtmp4);
2301 
2302   // Compressed low:   dst   = 00 00 00 00 00 44 22 11
2303   // Compressed high:  vtmp1 = 00 00 00 00 00 00 88 55
2304   // Left shift(cross lane) compressed high with TRUE_CNT lanes,
2305   // TRUE_CNT is the number of active elements in the compressed low.
2306   neg(rscratch2, rscratch2);
2307   // vtmp2 = {4 3 2 1 0 -1 -2 -3}
2308   sve_index(vtmp2, B, rscratch2, 1);
2309   // vtmp1 = 00 00 00 88 55 00 00 00
2310   sve_tbl(vtmp1, B, vtmp1, vtmp2);
2311   // Combine the compressed high(after shifted) with the compressed low.
2312   // dst = 00 00 00 88 55 44 22 11
2313   sve_orr(dst, dst, vtmp1);
2314 }
2315 
2316 void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2317   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2318   SIMD_Arrangement size = isQ ? T16B : T8B;
2319   if (bt == T_BYTE) {
2320     rbit(dst, size, src);
2321   } else {
2322     neon_reverse_bytes(dst, src, bt, isQ);
2323     rbit(dst, size, dst);
2324   }
2325 }
2326 
2327 void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2328   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2329   SIMD_Arrangement size = isQ ? T16B : T8B;
2330   switch (bt) {
2331     case T_BYTE:
2332       if (dst != src) {
2333         orr(dst, size, src, src);
2334       }
2335       break;
2336     case T_SHORT:
2337       rev16(dst, size, src);
2338       break;
2339     case T_INT:
2340       rev32(dst, size, src);
2341       break;
2342     case T_LONG:
2343       rev64(dst, size, src);
2344       break;
2345     default:
2346       assert(false, "unsupported");
2347       ShouldNotReachHere();
2348   }
2349 }
2350 
2351 // VectorRearrange implementation for short/int/float/long/double types with NEON
2352 // instructions. For VectorRearrange short/int/float, we use NEON tbl instruction.
2353 // But since it supports bytes table only, we need to lookup 2/4 bytes as a group.
2354 // For VectorRearrange long/double, we compare the shuffle input with iota indices,
2355 // and use bsl to implement the operation.
2356 void C2_MacroAssembler::neon_rearrange_hsd(FloatRegister dst, FloatRegister src,
2357                                            FloatRegister shuffle, FloatRegister tmp,
2358                                            BasicType bt, bool isQ) {
2359   assert_different_registers(dst, src, shuffle, tmp);
2360   SIMD_Arrangement size1 = isQ ? T16B : T8B;
2361   SIMD_Arrangement size2 = esize2arrangement((uint)type2aelembytes(bt), isQ);
2362 
2363   // Here is an example that rearranges a NEON vector with 4 ints:
2364   // Rearrange V1 int[a0, a1, a2, a3] to V2 int[a2, a3, a0, a1]
2365   //   1. We assume the shuffle input is Vi int[2, 3, 0, 1].
2366   //   2. Multiply Vi int[2, 3, 0, 1] with constant int vector
2367   //      [0x04040404, 0x04040404, 0x04040404, 0x04040404], and get
2368   //      tbl base Vm int[0x08080808, 0x0c0c0c0c, 0x00000000, 0x04040404].
2369   //   3. Add Vm with constant int[0x03020100, 0x03020100, 0x03020100, 0x03020100],
2370   //      and get tbl index Vm int[0x0b0a0908, 0x0f0e0d0c, 0x03020100, 0x07060504]
2371   //   4. Use Vm as index register, and use V1 as table register.
2372   //      Then get V2 as the result by tbl NEON instructions.
2373   switch (bt) {
2374     case T_SHORT:
2375       mov(tmp, size1, 0x02);
2376       mulv(dst, size2, shuffle, tmp);
2377       mov(tmp, size2, 0x0100);
2378       addv(dst, size1, dst, tmp);
2379       tbl(dst, size1, src, 1, dst);
2380       break;
2381     case T_INT:
2382     case T_FLOAT:
2383       mov(tmp, size1, 0x04);
2384       mulv(dst, size2, shuffle, tmp);
2385       mov(tmp, size2, 0x03020100);
2386       addv(dst, size1, dst, tmp);
2387       tbl(dst, size1, src, 1, dst);
2388       break;
2389     case T_LONG:
2390     case T_DOUBLE:
2391       // Load the iota indices for Long type. The indices are ordered by
2392       // type B/S/I/L/F/D, and the offset between two types is 16; Hence
2393       // the offset for L is 48.
2394       lea(rscratch1,
2395           ExternalAddress(StubRoutines::aarch64::vector_iota_indices() + 48));
2396       ldrq(tmp, rscratch1);
2397       // Check whether the input "shuffle" is the same with iota indices.
2398       // Return "src" if true, otherwise swap the two elements of "src".
2399       cm(EQ, dst, size2, shuffle, tmp);
2400       ext(tmp, size1, src, src, 8);
2401       bsl(dst, size1, src, tmp);
2402       break;
2403     default:
2404       assert(false, "unsupported element type");
2405       ShouldNotReachHere();
2406   }
2407 }
2408 
2409 // Extract a scalar element from an sve vector at position 'idx'.
2410 // The input elements in src are expected to be of integral type.
2411 void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src,
2412                                              int idx, FloatRegister vtmp) {
2413   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2414   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2415   if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction
2416     if (bt == T_INT || bt == T_LONG) {
2417       umov(dst, src, size, idx);
2418     } else {
2419       smov(dst, src, size, idx);
2420     }
2421   } else {
2422     sve_orr(vtmp, src, src);
2423     sve_ext(vtmp, vtmp, idx << size);
2424     if (bt == T_INT || bt == T_LONG) {
2425       umov(dst, vtmp, size, 0);
2426     } else {
2427       smov(dst, vtmp, size, 0);
2428     }
2429   }
2430 }
2431 
2432 // java.lang.Math::round intrinsics
2433 
2434 // Clobbers: rscratch1, rflags
2435 void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2436                                           FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) {
2437   assert_different_registers(tmp1, tmp2, tmp3, src, dst);
2438   switch (T) {
2439     case T2S:
2440     case T4S:
2441       fmovs(tmp1, T, 0.5f);
2442       mov(rscratch1, jint_cast(0x1.0p23f));
2443       break;
2444     case T2D:
2445       fmovd(tmp1, T, 0.5);
2446       mov(rscratch1, julong_cast(0x1.0p52));
2447       break;
2448     default:
2449       assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2450   }
2451   fadd(tmp1, T, tmp1, src);
2452   fcvtms(tmp1, T, tmp1);
2453   // tmp1 = floor(src + 0.5, ties to even)
2454 
2455   fcvtas(dst, T, src);
2456   // dst = round(src), ties to away
2457 
2458   fneg(tmp3, T, src);
2459   dup(tmp2, T, rscratch1);
2460   cm(HS, tmp3, T, tmp3, tmp2);
2461   // tmp3 is now a set of flags
2462 
2463   bif(dst, T16B, tmp1, tmp3);
2464   // result in dst
2465 }
2466 
2467 // Clobbers: rscratch1, rflags
2468 void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2469                                          FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) {
2470   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2471   assert_different_registers(tmp1, tmp2, src, dst);
2472 
2473   switch (T) {
2474     case S:
2475       mov(rscratch1, jint_cast(0x1.0p23f));
2476       break;
2477     case D:
2478       mov(rscratch1, julong_cast(0x1.0p52));
2479       break;
2480     default:
2481       assert(T == S || T == D, "invalid register variant");
2482   }
2483 
2484   sve_frinta(dst, T, ptrue, src);
2485   // dst = round(src), ties to away
2486 
2487   Label none;
2488 
2489   sve_fneg(tmp1, T, ptrue, src);
2490   sve_dup(tmp2, T, rscratch1);
2491   sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1);
2492   br(EQ, none);
2493   {
2494     sve_cpy(tmp1, T, pgtmp, 0.5);
2495     sve_fadd(tmp1, T, pgtmp, src);
2496     sve_frintm(dst, T, pgtmp, tmp1);
2497     // dst = floor(src + 0.5, ties to even)
2498   }
2499   bind(none);
2500 
2501   sve_fcvtzs(dst, T, ptrue, dst, T);
2502   // result in dst
2503 }
2504 
2505 void C2_MacroAssembler::vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero,
2506                                            FloatRegister one, SIMD_Arrangement T) {
2507   assert_different_registers(dst, src, zero, one);
2508   assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2509 
2510   facgt(dst, T, src, zero);
2511   ushr(dst, T, dst, 1); // dst=0 for +-0.0 and NaN. 0x7FF..F otherwise
2512   bsl(dst, T == T2S ? T8B : T16B, one, src); // Result in dst
2513 }
2514 
2515 void C2_MacroAssembler::vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero,
2516                                           FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T) {
2517     assert_different_registers(dst, src, zero, one, vtmp);
2518     assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2519 
2520     sve_orr(vtmp, src, src);
2521     sve_fac(Assembler::GT, pgtmp, T, ptrue, src, zero); // pmtp=0 for +-0.0 and NaN. 0x1 otherwise
2522     switch (T) {
2523     case S:
2524       sve_and(vtmp, T, min_jint); // Extract the sign bit of float value in every lane of src
2525       sve_orr(vtmp, T, jint_cast(1.0)); // OR it with +1 to make the final result +1 or -1 depending
2526                                         // on the sign of the float value
2527       break;
2528     case D:
2529       sve_and(vtmp, T, min_jlong);
2530       sve_orr(vtmp, T, jlong_cast(1.0));
2531       break;
2532     default:
2533       assert(false, "unsupported");
2534       ShouldNotReachHere();
2535     }
2536     sve_sel(dst, T, pgtmp, vtmp, src); // Select either from src or vtmp based on the predicate register pgtmp
2537                                        // Result in dst
2538 }
2539 
2540 bool C2_MacroAssembler::in_scratch_emit_size() {
2541   if (ciEnv::current()->task() != nullptr) {
2542     PhaseOutput* phase_output = Compile::current()->output();
2543     if (phase_output != nullptr && phase_output->in_scratch_emit_size()) {
2544       return true;
2545     }
2546   }
2547   return MacroAssembler::in_scratch_emit_size();
2548 }
2549 
2550 static void abort_verify_int_in_range(uint idx, jint val, jint lo, jint hi) {
2551   fatal("Invalid CastII, idx: %u, val: %d, lo: %d, hi: %d", idx, val, lo, hi);
2552 }
2553 
2554 void C2_MacroAssembler::verify_int_in_range(uint idx, const TypeInt* t, Register rval, Register rtmp) {
2555   assert(!t->empty() && !t->singleton(), "%s", Type::str(t));
2556   if (t == TypeInt::INT) {
2557     return;
2558   }
2559   BLOCK_COMMENT("verify_int_in_range {");
2560   Label L_success, L_failure;
2561 
2562   jint lo = t->_lo;
2563   jint hi = t->_hi;
2564 
2565   if (lo != min_jint && hi != max_jint) {
2566     subsw(rtmp, rval, lo);
2567     br(Assembler::LT, L_failure);
2568     subsw(rtmp, rval, hi);
2569     br(Assembler::LE, L_success);
2570   } else if (lo != min_jint) {
2571     subsw(rtmp, rval, lo);
2572     br(Assembler::GE, L_success);
2573   } else if (hi != max_jint) {
2574     subsw(rtmp, rval, hi);
2575     br(Assembler::LE, L_success);
2576   } else {
2577     ShouldNotReachHere();
2578   }
2579 
2580   bind(L_failure);
2581   movw(c_rarg0, idx);
2582   mov(c_rarg1, rval);
2583   movw(c_rarg2, lo);
2584   movw(c_rarg3, hi);
2585   reconstruct_frame_pointer(rtmp);
2586   rt_call(CAST_FROM_FN_PTR(address, abort_verify_int_in_range), rtmp);
2587   hlt(0);
2588 
2589   bind(L_success);
2590   BLOCK_COMMENT("} verify_int_in_range");
2591 }
2592 
2593 static void abort_verify_long_in_range(uint idx, jlong val, jlong lo, jlong hi) {
2594   fatal("Invalid CastLL, idx: %u, val: " JLONG_FORMAT ", lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, val, lo, hi);
2595 }
2596 
2597 void C2_MacroAssembler::verify_long_in_range(uint idx, const TypeLong* t, Register rval, Register rtmp) {
2598   assert(!t->empty() && !t->singleton(), "%s", Type::str(t));
2599   if (t == TypeLong::LONG) {
2600     return;
2601   }
2602   BLOCK_COMMENT("verify_long_in_range {");
2603   Label L_success, L_failure;
2604 
2605   jlong lo = t->_lo;
2606   jlong hi = t->_hi;
2607 
2608   if (lo != min_jlong && hi != max_jlong) {
2609     subs(rtmp, rval, lo);
2610     br(Assembler::LT, L_failure);
2611     subs(rtmp, rval, hi);
2612     br(Assembler::LE, L_success);
2613   } else if (lo != min_jlong) {
2614     subs(rtmp, rval, lo);
2615     br(Assembler::GE, L_success);
2616   } else if (hi != max_jlong) {
2617     subs(rtmp, rval, hi);
2618     br(Assembler::LE, L_success);
2619   } else {
2620     ShouldNotReachHere();
2621   }
2622 
2623   bind(L_failure);
2624   movw(c_rarg0, idx);
2625   mov(c_rarg1, rval);
2626   mov(c_rarg2, lo);
2627   mov(c_rarg3, hi);
2628   reconstruct_frame_pointer(rtmp);
2629   rt_call(CAST_FROM_FN_PTR(address, abort_verify_long_in_range), rtmp);
2630   hlt(0);
2631 
2632   bind(L_success);
2633   BLOCK_COMMENT("} verify_long_in_range");
2634 }
2635 
2636 void C2_MacroAssembler::reconstruct_frame_pointer(Register rtmp) {
2637   const int framesize = Compile::current()->output()->frame_size_in_bytes();
2638   if (PreserveFramePointer) {
2639     // frame pointer is valid
2640 #ifdef ASSERT
2641     // Verify frame pointer value in rfp.
2642     add(rtmp, sp, framesize - 2 * wordSize);
2643     Label L_success;
2644     cmp(rfp, rtmp);
2645     br(Assembler::EQ, L_success);
2646     stop("frame pointer mismatch");
2647     bind(L_success);
2648 #endif // ASSERT
2649   } else {
2650     add(rfp, sp, framesize - 2 * wordSize);
2651   }
2652 }
2653 
2654 // Selects elements from two source vectors (src1, src2) based on index values in the index register
2655 // using Neon instructions and places it in the destination vector element corresponding to the
2656 // index vector element. Each index in the index register must be in the range - [0, 2 * NUM_ELEM),
2657 // where NUM_ELEM is the number of BasicType elements per vector.
2658 // If idx < NUM_ELEM --> selects src1[idx] (idx is an element of the index register)
2659 // Otherwise, selects src2[idx – NUM_ELEM]
2660 void C2_MacroAssembler::select_from_two_vectors_neon(FloatRegister dst, FloatRegister src1,
2661                                                      FloatRegister src2, FloatRegister index,
2662                                                      FloatRegister tmp, unsigned vector_length_in_bytes) {
2663   assert_different_registers(dst, src1, src2, tmp);
2664   SIMD_Arrangement size = vector_length_in_bytes == 16 ? T16B : T8B;
2665 
2666   if (vector_length_in_bytes == 16) {
2667     assert(UseSVE <= 1, "sve must be <= 1");
2668     assert(src1->successor() == src2, "Source registers must be ordered");
2669     // If the vector length is 16B, then use the Neon "tbl" instruction with two vector table
2670     tbl(dst, size, src1, 2, index);
2671   } else { // vector length == 8
2672     assert(UseSVE == 0, "must be Neon only");
2673     // We need to fit both the source vectors (src1, src2) in a 128-bit register because the
2674     // Neon "tbl" instruction supports only looking up 16B vectors. We then use the Neon "tbl"
2675     // instruction with one vector lookup
2676     ins(tmp, D, src1, 0, 0);
2677     ins(tmp, D, src2, 1, 0);
2678     tbl(dst, size, tmp, 1, index);
2679   }
2680 }
2681 
2682 // Selects elements from two source vectors (src1, src2) based on index values in the index register
2683 // using SVE/SVE2 instructions and places it in the destination vector element corresponding to the
2684 // index vector element. Each index in the index register must be in the range - [0, 2 * NUM_ELEM),
2685 // where NUM_ELEM is the number of BasicType elements per vector.
2686 // If idx < NUM_ELEM --> selects src1[idx] (idx is an element of the index register)
2687 // Otherwise, selects src2[idx – NUM_ELEM]
2688 void C2_MacroAssembler::select_from_two_vectors_sve(FloatRegister dst, FloatRegister src1,
2689                                                     FloatRegister src2, FloatRegister index,
2690                                                     FloatRegister tmp, SIMD_RegVariant T,
2691                                                     unsigned vector_length_in_bytes) {
2692   assert_different_registers(dst, src1, src2, index, tmp);
2693 
2694   if (vector_length_in_bytes == 8) {
2695     // We need to fit both the source vectors (src1, src2) in a single vector register because the
2696     // SVE "tbl" instruction is unpredicated and works on the entire vector which can lead to
2697     // incorrect results if each source vector is only partially filled. We then use the SVE "tbl"
2698     // instruction with one vector lookup
2699     assert(UseSVE >= 1, "sve must be >= 1");
2700     ins(tmp, D, src1, 0, 0);
2701     ins(tmp, D, src2, 1, 0);
2702     sve_tbl(dst, T, tmp, index);
2703   } else {  // UseSVE == 2 and vector_length_in_bytes > 8
2704     // If the vector length is > 8, then use the SVE2 "tbl" instruction with the two vector table.
2705     // The assertion - vector_length_in_bytes == MaxVectorSize ensures that this operation
2706     // is not executed on machines where vector_length_in_bytes < MaxVectorSize
2707     // with the only exception of 8B vector length.
2708     assert(UseSVE == 2 && vector_length_in_bytes == MaxVectorSize, "must be");
2709     assert(src1->successor() == src2, "Source registers must be ordered");
2710     sve_tbl(dst, T, src1, src2, index);
2711   }
2712 }
2713 
2714 void C2_MacroAssembler::select_from_two_vectors(FloatRegister dst, FloatRegister src1,
2715                                                 FloatRegister src2, FloatRegister index,
2716                                                 FloatRegister tmp, BasicType bt,
2717                                                 unsigned vector_length_in_bytes) {
2718 
2719   assert_different_registers(dst, src1, src2, index, tmp);
2720 
2721   // The cases that can reach this method are -
2722   // - UseSVE = 0, vector_length_in_bytes = 8 or 16
2723   // - UseSVE = 1, vector_length_in_bytes = 8 or 16
2724   // - UseSVE = 2, vector_length_in_bytes >= 8
2725   //
2726   // SVE/SVE2 tbl instructions are generated when UseSVE = 1 with vector_length_in_bytes = 8
2727   // and UseSVE = 2 with vector_length_in_bytes >= 8
2728   //
2729   // Neon instructions are generated when UseSVE = 0 with vector_length_in_bytes = 8 or 16 and
2730   // UseSVE = 1 with vector_length_in_bytes = 16
2731 
2732   if ((UseSVE == 1 && vector_length_in_bytes == 8) || UseSVE == 2) {
2733     SIMD_RegVariant T = elemType_to_regVariant(bt);
2734     select_from_two_vectors_sve(dst, src1, src2, index, tmp, T, vector_length_in_bytes);
2735     return;
2736   }
2737 
2738   // The only BasicTypes that can reach here are T_SHORT, T_BYTE, T_INT and T_FLOAT
2739   assert(bt != T_DOUBLE && bt != T_LONG, "unsupported basic type");
2740   assert(vector_length_in_bytes <= 16, "length_in_bytes must be <= 16");
2741 
2742   bool isQ = vector_length_in_bytes == 16;
2743 
2744   SIMD_Arrangement size1 = isQ ? T16B : T8B;
2745   SIMD_Arrangement size2 = esize2arrangement((uint)type2aelembytes(bt), isQ);
2746 
2747   // Neon "tbl" instruction only supports byte tables, so we need to look at chunks of
2748   // 2B for selecting shorts or chunks of 4B for selecting ints/floats from the table.
2749   // The index values in "index" register are in the range of [0, 2 * NUM_ELEM) where NUM_ELEM
2750   // is the number of elements that can fit in a vector. For ex. for T_SHORT with 64-bit vector length,
2751   // the indices can range from [0, 8).
2752   // As an example with 64-bit vector length and T_SHORT type - let index = [2, 5, 1, 0]
2753   // Move a constant 0x02 in every byte of tmp - tmp = [0x0202, 0x0202, 0x0202, 0x0202]
2754   // Multiply index vector with tmp to yield - dst = [0x0404, 0x0a0a, 0x0202, 0x0000]
2755   // Move a constant 0x0100 in every 2B of tmp - tmp = [0x0100, 0x0100, 0x0100, 0x0100]
2756   // Add the multiplied result to the vector in tmp to obtain the byte level
2757   // offsets - dst = [0x0504, 0x0b0a, 0x0302, 0x0100]
2758   // Use these offsets in the "tbl" instruction to select chunks of 2B.
2759 
2760   if (bt == T_BYTE) {
2761     select_from_two_vectors_neon(dst, src1, src2, index, tmp, vector_length_in_bytes);
2762   } else {
2763     int elem_size = (bt == T_SHORT) ? 2 : 4;
2764     uint64_t tbl_offset = (bt == T_SHORT) ? 0x0100u : 0x03020100u;
2765 
2766     mov(tmp, size1, elem_size);
2767     mulv(dst, size2, index, tmp);
2768     mov(tmp, size2, tbl_offset);
2769     addv(dst, size1, dst, tmp); // "dst" now contains the processed index elements
2770                                 // to select a set of 2B/4B
2771     select_from_two_vectors_neon(dst, src1, src2, dst, tmp, vector_length_in_bytes);
2772   }
2773 }