1 /*
   2  * Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "asm/assembler.hpp"
  26 #include "asm/assembler.inline.hpp"
  27 #include "opto/c2_MacroAssembler.hpp"
  28 #include "opto/compile.hpp"
  29 #include "opto/intrinsicnode.hpp"
  30 #include "opto/matcher.hpp"
  31 #include "opto/output.hpp"
  32 #include "opto/subnode.hpp"
  33 #include "runtime/stubRoutines.hpp"
  34 #include "utilities/globalDefinitions.hpp"
  35 #include "utilities/powerOfTwo.hpp"
  36 
  37 #ifdef PRODUCT
  38 #define BLOCK_COMMENT(str) /* nothing */
  39 #define STOP(error) stop(error)
  40 #else
  41 #define BLOCK_COMMENT(str) block_comment(str)
  42 #define STOP(error) block_comment(error); stop(error)
  43 #endif
  44 
  45 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  46 
  47 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
  48 
  49 // jdk.internal.util.ArraysSupport.vectorizedHashCode
  50 address C2_MacroAssembler::arrays_hashcode(Register ary, Register cnt, Register result,
  51                                            FloatRegister vdata0, FloatRegister vdata1,
  52                                            FloatRegister vdata2, FloatRegister vdata3,
  53                                            FloatRegister vmul0, FloatRegister vmul1,
  54                                            FloatRegister vmul2, FloatRegister vmul3,
  55                                            FloatRegister vpow, FloatRegister vpowm,
  56                                            BasicType eltype) {
  57   ARRAYS_HASHCODE_REGISTERS;
  58 
  59   Register tmp1 = rscratch1, tmp2 = rscratch2;
  60 
  61   Label TAIL, STUB_SWITCH, STUB_SWITCH_OUT, LOOP, BR_BASE, LARGE, DONE;
  62 
  63   // Vectorization factor. Number of array elements loaded to one SIMD&FP registers by the stubs. We
  64   // use 8H load arrangements for chars and shorts and 8B for booleans and bytes. It's possible to
  65   // use 4H for chars and shorts instead, but using 8H gives better performance.
  66   const size_t vf = eltype == T_BOOLEAN || eltype == T_BYTE ? 8
  67                     : eltype == T_CHAR || eltype == T_SHORT ? 8
  68                     : eltype == T_INT                       ? 4
  69                                                             : 0;
  70   guarantee(vf, "unsupported eltype");
  71 
  72   // Unroll factor for the scalar loop below. The value is chosen based on performance analysis.
  73   const size_t unroll_factor = 4;
  74 
  75   switch (eltype) {
  76   case T_BOOLEAN:
  77     BLOCK_COMMENT("arrays_hashcode(unsigned byte) {");
  78     break;
  79   case T_CHAR:
  80     BLOCK_COMMENT("arrays_hashcode(char) {");
  81     break;
  82   case T_BYTE:
  83     BLOCK_COMMENT("arrays_hashcode(byte) {");
  84     break;
  85   case T_SHORT:
  86     BLOCK_COMMENT("arrays_hashcode(short) {");
  87     break;
  88   case T_INT:
  89     BLOCK_COMMENT("arrays_hashcode(int) {");
  90     break;
  91   default:
  92     ShouldNotReachHere();
  93   }
  94 
  95   // large_arrays_hashcode(T_INT) performs worse than the scalar loop below when the Neon loop
  96   // implemented by the stub executes just once. Call the stub only if at least two iterations will
  97   // be executed.
  98   const size_t large_threshold = eltype == T_INT ? vf * 2 : vf;
  99   cmpw(cnt, large_threshold);
 100   br(Assembler::HS, LARGE);
 101 
 102   bind(TAIL);
 103 
 104   // The andr performs cnt % uf where uf = unroll_factor. The subtract shifted by 3 offsets past
 105   // uf - (cnt % uf) pairs of load + madd insns i.e. it only executes cnt % uf load + madd pairs.
 106   // Iteration eats up the remainder, uf elements at a time.
 107   assert(is_power_of_2(unroll_factor), "can't use this value to calculate the jump target PC");
 108   andr(tmp2, cnt, unroll_factor - 1);
 109   adr(tmp1, BR_BASE);
 110   // For Cortex-A53 offset is 4 because 2 nops are generated.
 111   sub(tmp1, tmp1, tmp2, ext::sxtw, VM_Version::supports_a53mac() ? 4 : 3);
 112   movw(tmp2, 0x1f);
 113   br(tmp1);
 114 
 115   bind(LOOP);
 116   for (size_t i = 0; i < unroll_factor; ++i) {
 117     load(tmp1, Address(post(ary, type2aelembytes(eltype))), eltype);
 118     maddw(result, result, tmp2, tmp1);
 119     // maddw generates an extra nop for Cortex-A53 (see maddw definition in macroAssembler).
 120     // Generate 2nd nop to have 4 instructions per iteration.
 121     if (VM_Version::supports_a53mac()) {
 122       nop();
 123     }
 124   }
 125   bind(BR_BASE);
 126   subsw(cnt, cnt, unroll_factor);
 127   br(Assembler::HS, LOOP);
 128 
 129   b(DONE);
 130 
 131   bind(LARGE);
 132 
 133   RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_arrays_hashcode(eltype));
 134   assert(stub.target() != nullptr, "array_hashcode stub has not been generated");
 135   address tpc = trampoline_call(stub);
 136   if (tpc == nullptr) {
 137     DEBUG_ONLY(reset_labels(TAIL, BR_BASE));
 138     postcond(pc() == badAddress);
 139     return nullptr;
 140   }
 141 
 142   bind(DONE);
 143 
 144   BLOCK_COMMENT("} // arrays_hashcode");
 145 
 146   postcond(pc() != badAddress);
 147   return pc();
 148 }
 149 
 150 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register t1,
 151                                               Register t2, Register t3) {
 152   assert_different_registers(obj, box, t1, t2, t3, rscratch2);
 153 
 154   // Handle inflated monitor.
 155   Label inflated;
 156   // Finish fast lock successfully. MUST branch to with flag == EQ
 157   Label locked;
 158   // Finish fast lock unsuccessfully. MUST branch to with flag == NE
 159   Label slow_path;
 160 
 161   if (UseObjectMonitorTable) {
 162     // Clear cache in case fast locking succeeds or we need to take the slow-path.
 163     str(zr, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 164   }
 165 
 166   if (DiagnoseSyncOnValueBasedClasses != 0) {
 167     load_klass(t1, obj);
 168     ldrb(t1, Address(t1, Klass::misc_flags_offset()));
 169     tst(t1, KlassFlags::_misc_is_value_based_class);
 170     br(Assembler::NE, slow_path);
 171   }
 172 
 173   const Register t1_mark = t1;
 174   const Register t3_t = t3;
 175 
 176   { // Lightweight locking
 177 
 178     // Push lock to the lock stack and finish successfully. MUST branch to with flag == EQ
 179     Label push;
 180 
 181     const Register t2_top = t2;
 182 
 183     // Check if lock-stack is full.
 184     ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 185     cmpw(t2_top, (unsigned)LockStack::end_offset() - 1);
 186     br(Assembler::GT, slow_path);
 187 
 188     // Check if recursive.
 189     subw(t3_t, t2_top, oopSize);
 190     ldr(t3_t, Address(rthread, t3_t));
 191     cmp(obj, t3_t);
 192     br(Assembler::EQ, push);
 193 
 194     // Relaxed normal load to check for monitor. Optimization for monitor case.
 195     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 196     tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
 197 
 198     // Not inflated
 199     assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea");
 200 
 201     // Try to lock. Transition lock-bits 0b01 => 0b00
 202     orr(t1_mark, t1_mark, markWord::unlocked_value);
 203     eor(t3_t, t1_mark, markWord::unlocked_value);
 204     cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
 205             /*acquire*/ true, /*release*/ false, /*weak*/ false, noreg);
 206     br(Assembler::NE, slow_path);
 207 
 208     bind(push);
 209     // After successful lock, push object on lock-stack.
 210     str(obj, Address(rthread, t2_top));
 211     addw(t2_top, t2_top, oopSize);
 212     strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 213     b(locked);
 214   }
 215 
 216   { // Handle inflated monitor.
 217     bind(inflated);
 218 
 219     const Register t1_monitor = t1;
 220 
 221     if (!UseObjectMonitorTable) {
 222       assert(t1_monitor == t1_mark, "should be the same here");
 223     } else {
 224       Label monitor_found;
 225 
 226       // Load cache address
 227       lea(t3_t, Address(rthread, JavaThread::om_cache_oops_offset()));
 228 
 229       const int num_unrolled = 2;
 230       for (int i = 0; i < num_unrolled; i++) {
 231         ldr(t1, Address(t3_t));
 232         cmp(obj, t1);
 233         br(Assembler::EQ, monitor_found);
 234         increment(t3_t, in_bytes(OMCache::oop_to_oop_difference()));
 235       }
 236 
 237       Label loop;
 238 
 239       // Search for obj in cache.
 240       bind(loop);
 241 
 242       // Check for match.
 243       ldr(t1, Address(t3_t));
 244       cmp(obj, t1);
 245       br(Assembler::EQ, monitor_found);
 246 
 247       // Search until null encountered, guaranteed _null_sentinel at end.
 248       increment(t3_t, in_bytes(OMCache::oop_to_oop_difference()));
 249       cbnz(t1, loop);
 250       // Cache Miss, NE set from cmp above, cbnz does not set flags
 251       b(slow_path);
 252 
 253       bind(monitor_found);
 254       ldr(t1_monitor, Address(t3_t, OMCache::oop_to_monitor_difference()));
 255     }
 256 
 257     const Register t2_owner_addr = t2;
 258     const Register t3_owner = t3;
 259     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 260     const Address owner_address(t1_monitor, ObjectMonitor::owner_offset() - monitor_tag);
 261     const Address recursions_address(t1_monitor, ObjectMonitor::recursions_offset() - monitor_tag);
 262 
 263     Label monitor_locked;
 264 
 265     // Compute owner address.
 266     lea(t2_owner_addr, owner_address);
 267 
 268     // Try to CAS owner (no owner => current thread's _monitor_owner_id).
 269     ldr(rscratch2, Address(rthread, JavaThread::monitor_owner_id_offset()));
 270     cmpxchg(t2_owner_addr, zr, rscratch2, Assembler::xword, /*acquire*/ true,
 271             /*release*/ false, /*weak*/ false, t3_owner);
 272     br(Assembler::EQ, monitor_locked);
 273 
 274     // Check if recursive.
 275     cmp(t3_owner, rscratch2);
 276     br(Assembler::NE, slow_path);
 277 
 278     // Recursive.
 279     increment(recursions_address, 1);
 280 
 281     bind(monitor_locked);
 282     if (UseObjectMonitorTable) {
 283       str(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 284     }
 285   }
 286 
 287   bind(locked);
 288 
 289 #ifdef ASSERT
 290   // Check that locked label is reached with Flags == EQ.
 291   Label flag_correct;
 292   br(Assembler::EQ, flag_correct);
 293   stop("Fast Lock Flag != EQ");
 294 #endif
 295 
 296   bind(slow_path);
 297 #ifdef ASSERT
 298   // Check that slow_path label is reached with Flags == NE.
 299   br(Assembler::NE, flag_correct);
 300   stop("Fast Lock Flag != NE");
 301   bind(flag_correct);
 302 #endif
 303   // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
 304 }
 305 
 306 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register box, Register t1,
 307                                                 Register t2, Register t3) {
 308   assert_different_registers(obj, box, t1, t2, t3);
 309 
 310   // Handle inflated monitor.
 311   Label inflated, inflated_load_mark;
 312   // Finish fast unlock successfully. MUST branch to with flag == EQ
 313   Label unlocked;
 314   // Finish fast unlock unsuccessfully. MUST branch to with flag == NE
 315   Label slow_path;
 316 
 317   const Register t1_mark = t1;
 318   const Register t2_top = t2;
 319   const Register t3_t = t3;
 320 
 321   { // Lightweight unlock
 322 
 323     Label push_and_slow_path;
 324 
 325     // Check if obj is top of lock-stack.
 326     ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 327     subw(t2_top, t2_top, oopSize);
 328     ldr(t3_t, Address(rthread, t2_top));
 329     cmp(obj, t3_t);
 330     // Top of lock stack was not obj. Must be monitor.
 331     br(Assembler::NE, inflated_load_mark);
 332 
 333     // Pop lock-stack.
 334     DEBUG_ONLY(str(zr, Address(rthread, t2_top));)
 335     strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 336 
 337     // Check if recursive.
 338     subw(t3_t, t2_top, oopSize);
 339     ldr(t3_t, Address(rthread, t3_t));
 340     cmp(obj, t3_t);
 341     br(Assembler::EQ, unlocked);
 342 
 343     // Not recursive.
 344     // Load Mark.
 345     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 346 
 347     // Check header for monitor (0b10).
 348     // Because we got here by popping (meaning we pushed in locked)
 349     // there will be no monitor in the box. So we need to push back the obj
 350     // so that the runtime can fix any potential anonymous owner.
 351     tbnz(t1_mark, exact_log2(markWord::monitor_value), UseObjectMonitorTable ? push_and_slow_path : inflated);
 352 
 353     // Try to unlock. Transition lock bits 0b00 => 0b01
 354     assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea");
 355     orr(t3_t, t1_mark, markWord::unlocked_value);
 356     cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
 357             /*acquire*/ false, /*release*/ true, /*weak*/ false, noreg);
 358     br(Assembler::EQ, unlocked);
 359 
 360     bind(push_and_slow_path);
 361     // Compare and exchange failed.
 362     // Restore lock-stack and handle the unlock in runtime.
 363     DEBUG_ONLY(str(obj, Address(rthread, t2_top));)
 364     addw(t2_top, t2_top, oopSize);
 365     str(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 366     b(slow_path);
 367   }
 368 
 369 
 370   { // Handle inflated monitor.
 371     bind(inflated_load_mark);
 372     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 373 #ifdef ASSERT
 374     tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
 375     stop("Fast Unlock not monitor");
 376 #endif
 377 
 378     bind(inflated);
 379 
 380 #ifdef ASSERT
 381     Label check_done;
 382     subw(t2_top, t2_top, oopSize);
 383     cmpw(t2_top, in_bytes(JavaThread::lock_stack_base_offset()));
 384     br(Assembler::LT, check_done);
 385     ldr(t3_t, Address(rthread, t2_top));
 386     cmp(obj, t3_t);
 387     br(Assembler::NE, inflated);
 388     stop("Fast Unlock lock on stack");
 389     bind(check_done);
 390 #endif
 391 
 392     const Register t1_monitor = t1;
 393 
 394     if (!UseObjectMonitorTable) {
 395       assert(t1_monitor == t1_mark, "should be the same here");
 396 
 397       // Untag the monitor.
 398       add(t1_monitor, t1_mark, -(int)markWord::monitor_value);
 399     } else {
 400       ldr(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 401       // null check with Flags == NE, no valid pointer below alignof(ObjectMonitor*)
 402       cmp(t1_monitor, checked_cast<uint8_t>(alignof(ObjectMonitor*)));
 403       br(Assembler::LO, slow_path);
 404     }
 405 
 406     const Register t2_recursions = t2;
 407     Label not_recursive;
 408 
 409     // Check if recursive.
 410     ldr(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
 411     cbz(t2_recursions, not_recursive);
 412 
 413     // Recursive unlock.
 414     sub(t2_recursions, t2_recursions, 1u);
 415     str(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
 416     // Set flag == EQ
 417     cmp(t2_recursions, t2_recursions);
 418     b(unlocked);
 419 
 420     bind(not_recursive);
 421 
 422     const Register t2_owner_addr = t2;
 423 
 424     // Compute owner address.
 425     lea(t2_owner_addr, Address(t1_monitor, ObjectMonitor::owner_offset()));
 426 
 427     // Set owner to null.
 428     // Release to satisfy the JMM
 429     stlr(zr, t2_owner_addr);
 430     // We need a full fence after clearing owner to avoid stranding.
 431     // StoreLoad achieves this.
 432     membar(StoreLoad);
 433 
 434     // Check if the entry_list is empty.
 435     ldr(rscratch1, Address(t1_monitor, ObjectMonitor::entry_list_offset()));
 436     cmp(rscratch1, zr);
 437     br(Assembler::EQ, unlocked);  // If so we are done.
 438 
 439     // Check if there is a successor.
 440     ldr(rscratch1, Address(t1_monitor, ObjectMonitor::succ_offset()));
 441     cmp(rscratch1, zr);
 442     br(Assembler::NE, unlocked);  // If so we are done.
 443 
 444     // Save the monitor pointer in the current thread, so we can try to
 445     // reacquire the lock in SharedRuntime::monitor_exit_helper().
 446     str(t1_monitor, Address(rthread, JavaThread::unlocked_inflated_monitor_offset()));
 447 
 448     cmp(zr, rthread); // Set Flag to NE => slow path
 449     b(slow_path);
 450   }
 451 
 452   bind(unlocked);
 453   cmp(zr, zr); // Set Flags to EQ => fast path
 454 
 455 #ifdef ASSERT
 456   // Check that unlocked label is reached with Flags == EQ.
 457   Label flag_correct;
 458   br(Assembler::EQ, flag_correct);
 459   stop("Fast Unlock Flag != EQ");
 460 #endif
 461 
 462   bind(slow_path);
 463 #ifdef ASSERT
 464   // Check that slow_path label is reached with Flags == NE.
 465   br(Assembler::NE, flag_correct);
 466   stop("Fast Unlock Flag != NE");
 467   bind(flag_correct);
 468 #endif
 469   // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
 470 }
 471 
 472 // Search for str1 in str2 and return index or -1
 473 // Clobbers: rscratch1, rscratch2, rflags. May also clobber v0-v1, when icnt1==-1.
 474 void C2_MacroAssembler::string_indexof(Register str2, Register str1,
 475                                        Register cnt2, Register cnt1,
 476                                        Register tmp1, Register tmp2,
 477                                        Register tmp3, Register tmp4,
 478                                        Register tmp5, Register tmp6,
 479                                        int icnt1, Register result, int ae) {
 480   // NOTE: tmp5, tmp6 can be zr depending on specific method version
 481   Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
 482 
 483   Register ch1 = rscratch1;
 484   Register ch2 = rscratch2;
 485   Register cnt1tmp = tmp1;
 486   Register cnt2tmp = tmp2;
 487   Register cnt1_neg = cnt1;
 488   Register cnt2_neg = cnt2;
 489   Register result_tmp = tmp4;
 490 
 491   bool isL = ae == StrIntrinsicNode::LL;
 492 
 493   bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
 494   bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
 495   int str1_chr_shift = str1_isL ? 0:1;
 496   int str2_chr_shift = str2_isL ? 0:1;
 497   int str1_chr_size = str1_isL ? 1:2;
 498   int str2_chr_size = str2_isL ? 1:2;
 499   chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
 500                                       (chr_insn)&MacroAssembler::ldrh;
 501   chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
 502                                       (chr_insn)&MacroAssembler::ldrh;
 503   chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
 504   chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
 505 
 506   // Note, inline_string_indexOf() generates checks:
 507   // if (substr.count > string.count) return -1;
 508   // if (substr.count == 0) return 0;
 509 
 510   // We have two strings, a source string in str2, cnt2 and a pattern string
 511   // in str1, cnt1. Find the 1st occurrence of pattern in source or return -1.
 512 
 513   // For larger pattern and source we use a simplified Boyer Moore algorithm.
 514   // With a small pattern and source we use linear scan.
 515 
 516   if (icnt1 == -1) {
 517     sub(result_tmp, cnt2, cnt1);
 518     cmp(cnt1, (u1)8);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
 519     br(LT, LINEARSEARCH);
 520     dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
 521     subs(zr, cnt1, 256);
 522     lsr(tmp1, cnt2, 2);
 523     ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
 524     br(GE, LINEARSTUB);
 525   }
 526 
 527 // The Boyer Moore alogorithm is based on the description here:-
 528 //
 529 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
 530 //
 531 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
 532 // and the 'Good Suffix' rule.
 533 //
 534 // These rules are essentially heuristics for how far we can shift the
 535 // pattern along the search string.
 536 //
 537 // The implementation here uses the 'Bad Character' rule only because of the
 538 // complexity of initialisation for the 'Good Suffix' rule.
 539 //
 540 // This is also known as the Boyer-Moore-Horspool algorithm:-
 541 //
 542 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
 543 //
 544 // This particular implementation has few java-specific optimizations.
 545 //
 546 // #define ASIZE 256
 547 //
 548 //    int bm(unsigned char *x, int m, unsigned char *y, int n) {
 549 //       int i, j;
 550 //       unsigned c;
 551 //       unsigned char bc[ASIZE];
 552 //
 553 //       /* Preprocessing */
 554 //       for (i = 0; i < ASIZE; ++i)
 555 //          bc[i] = m;
 556 //       for (i = 0; i < m - 1; ) {
 557 //          c = x[i];
 558 //          ++i;
 559 //          // c < 256 for Latin1 string, so, no need for branch
 560 //          #ifdef PATTERN_STRING_IS_LATIN1
 561 //          bc[c] = m - i;
 562 //          #else
 563 //          if (c < ASIZE) bc[c] = m - i;
 564 //          #endif
 565 //       }
 566 //
 567 //       /* Searching */
 568 //       j = 0;
 569 //       while (j <= n - m) {
 570 //          c = y[i+j];
 571 //          if (x[m-1] == c)
 572 //            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
 573 //          if (i < 0) return j;
 574 //          // c < 256 for Latin1 string, so, no need for branch
 575 //          #ifdef SOURCE_STRING_IS_LATIN1
 576 //          // LL case: (c< 256) always true. Remove branch
 577 //          j += bc[y[j+m-1]];
 578 //          #endif
 579 //          #ifndef PATTERN_STRING_IS_UTF
 580 //          // UU case: need if (c<ASIZE) check. Skip 1 character if not.
 581 //          if (c < ASIZE)
 582 //            j += bc[y[j+m-1]];
 583 //          else
 584 //            j += 1
 585 //          #endif
 586 //          #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
 587 //          // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
 588 //          if (c < ASIZE)
 589 //            j += bc[y[j+m-1]];
 590 //          else
 591 //            j += m
 592 //          #endif
 593 //       }
 594 //    }
 595 
 596   if (icnt1 == -1) {
 597     Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
 598         BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
 599     Register cnt1end = tmp2;
 600     Register str2end = cnt2;
 601     Register skipch = tmp2;
 602 
 603     // str1 length is >=8, so, we can read at least 1 register for cases when
 604     // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
 605     // UL case. We'll re-read last character in inner pre-loop code to have
 606     // single outer pre-loop load
 607     const int firstStep = isL ? 7 : 3;
 608 
 609     const int ASIZE = 256;
 610     const int STORED_BYTES = 32; // amount of bytes stored per instruction
 611     sub(sp, sp, ASIZE);
 612     mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
 613     mov(ch1, sp);
 614     BIND(BM_INIT_LOOP);
 615       stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
 616       subs(tmp5, tmp5, 1);
 617       br(GT, BM_INIT_LOOP);
 618 
 619       sub(cnt1tmp, cnt1, 1);
 620       mov(tmp5, str2);
 621       add(str2end, str2, result_tmp, LSL, str2_chr_shift);
 622       sub(ch2, cnt1, 1);
 623       mov(tmp3, str1);
 624     BIND(BCLOOP);
 625       (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
 626       if (!str1_isL) {
 627         subs(zr, ch1, ASIZE);
 628         br(HS, BCSKIP);
 629       }
 630       strb(ch2, Address(sp, ch1));
 631     BIND(BCSKIP);
 632       subs(ch2, ch2, 1);
 633       br(GT, BCLOOP);
 634 
 635       add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
 636       if (str1_isL == str2_isL) {
 637         // load last 8 bytes (8LL/4UU symbols)
 638         ldr(tmp6, Address(tmp6, -wordSize));
 639       } else {
 640         ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
 641         // convert Latin1 to UTF. We'll have to wait until load completed, but
 642         // it's still faster than per-character loads+checks
 643         lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
 644         ubfx(ch1, tmp6, 8, 8); // str1[N-2]
 645         ubfx(ch2, tmp6, 16, 8); // str1[N-3]
 646         andr(tmp6, tmp6, 0xFF); // str1[N-4]
 647         orr(ch2, ch1, ch2, LSL, 16);
 648         orr(tmp6, tmp6, tmp3, LSL, 48);
 649         orr(tmp6, tmp6, ch2, LSL, 16);
 650       }
 651     BIND(BMLOOPSTR2);
 652       (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 653       sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
 654       if (str1_isL == str2_isL) {
 655         // re-init tmp3. It's for free because it's executed in parallel with
 656         // load above. Alternative is to initialize it before loop, but it'll
 657         // affect performance on in-order systems with 2 or more ld/st pipelines
 658         lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
 659       }
 660       if (!isL) { // UU/UL case
 661         lsl(ch2, cnt1tmp, 1); // offset in bytes
 662       }
 663       cmp(tmp3, skipch);
 664       br(NE, BMSKIP);
 665       ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
 666       mov(ch1, tmp6);
 667       if (isL) {
 668         b(BMLOOPSTR1_AFTER_LOAD);
 669       } else {
 670         sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
 671         b(BMLOOPSTR1_CMP);
 672       }
 673     BIND(BMLOOPSTR1);
 674       (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
 675       (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 676     BIND(BMLOOPSTR1_AFTER_LOAD);
 677       subs(cnt1tmp, cnt1tmp, 1);
 678       br(LT, BMLOOPSTR1_LASTCMP);
 679     BIND(BMLOOPSTR1_CMP);
 680       cmp(ch1, ch2);
 681       br(EQ, BMLOOPSTR1);
 682     BIND(BMSKIP);
 683       if (!isL) {
 684         // if we've met UTF symbol while searching Latin1 pattern, then we can
 685         // skip cnt1 symbols
 686         if (str1_isL != str2_isL) {
 687           mov(result_tmp, cnt1);
 688         } else {
 689           mov(result_tmp, 1);
 690         }
 691         subs(zr, skipch, ASIZE);
 692         br(HS, BMADV);
 693       }
 694       ldrb(result_tmp, Address(sp, skipch)); // load skip distance
 695     BIND(BMADV);
 696       sub(cnt1tmp, cnt1, 1);
 697       add(str2, str2, result_tmp, LSL, str2_chr_shift);
 698       cmp(str2, str2end);
 699       br(LE, BMLOOPSTR2);
 700       add(sp, sp, ASIZE);
 701       b(NOMATCH);
 702     BIND(BMLOOPSTR1_LASTCMP);
 703       cmp(ch1, ch2);
 704       br(NE, BMSKIP);
 705     BIND(BMMATCH);
 706       sub(result, str2, tmp5);
 707       if (!str2_isL) lsr(result, result, 1);
 708       add(sp, sp, ASIZE);
 709       b(DONE);
 710 
 711     BIND(LINEARSTUB);
 712     cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm
 713     br(LT, LINEAR_MEDIUM);
 714     mov(result, zr);
 715     RuntimeAddress stub = nullptr;
 716     if (isL) {
 717       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
 718       assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated");
 719     } else if (str1_isL) {
 720       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
 721        assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated");
 722     } else {
 723       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
 724       assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated");
 725     }
 726     address call = trampoline_call(stub);
 727     if (call == nullptr) {
 728       DEBUG_ONLY(reset_labels(LINEARSEARCH, LINEAR_MEDIUM, DONE, NOMATCH, MATCH));
 729       ciEnv::current()->record_failure("CodeCache is full");
 730       return;
 731     }
 732     b(DONE);
 733   }
 734 
 735   BIND(LINEARSEARCH);
 736   {
 737     Label DO1, DO2, DO3;
 738 
 739     Register str2tmp = tmp2;
 740     Register first = tmp3;
 741 
 742     if (icnt1 == -1)
 743     {
 744         Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
 745 
 746         cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2));
 747         br(LT, DOSHORT);
 748       BIND(LINEAR_MEDIUM);
 749         (this->*str1_load_1chr)(first, Address(str1));
 750         lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
 751         sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
 752         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 753         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 754 
 755       BIND(FIRST_LOOP);
 756         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
 757         cmp(first, ch2);
 758         br(EQ, STR1_LOOP);
 759       BIND(STR2_NEXT);
 760         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 761         br(LE, FIRST_LOOP);
 762         b(NOMATCH);
 763 
 764       BIND(STR1_LOOP);
 765         adds(cnt1tmp, cnt1_neg, str1_chr_size);
 766         add(cnt2tmp, cnt2_neg, str2_chr_size);
 767         br(GE, MATCH);
 768 
 769       BIND(STR1_NEXT);
 770         (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
 771         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
 772         cmp(ch1, ch2);
 773         br(NE, STR2_NEXT);
 774         adds(cnt1tmp, cnt1tmp, str1_chr_size);
 775         add(cnt2tmp, cnt2tmp, str2_chr_size);
 776         br(LT, STR1_NEXT);
 777         b(MATCH);
 778 
 779       BIND(DOSHORT);
 780       if (str1_isL == str2_isL) {
 781         cmp(cnt1, (u1)2);
 782         br(LT, DO1);
 783         br(GT, DO3);
 784       }
 785     }
 786 
 787     if (icnt1 == 4) {
 788       Label CH1_LOOP;
 789 
 790         (this->*load_4chr)(ch1, str1);
 791         sub(result_tmp, cnt2, 4);
 792         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 793         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 794 
 795       BIND(CH1_LOOP);
 796         (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
 797         cmp(ch1, ch2);
 798         br(EQ, MATCH);
 799         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 800         br(LE, CH1_LOOP);
 801         b(NOMATCH);
 802       }
 803 
 804     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
 805       Label CH1_LOOP;
 806 
 807       BIND(DO2);
 808         (this->*load_2chr)(ch1, str1);
 809         if (icnt1 == 2) {
 810           sub(result_tmp, cnt2, 2);
 811         }
 812         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 813         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 814       BIND(CH1_LOOP);
 815         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
 816         cmp(ch1, ch2);
 817         br(EQ, MATCH);
 818         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 819         br(LE, CH1_LOOP);
 820         b(NOMATCH);
 821     }
 822 
 823     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
 824       Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
 825 
 826       BIND(DO3);
 827         (this->*load_2chr)(first, str1);
 828         (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
 829         if (icnt1 == 3) {
 830           sub(result_tmp, cnt2, 3);
 831         }
 832         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 833         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 834       BIND(FIRST_LOOP);
 835         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
 836         cmpw(first, ch2);
 837         br(EQ, STR1_LOOP);
 838       BIND(STR2_NEXT);
 839         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 840         br(LE, FIRST_LOOP);
 841         b(NOMATCH);
 842 
 843       BIND(STR1_LOOP);
 844         add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
 845         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
 846         cmp(ch1, ch2);
 847         br(NE, STR2_NEXT);
 848         b(MATCH);
 849     }
 850 
 851     if (icnt1 == -1 || icnt1 == 1) {
 852       Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
 853 
 854       BIND(DO1);
 855         (this->*str1_load_1chr)(ch1, str1);
 856         cmp(cnt2, (u1)8);
 857         br(LT, DO1_SHORT);
 858 
 859         sub(result_tmp, cnt2, 8/str2_chr_size);
 860         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 861         mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
 862         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 863 
 864         if (str2_isL) {
 865           orr(ch1, ch1, ch1, LSL, 8);
 866         }
 867         orr(ch1, ch1, ch1, LSL, 16);
 868         orr(ch1, ch1, ch1, LSL, 32);
 869       BIND(CH1_LOOP);
 870         ldr(ch2, Address(str2, cnt2_neg));
 871         eor(ch2, ch1, ch2);
 872         sub(tmp1, ch2, tmp3);
 873         orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 874         bics(tmp1, tmp1, tmp2);
 875         br(NE, HAS_ZERO);
 876         adds(cnt2_neg, cnt2_neg, 8);
 877         br(LT, CH1_LOOP);
 878 
 879         cmp(cnt2_neg, (u1)8);
 880         mov(cnt2_neg, 0);
 881         br(LT, CH1_LOOP);
 882         b(NOMATCH);
 883 
 884       BIND(HAS_ZERO);
 885         rev(tmp1, tmp1);
 886         clz(tmp1, tmp1);
 887         add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
 888         b(MATCH);
 889 
 890       BIND(DO1_SHORT);
 891         mov(result_tmp, cnt2);
 892         lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
 893         sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
 894       BIND(DO1_LOOP);
 895         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
 896         cmpw(ch1, ch2);
 897         br(EQ, MATCH);
 898         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 899         br(LT, DO1_LOOP);
 900     }
 901   }
 902   BIND(NOMATCH);
 903     mov(result, -1);
 904     b(DONE);
 905   BIND(MATCH);
 906     add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
 907   BIND(DONE);
 908 }
 909 
 910 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
 911 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
 912 
 913 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1,
 914                                             Register ch, Register result,
 915                                             Register tmp1, Register tmp2, Register tmp3)
 916 {
 917   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
 918   Register cnt1_neg = cnt1;
 919   Register ch1 = rscratch1;
 920   Register result_tmp = rscratch2;
 921 
 922   cbz(cnt1, NOMATCH);
 923 
 924   cmp(cnt1, (u1)4);
 925   br(LT, DO1_SHORT);
 926 
 927   orr(ch, ch, ch, LSL, 16);
 928   orr(ch, ch, ch, LSL, 32);
 929 
 930   sub(cnt1, cnt1, 4);
 931   mov(result_tmp, cnt1);
 932   lea(str1, Address(str1, cnt1, Address::uxtw(1)));
 933   sub(cnt1_neg, zr, cnt1, LSL, 1);
 934 
 935   mov(tmp3, 0x0001000100010001);
 936 
 937   BIND(CH1_LOOP);
 938     ldr(ch1, Address(str1, cnt1_neg));
 939     eor(ch1, ch, ch1);
 940     sub(tmp1, ch1, tmp3);
 941     orr(tmp2, ch1, 0x7fff7fff7fff7fff);
 942     bics(tmp1, tmp1, tmp2);
 943     br(NE, HAS_ZERO);
 944     adds(cnt1_neg, cnt1_neg, 8);
 945     br(LT, CH1_LOOP);
 946 
 947     cmp(cnt1_neg, (u1)8);
 948     mov(cnt1_neg, 0);
 949     br(LT, CH1_LOOP);
 950     b(NOMATCH);
 951 
 952   BIND(HAS_ZERO);
 953     rev(tmp1, tmp1);
 954     clz(tmp1, tmp1);
 955     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
 956     b(MATCH);
 957 
 958   BIND(DO1_SHORT);
 959     mov(result_tmp, cnt1);
 960     lea(str1, Address(str1, cnt1, Address::uxtw(1)));
 961     sub(cnt1_neg, zr, cnt1, LSL, 1);
 962   BIND(DO1_LOOP);
 963     ldrh(ch1, Address(str1, cnt1_neg));
 964     cmpw(ch, ch1);
 965     br(EQ, MATCH);
 966     adds(cnt1_neg, cnt1_neg, 2);
 967     br(LT, DO1_LOOP);
 968   BIND(NOMATCH);
 969     mov(result, -1);
 970     b(DONE);
 971   BIND(MATCH);
 972     add(result, result_tmp, cnt1_neg, ASR, 1);
 973   BIND(DONE);
 974 }
 975 
 976 void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1,
 977                                                 Register ch, Register result,
 978                                                 FloatRegister ztmp1,
 979                                                 FloatRegister ztmp2,
 980                                                 PRegister tmp_pg,
 981                                                 PRegister tmp_pdn, bool isL)
 982 {
 983   // Note that `tmp_pdn` should *NOT* be used as governing predicate register.
 984   assert(tmp_pg->is_governing(),
 985          "this register has to be a governing predicate register");
 986 
 987   Label LOOP, MATCH, DONE, NOMATCH;
 988   Register vec_len = rscratch1;
 989   Register idx = rscratch2;
 990 
 991   SIMD_RegVariant T = (isL == true) ? B : H;
 992 
 993   cbz(cnt1, NOMATCH);
 994 
 995   // Assign the particular char throughout the vector.
 996   sve_dup(ztmp2, T, ch);
 997   if (isL) {
 998     sve_cntb(vec_len);
 999   } else {
1000     sve_cnth(vec_len);
1001   }
1002   mov(idx, 0);
1003 
1004   // Generate a predicate to control the reading of input string.
1005   sve_whilelt(tmp_pg, T, idx, cnt1);
1006 
1007   BIND(LOOP);
1008     // Read a vector of 8- or 16-bit data depending on the string type. Note
1009     // that inactive elements indicated by the predicate register won't cause
1010     // a data read from memory to the destination vector.
1011     if (isL) {
1012       sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx));
1013     } else {
1014       sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1)));
1015     }
1016     add(idx, idx, vec_len);
1017 
1018     // Perform the comparison. An element of the destination predicate is set
1019     // to active if the particular char is matched.
1020     sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2);
1021 
1022     // Branch if the particular char is found.
1023     br(NE, MATCH);
1024 
1025     sve_whilelt(tmp_pg, T, idx, cnt1);
1026 
1027     // Loop back if the particular char not found.
1028     br(MI, LOOP);
1029 
1030   BIND(NOMATCH);
1031     mov(result, -1);
1032     b(DONE);
1033 
1034   BIND(MATCH);
1035     // Undo the index increment.
1036     sub(idx, idx, vec_len);
1037 
1038     // Crop the vector to find its location.
1039     sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */);
1040     add(result, idx, -1);
1041     sve_incp(result, T, tmp_pdn);
1042   BIND(DONE);
1043 }
1044 
1045 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1,
1046                                             Register ch, Register result,
1047                                             Register tmp1, Register tmp2, Register tmp3)
1048 {
1049   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
1050   Register cnt1_neg = cnt1;
1051   Register ch1 = rscratch1;
1052   Register result_tmp = rscratch2;
1053 
1054   cbz(cnt1, NOMATCH);
1055 
1056   cmp(cnt1, (u1)8);
1057   br(LT, DO1_SHORT);
1058 
1059   orr(ch, ch, ch, LSL, 8);
1060   orr(ch, ch, ch, LSL, 16);
1061   orr(ch, ch, ch, LSL, 32);
1062 
1063   sub(cnt1, cnt1, 8);
1064   mov(result_tmp, cnt1);
1065   lea(str1, Address(str1, cnt1));
1066   sub(cnt1_neg, zr, cnt1);
1067 
1068   mov(tmp3, 0x0101010101010101);
1069 
1070   BIND(CH1_LOOP);
1071     ldr(ch1, Address(str1, cnt1_neg));
1072     eor(ch1, ch, ch1);
1073     sub(tmp1, ch1, tmp3);
1074     orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f);
1075     bics(tmp1, tmp1, tmp2);
1076     br(NE, HAS_ZERO);
1077     adds(cnt1_neg, cnt1_neg, 8);
1078     br(LT, CH1_LOOP);
1079 
1080     cmp(cnt1_neg, (u1)8);
1081     mov(cnt1_neg, 0);
1082     br(LT, CH1_LOOP);
1083     b(NOMATCH);
1084 
1085   BIND(HAS_ZERO);
1086     rev(tmp1, tmp1);
1087     clz(tmp1, tmp1);
1088     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
1089     b(MATCH);
1090 
1091   BIND(DO1_SHORT);
1092     mov(result_tmp, cnt1);
1093     lea(str1, Address(str1, cnt1));
1094     sub(cnt1_neg, zr, cnt1);
1095   BIND(DO1_LOOP);
1096     ldrb(ch1, Address(str1, cnt1_neg));
1097     cmp(ch, ch1);
1098     br(EQ, MATCH);
1099     adds(cnt1_neg, cnt1_neg, 1);
1100     br(LT, DO1_LOOP);
1101   BIND(NOMATCH);
1102     mov(result, -1);
1103     b(DONE);
1104   BIND(MATCH);
1105     add(result, result_tmp, cnt1_neg);
1106   BIND(DONE);
1107 }
1108 
1109 // Compare strings.
1110 void C2_MacroAssembler::string_compare(Register str1, Register str2,
1111     Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
1112     FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
1113     PRegister pgtmp1, PRegister pgtmp2, int ae) {
1114   Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
1115       DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
1116       SHORT_LOOP_START, TAIL_CHECK;
1117 
1118   bool isLL = ae == StrIntrinsicNode::LL;
1119   bool isLU = ae == StrIntrinsicNode::LU;
1120   bool isUL = ae == StrIntrinsicNode::UL;
1121 
1122   // The stub threshold for LL strings is: 72 (64 + 8) chars
1123   // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch)
1124   // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least)
1125   const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36);
1126 
1127   bool str1_isL = isLL || isLU;
1128   bool str2_isL = isLL || isUL;
1129 
1130   int str1_chr_shift = str1_isL ? 0 : 1;
1131   int str2_chr_shift = str2_isL ? 0 : 1;
1132   int str1_chr_size = str1_isL ? 1 : 2;
1133   int str2_chr_size = str2_isL ? 1 : 2;
1134   int minCharsInWord = isLL ? wordSize : wordSize/2;
1135 
1136   FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
1137   chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
1138                                       (chr_insn)&MacroAssembler::ldrh;
1139   chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
1140                                       (chr_insn)&MacroAssembler::ldrh;
1141   uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
1142                             (uxt_insn)&MacroAssembler::uxthw;
1143 
1144   BLOCK_COMMENT("string_compare {");
1145 
1146   // Bizarrely, the counts are passed in bytes, regardless of whether they
1147   // are L or U strings, however the result is always in characters.
1148   if (!str1_isL) asrw(cnt1, cnt1, 1);
1149   if (!str2_isL) asrw(cnt2, cnt2, 1);
1150 
1151   // Compute the minimum of the string lengths and save the difference.
1152   subsw(result, cnt1, cnt2);
1153   cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
1154 
1155   // A very short string
1156   cmpw(cnt2, minCharsInWord);
1157   br(Assembler::LE, SHORT_STRING);
1158 
1159   // Compare longwords
1160   // load first parts of strings and finish initialization while loading
1161   {
1162     if (str1_isL == str2_isL) { // LL or UU
1163       ldr(tmp1, Address(str1));
1164       cmp(str1, str2);
1165       br(Assembler::EQ, DONE);
1166       ldr(tmp2, Address(str2));
1167       cmp(cnt2, stub_threshold);
1168       br(GE, STUB);
1169       subsw(cnt2, cnt2, minCharsInWord);
1170       br(EQ, TAIL_CHECK);
1171       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1172       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1173       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1174     } else if (isLU) {
1175       ldrs(vtmp, Address(str1));
1176       ldr(tmp2, Address(str2));
1177       cmp(cnt2, stub_threshold);
1178       br(GE, STUB);
1179       subw(cnt2, cnt2, 4);
1180       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1181       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1182       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1183       zip1(vtmp, T8B, vtmp, vtmpZ);
1184       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1185       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1186       add(cnt1, cnt1, 4);
1187       fmovd(tmp1, vtmp);
1188     } else { // UL case
1189       ldr(tmp1, Address(str1));
1190       ldrs(vtmp, Address(str2));
1191       cmp(cnt2, stub_threshold);
1192       br(GE, STUB);
1193       subw(cnt2, cnt2, 4);
1194       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1195       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1196       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1197       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1198       zip1(vtmp, T8B, vtmp, vtmpZ);
1199       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1200       add(cnt1, cnt1, 8);
1201       fmovd(tmp2, vtmp);
1202     }
1203     adds(cnt2, cnt2, isUL ? 4 : 8);
1204     br(GE, TAIL);
1205     eor(rscratch2, tmp1, tmp2);
1206     cbnz(rscratch2, DIFF);
1207     // main loop
1208     bind(NEXT_WORD);
1209     if (str1_isL == str2_isL) {
1210       ldr(tmp1, Address(str1, cnt2));
1211       ldr(tmp2, Address(str2, cnt2));
1212       adds(cnt2, cnt2, 8);
1213     } else if (isLU) {
1214       ldrs(vtmp, Address(str1, cnt1));
1215       ldr(tmp2, Address(str2, cnt2));
1216       add(cnt1, cnt1, 4);
1217       zip1(vtmp, T8B, vtmp, vtmpZ);
1218       fmovd(tmp1, vtmp);
1219       adds(cnt2, cnt2, 8);
1220     } else { // UL
1221       ldrs(vtmp, Address(str2, cnt2));
1222       ldr(tmp1, Address(str1, cnt1));
1223       zip1(vtmp, T8B, vtmp, vtmpZ);
1224       add(cnt1, cnt1, 8);
1225       fmovd(tmp2, vtmp);
1226       adds(cnt2, cnt2, 4);
1227     }
1228     br(GE, TAIL);
1229 
1230     eor(rscratch2, tmp1, tmp2);
1231     cbz(rscratch2, NEXT_WORD);
1232     b(DIFF);
1233     bind(TAIL);
1234     eor(rscratch2, tmp1, tmp2);
1235     cbnz(rscratch2, DIFF);
1236     // Last longword.  In the case where length == 4 we compare the
1237     // same longword twice, but that's still faster than another
1238     // conditional branch.
1239     if (str1_isL == str2_isL) {
1240       ldr(tmp1, Address(str1));
1241       ldr(tmp2, Address(str2));
1242     } else if (isLU) {
1243       ldrs(vtmp, Address(str1));
1244       ldr(tmp2, Address(str2));
1245       zip1(vtmp, T8B, vtmp, vtmpZ);
1246       fmovd(tmp1, vtmp);
1247     } else { // UL
1248       ldrs(vtmp, Address(str2));
1249       ldr(tmp1, Address(str1));
1250       zip1(vtmp, T8B, vtmp, vtmpZ);
1251       fmovd(tmp2, vtmp);
1252     }
1253     bind(TAIL_CHECK);
1254     eor(rscratch2, tmp1, tmp2);
1255     cbz(rscratch2, DONE);
1256 
1257     // Find the first different characters in the longwords and
1258     // compute their difference.
1259     bind(DIFF);
1260     rev(rscratch2, rscratch2);
1261     clz(rscratch2, rscratch2);
1262     andr(rscratch2, rscratch2, isLL ? -8 : -16);
1263     lsrv(tmp1, tmp1, rscratch2);
1264     (this->*ext_chr)(tmp1, tmp1);
1265     lsrv(tmp2, tmp2, rscratch2);
1266     (this->*ext_chr)(tmp2, tmp2);
1267     subw(result, tmp1, tmp2);
1268     b(DONE);
1269   }
1270 
1271   bind(STUB);
1272     RuntimeAddress stub = nullptr;
1273     switch(ae) {
1274       case StrIntrinsicNode::LL:
1275         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
1276         break;
1277       case StrIntrinsicNode::UU:
1278         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
1279         break;
1280       case StrIntrinsicNode::LU:
1281         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
1282         break;
1283       case StrIntrinsicNode::UL:
1284         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
1285         break;
1286       default:
1287         ShouldNotReachHere();
1288      }
1289     assert(stub.target() != nullptr, "compare_long_string stub has not been generated");
1290     address call = trampoline_call(stub);
1291     if (call == nullptr) {
1292       DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START));
1293       ciEnv::current()->record_failure("CodeCache is full");
1294       return;
1295     }
1296     b(DONE);
1297 
1298   bind(SHORT_STRING);
1299   // Is the minimum length zero?
1300   cbz(cnt2, DONE);
1301   // arrange code to do most branches while loading and loading next characters
1302   // while comparing previous
1303   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1304   subs(cnt2, cnt2, 1);
1305   br(EQ, SHORT_LAST_INIT);
1306   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1307   b(SHORT_LOOP_START);
1308   bind(SHORT_LOOP);
1309   subs(cnt2, cnt2, 1);
1310   br(EQ, SHORT_LAST);
1311   bind(SHORT_LOOP_START);
1312   (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
1313   (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
1314   cmp(tmp1, cnt1);
1315   br(NE, SHORT_LOOP_TAIL);
1316   subs(cnt2, cnt2, 1);
1317   br(EQ, SHORT_LAST2);
1318   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1319   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1320   cmp(tmp2, rscratch1);
1321   br(EQ, SHORT_LOOP);
1322   sub(result, tmp2, rscratch1);
1323   b(DONE);
1324   bind(SHORT_LOOP_TAIL);
1325   sub(result, tmp1, cnt1);
1326   b(DONE);
1327   bind(SHORT_LAST2);
1328   cmp(tmp2, rscratch1);
1329   br(EQ, DONE);
1330   sub(result, tmp2, rscratch1);
1331 
1332   b(DONE);
1333   bind(SHORT_LAST_INIT);
1334   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1335   bind(SHORT_LAST);
1336   cmp(tmp1, cnt1);
1337   br(EQ, DONE);
1338   sub(result, tmp1, cnt1);
1339 
1340   bind(DONE);
1341 
1342   BLOCK_COMMENT("} string_compare");
1343 }
1344 
1345 void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1,
1346                                      FloatRegister src2, Condition cond, bool isQ) {
1347   SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1348   FloatRegister zn = src1, zm = src2;
1349   bool needs_negation = false;
1350   switch (cond) {
1351     case LT: cond = GT; zn = src2; zm = src1; break;
1352     case LE: cond = GE; zn = src2; zm = src1; break;
1353     case LO: cond = HI; zn = src2; zm = src1; break;
1354     case LS: cond = HS; zn = src2; zm = src1; break;
1355     case NE: cond = EQ; needs_negation = true; break;
1356     default:
1357       break;
1358   }
1359 
1360   if (is_floating_point_type(bt)) {
1361     fcm(cond, dst, size, zn, zm);
1362   } else {
1363     cm(cond, dst, size, zn, zm);
1364   }
1365 
1366   if (needs_negation) {
1367     notr(dst, isQ ? T16B : T8B, dst);
1368   }
1369 }
1370 
1371 void C2_MacroAssembler::neon_compare_zero(FloatRegister dst, BasicType bt, FloatRegister src,
1372                                           Condition cond, bool isQ) {
1373   SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1374   if (bt == T_FLOAT || bt == T_DOUBLE) {
1375     if (cond == Assembler::NE) {
1376       fcm(Assembler::EQ, dst, size, src);
1377       notr(dst, isQ ? T16B : T8B, dst);
1378     } else {
1379       fcm(cond, dst, size, src);
1380     }
1381   } else {
1382     if (cond == Assembler::NE) {
1383       cm(Assembler::EQ, dst, size, src);
1384       notr(dst, isQ ? T16B : T8B, dst);
1385     } else {
1386       cm(cond, dst, size, src);
1387     }
1388   }
1389 }
1390 
1391 // Compress the least significant bit of each byte to the rightmost and clear
1392 // the higher garbage bits.
1393 void C2_MacroAssembler::bytemask_compress(Register dst) {
1394   // Example input, dst = 0x01 00 00 00 01 01 00 01
1395   // The "??" bytes are garbage.
1396   orr(dst, dst, dst, Assembler::LSR, 7);  // dst = 0x?? 02 ?? 00 ?? 03 ?? 01
1397   orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D
1398   orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D
1399   andr(dst, dst, 0xff);                   // dst = 0x8D
1400 }
1401 
1402 // Pack the lowest-numbered bit of each mask element in src into a long value
1403 // in dst, at most the first 64 lane elements.
1404 // Clobbers: rscratch1, if UseSVE=1 or the hardware doesn't support FEAT_BITPERM.
1405 void C2_MacroAssembler::sve_vmask_tolong(Register dst, PRegister src, BasicType bt, int lane_cnt,
1406                                          FloatRegister vtmp1, FloatRegister vtmp2) {
1407   assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
1408   assert_different_registers(dst, rscratch1);
1409   assert_different_registers(vtmp1, vtmp2);
1410 
1411   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
1412   // Example:   src = 0b01100101 10001101, bt = T_BYTE, lane_cnt = 16
1413   // Expected:  dst = 0x658D
1414 
1415   // Convert the mask into vector with sequential bytes.
1416   // vtmp1 = 0x00010100 0x00010001 0x01000000 0x01010001
1417   sve_cpy(vtmp1, size, src, 1, false);
1418   if (bt != T_BYTE) {
1419     sve_vector_narrow(vtmp1, B, vtmp1, size, vtmp2);
1420   }
1421 
1422   if (UseSVE > 1 && VM_Version::supports_svebitperm()) {
1423     // Given a vector with the value 0x00 or 0x01 in each byte, the basic idea
1424     // is to compress each significant bit of the byte in a cross-lane way. Due
1425     // to the lack of a cross-lane bit-compress instruction, we use BEXT
1426     // (bit-compress in each lane) with the biggest lane size (T = D) then
1427     // concatenate the results.
1428 
1429     // The second source input of BEXT, initialized with 0x01 in each byte.
1430     // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1431     sve_dup(vtmp2, B, 1);
1432 
1433     // BEXT vtmp1.D, vtmp1.D, vtmp2.D
1434     // vtmp1 = 0x0001010000010001 | 0x0100000001010001
1435     // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1436     //         ---------------------------------------
1437     // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1438     sve_bext(vtmp1, D, vtmp1, vtmp2);
1439 
1440     // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the
1441     // result to dst.
1442     // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1443     // dst   = 0x658D
1444     if (lane_cnt <= 8) {
1445       // No need to concatenate.
1446       umov(dst, vtmp1, B, 0);
1447     } else if (lane_cnt <= 16) {
1448       ins(vtmp1, B, vtmp1, 1, 8);
1449       umov(dst, vtmp1, H, 0);
1450     } else {
1451       // As the lane count is 64 at most, the final expected value must be in
1452       // the lowest 64 bits after narrowing vtmp1 from D to B.
1453       sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2);
1454       umov(dst, vtmp1, D, 0);
1455     }
1456   } else if (UseSVE > 0) {
1457     // Compress the lowest 8 bytes.
1458     fmovd(dst, vtmp1);
1459     bytemask_compress(dst);
1460     if (lane_cnt <= 8) return;
1461 
1462     // Repeat on higher bytes and join the results.
1463     // Compress 8 bytes in each iteration.
1464     for (int idx = 1; idx < (lane_cnt / 8); idx++) {
1465       sve_extract_integral(rscratch1, T_LONG, vtmp1, idx, vtmp2);
1466       bytemask_compress(rscratch1);
1467       orr(dst, dst, rscratch1, Assembler::LSL, idx << 3);
1468     }
1469   } else {
1470     assert(false, "unsupported");
1471     ShouldNotReachHere();
1472   }
1473 }
1474 
1475 // Unpack the mask, a long value in src, into predicate register dst based on the
1476 // corresponding data type. Note that dst can support at most 64 lanes.
1477 // Below example gives the expected dst predicate register in different types, with
1478 // a valid src(0x658D) on a 1024-bit vector size machine.
1479 // BYTE:  dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 00 00 65 8D
1480 // SHORT: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 14 11 40 51
1481 // INT:   dst = 0x00 00 00 00 00 00 00 00 01 10 01 01 10 00 11 01
1482 // LONG:  dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01
1483 //
1484 // The number of significant bits of src must be equal to lane_cnt. E.g., 0xFF658D which
1485 // has 24 significant bits would be an invalid input if dst predicate register refers to
1486 // a LONG type 1024-bit vector, which has at most 16 lanes.
1487 void C2_MacroAssembler::sve_vmask_fromlong(PRegister dst, Register src, BasicType bt, int lane_cnt,
1488                                            FloatRegister vtmp1, FloatRegister vtmp2) {
1489   assert(UseSVE == 2 && VM_Version::supports_svebitperm() &&
1490          lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported");
1491   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
1492   // Example:   src = 0x658D, bt = T_BYTE, size = B, lane_cnt = 16
1493   // Expected:  dst = 0b01101001 10001101
1494 
1495   // Put long value from general purpose register into the first lane of vector.
1496   // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1497   sve_dup(vtmp1, B, 0);
1498   mov(vtmp1, D, 0, src);
1499 
1500   // As sve_cmp generates mask value with the minimum unit in byte, we should
1501   // transform the value in the first lane which is mask in bit now to the
1502   // mask in byte, which can be done by SVE2's BDEP instruction.
1503 
1504   // The first source input of BDEP instruction. Deposite each byte in every 8 bytes.
1505   // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1506   if (lane_cnt <= 8) {
1507     // Nothing. As only one byte exsits.
1508   } else if (lane_cnt <= 16) {
1509     ins(vtmp1, B, vtmp1, 8, 1);
1510     mov(vtmp1, B, 1, zr);
1511   } else {
1512     sve_vector_extend(vtmp1, D, vtmp1, B);
1513   }
1514 
1515   // The second source input of BDEP instruction, initialized with 0x01 for each byte.
1516   // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1517   sve_dup(vtmp2, B, 1);
1518 
1519   // BDEP vtmp1.D, vtmp1.D, vtmp2.D
1520   // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1521   // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1522   //         ---------------------------------------
1523   // vtmp1 = 0x0001010000010001 | 0x0100000001010001
1524   sve_bdep(vtmp1, D, vtmp1, vtmp2);
1525 
1526   if (bt != T_BYTE) {
1527     sve_vector_extend(vtmp1, size, vtmp1, B);
1528   }
1529   // Generate mask according to the given vector, in which the elements have been
1530   // extended to expected type.
1531   // dst = 0b01101001 10001101
1532   sve_cmp(Assembler::NE, dst, size, ptrue, vtmp1, 0);
1533 }
1534 
1535 // Clobbers: rflags
1536 void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg,
1537                                     FloatRegister zn, FloatRegister zm, Condition cond) {
1538   assert(pg->is_governing(), "This register has to be a governing predicate register");
1539   FloatRegister z1 = zn, z2 = zm;
1540   switch (cond) {
1541     case LE: z1 = zm; z2 = zn; cond = GE; break;
1542     case LT: z1 = zm; z2 = zn; cond = GT; break;
1543     case LO: z1 = zm; z2 = zn; cond = HI; break;
1544     case LS: z1 = zm; z2 = zn; cond = HS; break;
1545     default:
1546       break;
1547   }
1548 
1549   SIMD_RegVariant size = elemType_to_regVariant(bt);
1550   if (is_floating_point_type(bt)) {
1551     sve_fcm(cond, pd, size, pg, z1, z2);
1552   } else {
1553     assert(is_integral_type(bt), "unsupported element type");
1554     sve_cmp(cond, pd, size, pg, z1, z2);
1555   }
1556 }
1557 
1558 // Get index of the last mask lane that is set
1559 void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) {
1560   SIMD_RegVariant size = elemType_to_regVariant(bt);
1561   sve_rev(ptmp, size, src);
1562   sve_brkb(ptmp, ptrue, ptmp, false);
1563   sve_cntp(dst, size, ptrue, ptmp);
1564   movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1);
1565   subw(dst, rscratch1, dst);
1566 }
1567 
1568 // Extend integer vector src to dst with the same lane count
1569 // but larger element size, e.g. 4B -> 4I
1570 void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes,
1571                                            FloatRegister src, BasicType src_bt, bool is_unsigned) {
1572   if (src_bt == T_BYTE) {
1573     // 4B to 4S/4I, 8B to 8S
1574     assert(dst_vlen_in_bytes == 8 || dst_vlen_in_bytes == 16, "unsupported");
1575     assert(dst_bt == T_SHORT || dst_bt == T_INT, "unsupported");
1576     _xshll(is_unsigned, dst, T8H, src, T8B, 0);
1577     if (dst_bt == T_INT) {
1578       _xshll(is_unsigned, dst, T4S, dst, T4H, 0);
1579     }
1580   } else if (src_bt == T_SHORT) {
1581     // 2S to 2I/2L, 4S to 4I
1582     assert(dst_vlen_in_bytes == 8 || dst_vlen_in_bytes == 16, "unsupported");
1583     assert(dst_bt == T_INT || dst_bt == T_LONG, "unsupported");
1584     _xshll(is_unsigned, dst, T4S, src, T4H, 0);
1585     if (dst_bt == T_LONG) {
1586       _xshll(is_unsigned, dst, T2D, dst, T2S, 0);
1587     }
1588   } else if (src_bt == T_INT) {
1589     // 2I to 2L
1590     assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported");
1591     _xshll(is_unsigned, dst, T2D, src, T2S, 0);
1592   } else {
1593     ShouldNotReachHere();
1594   }
1595 }
1596 
1597 // Narrow integer vector src down to dst with the same lane count
1598 // but smaller element size, e.g. 4I -> 4B
1599 void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt,
1600                                            FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) {
1601   if (src_bt == T_SHORT) {
1602     // 4S/8S to 4B/8B
1603     assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
1604     assert(dst_bt == T_BYTE, "unsupported");
1605     xtn(dst, T8B, src, T8H);
1606   } else if (src_bt == T_INT) {
1607     // 2I to 2S, 4I to 4B/4S
1608     assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
1609     assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported");
1610     xtn(dst, T4H, src, T4S);
1611     if (dst_bt == T_BYTE) {
1612       xtn(dst, T8B, dst, T8H);
1613     }
1614   } else if (src_bt == T_LONG) {
1615     // 2L to 2S/2I
1616     assert(src_vlen_in_bytes == 16, "unsupported");
1617     assert(dst_bt == T_INT || dst_bt == T_SHORT, "unsupported");
1618     xtn(dst, T2S, src, T2D);
1619     if (dst_bt == T_SHORT) {
1620       xtn(dst, T4H, dst, T4S);
1621     }
1622   } else {
1623     ShouldNotReachHere();
1624   }
1625 }
1626 
1627 void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size,
1628                                           FloatRegister src, SIMD_RegVariant src_size,
1629                                           bool is_unsigned) {
1630   assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size");
1631 
1632   if (src_size == B) {
1633     switch (dst_size) {
1634     case H:
1635       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1636       break;
1637     case S:
1638       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1639       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1640       break;
1641     case D:
1642       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1643       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1644       _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1645       break;
1646     default:
1647       ShouldNotReachHere();
1648     }
1649   } else if (src_size == H) {
1650     if (dst_size == S) {
1651       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1652     } else { // D
1653       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1654       _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1655     }
1656   } else if (src_size == S) {
1657     _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, src);
1658   }
1659 }
1660 
1661 // Vector narrow from src to dst with specified element sizes.
1662 // High part of dst vector will be filled with zero.
1663 void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size,
1664                                           FloatRegister src, SIMD_RegVariant src_size,
1665                                           FloatRegister tmp) {
1666   assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size");
1667   assert_different_registers(src, tmp);
1668   sve_dup(tmp, src_size, 0);
1669   if (src_size == D) {
1670     switch (dst_size) {
1671     case S:
1672       sve_uzp1(dst, S, src, tmp);
1673       break;
1674     case H:
1675       assert_different_registers(dst, tmp);
1676       sve_uzp1(dst, S, src, tmp);
1677       sve_uzp1(dst, H, dst, tmp);
1678       break;
1679     case B:
1680       assert_different_registers(dst, tmp);
1681       sve_uzp1(dst, S, src, tmp);
1682       sve_uzp1(dst, H, dst, tmp);
1683       sve_uzp1(dst, B, dst, tmp);
1684       break;
1685     default:
1686       ShouldNotReachHere();
1687     }
1688   } else if (src_size == S) {
1689     if (dst_size == H) {
1690       sve_uzp1(dst, H, src, tmp);
1691     } else { // B
1692       assert_different_registers(dst, tmp);
1693       sve_uzp1(dst, H, src, tmp);
1694       sve_uzp1(dst, B, dst, tmp);
1695     }
1696   } else if (src_size == H) {
1697     sve_uzp1(dst, B, src, tmp);
1698   }
1699 }
1700 
1701 // Extend src predicate to dst predicate with the same lane count but larger
1702 // element size, e.g. 64Byte -> 512Long
1703 void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src,
1704                                              uint dst_element_length_in_bytes,
1705                                              uint src_element_length_in_bytes) {
1706   if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) {
1707     sve_punpklo(dst, src);
1708   } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) {
1709     sve_punpklo(dst, src);
1710     sve_punpklo(dst, dst);
1711   } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) {
1712     sve_punpklo(dst, src);
1713     sve_punpklo(dst, dst);
1714     sve_punpklo(dst, dst);
1715   } else {
1716     assert(false, "unsupported");
1717     ShouldNotReachHere();
1718   }
1719 }
1720 
1721 // Narrow src predicate to dst predicate with the same lane count but
1722 // smaller element size, e.g. 512Long -> 64Byte
1723 void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src, PRegister ptmp,
1724                                              uint dst_element_length_in_bytes, uint src_element_length_in_bytes) {
1725   // The insignificant bits in src predicate are expected to be zero.
1726   // To ensure the higher order bits of the resultant narrowed vector are 0, an all-zero predicate is
1727   // passed as the second argument. An example narrowing operation with a given mask would be -
1728   // 128Long -> 64Int on a 128-bit machine i.e 2L -> 2I
1729   // Mask (for 2 Longs) : TF
1730   // Predicate register for the above mask (16 bits) : 00000001 00000000
1731   // After narrowing (uzp1 dst.b, src.b, ptmp.b) : 0000 0000 0001 0000
1732   // Which translates to mask for 2 integers as : TF (lower half is considered while upper half is 0)
1733   assert_different_registers(src, ptmp);
1734   assert_different_registers(dst, ptmp);
1735   sve_pfalse(ptmp);
1736   if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) {
1737     sve_uzp1(dst, B, src, ptmp);
1738   } else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) {
1739     sve_uzp1(dst, H, src, ptmp);
1740     sve_uzp1(dst, B, dst, ptmp);
1741   } else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) {
1742     sve_uzp1(dst, S, src, ptmp);
1743     sve_uzp1(dst, H, dst, ptmp);
1744     sve_uzp1(dst, B, dst, ptmp);
1745   } else {
1746     assert(false, "unsupported");
1747     ShouldNotReachHere();
1748   }
1749 }
1750 
1751 // Vector reduction add for integral type with ASIMD instructions.
1752 void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt,
1753                                                  Register isrc, FloatRegister vsrc,
1754                                                  unsigned vector_length_in_bytes,
1755                                                  FloatRegister vtmp) {
1756   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1757   assert_different_registers(dst, isrc);
1758   bool isQ = vector_length_in_bytes == 16;
1759 
1760   BLOCK_COMMENT("neon_reduce_add_integral {");
1761     switch(bt) {
1762       case T_BYTE:
1763         addv(vtmp, isQ ? T16B : T8B, vsrc);
1764         smov(dst, vtmp, B, 0);
1765         addw(dst, dst, isrc, ext::sxtb);
1766         break;
1767       case T_SHORT:
1768         addv(vtmp, isQ ? T8H : T4H, vsrc);
1769         smov(dst, vtmp, H, 0);
1770         addw(dst, dst, isrc, ext::sxth);
1771         break;
1772       case T_INT:
1773         isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc);
1774         umov(dst, vtmp, S, 0);
1775         addw(dst, dst, isrc);
1776         break;
1777       case T_LONG:
1778         assert(isQ, "unsupported");
1779         addpd(vtmp, vsrc);
1780         umov(dst, vtmp, D, 0);
1781         add(dst, dst, isrc);
1782         break;
1783       default:
1784         assert(false, "unsupported");
1785         ShouldNotReachHere();
1786     }
1787   BLOCK_COMMENT("} neon_reduce_add_integral");
1788 }
1789 
1790 // Vector reduction multiply for integral type with ASIMD instructions.
1791 // Note: temporary registers vtmp1 and vtmp2 are not used in some cases.
1792 // Clobbers: rscratch1
1793 void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt,
1794                                                  Register isrc, FloatRegister vsrc,
1795                                                  unsigned vector_length_in_bytes,
1796                                                  FloatRegister vtmp1, FloatRegister vtmp2) {
1797   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1798   bool isQ = vector_length_in_bytes == 16;
1799 
1800   BLOCK_COMMENT("neon_reduce_mul_integral {");
1801     switch(bt) {
1802       case T_BYTE:
1803         if (isQ) {
1804           // Multiply the lower half and higher half of vector iteratively.
1805           // vtmp1 = vsrc[8:15]
1806           ins(vtmp1, D, vsrc, 0, 1);
1807           // vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7]
1808           mulv(vtmp1, T8B, vtmp1, vsrc);
1809           // vtmp2 = vtmp1[4:7]
1810           ins(vtmp2, S, vtmp1, 0, 1);
1811           // vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3]
1812           mulv(vtmp1, T8B, vtmp2, vtmp1);
1813         } else {
1814           ins(vtmp1, S, vsrc, 0, 1);
1815           mulv(vtmp1, T8B, vtmp1, vsrc);
1816         }
1817         // vtmp2 = vtmp1[2:3]
1818         ins(vtmp2, H, vtmp1, 0, 1);
1819         // vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1]
1820         mulv(vtmp2, T8B, vtmp2, vtmp1);
1821         // dst = vtmp2[0] * isrc * vtmp2[1]
1822         umov(rscratch1, vtmp2, B, 0);
1823         mulw(dst, rscratch1, isrc);
1824         sxtb(dst, dst);
1825         umov(rscratch1, vtmp2, B, 1);
1826         mulw(dst, rscratch1, dst);
1827         sxtb(dst, dst);
1828         break;
1829       case T_SHORT:
1830         if (isQ) {
1831           ins(vtmp2, D, vsrc, 0, 1);
1832           mulv(vtmp2, T4H, vtmp2, vsrc);
1833           ins(vtmp1, S, vtmp2, 0, 1);
1834           mulv(vtmp1, T4H, vtmp1, vtmp2);
1835         } else {
1836           ins(vtmp1, S, vsrc, 0, 1);
1837           mulv(vtmp1, T4H, vtmp1, vsrc);
1838         }
1839         umov(rscratch1, vtmp1, H, 0);
1840         mulw(dst, rscratch1, isrc);
1841         sxth(dst, dst);
1842         umov(rscratch1, vtmp1, H, 1);
1843         mulw(dst, rscratch1, dst);
1844         sxth(dst, dst);
1845         break;
1846       case T_INT:
1847         if (isQ) {
1848           ins(vtmp1, D, vsrc, 0, 1);
1849           mulv(vtmp1, T2S, vtmp1, vsrc);
1850         } else {
1851           vtmp1 = vsrc;
1852         }
1853         umov(rscratch1, vtmp1, S, 0);
1854         mul(dst, rscratch1, isrc);
1855         umov(rscratch1, vtmp1, S, 1);
1856         mul(dst, rscratch1, dst);
1857         break;
1858       case T_LONG:
1859         umov(rscratch1, vsrc, D, 0);
1860         mul(dst, isrc, rscratch1);
1861         umov(rscratch1, vsrc, D, 1);
1862         mul(dst, dst, rscratch1);
1863         break;
1864       default:
1865         assert(false, "unsupported");
1866         ShouldNotReachHere();
1867     }
1868   BLOCK_COMMENT("} neon_reduce_mul_integral");
1869 }
1870 
1871 // Vector reduction multiply for floating-point type with ASIMD instructions.
1872 void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt,
1873                                            FloatRegister fsrc, FloatRegister vsrc,
1874                                            unsigned vector_length_in_bytes,
1875                                            FloatRegister vtmp) {
1876   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1877   bool isQ = vector_length_in_bytes == 16;
1878 
1879   BLOCK_COMMENT("neon_reduce_mul_fp {");
1880     switch(bt) {
1881       case T_FLOAT:
1882         fmuls(dst, fsrc, vsrc);
1883         ins(vtmp, S, vsrc, 0, 1);
1884         fmuls(dst, dst, vtmp);
1885         if (isQ) {
1886           ins(vtmp, S, vsrc, 0, 2);
1887           fmuls(dst, dst, vtmp);
1888           ins(vtmp, S, vsrc, 0, 3);
1889           fmuls(dst, dst, vtmp);
1890          }
1891         break;
1892       case T_DOUBLE:
1893         assert(isQ, "unsupported");
1894         fmuld(dst, fsrc, vsrc);
1895         ins(vtmp, D, vsrc, 0, 1);
1896         fmuld(dst, dst, vtmp);
1897         break;
1898       default:
1899         assert(false, "unsupported");
1900         ShouldNotReachHere();
1901     }
1902   BLOCK_COMMENT("} neon_reduce_mul_fp");
1903 }
1904 
1905 // Helper to select logical instruction
1906 void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd,
1907                                                    Register Rn, Register Rm,
1908                                                    enum shift_kind kind, unsigned shift) {
1909   switch(opc) {
1910     case Op_AndReductionV:
1911       is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift);
1912       break;
1913     case Op_OrReductionV:
1914       is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift);
1915       break;
1916     case Op_XorReductionV:
1917       is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift);
1918       break;
1919     default:
1920       assert(false, "unsupported");
1921       ShouldNotReachHere();
1922   }
1923 }
1924 
1925 // Vector reduction logical operations And, Or, Xor
1926 // Clobbers: rscratch1
1927 void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt,
1928                                             Register isrc, FloatRegister vsrc,
1929                                             unsigned vector_length_in_bytes) {
1930   assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV,
1931          "unsupported");
1932   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1933   assert_different_registers(dst, isrc);
1934   bool isQ = vector_length_in_bytes == 16;
1935 
1936   BLOCK_COMMENT("neon_reduce_logical {");
1937     umov(rscratch1, vsrc, isQ ? D : S, 0);
1938     umov(dst, vsrc, isQ ? D : S, 1);
1939     neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1);
1940     switch(bt) {
1941       case T_BYTE:
1942         if (isQ) {
1943           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1944         }
1945         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
1946         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8);
1947         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1948         sxtb(dst, dst);
1949         break;
1950       case T_SHORT:
1951         if (isQ) {
1952           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1953         }
1954         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
1955         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1956         sxth(dst, dst);
1957         break;
1958       case T_INT:
1959         if (isQ) {
1960           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1961         }
1962         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1963         break;
1964       case T_LONG:
1965         assert(isQ, "unsupported");
1966         neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst);
1967         break;
1968       default:
1969         assert(false, "unsupported");
1970         ShouldNotReachHere();
1971     }
1972   BLOCK_COMMENT("} neon_reduce_logical");
1973 }
1974 
1975 // Vector reduction min/max for integral type with ASIMD instructions.
1976 // Note: vtmp is not used and expected to be fnoreg for T_LONG case.
1977 // Clobbers: rscratch1, rflags
1978 void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt,
1979                                                     Register isrc, FloatRegister vsrc,
1980                                                     unsigned vector_length_in_bytes,
1981                                                     FloatRegister vtmp) {
1982   assert(opc == Op_MinReductionV || opc == Op_MaxReductionV, "unsupported");
1983   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1984   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported");
1985   assert_different_registers(dst, isrc);
1986   bool isQ = vector_length_in_bytes == 16;
1987   bool is_min = opc == Op_MinReductionV;
1988 
1989   BLOCK_COMMENT("neon_reduce_minmax_integral {");
1990     if (bt == T_LONG) {
1991       assert(vtmp == fnoreg, "should be");
1992       assert(isQ, "should be");
1993       umov(rscratch1, vsrc, D, 0);
1994       cmp(isrc, rscratch1);
1995       csel(dst, isrc, rscratch1, is_min ? LT : GT);
1996       umov(rscratch1, vsrc, D, 1);
1997       cmp(dst, rscratch1);
1998       csel(dst, dst, rscratch1, is_min ? LT : GT);
1999     } else {
2000       SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
2001       if (size == T2S) {
2002         is_min ? sminp(vtmp, size, vsrc, vsrc) : smaxp(vtmp, size, vsrc, vsrc);
2003       } else {
2004         is_min ? sminv(vtmp, size, vsrc) : smaxv(vtmp, size, vsrc);
2005       }
2006       if (bt == T_INT) {
2007         umov(dst, vtmp, S, 0);
2008       } else {
2009         smov(dst, vtmp, elemType_to_regVariant(bt), 0);
2010       }
2011       cmpw(dst, isrc);
2012       cselw(dst, dst, isrc, is_min ? LT : GT);
2013     }
2014   BLOCK_COMMENT("} neon_reduce_minmax_integral");
2015 }
2016 
2017 // Vector reduction for integral type with SVE instruction.
2018 // Supported operations are Add, And, Or, Xor, Max, Min.
2019 // rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV.
2020 void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1,
2021                                             FloatRegister src2, PRegister pg, FloatRegister tmp) {
2022   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2023   assert(pg->is_governing(), "This register has to be a governing predicate register");
2024   assert_different_registers(src1, dst);
2025   // Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved.
2026   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2027   switch (opc) {
2028     case Op_AddReductionVI: {
2029       sve_uaddv(tmp, size, pg, src2);
2030       if (bt == T_BYTE) {
2031         smov(dst, tmp, size, 0);
2032         addw(dst, src1, dst, ext::sxtb);
2033       } else if (bt == T_SHORT) {
2034         smov(dst, tmp, size, 0);
2035         addw(dst, src1, dst, ext::sxth);
2036       } else {
2037         umov(dst, tmp, size, 0);
2038         addw(dst, dst, src1);
2039       }
2040       break;
2041     }
2042     case Op_AddReductionVL: {
2043       sve_uaddv(tmp, size, pg, src2);
2044       umov(dst, tmp, size, 0);
2045       add(dst, dst, src1);
2046       break;
2047     }
2048     case Op_AndReductionV: {
2049       sve_andv(tmp, size, pg, src2);
2050       if (bt == T_INT || bt == T_LONG) {
2051         umov(dst, tmp, size, 0);
2052       } else {
2053         smov(dst, tmp, size, 0);
2054       }
2055       if (bt == T_LONG) {
2056         andr(dst, dst, src1);
2057       } else {
2058         andw(dst, dst, src1);
2059       }
2060       break;
2061     }
2062     case Op_OrReductionV: {
2063       sve_orv(tmp, size, pg, src2);
2064       if (bt == T_INT || bt == T_LONG) {
2065         umov(dst, tmp, size, 0);
2066       } else {
2067         smov(dst, tmp, size, 0);
2068       }
2069       if (bt == T_LONG) {
2070         orr(dst, dst, src1);
2071       } else {
2072         orrw(dst, dst, src1);
2073       }
2074       break;
2075     }
2076     case Op_XorReductionV: {
2077       sve_eorv(tmp, size, pg, src2);
2078       if (bt == T_INT || bt == T_LONG) {
2079         umov(dst, tmp, size, 0);
2080       } else {
2081         smov(dst, tmp, size, 0);
2082       }
2083       if (bt == T_LONG) {
2084         eor(dst, dst, src1);
2085       } else {
2086         eorw(dst, dst, src1);
2087       }
2088       break;
2089     }
2090     case Op_MaxReductionV: {
2091       sve_smaxv(tmp, size, pg, src2);
2092       if (bt == T_INT || bt == T_LONG) {
2093         umov(dst, tmp, size, 0);
2094       } else {
2095         smov(dst, tmp, size, 0);
2096       }
2097       if (bt == T_LONG) {
2098         cmp(dst, src1);
2099         csel(dst, dst, src1, Assembler::GT);
2100       } else {
2101         cmpw(dst, src1);
2102         cselw(dst, dst, src1, Assembler::GT);
2103       }
2104       break;
2105     }
2106     case Op_MinReductionV: {
2107       sve_sminv(tmp, size, pg, src2);
2108       if (bt == T_INT || bt == T_LONG) {
2109         umov(dst, tmp, size, 0);
2110       } else {
2111         smov(dst, tmp, size, 0);
2112       }
2113       if (bt == T_LONG) {
2114         cmp(dst, src1);
2115         csel(dst, dst, src1, Assembler::LT);
2116       } else {
2117         cmpw(dst, src1);
2118         cselw(dst, dst, src1, Assembler::LT);
2119       }
2120       break;
2121     }
2122     default:
2123       assert(false, "unsupported");
2124       ShouldNotReachHere();
2125   }
2126 
2127   if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) {
2128     if (bt == T_BYTE) {
2129       sxtb(dst, dst);
2130     } else if (bt == T_SHORT) {
2131       sxth(dst, dst);
2132     }
2133   }
2134 }
2135 
2136 // Set elements of the dst predicate to true for lanes in the range of [0, lane_cnt), or
2137 // to false otherwise. The input "lane_cnt" should be smaller than or equal to the supported
2138 // max vector length of the basic type. Clobbers: rscratch1 and the rFlagsReg.
2139 void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt) {
2140   uint32_t max_vector_length = Matcher::max_vector_size(bt);
2141   assert(lane_cnt <= max_vector_length, "unsupported input lane_cnt");
2142 
2143   // Set all elements to false if the input "lane_cnt" is zero.
2144   if (lane_cnt == 0) {
2145     sve_pfalse(dst);
2146     return;
2147   }
2148 
2149   SIMD_RegVariant size = elemType_to_regVariant(bt);
2150   assert(size != Q, "invalid size");
2151 
2152   // Set all true if "lane_cnt" equals to the max lane count.
2153   if (lane_cnt == max_vector_length) {
2154     sve_ptrue(dst, size, /* ALL */ 0b11111);
2155     return;
2156   }
2157 
2158   // Fixed numbers for "ptrue".
2159   switch(lane_cnt) {
2160   case 1: /* VL1 */
2161   case 2: /* VL2 */
2162   case 3: /* VL3 */
2163   case 4: /* VL4 */
2164   case 5: /* VL5 */
2165   case 6: /* VL6 */
2166   case 7: /* VL7 */
2167   case 8: /* VL8 */
2168     sve_ptrue(dst, size, lane_cnt);
2169     return;
2170   case 16:
2171     sve_ptrue(dst, size, /* VL16 */ 0b01001);
2172     return;
2173   case 32:
2174     sve_ptrue(dst, size, /* VL32 */ 0b01010);
2175     return;
2176   case 64:
2177     sve_ptrue(dst, size, /* VL64 */ 0b01011);
2178     return;
2179   case 128:
2180     sve_ptrue(dst, size, /* VL128 */ 0b01100);
2181     return;
2182   case 256:
2183     sve_ptrue(dst, size, /* VL256 */ 0b01101);
2184     return;
2185   default:
2186     break;
2187   }
2188 
2189   // Special patterns for "ptrue".
2190   if (lane_cnt == round_down_power_of_2(max_vector_length)) {
2191     sve_ptrue(dst, size, /* POW2 */ 0b00000);
2192   } else if (lane_cnt == max_vector_length - (max_vector_length % 4)) {
2193     sve_ptrue(dst, size, /* MUL4 */ 0b11101);
2194   } else if (lane_cnt == max_vector_length - (max_vector_length % 3)) {
2195     sve_ptrue(dst, size, /* MUL3 */ 0b11110);
2196   } else {
2197     // Encode to "whileltw" for the remaining cases.
2198     mov(rscratch1, lane_cnt);
2199     sve_whileltw(dst, size, zr, rscratch1);
2200   }
2201 }
2202 
2203 // Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst.
2204 // Any remaining elements of dst will be filled with zero.
2205 // Clobbers: rscratch1
2206 // Preserves: mask, vzr
2207 void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask,
2208                                            FloatRegister vzr, FloatRegister vtmp,
2209                                            PRegister pgtmp, unsigned vector_length_in_bytes) {
2210   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2211   // When called by sve_compress_byte, src and vtmp may be the same register.
2212   assert_different_registers(dst, src, vzr);
2213   assert_different_registers(dst, vtmp, vzr);
2214   assert_different_registers(mask, pgtmp);
2215   // high <-- low
2216   // Example input:   src   = hh gg ff ee dd cc bb aa, one character is 8 bits.
2217   //                  mask  = 01 00 00 01 01 00 01 01, one character is 1 bit.
2218   // Expected result: dst   = 00 00 00 hh ee dd bb aa
2219 
2220   // Extend lowest half to type INT.
2221   // dst   =  00dd  00cc  00bb  00aa
2222   sve_uunpklo(dst, S, src);
2223   // pgtmp =  0001  0000  0001  0001
2224   sve_punpklo(pgtmp, mask);
2225   // Pack the active elements in size of type INT to the right,
2226   // and fill the remainings with zero.
2227   // dst   =  0000  00dd  00bb  00aa
2228   sve_compact(dst, S, dst, pgtmp);
2229   // Narrow the result back to type SHORT.
2230   // dst   = 00 00 00 00 00 dd bb aa
2231   sve_uzp1(dst, H, dst, vzr);
2232 
2233   // Return if the vector length is no more than MaxVectorSize/2, since the
2234   // highest half is invalid.
2235   if (vector_length_in_bytes <= (MaxVectorSize >> 1)) {
2236     return;
2237   }
2238 
2239   // Count the active elements of lowest half.
2240   // rscratch1 = 3
2241   sve_cntp(rscratch1, S, ptrue, pgtmp);
2242 
2243   // Repeat to the highest half.
2244   // pgtmp =  0001  0000  0000  0001
2245   sve_punpkhi(pgtmp, mask);
2246   // vtmp  =  00hh  00gg  00ff  00ee
2247   sve_uunpkhi(vtmp, S, src);
2248   // vtmp  =  0000  0000  00hh  00ee
2249   sve_compact(vtmp, S, vtmp, pgtmp);
2250   // vtmp  = 00 00 00 00 00 00 hh ee
2251   sve_uzp1(vtmp, H, vtmp, vzr);
2252 
2253   // pgtmp = 00 00 00 00 00 01 01 01
2254   sve_whilelt(pgtmp, H, zr, rscratch1);
2255   // Compressed low:  dst  = 00 00 00 00 00 dd bb aa
2256   // Compressed high: vtmp = 00 00 00 00 00 00 hh ee
2257   // Combine the compressed low with the compressed high:
2258   //                  dst  = 00 00 00 hh ee dd bb aa
2259   sve_splice(dst, H, pgtmp, vtmp);
2260 }
2261 
2262 // Clobbers: rscratch1, rscratch2
2263 // Preserves: src, mask
2264 void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask,
2265                                           FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
2266                                           PRegister ptmp, PRegister pgtmp, unsigned vector_length_in_bytes) {
2267   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2268   assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3);
2269   assert_different_registers(mask, ptmp, pgtmp);
2270   // high <-- low
2271   // Example input:   src   = q p n m l k j i h g f e d c b a, one character is 8 bits.
2272   //                  mask  = 0 1 0 0 0 0 0 1 0 1 0 0 0 1 0 1, one character is 1 bit.
2273   // Expected result: dst   = 0 0 0 0 0 0 0 0 0 0 0 p i g c a
2274   FloatRegister vzr = vtmp3;
2275   sve_dup(vzr, B, 0);
2276 
2277   // Extend lowest half to type SHORT.
2278   // vtmp1 =  0h  0g  0f  0e  0d  0c  0b  0a
2279   sve_uunpklo(vtmp1, H, src);
2280   // ptmp  =  00  01  00  00  00  01  00  01
2281   sve_punpklo(ptmp, mask);
2282   // Pack the active elements in size of type SHORT to the right,
2283   // and fill the remainings with zero.
2284   // dst   =  00  00  00  00  00  0g  0c  0a
2285   unsigned extended_size = vector_length_in_bytes << 1;
2286   sve_compress_short(dst, vtmp1, ptmp, vzr, vtmp2, pgtmp, extended_size > MaxVectorSize ? MaxVectorSize : extended_size);
2287   // Narrow the result back to type BYTE.
2288   // dst   = 0 0 0 0 0 0 0 0 0 0 0 0 0 g c a
2289   sve_uzp1(dst, B, dst, vzr);
2290 
2291   // Return if the vector length is no more than MaxVectorSize/2, since the
2292   // highest half is invalid.
2293   if (vector_length_in_bytes <= (MaxVectorSize >> 1)) {
2294     return;
2295   }
2296   // Count the active elements of lowest half.
2297   // rscratch2 = 3
2298   sve_cntp(rscratch2, H, ptrue, ptmp);
2299 
2300   // Repeat to the highest half.
2301   // ptmp  =  00  01  00  00  00  00  00  01
2302   sve_punpkhi(ptmp, mask);
2303   // vtmp2 =  0q  0p  0n  0m  0l  0k  0j  0i
2304   sve_uunpkhi(vtmp2, H, src);
2305   // vtmp1 =  00  00  00  00  00  00  0p  0i
2306   sve_compress_short(vtmp1, vtmp2, ptmp, vzr, vtmp2, pgtmp, extended_size - MaxVectorSize);
2307   // vtmp1 = 0 0 0 0 0 0 0 0 0 0 0 0 0 0 p i
2308   sve_uzp1(vtmp1, B, vtmp1, vzr);
2309 
2310   // ptmp  = 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1
2311   sve_whilelt(ptmp, B, zr, rscratch2);
2312   // Compressed low:  dst   = 0 0 0 0 0 0 0 0 0 0 0 0 0 g c a
2313   // Compressed high: vtmp1 = 0 0 0 0 0 0 0 0 0 0 0 0 0 0 p i
2314   // Combine the compressed low with the compressed high:
2315   //                  dst   = 0 0 0 0 0 0 0 0 0 0 0 p i g c a
2316   sve_splice(dst, B, ptmp, vtmp1);
2317 }
2318 
2319 void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2320   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2321   SIMD_Arrangement size = isQ ? T16B : T8B;
2322   if (bt == T_BYTE) {
2323     rbit(dst, size, src);
2324   } else {
2325     neon_reverse_bytes(dst, src, bt, isQ);
2326     rbit(dst, size, dst);
2327   }
2328 }
2329 
2330 void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2331   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2332   SIMD_Arrangement size = isQ ? T16B : T8B;
2333   switch (bt) {
2334     case T_BYTE:
2335       if (dst != src) {
2336         orr(dst, size, src, src);
2337       }
2338       break;
2339     case T_SHORT:
2340       rev16(dst, size, src);
2341       break;
2342     case T_INT:
2343       rev32(dst, size, src);
2344       break;
2345     case T_LONG:
2346       rev64(dst, size, src);
2347       break;
2348     default:
2349       assert(false, "unsupported");
2350       ShouldNotReachHere();
2351   }
2352 }
2353 
2354 // VectorRearrange implementation for short/int/float/long/double types with NEON
2355 // instructions. For VectorRearrange short/int/float, we use NEON tbl instruction.
2356 // But since it supports bytes table only, we need to lookup 2/4 bytes as a group.
2357 // For VectorRearrange long/double, we compare the shuffle input with iota indices,
2358 // and use bsl to implement the operation.
2359 void C2_MacroAssembler::neon_rearrange_hsd(FloatRegister dst, FloatRegister src,
2360                                            FloatRegister shuffle, FloatRegister tmp,
2361                                            BasicType bt, bool isQ) {
2362   assert_different_registers(dst, src, shuffle, tmp);
2363   SIMD_Arrangement size1 = isQ ? T16B : T8B;
2364   SIMD_Arrangement size2 = esize2arrangement((uint)type2aelembytes(bt), isQ);
2365 
2366   // Here is an example that rearranges a NEON vector with 4 ints:
2367   // Rearrange V1 int[a0, a1, a2, a3] to V2 int[a2, a3, a0, a1]
2368   //   1. We assume the shuffle input is Vi int[2, 3, 0, 1].
2369   //   2. Multiply Vi int[2, 3, 0, 1] with constant int vector
2370   //      [0x04040404, 0x04040404, 0x04040404, 0x04040404], and get
2371   //      tbl base Vm int[0x08080808, 0x0c0c0c0c, 0x00000000, 0x04040404].
2372   //   3. Add Vm with constant int[0x03020100, 0x03020100, 0x03020100, 0x03020100],
2373   //      and get tbl index Vm int[0x0b0a0908, 0x0f0e0d0c, 0x03020100, 0x07060504]
2374   //   4. Use Vm as index register, and use V1 as table register.
2375   //      Then get V2 as the result by tbl NEON instructions.
2376   switch (bt) {
2377     case T_SHORT:
2378       mov(tmp, size1, 0x02);
2379       mulv(dst, size2, shuffle, tmp);
2380       mov(tmp, size2, 0x0100);
2381       addv(dst, size1, dst, tmp);
2382       tbl(dst, size1, src, 1, dst);
2383       break;
2384     case T_INT:
2385     case T_FLOAT:
2386       mov(tmp, size1, 0x04);
2387       mulv(dst, size2, shuffle, tmp);
2388       mov(tmp, size2, 0x03020100);
2389       addv(dst, size1, dst, tmp);
2390       tbl(dst, size1, src, 1, dst);
2391       break;
2392     case T_LONG:
2393     case T_DOUBLE:
2394       // Load the iota indices for Long type. The indices are ordered by
2395       // type B/S/I/L/F/D, and the offset between two types is 16; Hence
2396       // the offset for L is 48.
2397       lea(rscratch1,
2398           ExternalAddress(StubRoutines::aarch64::vector_iota_indices() + 48));
2399       ldrq(tmp, rscratch1);
2400       // Check whether the input "shuffle" is the same with iota indices.
2401       // Return "src" if true, otherwise swap the two elements of "src".
2402       cm(EQ, dst, size2, shuffle, tmp);
2403       ext(tmp, size1, src, src, 8);
2404       bsl(dst, size1, src, tmp);
2405       break;
2406     default:
2407       assert(false, "unsupported element type");
2408       ShouldNotReachHere();
2409   }
2410 }
2411 
2412 // Extract a scalar element from an sve vector at position 'idx'.
2413 // The input elements in src are expected to be of integral type.
2414 void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src,
2415                                              int idx, FloatRegister vtmp) {
2416   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2417   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2418   if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction
2419     if (bt == T_INT || bt == T_LONG) {
2420       umov(dst, src, size, idx);
2421     } else {
2422       smov(dst, src, size, idx);
2423     }
2424   } else {
2425     sve_orr(vtmp, src, src);
2426     sve_ext(vtmp, vtmp, idx << size);
2427     if (bt == T_INT || bt == T_LONG) {
2428       umov(dst, vtmp, size, 0);
2429     } else {
2430       smov(dst, vtmp, size, 0);
2431     }
2432   }
2433 }
2434 
2435 // java.lang.Math::round intrinsics
2436 
2437 // Clobbers: rscratch1, rflags
2438 void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2439                                           FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) {
2440   assert_different_registers(tmp1, tmp2, tmp3, src, dst);
2441   switch (T) {
2442     case T2S:
2443     case T4S:
2444       fmovs(tmp1, T, 0.5f);
2445       mov(rscratch1, jint_cast(0x1.0p23f));
2446       break;
2447     case T2D:
2448       fmovd(tmp1, T, 0.5);
2449       mov(rscratch1, julong_cast(0x1.0p52));
2450       break;
2451     default:
2452       assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2453   }
2454   fadd(tmp1, T, tmp1, src);
2455   fcvtms(tmp1, T, tmp1);
2456   // tmp1 = floor(src + 0.5, ties to even)
2457 
2458   fcvtas(dst, T, src);
2459   // dst = round(src), ties to away
2460 
2461   fneg(tmp3, T, src);
2462   dup(tmp2, T, rscratch1);
2463   cm(HS, tmp3, T, tmp3, tmp2);
2464   // tmp3 is now a set of flags
2465 
2466   bif(dst, T16B, tmp1, tmp3);
2467   // result in dst
2468 }
2469 
2470 // Clobbers: rscratch1, rflags
2471 void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2472                                          FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) {
2473   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2474   assert_different_registers(tmp1, tmp2, src, dst);
2475 
2476   switch (T) {
2477     case S:
2478       mov(rscratch1, jint_cast(0x1.0p23f));
2479       break;
2480     case D:
2481       mov(rscratch1, julong_cast(0x1.0p52));
2482       break;
2483     default:
2484       assert(T == S || T == D, "invalid register variant");
2485   }
2486 
2487   sve_frinta(dst, T, ptrue, src);
2488   // dst = round(src), ties to away
2489 
2490   Label none;
2491 
2492   sve_fneg(tmp1, T, ptrue, src);
2493   sve_dup(tmp2, T, rscratch1);
2494   sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1);
2495   br(EQ, none);
2496   {
2497     sve_cpy(tmp1, T, pgtmp, 0.5);
2498     sve_fadd(tmp1, T, pgtmp, src);
2499     sve_frintm(dst, T, pgtmp, tmp1);
2500     // dst = floor(src + 0.5, ties to even)
2501   }
2502   bind(none);
2503 
2504   sve_fcvtzs(dst, T, ptrue, dst, T);
2505   // result in dst
2506 }
2507 
2508 void C2_MacroAssembler::vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero,
2509                                            FloatRegister one, SIMD_Arrangement T) {
2510   assert_different_registers(dst, src, zero, one);
2511   assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2512 
2513   facgt(dst, T, src, zero);
2514   ushr(dst, T, dst, 1); // dst=0 for +-0.0 and NaN. 0x7FF..F otherwise
2515   bsl(dst, T == T2S ? T8B : T16B, one, src); // Result in dst
2516 }
2517 
2518 void C2_MacroAssembler::vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero,
2519                                           FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T) {
2520     assert_different_registers(dst, src, zero, one, vtmp);
2521     assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2522 
2523     sve_orr(vtmp, src, src);
2524     sve_fac(Assembler::GT, pgtmp, T, ptrue, src, zero); // pmtp=0 for +-0.0 and NaN. 0x1 otherwise
2525     switch (T) {
2526     case S:
2527       sve_and(vtmp, T, min_jint); // Extract the sign bit of float value in every lane of src
2528       sve_orr(vtmp, T, jint_cast(1.0)); // OR it with +1 to make the final result +1 or -1 depending
2529                                         // on the sign of the float value
2530       break;
2531     case D:
2532       sve_and(vtmp, T, min_jlong);
2533       sve_orr(vtmp, T, jlong_cast(1.0));
2534       break;
2535     default:
2536       assert(false, "unsupported");
2537       ShouldNotReachHere();
2538     }
2539     sve_sel(dst, T, pgtmp, vtmp, src); // Select either from src or vtmp based on the predicate register pgtmp
2540                                        // Result in dst
2541 }
2542 
2543 bool C2_MacroAssembler::in_scratch_emit_size() {
2544   if (ciEnv::current()->task() != nullptr) {
2545     PhaseOutput* phase_output = Compile::current()->output();
2546     if (phase_output != nullptr && phase_output->in_scratch_emit_size()) {
2547       return true;
2548     }
2549   }
2550   return MacroAssembler::in_scratch_emit_size();
2551 }
2552 
2553 static void abort_verify_int_in_range(uint idx, jint val, jint lo, jint hi) {
2554   fatal("Invalid CastII, idx: %u, val: %d, lo: %d, hi: %d", idx, val, lo, hi);
2555 }
2556 
2557 void C2_MacroAssembler::verify_int_in_range(uint idx, const TypeInt* t, Register rval, Register rtmp) {
2558   assert(!t->empty() && !t->singleton(), "%s", Type::str(t));
2559   if (t == TypeInt::INT) {
2560     return;
2561   }
2562   BLOCK_COMMENT("verify_int_in_range {");
2563   Label L_success, L_failure;
2564 
2565   jint lo = t->_lo;
2566   jint hi = t->_hi;
2567 
2568   if (lo != min_jint && hi != max_jint) {
2569     subsw(rtmp, rval, lo);
2570     br(Assembler::LT, L_failure);
2571     subsw(rtmp, rval, hi);
2572     br(Assembler::LE, L_success);
2573   } else if (lo != min_jint) {
2574     subsw(rtmp, rval, lo);
2575     br(Assembler::GE, L_success);
2576   } else if (hi != max_jint) {
2577     subsw(rtmp, rval, hi);
2578     br(Assembler::LE, L_success);
2579   } else {
2580     ShouldNotReachHere();
2581   }
2582 
2583   bind(L_failure);
2584   movw(c_rarg0, idx);
2585   mov(c_rarg1, rval);
2586   movw(c_rarg2, lo);
2587   movw(c_rarg3, hi);
2588   reconstruct_frame_pointer(rtmp);
2589   rt_call(CAST_FROM_FN_PTR(address, abort_verify_int_in_range), rtmp);
2590   hlt(0);
2591 
2592   bind(L_success);
2593   BLOCK_COMMENT("} verify_int_in_range");
2594 }
2595 
2596 static void abort_verify_long_in_range(uint idx, jlong val, jlong lo, jlong hi) {
2597   fatal("Invalid CastLL, idx: %u, val: " JLONG_FORMAT ", lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, val, lo, hi);
2598 }
2599 
2600 void C2_MacroAssembler::verify_long_in_range(uint idx, const TypeLong* t, Register rval, Register rtmp) {
2601   assert(!t->empty() && !t->singleton(), "%s", Type::str(t));
2602   if (t == TypeLong::LONG) {
2603     return;
2604   }
2605   BLOCK_COMMENT("verify_long_in_range {");
2606   Label L_success, L_failure;
2607 
2608   jlong lo = t->_lo;
2609   jlong hi = t->_hi;
2610 
2611   if (lo != min_jlong && hi != max_jlong) {
2612     subs(rtmp, rval, lo);
2613     br(Assembler::LT, L_failure);
2614     subs(rtmp, rval, hi);
2615     br(Assembler::LE, L_success);
2616   } else if (lo != min_jlong) {
2617     subs(rtmp, rval, lo);
2618     br(Assembler::GE, L_success);
2619   } else if (hi != max_jlong) {
2620     subs(rtmp, rval, hi);
2621     br(Assembler::LE, L_success);
2622   } else {
2623     ShouldNotReachHere();
2624   }
2625 
2626   bind(L_failure);
2627   movw(c_rarg0, idx);
2628   mov(c_rarg1, rval);
2629   mov(c_rarg2, lo);
2630   mov(c_rarg3, hi);
2631   reconstruct_frame_pointer(rtmp);
2632   rt_call(CAST_FROM_FN_PTR(address, abort_verify_long_in_range), rtmp);
2633   hlt(0);
2634 
2635   bind(L_success);
2636   BLOCK_COMMENT("} verify_long_in_range");
2637 }
2638 
2639 void C2_MacroAssembler::reconstruct_frame_pointer(Register rtmp) {
2640   const int framesize = Compile::current()->output()->frame_size_in_bytes();
2641   if (PreserveFramePointer) {
2642     // frame pointer is valid
2643 #ifdef ASSERT
2644     // Verify frame pointer value in rfp.
2645     add(rtmp, sp, framesize - 2 * wordSize);
2646     Label L_success;
2647     cmp(rfp, rtmp);
2648     br(Assembler::EQ, L_success);
2649     stop("frame pointer mismatch");
2650     bind(L_success);
2651 #endif // ASSERT
2652   } else {
2653     add(rfp, sp, framesize - 2 * wordSize);
2654   }
2655 }
2656 
2657 // Selects elements from two source vectors (src1, src2) based on index values in the index register
2658 // using Neon instructions and places it in the destination vector element corresponding to the
2659 // index vector element. Each index in the index register must be in the range - [0, 2 * NUM_ELEM),
2660 // where NUM_ELEM is the number of BasicType elements per vector.
2661 // If idx < NUM_ELEM --> selects src1[idx] (idx is an element of the index register)
2662 // Otherwise, selects src2[idx – NUM_ELEM]
2663 void C2_MacroAssembler::select_from_two_vectors_neon(FloatRegister dst, FloatRegister src1,
2664                                                      FloatRegister src2, FloatRegister index,
2665                                                      FloatRegister tmp, unsigned vector_length_in_bytes) {
2666   assert_different_registers(dst, src1, src2, tmp);
2667   SIMD_Arrangement size = vector_length_in_bytes == 16 ? T16B : T8B;
2668 
2669   if (vector_length_in_bytes == 16) {
2670     assert(UseSVE <= 1, "sve must be <= 1");
2671     assert(src1->successor() == src2, "Source registers must be ordered");
2672     // If the vector length is 16B, then use the Neon "tbl" instruction with two vector table
2673     tbl(dst, size, src1, 2, index);
2674   } else { // vector length == 8
2675     assert(UseSVE == 0, "must be Neon only");
2676     // We need to fit both the source vectors (src1, src2) in a 128-bit register because the
2677     // Neon "tbl" instruction supports only looking up 16B vectors. We then use the Neon "tbl"
2678     // instruction with one vector lookup
2679     ins(tmp, D, src1, 0, 0);
2680     ins(tmp, D, src2, 1, 0);
2681     tbl(dst, size, tmp, 1, index);
2682   }
2683 }
2684 
2685 // Selects elements from two source vectors (src1, src2) based on index values in the index register
2686 // using SVE/SVE2 instructions and places it in the destination vector element corresponding to the
2687 // index vector element. Each index in the index register must be in the range - [0, 2 * NUM_ELEM),
2688 // where NUM_ELEM is the number of BasicType elements per vector.
2689 // If idx < NUM_ELEM --> selects src1[idx] (idx is an element of the index register)
2690 // Otherwise, selects src2[idx – NUM_ELEM]
2691 void C2_MacroAssembler::select_from_two_vectors_sve(FloatRegister dst, FloatRegister src1,
2692                                                     FloatRegister src2, FloatRegister index,
2693                                                     FloatRegister tmp, SIMD_RegVariant T,
2694                                                     unsigned vector_length_in_bytes) {
2695   assert_different_registers(dst, src1, src2, index, tmp);
2696 
2697   if (vector_length_in_bytes == 8) {
2698     // We need to fit both the source vectors (src1, src2) in a single vector register because the
2699     // SVE "tbl" instruction is unpredicated and works on the entire vector which can lead to
2700     // incorrect results if each source vector is only partially filled. We then use the SVE "tbl"
2701     // instruction with one vector lookup
2702     assert(UseSVE >= 1, "sve must be >= 1");
2703     ins(tmp, D, src1, 0, 0);
2704     ins(tmp, D, src2, 1, 0);
2705     sve_tbl(dst, T, tmp, index);
2706   } else {  // UseSVE == 2 and vector_length_in_bytes > 8
2707     // If the vector length is > 8, then use the SVE2 "tbl" instruction with the two vector table.
2708     // The assertion - vector_length_in_bytes == MaxVectorSize ensures that this operation
2709     // is not executed on machines where vector_length_in_bytes < MaxVectorSize
2710     // with the only exception of 8B vector length.
2711     assert(UseSVE == 2 && vector_length_in_bytes == MaxVectorSize, "must be");
2712     assert(src1->successor() == src2, "Source registers must be ordered");
2713     sve_tbl(dst, T, src1, src2, index);
2714   }
2715 }
2716 
2717 void C2_MacroAssembler::select_from_two_vectors(FloatRegister dst, FloatRegister src1,
2718                                                 FloatRegister src2, FloatRegister index,
2719                                                 FloatRegister tmp, BasicType bt,
2720                                                 unsigned vector_length_in_bytes) {
2721 
2722   assert_different_registers(dst, src1, src2, index, tmp);
2723 
2724   // The cases that can reach this method are -
2725   // - UseSVE = 0, vector_length_in_bytes = 8 or 16
2726   // - UseSVE = 1, vector_length_in_bytes = 8 or 16
2727   // - UseSVE = 2, vector_length_in_bytes >= 8
2728   //
2729   // SVE/SVE2 tbl instructions are generated when UseSVE = 1 with vector_length_in_bytes = 8
2730   // and UseSVE = 2 with vector_length_in_bytes >= 8
2731   //
2732   // Neon instructions are generated when UseSVE = 0 with vector_length_in_bytes = 8 or 16 and
2733   // UseSVE = 1 with vector_length_in_bytes = 16
2734 
2735   if ((UseSVE == 1 && vector_length_in_bytes == 8) || UseSVE == 2) {
2736     SIMD_RegVariant T = elemType_to_regVariant(bt);
2737     select_from_two_vectors_sve(dst, src1, src2, index, tmp, T, vector_length_in_bytes);
2738     return;
2739   }
2740 
2741   // The only BasicTypes that can reach here are T_SHORT, T_BYTE, T_INT and T_FLOAT
2742   assert(bt != T_DOUBLE && bt != T_LONG, "unsupported basic type");
2743   assert(vector_length_in_bytes <= 16, "length_in_bytes must be <= 16");
2744 
2745   bool isQ = vector_length_in_bytes == 16;
2746 
2747   SIMD_Arrangement size1 = isQ ? T16B : T8B;
2748   SIMD_Arrangement size2 = esize2arrangement((uint)type2aelembytes(bt), isQ);
2749 
2750   // Neon "tbl" instruction only supports byte tables, so we need to look at chunks of
2751   // 2B for selecting shorts or chunks of 4B for selecting ints/floats from the table.
2752   // The index values in "index" register are in the range of [0, 2 * NUM_ELEM) where NUM_ELEM
2753   // is the number of elements that can fit in a vector. For ex. for T_SHORT with 64-bit vector length,
2754   // the indices can range from [0, 8).
2755   // As an example with 64-bit vector length and T_SHORT type - let index = [2, 5, 1, 0]
2756   // Move a constant 0x02 in every byte of tmp - tmp = [0x0202, 0x0202, 0x0202, 0x0202]
2757   // Multiply index vector with tmp to yield - dst = [0x0404, 0x0a0a, 0x0202, 0x0000]
2758   // Move a constant 0x0100 in every 2B of tmp - tmp = [0x0100, 0x0100, 0x0100, 0x0100]
2759   // Add the multiplied result to the vector in tmp to obtain the byte level
2760   // offsets - dst = [0x0504, 0x0b0a, 0x0302, 0x0100]
2761   // Use these offsets in the "tbl" instruction to select chunks of 2B.
2762 
2763   if (bt == T_BYTE) {
2764     select_from_two_vectors_neon(dst, src1, src2, index, tmp, vector_length_in_bytes);
2765   } else {
2766     int elem_size = (bt == T_SHORT) ? 2 : 4;
2767     uint64_t tbl_offset = (bt == T_SHORT) ? 0x0100u : 0x03020100u;
2768 
2769     mov(tmp, size1, elem_size);
2770     mulv(dst, size2, index, tmp);
2771     mov(tmp, size2, tbl_offset);
2772     addv(dst, size1, dst, tmp); // "dst" now contains the processed index elements
2773                                 // to select a set of 2B/4B
2774     select_from_two_vectors_neon(dst, src1, src2, dst, tmp, vector_length_in_bytes);
2775   }
2776 }
2777 
2778 // Vector expand implementation. Elements from the src vector are expanded into
2779 // the dst vector under the control of the vector mask.
2780 // Since there are no native instructions directly corresponding to expand before
2781 // SVE2p2, the following implementations mainly leverages the TBL instruction to
2782 // implement expand. To compute the index input for TBL, the prefix sum algorithm
2783 // (https://en.wikipedia.org/wiki/Prefix_sum) is used. The same algorithm is used
2784 // for NEON and SVE, but with different instructions where appropriate.
2785 
2786 // Vector expand implementation for NEON.
2787 //
2788 // An example of 128-bit Byte vector:
2789 //   Data direction: high <== low
2790 //   Input:
2791 //         src   = g  f  e  d  c  b  a  9  8  7  6  5  4  3  2  1
2792 //         mask  = 0  0 -1 -1  0  0 -1 -1  0  0 -1 -1  0  0 -1 -1
2793 //   Expected result:
2794 //         dst   = 0  0  8  7  0  0  6  5  0  0  4  3  0  0  2  1
2795 void C2_MacroAssembler::vector_expand_neon(FloatRegister dst, FloatRegister src, FloatRegister mask,
2796                                            FloatRegister tmp1, FloatRegister tmp2, BasicType bt,
2797                                            int vector_length_in_bytes) {
2798   assert(vector_length_in_bytes <= 16, "the vector length in bytes for NEON must be <= 16");
2799   assert_different_registers(dst, src, mask, tmp1, tmp2);
2800   // Since the TBL instruction only supports byte table, we need to
2801   // compute indices in byte type for all types.
2802   SIMD_Arrangement size = vector_length_in_bytes == 16 ? T16B : T8B;
2803   // tmp1 =  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
2804   dup(tmp1, size, zr);
2805   // dst  =  0  0  1  1  0  0  1  1  0  0  1  1  0  0  1  1
2806   negr(dst, size, mask);
2807   // Calculate vector index for TBL with prefix sum algorithm.
2808   // dst  =  8  8  8  7  6  6  6  5  4  4  4  3  2  2  2  1
2809   for (int i = 1; i < vector_length_in_bytes; i <<= 1) {
2810     ext(tmp2, size, tmp1, dst, vector_length_in_bytes - i);
2811     addv(dst, size, tmp2, dst);
2812   }
2813   // tmp2 =  0  0 -1 -1  0  0 -1 -1  0  0 -1 -1  0  0 -1 -1
2814   orr(tmp2, size, mask, mask);
2815   // tmp2 =  0  0  8  7  0  0  6  5  0  0  4  3  0  0  2  1
2816   bsl(tmp2, size, dst, tmp1);
2817   // tmp1 =  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
2818   movi(tmp1, size, 1);
2819   // dst  = -1 -1  7  6 -1 -1  5  4 -1 -1  3  2 -1 -1  1  0
2820   subv(dst, size, tmp2, tmp1);
2821   // dst  =  0  0  8  7  0  0  6  5  0  0  4  3  0  0  2  1
2822   tbl(dst, size, src, 1, dst);
2823 }
2824 
2825 // Vector expand implementation for SVE.
2826 //
2827 // An example of 128-bit Short vector:
2828 //   Data direction: high <== low
2829 //   Input:
2830 //         src   = gf ed cb a9 87 65 43 21
2831 //         pg    = 00 01 00 01 00 01 00 01
2832 //   Expected result:
2833 //         dst   = 00 87 00 65 00 43 00 21
2834 void C2_MacroAssembler::vector_expand_sve(FloatRegister dst, FloatRegister src, PRegister pg,
2835                                           FloatRegister tmp1, FloatRegister tmp2, BasicType bt,
2836                                           int vector_length_in_bytes) {
2837   assert(UseSVE > 0, "expand implementation only for SVE");
2838   assert_different_registers(dst, src, tmp1, tmp2);
2839   SIMD_RegVariant size = elemType_to_regVariant(bt);
2840 
2841   // tmp1 = 00 00 00 00 00 00 00 00
2842   sve_dup(tmp1, size, 0);
2843   sve_movprfx(tmp2, tmp1);
2844   // tmp2 = 00 01 00 01 00 01 00 01
2845   sve_cpy(tmp2, size, pg, 1, true);
2846   // Calculate vector index for TBL with prefix sum algorithm.
2847   // tmp2 = 04 04 03 03 02 02 01 01
2848   for (int i = type2aelembytes(bt); i < vector_length_in_bytes; i <<= 1) {
2849     sve_movprfx(dst, tmp1);
2850     // The EXT instruction operates on the full-width sve register. The correct
2851     // index calculation method is:
2852     // vector_length_in_bytes - i + MaxVectorSize - vector_length_in_bytes =>
2853     // MaxVectorSize - i.
2854     sve_ext(dst, tmp2, MaxVectorSize - i);
2855     sve_add(tmp2, size, dst, tmp2);
2856   }
2857   // dst  = 00 04 00 03 00 02 00 01
2858   sve_sel(dst, size, pg, tmp2, tmp1);
2859   // dst  = -1 03 -1 02 -1 01 -1 00
2860   sve_sub(dst, size, 1);
2861   // dst  = 00 87 00 65 00 43 00 21
2862   sve_tbl(dst, size, src, dst);
2863 }