1 /*
   2  * Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "asm/assembler.hpp"
  26 #include "asm/assembler.inline.hpp"
  27 #include "opto/c2_MacroAssembler.hpp"
  28 #include "opto/compile.hpp"
  29 #include "opto/intrinsicnode.hpp"
  30 #include "opto/matcher.hpp"
  31 #include "opto/output.hpp"
  32 #include "opto/subnode.hpp"
  33 #include "runtime/stubRoutines.hpp"
  34 #include "utilities/globalDefinitions.hpp"
  35 #include "utilities/powerOfTwo.hpp"
  36 
  37 #ifdef PRODUCT
  38 #define BLOCK_COMMENT(str) /* nothing */
  39 #define STOP(error) stop(error)
  40 #else
  41 #define BLOCK_COMMENT(str) block_comment(str)
  42 #define STOP(error) block_comment(error); stop(error)
  43 #endif
  44 
  45 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  46 
  47 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
  48 
  49 // jdk.internal.util.ArraysSupport.vectorizedHashCode
  50 address C2_MacroAssembler::arrays_hashcode(Register ary, Register cnt, Register result,
  51                                            FloatRegister vdata0, FloatRegister vdata1,
  52                                            FloatRegister vdata2, FloatRegister vdata3,
  53                                            FloatRegister vmul0, FloatRegister vmul1,
  54                                            FloatRegister vmul2, FloatRegister vmul3,
  55                                            FloatRegister vpow, FloatRegister vpowm,
  56                                            BasicType eltype) {
  57   ARRAYS_HASHCODE_REGISTERS;
  58 
  59   Register tmp1 = rscratch1, tmp2 = rscratch2;
  60 
  61   Label TAIL, STUB_SWITCH, STUB_SWITCH_OUT, LOOP, BR_BASE, LARGE, DONE;
  62 
  63   // Vectorization factor. Number of array elements loaded to one SIMD&FP registers by the stubs. We
  64   // use 8H load arrangements for chars and shorts and 8B for booleans and bytes. It's possible to
  65   // use 4H for chars and shorts instead, but using 8H gives better performance.
  66   const size_t vf = eltype == T_BOOLEAN || eltype == T_BYTE ? 8
  67                     : eltype == T_CHAR || eltype == T_SHORT ? 8
  68                     : eltype == T_INT                       ? 4
  69                                                             : 0;
  70   guarantee(vf, "unsupported eltype");
  71 
  72   // Unroll factor for the scalar loop below. The value is chosen based on performance analysis.
  73   const size_t unroll_factor = 4;
  74 
  75   switch (eltype) {
  76   case T_BOOLEAN:
  77     BLOCK_COMMENT("arrays_hashcode(unsigned byte) {");
  78     break;
  79   case T_CHAR:
  80     BLOCK_COMMENT("arrays_hashcode(char) {");
  81     break;
  82   case T_BYTE:
  83     BLOCK_COMMENT("arrays_hashcode(byte) {");
  84     break;
  85   case T_SHORT:
  86     BLOCK_COMMENT("arrays_hashcode(short) {");
  87     break;
  88   case T_INT:
  89     BLOCK_COMMENT("arrays_hashcode(int) {");
  90     break;
  91   default:
  92     ShouldNotReachHere();
  93   }
  94 
  95   // large_arrays_hashcode(T_INT) performs worse than the scalar loop below when the Neon loop
  96   // implemented by the stub executes just once. Call the stub only if at least two iterations will
  97   // be executed.
  98   const size_t large_threshold = eltype == T_INT ? vf * 2 : vf;
  99   cmpw(cnt, large_threshold);
 100   br(Assembler::HS, LARGE);
 101 
 102   bind(TAIL);
 103 
 104   // The andr performs cnt % uf where uf = unroll_factor. The subtract shifted by 3 offsets past
 105   // uf - (cnt % uf) pairs of load + madd insns i.e. it only executes cnt % uf load + madd pairs.
 106   // Iteration eats up the remainder, uf elements at a time.
 107   assert(is_power_of_2(unroll_factor), "can't use this value to calculate the jump target PC");
 108   andr(tmp2, cnt, unroll_factor - 1);
 109   adr(tmp1, BR_BASE);
 110   // For Cortex-A53 offset is 4 because 2 nops are generated.
 111   sub(tmp1, tmp1, tmp2, ext::sxtw, VM_Version::supports_a53mac() ? 4 : 3);
 112   movw(tmp2, 0x1f);
 113   br(tmp1);
 114 
 115   bind(LOOP);
 116   for (size_t i = 0; i < unroll_factor; ++i) {
 117     load(tmp1, Address(post(ary, type2aelembytes(eltype))), eltype);
 118     maddw(result, result, tmp2, tmp1);
 119     // maddw generates an extra nop for Cortex-A53 (see maddw definition in macroAssembler).
 120     // Generate 2nd nop to have 4 instructions per iteration.
 121     if (VM_Version::supports_a53mac()) {
 122       nop();
 123     }
 124   }
 125   bind(BR_BASE);
 126   subsw(cnt, cnt, unroll_factor);
 127   br(Assembler::HS, LOOP);
 128 
 129   b(DONE);
 130 
 131   bind(LARGE);
 132 
 133   RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_arrays_hashcode(eltype));
 134   assert(stub.target() != nullptr, "array_hashcode stub has not been generated");
 135   address tpc = trampoline_call(stub);
 136   if (tpc == nullptr) {
 137     DEBUG_ONLY(reset_labels(TAIL, BR_BASE));
 138     postcond(pc() == badAddress);
 139     return nullptr;
 140   }
 141 
 142   bind(DONE);
 143 
 144   BLOCK_COMMENT("} // arrays_hashcode");
 145 
 146   postcond(pc() != badAddress);
 147   return pc();
 148 }
 149 
 150 void C2_MacroAssembler::fast_lock(Register obj, Register box, Register t1,
 151                                   Register t2, Register t3) {
 152   assert_different_registers(obj, box, t1, t2, t3, rscratch2);
 153 
 154   // Handle inflated monitor.
 155   Label inflated;
 156   // Finish fast lock successfully. MUST branch to with flag == EQ
 157   Label locked;
 158   // Finish fast lock unsuccessfully. MUST branch to with flag == NE
 159   Label slow_path;
 160 
 161   if (UseObjectMonitorTable) {
 162     // Clear cache in case fast locking succeeds or we need to take the slow-path.
 163     str(zr, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 164   }
 165 
 166   if (DiagnoseSyncOnValueBasedClasses != 0) {
 167     load_klass(t1, obj);
 168     ldrb(t1, Address(t1, Klass::misc_flags_offset()));
 169     tst(t1, KlassFlags::_misc_is_value_based_class);
 170     br(Assembler::NE, slow_path);
 171   }
 172 
 173   const Register t1_mark = t1;
 174   const Register t3_t = t3;
 175 
 176   { // Fast locking
 177 
 178     // Push lock to the lock stack and finish successfully. MUST branch to with flag == EQ
 179     Label push;
 180 
 181     const Register t2_top = t2;
 182 
 183     // Check if lock-stack is full.
 184     ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 185     cmpw(t2_top, (unsigned)LockStack::end_offset() - 1);
 186     br(Assembler::GT, slow_path);
 187 
 188     // Check if recursive.
 189     subw(t3_t, t2_top, oopSize);
 190     ldr(t3_t, Address(rthread, t3_t));
 191     cmp(obj, t3_t);
 192     br(Assembler::EQ, push);
 193 
 194     // Relaxed normal load to check for monitor. Optimization for monitor case.
 195     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 196     tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
 197 
 198     // Not inflated
 199     assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea");
 200 
 201     // Try to lock. Transition lock-bits 0b01 => 0b00
 202     orr(t1_mark, t1_mark, markWord::unlocked_value);
 203     eor(t3_t, t1_mark, markWord::unlocked_value);
 204     cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
 205             /*acquire*/ true, /*release*/ false, /*weak*/ false, noreg);
 206     br(Assembler::NE, slow_path);
 207 
 208     bind(push);
 209     // After successful lock, push object on lock-stack.
 210     str(obj, Address(rthread, t2_top));
 211     addw(t2_top, t2_top, oopSize);
 212     strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 213     b(locked);
 214   }
 215 
 216   { // Handle inflated monitor.
 217     bind(inflated);
 218 
 219     const Register t1_monitor = t1;
 220 
 221     if (!UseObjectMonitorTable) {
 222       assert(t1_monitor == t1_mark, "should be the same here");
 223     } else {
 224       Label monitor_found;
 225 
 226       // Load cache address
 227       lea(t3_t, Address(rthread, JavaThread::om_cache_oops_offset()));
 228 
 229       const int num_unrolled = 2;
 230       for (int i = 0; i < num_unrolled; i++) {
 231         ldr(t1, Address(t3_t));
 232         cmp(obj, t1);
 233         br(Assembler::EQ, monitor_found);
 234         increment(t3_t, in_bytes(OMCache::oop_to_oop_difference()));
 235       }
 236 
 237       Label loop;
 238 
 239       // Search for obj in cache.
 240       bind(loop);
 241 
 242       // Check for match.
 243       ldr(t1, Address(t3_t));
 244       cmp(obj, t1);
 245       br(Assembler::EQ, monitor_found);
 246 
 247       // Search until null encountered, guaranteed _null_sentinel at end.
 248       increment(t3_t, in_bytes(OMCache::oop_to_oop_difference()));
 249       cbnz(t1, loop);
 250       // Cache Miss, NE set from cmp above, cbnz does not set flags
 251       b(slow_path);
 252 
 253       bind(monitor_found);
 254       ldr(t1_monitor, Address(t3_t, OMCache::oop_to_monitor_difference()));
 255     }
 256 
 257     const Register t2_owner_addr = t2;
 258     const Register t3_owner = t3;
 259     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 260     const Address owner_address(t1_monitor, ObjectMonitor::owner_offset() - monitor_tag);
 261     const Address recursions_address(t1_monitor, ObjectMonitor::recursions_offset() - monitor_tag);
 262 
 263     Label monitor_locked;
 264 
 265     // Compute owner address.
 266     lea(t2_owner_addr, owner_address);
 267 
 268     // Try to CAS owner (no owner => current thread's _monitor_owner_id).
 269     ldr(rscratch2, Address(rthread, JavaThread::monitor_owner_id_offset()));
 270     cmpxchg(t2_owner_addr, zr, rscratch2, Assembler::xword, /*acquire*/ true,
 271             /*release*/ false, /*weak*/ false, t3_owner);
 272     br(Assembler::EQ, monitor_locked);
 273 
 274     // Check if recursive.
 275     cmp(t3_owner, rscratch2);
 276     br(Assembler::NE, slow_path);
 277 
 278     // Recursive.
 279     increment(recursions_address, 1);
 280 
 281     bind(monitor_locked);
 282     if (UseObjectMonitorTable) {
 283       str(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 284     }
 285   }
 286 
 287   bind(locked);
 288 
 289 #ifdef ASSERT
 290   // Check that locked label is reached with Flags == EQ.
 291   Label flag_correct;
 292   br(Assembler::EQ, flag_correct);
 293   stop("Fast Lock Flag != EQ");
 294 #endif
 295 
 296   bind(slow_path);
 297 #ifdef ASSERT
 298   // Check that slow_path label is reached with Flags == NE.
 299   br(Assembler::NE, flag_correct);
 300   stop("Fast Lock Flag != NE");
 301   bind(flag_correct);
 302 #endif
 303   // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
 304 }
 305 
 306 void C2_MacroAssembler::fast_unlock(Register obj, Register box, Register t1,
 307                                     Register t2, Register t3) {
 308   assert_different_registers(obj, box, t1, t2, t3);
 309 
 310   // Handle inflated monitor.
 311   Label inflated, inflated_load_mark;
 312   // Finish fast unlock successfully. MUST branch to with flag == EQ
 313   Label unlocked;
 314   // Finish fast unlock unsuccessfully. MUST branch to with flag == NE
 315   Label slow_path;
 316 
 317   const Register t1_mark = t1;
 318   const Register t2_top = t2;
 319   const Register t3_t = t3;
 320 
 321   { // Fast unlock
 322 
 323     Label push_and_slow_path;
 324 
 325     // Check if obj is top of lock-stack.
 326     ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 327     subw(t2_top, t2_top, oopSize);
 328     ldr(t3_t, Address(rthread, t2_top));
 329     cmp(obj, t3_t);
 330     // Top of lock stack was not obj. Must be monitor.
 331     br(Assembler::NE, inflated_load_mark);
 332 
 333     // Pop lock-stack.
 334     DEBUG_ONLY(str(zr, Address(rthread, t2_top));)
 335     strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 336 
 337     // Check if recursive.
 338     subw(t3_t, t2_top, oopSize);
 339     ldr(t3_t, Address(rthread, t3_t));
 340     cmp(obj, t3_t);
 341     br(Assembler::EQ, unlocked);
 342 
 343     // Not recursive.
 344     // Load Mark.
 345     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 346 
 347     // Check header for monitor (0b10).
 348     // Because we got here by popping (meaning we pushed in locked)
 349     // there will be no monitor in the box. So we need to push back the obj
 350     // so that the runtime can fix any potential anonymous owner.
 351     tbnz(t1_mark, exact_log2(markWord::monitor_value), UseObjectMonitorTable ? push_and_slow_path : inflated);
 352 
 353     // Try to unlock. Transition lock bits 0b00 => 0b01
 354     assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea");
 355     orr(t3_t, t1_mark, markWord::unlocked_value);
 356     cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
 357             /*acquire*/ false, /*release*/ true, /*weak*/ false, noreg);
 358     br(Assembler::EQ, unlocked);
 359 
 360     bind(push_and_slow_path);
 361     // Compare and exchange failed.
 362     // Restore lock-stack and handle the unlock in runtime.
 363     DEBUG_ONLY(str(obj, Address(rthread, t2_top));)
 364     addw(t2_top, t2_top, oopSize);
 365     str(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 366     b(slow_path);
 367   }
 368 
 369 
 370   { // Handle inflated monitor.
 371     bind(inflated_load_mark);
 372     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 373 #ifdef ASSERT
 374     tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
 375     stop("Fast Unlock not monitor");
 376 #endif
 377 
 378     bind(inflated);
 379 
 380 #ifdef ASSERT
 381     Label check_done;
 382     subw(t2_top, t2_top, oopSize);
 383     cmpw(t2_top, in_bytes(JavaThread::lock_stack_base_offset()));
 384     br(Assembler::LT, check_done);
 385     ldr(t3_t, Address(rthread, t2_top));
 386     cmp(obj, t3_t);
 387     br(Assembler::NE, inflated);
 388     stop("Fast Unlock lock on stack");
 389     bind(check_done);
 390 #endif
 391 
 392     const Register t1_monitor = t1;
 393 
 394     if (!UseObjectMonitorTable) {
 395       assert(t1_monitor == t1_mark, "should be the same here");
 396 
 397       // Untag the monitor.
 398       add(t1_monitor, t1_mark, -(int)markWord::monitor_value);
 399     } else {
 400       ldr(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 401       // null check with Flags == NE, no valid pointer below alignof(ObjectMonitor*)
 402       cmp(t1_monitor, checked_cast<uint8_t>(alignof(ObjectMonitor*)));
 403       br(Assembler::LO, slow_path);
 404     }
 405 
 406     const Register t2_recursions = t2;
 407     Label not_recursive;
 408 
 409     // Check if recursive.
 410     ldr(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
 411     cbz(t2_recursions, not_recursive);
 412 
 413     // Recursive unlock.
 414     sub(t2_recursions, t2_recursions, 1u);
 415     str(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
 416     // Set flag == EQ
 417     cmp(t2_recursions, t2_recursions);
 418     b(unlocked);
 419 
 420     bind(not_recursive);
 421 
 422     const Register t2_owner_addr = t2;
 423 
 424     // Compute owner address.
 425     lea(t2_owner_addr, Address(t1_monitor, ObjectMonitor::owner_offset()));
 426 
 427     // Set owner to null.
 428     // Release to satisfy the JMM
 429     stlr(zr, t2_owner_addr);
 430     // We need a full fence after clearing owner to avoid stranding.
 431     // StoreLoad achieves this.
 432     membar(StoreLoad);
 433 
 434     // Check if the entry_list is empty.
 435     ldr(rscratch1, Address(t1_monitor, ObjectMonitor::entry_list_offset()));
 436     cmp(rscratch1, zr);
 437     br(Assembler::EQ, unlocked);  // If so we are done.
 438 
 439     // Check if there is a successor.
 440     ldr(rscratch1, Address(t1_monitor, ObjectMonitor::succ_offset()));
 441     cmp(rscratch1, zr);
 442     br(Assembler::NE, unlocked);  // If so we are done.
 443 
 444     // Save the monitor pointer in the current thread, so we can try to
 445     // reacquire the lock in SharedRuntime::monitor_exit_helper().
 446     str(t1_monitor, Address(rthread, JavaThread::unlocked_inflated_monitor_offset()));
 447 
 448     cmp(zr, rthread); // Set Flag to NE => slow path
 449     b(slow_path);
 450   }
 451 
 452   bind(unlocked);
 453   cmp(zr, zr); // Set Flags to EQ => fast path
 454 
 455 #ifdef ASSERT
 456   // Check that unlocked label is reached with Flags == EQ.
 457   Label flag_correct;
 458   br(Assembler::EQ, flag_correct);
 459   stop("Fast Unlock Flag != EQ");
 460 #endif
 461 
 462   bind(slow_path);
 463 #ifdef ASSERT
 464   // Check that slow_path label is reached with Flags == NE.
 465   br(Assembler::NE, flag_correct);
 466   stop("Fast Unlock Flag != NE");
 467   bind(flag_correct);
 468 #endif
 469   // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
 470 }
 471 
 472 // Search for str1 in str2 and return index or -1
 473 // Clobbers: rscratch1, rscratch2, rflags. May also clobber v0-v1, when icnt1==-1.
 474 void C2_MacroAssembler::string_indexof(Register str2, Register str1,
 475                                        Register cnt2, Register cnt1,
 476                                        Register tmp1, Register tmp2,
 477                                        Register tmp3, Register tmp4,
 478                                        Register tmp5, Register tmp6,
 479                                        int icnt1, Register result, int ae) {
 480   // NOTE: tmp5, tmp6 can be zr depending on specific method version
 481   Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
 482 
 483   Register ch1 = rscratch1;
 484   Register ch2 = rscratch2;
 485   Register cnt1tmp = tmp1;
 486   Register cnt2tmp = tmp2;
 487   Register cnt1_neg = cnt1;
 488   Register cnt2_neg = cnt2;
 489   Register result_tmp = tmp4;
 490 
 491   bool isL = ae == StrIntrinsicNode::LL;
 492 
 493   bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
 494   bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
 495   int str1_chr_shift = str1_isL ? 0:1;
 496   int str2_chr_shift = str2_isL ? 0:1;
 497   int str1_chr_size = str1_isL ? 1:2;
 498   int str2_chr_size = str2_isL ? 1:2;
 499   chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
 500                                       (chr_insn)&MacroAssembler::ldrh;
 501   chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
 502                                       (chr_insn)&MacroAssembler::ldrh;
 503   chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
 504   chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
 505 
 506   // Note, inline_string_indexOf() generates checks:
 507   // if (substr.count > string.count) return -1;
 508   // if (substr.count == 0) return 0;
 509 
 510   // We have two strings, a source string in str2, cnt2 and a pattern string
 511   // in str1, cnt1. Find the 1st occurrence of pattern in source or return -1.
 512 
 513   // For larger pattern and source we use a simplified Boyer Moore algorithm.
 514   // With a small pattern and source we use linear scan.
 515 
 516   if (icnt1 == -1) {
 517     sub(result_tmp, cnt2, cnt1);
 518     cmp(cnt1, (u1)8);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
 519     br(LT, LINEARSEARCH);
 520     dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
 521     subs(zr, cnt1, 256);
 522     lsr(tmp1, cnt2, 2);
 523     ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
 524     br(GE, LINEARSTUB);
 525   }
 526 
 527 // The Boyer Moore alogorithm is based on the description here:-
 528 //
 529 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
 530 //
 531 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
 532 // and the 'Good Suffix' rule.
 533 //
 534 // These rules are essentially heuristics for how far we can shift the
 535 // pattern along the search string.
 536 //
 537 // The implementation here uses the 'Bad Character' rule only because of the
 538 // complexity of initialisation for the 'Good Suffix' rule.
 539 //
 540 // This is also known as the Boyer-Moore-Horspool algorithm:-
 541 //
 542 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
 543 //
 544 // This particular implementation has few java-specific optimizations.
 545 //
 546 // #define ASIZE 256
 547 //
 548 //    int bm(unsigned char *x, int m, unsigned char *y, int n) {
 549 //       int i, j;
 550 //       unsigned c;
 551 //       unsigned char bc[ASIZE];
 552 //
 553 //       /* Preprocessing */
 554 //       for (i = 0; i < ASIZE; ++i)
 555 //          bc[i] = m;
 556 //       for (i = 0; i < m - 1; ) {
 557 //          c = x[i];
 558 //          ++i;
 559 //          // c < 256 for Latin1 string, so, no need for branch
 560 //          #ifdef PATTERN_STRING_IS_LATIN1
 561 //          bc[c] = m - i;
 562 //          #else
 563 //          if (c < ASIZE) bc[c] = m - i;
 564 //          #endif
 565 //       }
 566 //
 567 //       /* Searching */
 568 //       j = 0;
 569 //       while (j <= n - m) {
 570 //          c = y[i+j];
 571 //          if (x[m-1] == c)
 572 //            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
 573 //          if (i < 0) return j;
 574 //          // c < 256 for Latin1 string, so, no need for branch
 575 //          #ifdef SOURCE_STRING_IS_LATIN1
 576 //          // LL case: (c< 256) always true. Remove branch
 577 //          j += bc[y[j+m-1]];
 578 //          #endif
 579 //          #ifndef PATTERN_STRING_IS_UTF
 580 //          // UU case: need if (c<ASIZE) check. Skip 1 character if not.
 581 //          if (c < ASIZE)
 582 //            j += bc[y[j+m-1]];
 583 //          else
 584 //            j += 1
 585 //          #endif
 586 //          #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
 587 //          // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
 588 //          if (c < ASIZE)
 589 //            j += bc[y[j+m-1]];
 590 //          else
 591 //            j += m
 592 //          #endif
 593 //       }
 594 //    }
 595 
 596   if (icnt1 == -1) {
 597     Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
 598         BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
 599     Register cnt1end = tmp2;
 600     Register str2end = cnt2;
 601     Register skipch = tmp2;
 602 
 603     // str1 length is >=8, so, we can read at least 1 register for cases when
 604     // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
 605     // UL case. We'll re-read last character in inner pre-loop code to have
 606     // single outer pre-loop load
 607     const int firstStep = isL ? 7 : 3;
 608 
 609     const int ASIZE = 256;
 610     const int STORED_BYTES = 32; // amount of bytes stored per instruction
 611     sub(sp, sp, ASIZE);
 612     mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
 613     mov(ch1, sp);
 614     BIND(BM_INIT_LOOP);
 615       stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
 616       subs(tmp5, tmp5, 1);
 617       br(GT, BM_INIT_LOOP);
 618 
 619       sub(cnt1tmp, cnt1, 1);
 620       mov(tmp5, str2);
 621       add(str2end, str2, result_tmp, LSL, str2_chr_shift);
 622       sub(ch2, cnt1, 1);
 623       mov(tmp3, str1);
 624     BIND(BCLOOP);
 625       (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
 626       if (!str1_isL) {
 627         subs(zr, ch1, ASIZE);
 628         br(HS, BCSKIP);
 629       }
 630       strb(ch2, Address(sp, ch1));
 631     BIND(BCSKIP);
 632       subs(ch2, ch2, 1);
 633       br(GT, BCLOOP);
 634 
 635       add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
 636       if (str1_isL == str2_isL) {
 637         // load last 8 bytes (8LL/4UU symbols)
 638         ldr(tmp6, Address(tmp6, -wordSize));
 639       } else {
 640         ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
 641         // convert Latin1 to UTF. We'll have to wait until load completed, but
 642         // it's still faster than per-character loads+checks
 643         lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
 644         ubfx(ch1, tmp6, 8, 8); // str1[N-2]
 645         ubfx(ch2, tmp6, 16, 8); // str1[N-3]
 646         andr(tmp6, tmp6, 0xFF); // str1[N-4]
 647         orr(ch2, ch1, ch2, LSL, 16);
 648         orr(tmp6, tmp6, tmp3, LSL, 48);
 649         orr(tmp6, tmp6, ch2, LSL, 16);
 650       }
 651     BIND(BMLOOPSTR2);
 652       (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 653       sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
 654       if (str1_isL == str2_isL) {
 655         // re-init tmp3. It's for free because it's executed in parallel with
 656         // load above. Alternative is to initialize it before loop, but it'll
 657         // affect performance on in-order systems with 2 or more ld/st pipelines
 658         lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
 659       }
 660       if (!isL) { // UU/UL case
 661         lsl(ch2, cnt1tmp, 1); // offset in bytes
 662       }
 663       cmp(tmp3, skipch);
 664       br(NE, BMSKIP);
 665       ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
 666       mov(ch1, tmp6);
 667       if (isL) {
 668         b(BMLOOPSTR1_AFTER_LOAD);
 669       } else {
 670         sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
 671         b(BMLOOPSTR1_CMP);
 672       }
 673     BIND(BMLOOPSTR1);
 674       (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
 675       (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 676     BIND(BMLOOPSTR1_AFTER_LOAD);
 677       subs(cnt1tmp, cnt1tmp, 1);
 678       br(LT, BMLOOPSTR1_LASTCMP);
 679     BIND(BMLOOPSTR1_CMP);
 680       cmp(ch1, ch2);
 681       br(EQ, BMLOOPSTR1);
 682     BIND(BMSKIP);
 683       if (!isL) {
 684         // if we've met UTF symbol while searching Latin1 pattern, then we can
 685         // skip cnt1 symbols
 686         if (str1_isL != str2_isL) {
 687           mov(result_tmp, cnt1);
 688         } else {
 689           mov(result_tmp, 1);
 690         }
 691         subs(zr, skipch, ASIZE);
 692         br(HS, BMADV);
 693       }
 694       ldrb(result_tmp, Address(sp, skipch)); // load skip distance
 695     BIND(BMADV);
 696       sub(cnt1tmp, cnt1, 1);
 697       add(str2, str2, result_tmp, LSL, str2_chr_shift);
 698       cmp(str2, str2end);
 699       br(LE, BMLOOPSTR2);
 700       add(sp, sp, ASIZE);
 701       b(NOMATCH);
 702     BIND(BMLOOPSTR1_LASTCMP);
 703       cmp(ch1, ch2);
 704       br(NE, BMSKIP);
 705     BIND(BMMATCH);
 706       sub(result, str2, tmp5);
 707       if (!str2_isL) lsr(result, result, 1);
 708       add(sp, sp, ASIZE);
 709       b(DONE);
 710 
 711     BIND(LINEARSTUB);
 712     cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm
 713     br(LT, LINEAR_MEDIUM);
 714     mov(result, zr);
 715     RuntimeAddress stub = nullptr;
 716     if (isL) {
 717       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
 718       assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated");
 719     } else if (str1_isL) {
 720       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
 721        assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated");
 722     } else {
 723       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
 724       assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated");
 725     }
 726     address call = trampoline_call(stub);
 727     if (call == nullptr) {
 728       DEBUG_ONLY(reset_labels(LINEARSEARCH, LINEAR_MEDIUM, DONE, NOMATCH, MATCH));
 729       ciEnv::current()->record_failure("CodeCache is full");
 730       return;
 731     }
 732     b(DONE);
 733   }
 734 
 735   BIND(LINEARSEARCH);
 736   {
 737     Label DO1, DO2, DO3;
 738 
 739     Register str2tmp = tmp2;
 740     Register first = tmp3;
 741 
 742     if (icnt1 == -1)
 743     {
 744         Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
 745 
 746         cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2));
 747         br(LT, DOSHORT);
 748       BIND(LINEAR_MEDIUM);
 749         (this->*str1_load_1chr)(first, Address(str1));
 750         lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
 751         sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
 752         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 753         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 754 
 755       BIND(FIRST_LOOP);
 756         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
 757         cmp(first, ch2);
 758         br(EQ, STR1_LOOP);
 759       BIND(STR2_NEXT);
 760         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 761         br(LE, FIRST_LOOP);
 762         b(NOMATCH);
 763 
 764       BIND(STR1_LOOP);
 765         adds(cnt1tmp, cnt1_neg, str1_chr_size);
 766         add(cnt2tmp, cnt2_neg, str2_chr_size);
 767         br(GE, MATCH);
 768 
 769       BIND(STR1_NEXT);
 770         (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
 771         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
 772         cmp(ch1, ch2);
 773         br(NE, STR2_NEXT);
 774         adds(cnt1tmp, cnt1tmp, str1_chr_size);
 775         add(cnt2tmp, cnt2tmp, str2_chr_size);
 776         br(LT, STR1_NEXT);
 777         b(MATCH);
 778 
 779       BIND(DOSHORT);
 780       if (str1_isL == str2_isL) {
 781         cmp(cnt1, (u1)2);
 782         br(LT, DO1);
 783         br(GT, DO3);
 784       }
 785     }
 786 
 787     if (icnt1 == 4) {
 788       Label CH1_LOOP;
 789 
 790         (this->*load_4chr)(ch1, str1);
 791         sub(result_tmp, cnt2, 4);
 792         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 793         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 794 
 795       BIND(CH1_LOOP);
 796         (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
 797         cmp(ch1, ch2);
 798         br(EQ, MATCH);
 799         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 800         br(LE, CH1_LOOP);
 801         b(NOMATCH);
 802       }
 803 
 804     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
 805       Label CH1_LOOP;
 806 
 807       BIND(DO2);
 808         (this->*load_2chr)(ch1, str1);
 809         if (icnt1 == 2) {
 810           sub(result_tmp, cnt2, 2);
 811         }
 812         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 813         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 814       BIND(CH1_LOOP);
 815         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
 816         cmp(ch1, ch2);
 817         br(EQ, MATCH);
 818         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 819         br(LE, CH1_LOOP);
 820         b(NOMATCH);
 821     }
 822 
 823     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
 824       Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
 825 
 826       BIND(DO3);
 827         (this->*load_2chr)(first, str1);
 828         (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
 829         if (icnt1 == 3) {
 830           sub(result_tmp, cnt2, 3);
 831         }
 832         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 833         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 834       BIND(FIRST_LOOP);
 835         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
 836         cmpw(first, ch2);
 837         br(EQ, STR1_LOOP);
 838       BIND(STR2_NEXT);
 839         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 840         br(LE, FIRST_LOOP);
 841         b(NOMATCH);
 842 
 843       BIND(STR1_LOOP);
 844         add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
 845         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
 846         cmp(ch1, ch2);
 847         br(NE, STR2_NEXT);
 848         b(MATCH);
 849     }
 850 
 851     if (icnt1 == -1 || icnt1 == 1) {
 852       Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
 853 
 854       BIND(DO1);
 855         (this->*str1_load_1chr)(ch1, str1);
 856         cmp(cnt2, (u1)8);
 857         br(LT, DO1_SHORT);
 858 
 859         sub(result_tmp, cnt2, 8/str2_chr_size);
 860         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 861         mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
 862         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 863 
 864         if (str2_isL) {
 865           orr(ch1, ch1, ch1, LSL, 8);
 866         }
 867         orr(ch1, ch1, ch1, LSL, 16);
 868         orr(ch1, ch1, ch1, LSL, 32);
 869       BIND(CH1_LOOP);
 870         ldr(ch2, Address(str2, cnt2_neg));
 871         eor(ch2, ch1, ch2);
 872         sub(tmp1, ch2, tmp3);
 873         orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 874         bics(tmp1, tmp1, tmp2);
 875         br(NE, HAS_ZERO);
 876         adds(cnt2_neg, cnt2_neg, 8);
 877         br(LT, CH1_LOOP);
 878 
 879         cmp(cnt2_neg, (u1)8);
 880         mov(cnt2_neg, 0);
 881         br(LT, CH1_LOOP);
 882         b(NOMATCH);
 883 
 884       BIND(HAS_ZERO);
 885         rev(tmp1, tmp1);
 886         clz(tmp1, tmp1);
 887         add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
 888         b(MATCH);
 889 
 890       BIND(DO1_SHORT);
 891         mov(result_tmp, cnt2);
 892         lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
 893         sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
 894       BIND(DO1_LOOP);
 895         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
 896         cmpw(ch1, ch2);
 897         br(EQ, MATCH);
 898         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 899         br(LT, DO1_LOOP);
 900     }
 901   }
 902   BIND(NOMATCH);
 903     mov(result, -1);
 904     b(DONE);
 905   BIND(MATCH);
 906     add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
 907   BIND(DONE);
 908 }
 909 
 910 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
 911 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
 912 
 913 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1,
 914                                             Register ch, Register result,
 915                                             Register tmp1, Register tmp2, Register tmp3)
 916 {
 917   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
 918   Register cnt1_neg = cnt1;
 919   Register ch1 = rscratch1;
 920   Register result_tmp = rscratch2;
 921 
 922   cbz(cnt1, NOMATCH);
 923 
 924   cmp(cnt1, (u1)4);
 925   br(LT, DO1_SHORT);
 926 
 927   orr(ch, ch, ch, LSL, 16);
 928   orr(ch, ch, ch, LSL, 32);
 929 
 930   sub(cnt1, cnt1, 4);
 931   mov(result_tmp, cnt1);
 932   lea(str1, Address(str1, cnt1, Address::uxtw(1)));
 933   sub(cnt1_neg, zr, cnt1, LSL, 1);
 934 
 935   mov(tmp3, 0x0001000100010001);
 936 
 937   BIND(CH1_LOOP);
 938     ldr(ch1, Address(str1, cnt1_neg));
 939     eor(ch1, ch, ch1);
 940     sub(tmp1, ch1, tmp3);
 941     orr(tmp2, ch1, 0x7fff7fff7fff7fff);
 942     bics(tmp1, tmp1, tmp2);
 943     br(NE, HAS_ZERO);
 944     adds(cnt1_neg, cnt1_neg, 8);
 945     br(LT, CH1_LOOP);
 946 
 947     cmp(cnt1_neg, (u1)8);
 948     mov(cnt1_neg, 0);
 949     br(LT, CH1_LOOP);
 950     b(NOMATCH);
 951 
 952   BIND(HAS_ZERO);
 953     rev(tmp1, tmp1);
 954     clz(tmp1, tmp1);
 955     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
 956     b(MATCH);
 957 
 958   BIND(DO1_SHORT);
 959     mov(result_tmp, cnt1);
 960     lea(str1, Address(str1, cnt1, Address::uxtw(1)));
 961     sub(cnt1_neg, zr, cnt1, LSL, 1);
 962   BIND(DO1_LOOP);
 963     ldrh(ch1, Address(str1, cnt1_neg));
 964     cmpw(ch, ch1);
 965     br(EQ, MATCH);
 966     adds(cnt1_neg, cnt1_neg, 2);
 967     br(LT, DO1_LOOP);
 968   BIND(NOMATCH);
 969     mov(result, -1);
 970     b(DONE);
 971   BIND(MATCH);
 972     add(result, result_tmp, cnt1_neg, ASR, 1);
 973   BIND(DONE);
 974 }
 975 
 976 void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1,
 977                                                 Register ch, Register result,
 978                                                 FloatRegister ztmp1,
 979                                                 FloatRegister ztmp2,
 980                                                 PRegister tmp_pg,
 981                                                 PRegister tmp_pdn, bool isL)
 982 {
 983   // Note that `tmp_pdn` should *NOT* be used as governing predicate register.
 984   assert(tmp_pg->is_governing(),
 985          "this register has to be a governing predicate register");
 986 
 987   Label LOOP, MATCH, DONE, NOMATCH;
 988   Register vec_len = rscratch1;
 989   Register idx = rscratch2;
 990 
 991   SIMD_RegVariant T = (isL == true) ? B : H;
 992 
 993   cbz(cnt1, NOMATCH);
 994 
 995   // Assign the particular char throughout the vector.
 996   sve_dup(ztmp2, T, ch);
 997   if (isL) {
 998     sve_cntb(vec_len);
 999   } else {
1000     sve_cnth(vec_len);
1001   }
1002   mov(idx, 0);
1003 
1004   // Generate a predicate to control the reading of input string.
1005   sve_whilelt(tmp_pg, T, idx, cnt1);
1006 
1007   BIND(LOOP);
1008     // Read a vector of 8- or 16-bit data depending on the string type. Note
1009     // that inactive elements indicated by the predicate register won't cause
1010     // a data read from memory to the destination vector.
1011     if (isL) {
1012       sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx));
1013     } else {
1014       sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1)));
1015     }
1016     add(idx, idx, vec_len);
1017 
1018     // Perform the comparison. An element of the destination predicate is set
1019     // to active if the particular char is matched.
1020     sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2);
1021 
1022     // Branch if the particular char is found.
1023     br(NE, MATCH);
1024 
1025     sve_whilelt(tmp_pg, T, idx, cnt1);
1026 
1027     // Loop back if the particular char not found.
1028     br(MI, LOOP);
1029 
1030   BIND(NOMATCH);
1031     mov(result, -1);
1032     b(DONE);
1033 
1034   BIND(MATCH);
1035     // Undo the index increment.
1036     sub(idx, idx, vec_len);
1037 
1038     // Crop the vector to find its location.
1039     sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */);
1040     add(result, idx, -1);
1041     sve_incp(result, T, tmp_pdn);
1042   BIND(DONE);
1043 }
1044 
1045 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1,
1046                                             Register ch, Register result,
1047                                             Register tmp1, Register tmp2, Register tmp3)
1048 {
1049   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
1050   Register cnt1_neg = cnt1;
1051   Register ch1 = rscratch1;
1052   Register result_tmp = rscratch2;
1053 
1054   cbz(cnt1, NOMATCH);
1055 
1056   cmp(cnt1, (u1)8);
1057   br(LT, DO1_SHORT);
1058 
1059   orr(ch, ch, ch, LSL, 8);
1060   orr(ch, ch, ch, LSL, 16);
1061   orr(ch, ch, ch, LSL, 32);
1062 
1063   sub(cnt1, cnt1, 8);
1064   mov(result_tmp, cnt1);
1065   lea(str1, Address(str1, cnt1));
1066   sub(cnt1_neg, zr, cnt1);
1067 
1068   mov(tmp3, 0x0101010101010101);
1069 
1070   BIND(CH1_LOOP);
1071     ldr(ch1, Address(str1, cnt1_neg));
1072     eor(ch1, ch, ch1);
1073     sub(tmp1, ch1, tmp3);
1074     orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f);
1075     bics(tmp1, tmp1, tmp2);
1076     br(NE, HAS_ZERO);
1077     adds(cnt1_neg, cnt1_neg, 8);
1078     br(LT, CH1_LOOP);
1079 
1080     cmp(cnt1_neg, (u1)8);
1081     mov(cnt1_neg, 0);
1082     br(LT, CH1_LOOP);
1083     b(NOMATCH);
1084 
1085   BIND(HAS_ZERO);
1086     rev(tmp1, tmp1);
1087     clz(tmp1, tmp1);
1088     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
1089     b(MATCH);
1090 
1091   BIND(DO1_SHORT);
1092     mov(result_tmp, cnt1);
1093     lea(str1, Address(str1, cnt1));
1094     sub(cnt1_neg, zr, cnt1);
1095   BIND(DO1_LOOP);
1096     ldrb(ch1, Address(str1, cnt1_neg));
1097     cmp(ch, ch1);
1098     br(EQ, MATCH);
1099     adds(cnt1_neg, cnt1_neg, 1);
1100     br(LT, DO1_LOOP);
1101   BIND(NOMATCH);
1102     mov(result, -1);
1103     b(DONE);
1104   BIND(MATCH);
1105     add(result, result_tmp, cnt1_neg);
1106   BIND(DONE);
1107 }
1108 
1109 // Compare strings.
1110 void C2_MacroAssembler::string_compare(Register str1, Register str2,
1111     Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
1112     FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
1113     PRegister pgtmp1, PRegister pgtmp2, int ae) {
1114   Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
1115       DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
1116       SHORT_LOOP_START, TAIL_CHECK;
1117 
1118   bool isLL = ae == StrIntrinsicNode::LL;
1119   bool isLU = ae == StrIntrinsicNode::LU;
1120   bool isUL = ae == StrIntrinsicNode::UL;
1121 
1122   // The stub threshold for LL strings is: 72 (64 + 8) chars
1123   // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch)
1124   // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least)
1125   const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36);
1126 
1127   bool str1_isL = isLL || isLU;
1128   bool str2_isL = isLL || isUL;
1129 
1130   int str1_chr_shift = str1_isL ? 0 : 1;
1131   int str2_chr_shift = str2_isL ? 0 : 1;
1132   int str1_chr_size = str1_isL ? 1 : 2;
1133   int str2_chr_size = str2_isL ? 1 : 2;
1134   int minCharsInWord = isLL ? wordSize : wordSize/2;
1135 
1136   FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
1137   chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
1138                                       (chr_insn)&MacroAssembler::ldrh;
1139   chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
1140                                       (chr_insn)&MacroAssembler::ldrh;
1141   uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
1142                             (uxt_insn)&MacroAssembler::uxthw;
1143 
1144   BLOCK_COMMENT("string_compare {");
1145 
1146   // Bizarrely, the counts are passed in bytes, regardless of whether they
1147   // are L or U strings, however the result is always in characters.
1148   if (!str1_isL) asrw(cnt1, cnt1, 1);
1149   if (!str2_isL) asrw(cnt2, cnt2, 1);
1150 
1151   // Compute the minimum of the string lengths and save the difference.
1152   subsw(result, cnt1, cnt2);
1153   cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
1154 
1155   // A very short string
1156   cmpw(cnt2, minCharsInWord);
1157   br(Assembler::LE, SHORT_STRING);
1158 
1159   // Compare longwords
1160   // load first parts of strings and finish initialization while loading
1161   {
1162     if (str1_isL == str2_isL) { // LL or UU
1163       ldr(tmp1, Address(str1));
1164       cmp(str1, str2);
1165       br(Assembler::EQ, DONE);
1166       ldr(tmp2, Address(str2));
1167       cmp(cnt2, stub_threshold);
1168       br(GE, STUB);
1169       subsw(cnt2, cnt2, minCharsInWord);
1170       br(EQ, TAIL_CHECK);
1171       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1172       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1173       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1174     } else if (isLU) {
1175       ldrs(vtmp, Address(str1));
1176       ldr(tmp2, Address(str2));
1177       cmp(cnt2, stub_threshold);
1178       br(GE, STUB);
1179       subw(cnt2, cnt2, 4);
1180       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1181       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1182       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1183       zip1(vtmp, T8B, vtmp, vtmpZ);
1184       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1185       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1186       add(cnt1, cnt1, 4);
1187       fmovd(tmp1, vtmp);
1188     } else { // UL case
1189       ldr(tmp1, Address(str1));
1190       ldrs(vtmp, Address(str2));
1191       cmp(cnt2, stub_threshold);
1192       br(GE, STUB);
1193       subw(cnt2, cnt2, 4);
1194       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1195       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1196       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1197       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1198       zip1(vtmp, T8B, vtmp, vtmpZ);
1199       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1200       add(cnt1, cnt1, 8);
1201       fmovd(tmp2, vtmp);
1202     }
1203     adds(cnt2, cnt2, isUL ? 4 : 8);
1204     br(GE, TAIL);
1205     eor(rscratch2, tmp1, tmp2);
1206     cbnz(rscratch2, DIFF);
1207     // main loop
1208     bind(NEXT_WORD);
1209     if (str1_isL == str2_isL) {
1210       ldr(tmp1, Address(str1, cnt2));
1211       ldr(tmp2, Address(str2, cnt2));
1212       adds(cnt2, cnt2, 8);
1213     } else if (isLU) {
1214       ldrs(vtmp, Address(str1, cnt1));
1215       ldr(tmp2, Address(str2, cnt2));
1216       add(cnt1, cnt1, 4);
1217       zip1(vtmp, T8B, vtmp, vtmpZ);
1218       fmovd(tmp1, vtmp);
1219       adds(cnt2, cnt2, 8);
1220     } else { // UL
1221       ldrs(vtmp, Address(str2, cnt2));
1222       ldr(tmp1, Address(str1, cnt1));
1223       zip1(vtmp, T8B, vtmp, vtmpZ);
1224       add(cnt1, cnt1, 8);
1225       fmovd(tmp2, vtmp);
1226       adds(cnt2, cnt2, 4);
1227     }
1228     br(GE, TAIL);
1229 
1230     eor(rscratch2, tmp1, tmp2);
1231     cbz(rscratch2, NEXT_WORD);
1232     b(DIFF);
1233     bind(TAIL);
1234     eor(rscratch2, tmp1, tmp2);
1235     cbnz(rscratch2, DIFF);
1236     // Last longword.  In the case where length == 4 we compare the
1237     // same longword twice, but that's still faster than another
1238     // conditional branch.
1239     if (str1_isL == str2_isL) {
1240       ldr(tmp1, Address(str1));
1241       ldr(tmp2, Address(str2));
1242     } else if (isLU) {
1243       ldrs(vtmp, Address(str1));
1244       ldr(tmp2, Address(str2));
1245       zip1(vtmp, T8B, vtmp, vtmpZ);
1246       fmovd(tmp1, vtmp);
1247     } else { // UL
1248       ldrs(vtmp, Address(str2));
1249       ldr(tmp1, Address(str1));
1250       zip1(vtmp, T8B, vtmp, vtmpZ);
1251       fmovd(tmp2, vtmp);
1252     }
1253     bind(TAIL_CHECK);
1254     eor(rscratch2, tmp1, tmp2);
1255     cbz(rscratch2, DONE);
1256 
1257     // Find the first different characters in the longwords and
1258     // compute their difference.
1259     bind(DIFF);
1260     rev(rscratch2, rscratch2);
1261     clz(rscratch2, rscratch2);
1262     andr(rscratch2, rscratch2, isLL ? -8 : -16);
1263     lsrv(tmp1, tmp1, rscratch2);
1264     (this->*ext_chr)(tmp1, tmp1);
1265     lsrv(tmp2, tmp2, rscratch2);
1266     (this->*ext_chr)(tmp2, tmp2);
1267     subw(result, tmp1, tmp2);
1268     b(DONE);
1269   }
1270 
1271   bind(STUB);
1272     RuntimeAddress stub = nullptr;
1273     switch(ae) {
1274       case StrIntrinsicNode::LL:
1275         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
1276         break;
1277       case StrIntrinsicNode::UU:
1278         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
1279         break;
1280       case StrIntrinsicNode::LU:
1281         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
1282         break;
1283       case StrIntrinsicNode::UL:
1284         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
1285         break;
1286       default:
1287         ShouldNotReachHere();
1288      }
1289     assert(stub.target() != nullptr, "compare_long_string stub has not been generated");
1290     address call = trampoline_call(stub);
1291     if (call == nullptr) {
1292       DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START));
1293       ciEnv::current()->record_failure("CodeCache is full");
1294       return;
1295     }
1296     b(DONE);
1297 
1298   bind(SHORT_STRING);
1299   // Is the minimum length zero?
1300   cbz(cnt2, DONE);
1301   // arrange code to do most branches while loading and loading next characters
1302   // while comparing previous
1303   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1304   subs(cnt2, cnt2, 1);
1305   br(EQ, SHORT_LAST_INIT);
1306   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1307   b(SHORT_LOOP_START);
1308   bind(SHORT_LOOP);
1309   subs(cnt2, cnt2, 1);
1310   br(EQ, SHORT_LAST);
1311   bind(SHORT_LOOP_START);
1312   (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
1313   (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
1314   cmp(tmp1, cnt1);
1315   br(NE, SHORT_LOOP_TAIL);
1316   subs(cnt2, cnt2, 1);
1317   br(EQ, SHORT_LAST2);
1318   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1319   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1320   cmp(tmp2, rscratch1);
1321   br(EQ, SHORT_LOOP);
1322   sub(result, tmp2, rscratch1);
1323   b(DONE);
1324   bind(SHORT_LOOP_TAIL);
1325   sub(result, tmp1, cnt1);
1326   b(DONE);
1327   bind(SHORT_LAST2);
1328   cmp(tmp2, rscratch1);
1329   br(EQ, DONE);
1330   sub(result, tmp2, rscratch1);
1331 
1332   b(DONE);
1333   bind(SHORT_LAST_INIT);
1334   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1335   bind(SHORT_LAST);
1336   cmp(tmp1, cnt1);
1337   br(EQ, DONE);
1338   sub(result, tmp1, cnt1);
1339 
1340   bind(DONE);
1341 
1342   BLOCK_COMMENT("} string_compare");
1343 }
1344 
1345 void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1,
1346                                      FloatRegister src2, Condition cond, bool isQ) {
1347   SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1348   FloatRegister zn = src1, zm = src2;
1349   bool needs_negation = false;
1350   switch (cond) {
1351     case LT: cond = GT; zn = src2; zm = src1; break;
1352     case LE: cond = GE; zn = src2; zm = src1; break;
1353     case LO: cond = HI; zn = src2; zm = src1; break;
1354     case LS: cond = HS; zn = src2; zm = src1; break;
1355     case NE: cond = EQ; needs_negation = true; break;
1356     default:
1357       break;
1358   }
1359 
1360   if (is_floating_point_type(bt)) {
1361     fcm(cond, dst, size, zn, zm);
1362   } else {
1363     cm(cond, dst, size, zn, zm);
1364   }
1365 
1366   if (needs_negation) {
1367     notr(dst, isQ ? T16B : T8B, dst);
1368   }
1369 }
1370 
1371 void C2_MacroAssembler::neon_compare_zero(FloatRegister dst, BasicType bt, FloatRegister src,
1372                                           Condition cond, bool isQ) {
1373   SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1374   if (bt == T_FLOAT || bt == T_DOUBLE) {
1375     if (cond == Assembler::NE) {
1376       fcm(Assembler::EQ, dst, size, src);
1377       notr(dst, isQ ? T16B : T8B, dst);
1378     } else {
1379       fcm(cond, dst, size, src);
1380     }
1381   } else {
1382     if (cond == Assembler::NE) {
1383       cm(Assembler::EQ, dst, size, src);
1384       notr(dst, isQ ? T16B : T8B, dst);
1385     } else {
1386       cm(cond, dst, size, src);
1387     }
1388   }
1389 }
1390 
1391 // Compress the least significant bit of each byte to the rightmost and clear
1392 // the higher garbage bits.
1393 void C2_MacroAssembler::bytemask_compress(Register dst) {
1394   // Example input, dst = 0x01 00 00 00 01 01 00 01
1395   // The "??" bytes are garbage.
1396   orr(dst, dst, dst, Assembler::LSR, 7);  // dst = 0x?? 02 ?? 00 ?? 03 ?? 01
1397   orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D
1398   orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D
1399   andr(dst, dst, 0xff);                   // dst = 0x8D
1400 }
1401 
1402 // Pack the value of each mask element in "src" into a long value in "dst", at most
1403 // the first 64 lane elements. The input "src" is a vector of boolean represented as
1404 // bytes with 0x00/0x01 as element values. Each lane value from "src" is packed into
1405 // one bit in "dst".
1406 //
1407 // Example:   src = 0x0001010000010001 0100000001010001, lane_cnt = 16
1408 // Expected:  dst = 0x658D
1409 //
1410 // Clobbers: rscratch1
1411 void C2_MacroAssembler::sve_vmask_tolong(Register dst, FloatRegister src,
1412                                          FloatRegister vtmp, int lane_cnt) {
1413   assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
1414   assert_different_registers(dst, rscratch1);
1415   assert_different_registers(src, vtmp);
1416   assert(UseSVE > 0, "must be");
1417 
1418   // Compress the lowest 8 bytes.
1419   fmovd(dst, src);
1420   bytemask_compress(dst);
1421   if (lane_cnt <= 8) return;
1422 
1423   // Repeat on higher bytes and join the results.
1424   // Compress 8 bytes in each iteration.
1425   for (int idx = 1; idx < (lane_cnt / 8); idx++) {
1426     sve_extract_integral(rscratch1, T_LONG, src, idx, vtmp);
1427     bytemask_compress(rscratch1);
1428     orr(dst, dst, rscratch1, Assembler::LSL, idx << 3);
1429   }
1430 }
1431 
1432 // The function is same as above "sve_vmask_tolong", but it uses SVE2's BEXT
1433 // instruction which requires the FEAT_BITPERM feature.
1434 void C2_MacroAssembler::sve2_vmask_tolong(Register dst, FloatRegister src,
1435                                           FloatRegister vtmp1, FloatRegister vtmp2,
1436                                           int lane_cnt) {
1437   assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
1438   assert_different_registers(src, vtmp1, vtmp2);
1439   assert(UseSVE > 1 && VM_Version::supports_svebitperm(), "must be");
1440 
1441   // Given a vector with the value 0x00 or 0x01 in each byte, the basic idea
1442   // is to compress each significant bit of the byte in a cross-lane way. Due
1443   // to the lack of a cross-lane bit-compress instruction, we use BEXT
1444   // (bit-compress in each lane) with the biggest lane size (T = D) then
1445   // concatenate the results.
1446 
1447   // The second source input of BEXT, initialized with 0x01 in each byte.
1448   // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1449   sve_dup(vtmp2, B, 1);
1450 
1451   // BEXT vtmp1.D, src.D, vtmp2.D
1452   // src   = 0x0001010000010001 | 0x0100000001010001
1453   // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1454   //         ---------------------------------------
1455   // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1456   sve_bext(vtmp1, D, src, vtmp2);
1457 
1458   // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the
1459   // result to dst.
1460   // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1461   // dst   = 0x658D
1462   if (lane_cnt <= 8) {
1463     // No need to concatenate.
1464     umov(dst, vtmp1, B, 0);
1465   } else if (lane_cnt <= 16) {
1466     ins(vtmp1, B, vtmp1, 1, 8);
1467     umov(dst, vtmp1, H, 0);
1468   } else {
1469     // As the lane count is 64 at most, the final expected value must be in
1470     // the lowest 64 bits after narrowing vtmp1 from D to B.
1471     sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2);
1472     umov(dst, vtmp1, D, 0);
1473   }
1474 }
1475 
1476 // Unpack the mask, a long value in "src", into a vector register of boolean
1477 // represented as bytes with 0x00/0x01 as element values in "dst".  Each bit in
1478 // "src" is unpacked into one byte lane in "dst". Note that "dst" can support at
1479 // most 64 lanes.
1480 //
1481 // Below example gives the expected dst vector register, with a valid src(0x658D)
1482 // on a 128-bit vector size machine.
1483 // dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01
1484 void C2_MacroAssembler::sve_vmask_fromlong(FloatRegister dst, Register src,
1485                                            FloatRegister vtmp, int lane_cnt) {
1486   assert_different_registers(dst, vtmp);
1487   assert(UseSVE == 2 && VM_Version::supports_svebitperm() &&
1488          lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported");
1489 
1490   // Example:   src = 0x658D, lane_cnt = 16
1491   // Expected:  dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01
1492 
1493   // Put long value from general purpose register into the first lane of vector.
1494   // vtmp = 0x0000000000000000 | 0x000000000000658D
1495   sve_dup(vtmp, B, 0);
1496   mov(vtmp, D, 0, src);
1497 
1498   // Transform the value in the first lane which is mask in bit now to the mask in
1499   // byte, which can be done by SVE2's BDEP instruction.
1500 
1501   // The first source input of BDEP instruction. Deposite each byte in every 8 bytes.
1502   // vtmp = 0x0000000000000065 | 0x000000000000008D
1503   if (lane_cnt <= 8) {
1504     // Nothing. As only one byte exsits.
1505   } else if (lane_cnt <= 16) {
1506     ins(vtmp, B, vtmp, 8, 1);
1507   } else {
1508     sve_vector_extend(vtmp, D, vtmp, B);
1509   }
1510 
1511   // The second source input of BDEP instruction, initialized with 0x01 for each byte.
1512   // dst = 0x01010101 0x01010101 0x01010101 0x01010101
1513   sve_dup(dst, B, 1);
1514 
1515   // BDEP dst.D, vtmp.D, dst.D
1516   // vtmp = 0x0000000000000065 | 0x000000000000008D
1517   // dst  = 0x0101010101010101 | 0x0101010101010101
1518   //        ---------------------------------------
1519   // dst  = 0x0001010000010001 | 0x0100000001010001
1520   sve_bdep(dst, D, vtmp, dst);
1521 }
1522 
1523 // Clobbers: rflags
1524 void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg,
1525                                     FloatRegister zn, FloatRegister zm, Condition cond) {
1526   assert(pg->is_governing(), "This register has to be a governing predicate register");
1527   FloatRegister z1 = zn, z2 = zm;
1528   switch (cond) {
1529     case LE: z1 = zm; z2 = zn; cond = GE; break;
1530     case LT: z1 = zm; z2 = zn; cond = GT; break;
1531     case LO: z1 = zm; z2 = zn; cond = HI; break;
1532     case LS: z1 = zm; z2 = zn; cond = HS; break;
1533     default:
1534       break;
1535   }
1536 
1537   SIMD_RegVariant size = elemType_to_regVariant(bt);
1538   if (is_floating_point_type(bt)) {
1539     sve_fcm(cond, pd, size, pg, z1, z2);
1540   } else {
1541     assert(is_integral_type(bt), "unsupported element type");
1542     sve_cmp(cond, pd, size, pg, z1, z2);
1543   }
1544 }
1545 
1546 // Get index of the last mask lane that is set
1547 void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) {
1548   SIMD_RegVariant size = elemType_to_regVariant(bt);
1549   sve_rev(ptmp, size, src);
1550   sve_brkb(ptmp, ptrue, ptmp, false);
1551   sve_cntp(dst, size, ptrue, ptmp);
1552   movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1);
1553   subw(dst, rscratch1, dst);
1554 }
1555 
1556 // Extend integer vector src to dst with the same lane count
1557 // but larger element size, e.g. 4B -> 4I
1558 void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes,
1559                                            FloatRegister src, BasicType src_bt, bool is_unsigned) {
1560   if (src_bt == T_BYTE) {
1561     // 4B to 4S/4I, 8B to 8S
1562     assert(dst_vlen_in_bytes == 8 || dst_vlen_in_bytes == 16, "unsupported");
1563     assert(dst_bt == T_SHORT || dst_bt == T_INT, "unsupported");
1564     _xshll(is_unsigned, dst, T8H, src, T8B, 0);
1565     if (dst_bt == T_INT) {
1566       _xshll(is_unsigned, dst, T4S, dst, T4H, 0);
1567     }
1568   } else if (src_bt == T_SHORT) {
1569     // 2S to 2I/2L, 4S to 4I
1570     assert(dst_vlen_in_bytes == 8 || dst_vlen_in_bytes == 16, "unsupported");
1571     assert(dst_bt == T_INT || dst_bt == T_LONG, "unsupported");
1572     _xshll(is_unsigned, dst, T4S, src, T4H, 0);
1573     if (dst_bt == T_LONG) {
1574       _xshll(is_unsigned, dst, T2D, dst, T2S, 0);
1575     }
1576   } else if (src_bt == T_INT) {
1577     // 2I to 2L
1578     assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported");
1579     _xshll(is_unsigned, dst, T2D, src, T2S, 0);
1580   } else {
1581     ShouldNotReachHere();
1582   }
1583 }
1584 
1585 // Narrow integer vector src down to dst with the same lane count
1586 // but smaller element size, e.g. 4I -> 4B
1587 void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt,
1588                                            FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) {
1589   if (src_bt == T_SHORT) {
1590     // 4S/8S to 4B/8B
1591     assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
1592     assert(dst_bt == T_BYTE, "unsupported");
1593     xtn(dst, T8B, src, T8H);
1594   } else if (src_bt == T_INT) {
1595     // 2I to 2S, 4I to 4B/4S
1596     assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
1597     assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported");
1598     xtn(dst, T4H, src, T4S);
1599     if (dst_bt == T_BYTE) {
1600       xtn(dst, T8B, dst, T8H);
1601     }
1602   } else if (src_bt == T_LONG) {
1603     // 2L to 2S/2I
1604     assert(src_vlen_in_bytes == 16, "unsupported");
1605     assert(dst_bt == T_INT || dst_bt == T_SHORT, "unsupported");
1606     xtn(dst, T2S, src, T2D);
1607     if (dst_bt == T_SHORT) {
1608       xtn(dst, T4H, dst, T4S);
1609     }
1610   } else {
1611     ShouldNotReachHere();
1612   }
1613 }
1614 
1615 void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size,
1616                                           FloatRegister src, SIMD_RegVariant src_size,
1617                                           bool is_unsigned) {
1618   assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size");
1619 
1620   if (src_size == B) {
1621     switch (dst_size) {
1622     case H:
1623       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1624       break;
1625     case S:
1626       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1627       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1628       break;
1629     case D:
1630       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1631       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1632       _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1633       break;
1634     default:
1635       ShouldNotReachHere();
1636     }
1637   } else if (src_size == H) {
1638     if (dst_size == S) {
1639       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1640     } else { // D
1641       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1642       _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1643     }
1644   } else if (src_size == S) {
1645     _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, src);
1646   }
1647 }
1648 
1649 // Vector narrow from src to dst with specified element sizes.
1650 // High part of dst vector will be filled with zero.
1651 void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size,
1652                                           FloatRegister src, SIMD_RegVariant src_size,
1653                                           FloatRegister tmp) {
1654   assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size");
1655   assert_different_registers(src, tmp);
1656   sve_dup(tmp, src_size, 0);
1657   if (src_size == D) {
1658     switch (dst_size) {
1659     case S:
1660       sve_uzp1(dst, S, src, tmp);
1661       break;
1662     case H:
1663       assert_different_registers(dst, tmp);
1664       sve_uzp1(dst, S, src, tmp);
1665       sve_uzp1(dst, H, dst, tmp);
1666       break;
1667     case B:
1668       assert_different_registers(dst, tmp);
1669       sve_uzp1(dst, S, src, tmp);
1670       sve_uzp1(dst, H, dst, tmp);
1671       sve_uzp1(dst, B, dst, tmp);
1672       break;
1673     default:
1674       ShouldNotReachHere();
1675     }
1676   } else if (src_size == S) {
1677     if (dst_size == H) {
1678       sve_uzp1(dst, H, src, tmp);
1679     } else { // B
1680       assert_different_registers(dst, tmp);
1681       sve_uzp1(dst, H, src, tmp);
1682       sve_uzp1(dst, B, dst, tmp);
1683     }
1684   } else if (src_size == H) {
1685     sve_uzp1(dst, B, src, tmp);
1686   }
1687 }
1688 
1689 // Extend src predicate to dst predicate with the same lane count but larger
1690 // element size, e.g. 64Byte -> 512Long
1691 void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src,
1692                                              uint dst_element_length_in_bytes,
1693                                              uint src_element_length_in_bytes) {
1694   if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) {
1695     sve_punpklo(dst, src);
1696   } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) {
1697     sve_punpklo(dst, src);
1698     sve_punpklo(dst, dst);
1699   } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) {
1700     sve_punpklo(dst, src);
1701     sve_punpklo(dst, dst);
1702     sve_punpklo(dst, dst);
1703   } else {
1704     assert(false, "unsupported");
1705     ShouldNotReachHere();
1706   }
1707 }
1708 
1709 // Narrow src predicate to dst predicate with the same lane count but
1710 // smaller element size, e.g. 512Long -> 64Byte
1711 void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src, PRegister ptmp,
1712                                              uint dst_element_length_in_bytes, uint src_element_length_in_bytes) {
1713   // The insignificant bits in src predicate are expected to be zero.
1714   // To ensure the higher order bits of the resultant narrowed vector are 0, an all-zero predicate is
1715   // passed as the second argument. An example narrowing operation with a given mask would be -
1716   // 128Long -> 64Int on a 128-bit machine i.e 2L -> 2I
1717   // Mask (for 2 Longs) : TF
1718   // Predicate register for the above mask (16 bits) : 00000001 00000000
1719   // After narrowing (uzp1 dst.b, src.b, ptmp.b) : 0000 0000 0001 0000
1720   // Which translates to mask for 2 integers as : TF (lower half is considered while upper half is 0)
1721   assert_different_registers(src, ptmp);
1722   assert_different_registers(dst, ptmp);
1723   sve_pfalse(ptmp);
1724   if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) {
1725     sve_uzp1(dst, B, src, ptmp);
1726   } else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) {
1727     sve_uzp1(dst, H, src, ptmp);
1728     sve_uzp1(dst, B, dst, ptmp);
1729   } else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) {
1730     sve_uzp1(dst, S, src, ptmp);
1731     sve_uzp1(dst, H, dst, ptmp);
1732     sve_uzp1(dst, B, dst, ptmp);
1733   } else {
1734     assert(false, "unsupported");
1735     ShouldNotReachHere();
1736   }
1737 }
1738 
1739 // Vector reduction add for integral type with ASIMD instructions.
1740 void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt,
1741                                                  Register isrc, FloatRegister vsrc,
1742                                                  unsigned vector_length_in_bytes,
1743                                                  FloatRegister vtmp) {
1744   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1745   assert_different_registers(dst, isrc);
1746   bool isQ = vector_length_in_bytes == 16;
1747 
1748   BLOCK_COMMENT("neon_reduce_add_integral {");
1749     switch(bt) {
1750       case T_BYTE:
1751         addv(vtmp, isQ ? T16B : T8B, vsrc);
1752         smov(dst, vtmp, B, 0);
1753         addw(dst, dst, isrc, ext::sxtb);
1754         break;
1755       case T_SHORT:
1756         addv(vtmp, isQ ? T8H : T4H, vsrc);
1757         smov(dst, vtmp, H, 0);
1758         addw(dst, dst, isrc, ext::sxth);
1759         break;
1760       case T_INT:
1761         isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc);
1762         umov(dst, vtmp, S, 0);
1763         addw(dst, dst, isrc);
1764         break;
1765       case T_LONG:
1766         assert(isQ, "unsupported");
1767         addpd(vtmp, vsrc);
1768         umov(dst, vtmp, D, 0);
1769         add(dst, dst, isrc);
1770         break;
1771       default:
1772         assert(false, "unsupported");
1773         ShouldNotReachHere();
1774     }
1775   BLOCK_COMMENT("} neon_reduce_add_integral");
1776 }
1777 
1778 // Vector reduction multiply for integral type with ASIMD instructions.
1779 // Note: temporary registers vtmp1 and vtmp2 are not used in some cases.
1780 // Clobbers: rscratch1
1781 void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt,
1782                                                  Register isrc, FloatRegister vsrc,
1783                                                  unsigned vector_length_in_bytes,
1784                                                  FloatRegister vtmp1, FloatRegister vtmp2) {
1785   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1786   bool isQ = vector_length_in_bytes == 16;
1787 
1788   BLOCK_COMMENT("neon_reduce_mul_integral {");
1789     switch(bt) {
1790       case T_BYTE:
1791         if (isQ) {
1792           // Multiply the lower half and higher half of vector iteratively.
1793           // vtmp1 = vsrc[8:15]
1794           ins(vtmp1, D, vsrc, 0, 1);
1795           // vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7]
1796           mulv(vtmp1, T8B, vtmp1, vsrc);
1797           // vtmp2 = vtmp1[4:7]
1798           ins(vtmp2, S, vtmp1, 0, 1);
1799           // vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3]
1800           mulv(vtmp1, T8B, vtmp2, vtmp1);
1801         } else {
1802           ins(vtmp1, S, vsrc, 0, 1);
1803           mulv(vtmp1, T8B, vtmp1, vsrc);
1804         }
1805         // vtmp2 = vtmp1[2:3]
1806         ins(vtmp2, H, vtmp1, 0, 1);
1807         // vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1]
1808         mulv(vtmp2, T8B, vtmp2, vtmp1);
1809         // dst = vtmp2[0] * isrc * vtmp2[1]
1810         umov(rscratch1, vtmp2, B, 0);
1811         mulw(dst, rscratch1, isrc);
1812         sxtb(dst, dst);
1813         umov(rscratch1, vtmp2, B, 1);
1814         mulw(dst, rscratch1, dst);
1815         sxtb(dst, dst);
1816         break;
1817       case T_SHORT:
1818         if (isQ) {
1819           ins(vtmp2, D, vsrc, 0, 1);
1820           mulv(vtmp2, T4H, vtmp2, vsrc);
1821           ins(vtmp1, S, vtmp2, 0, 1);
1822           mulv(vtmp1, T4H, vtmp1, vtmp2);
1823         } else {
1824           ins(vtmp1, S, vsrc, 0, 1);
1825           mulv(vtmp1, T4H, vtmp1, vsrc);
1826         }
1827         umov(rscratch1, vtmp1, H, 0);
1828         mulw(dst, rscratch1, isrc);
1829         sxth(dst, dst);
1830         umov(rscratch1, vtmp1, H, 1);
1831         mulw(dst, rscratch1, dst);
1832         sxth(dst, dst);
1833         break;
1834       case T_INT:
1835         if (isQ) {
1836           ins(vtmp1, D, vsrc, 0, 1);
1837           mulv(vtmp1, T2S, vtmp1, vsrc);
1838         } else {
1839           vtmp1 = vsrc;
1840         }
1841         umov(rscratch1, vtmp1, S, 0);
1842         mul(dst, rscratch1, isrc);
1843         umov(rscratch1, vtmp1, S, 1);
1844         mul(dst, rscratch1, dst);
1845         break;
1846       case T_LONG:
1847         umov(rscratch1, vsrc, D, 0);
1848         mul(dst, isrc, rscratch1);
1849         umov(rscratch1, vsrc, D, 1);
1850         mul(dst, dst, rscratch1);
1851         break;
1852       default:
1853         assert(false, "unsupported");
1854         ShouldNotReachHere();
1855     }
1856   BLOCK_COMMENT("} neon_reduce_mul_integral");
1857 }
1858 
1859 // Vector reduction multiply for floating-point type with ASIMD instructions.
1860 void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt,
1861                                            FloatRegister fsrc, FloatRegister vsrc,
1862                                            unsigned vector_length_in_bytes,
1863                                            FloatRegister vtmp) {
1864   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1865   bool isQ = vector_length_in_bytes == 16;
1866 
1867   BLOCK_COMMENT("neon_reduce_mul_fp {");
1868     switch(bt) {
1869       case T_FLOAT:
1870         fmuls(dst, fsrc, vsrc);
1871         ins(vtmp, S, vsrc, 0, 1);
1872         fmuls(dst, dst, vtmp);
1873         if (isQ) {
1874           ins(vtmp, S, vsrc, 0, 2);
1875           fmuls(dst, dst, vtmp);
1876           ins(vtmp, S, vsrc, 0, 3);
1877           fmuls(dst, dst, vtmp);
1878          }
1879         break;
1880       case T_DOUBLE:
1881         assert(isQ, "unsupported");
1882         fmuld(dst, fsrc, vsrc);
1883         ins(vtmp, D, vsrc, 0, 1);
1884         fmuld(dst, dst, vtmp);
1885         break;
1886       default:
1887         assert(false, "unsupported");
1888         ShouldNotReachHere();
1889     }
1890   BLOCK_COMMENT("} neon_reduce_mul_fp");
1891 }
1892 
1893 // Helper to select logical instruction
1894 void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd,
1895                                                    Register Rn, Register Rm,
1896                                                    enum shift_kind kind, unsigned shift) {
1897   switch(opc) {
1898     case Op_AndReductionV:
1899       is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift);
1900       break;
1901     case Op_OrReductionV:
1902       is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift);
1903       break;
1904     case Op_XorReductionV:
1905       is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift);
1906       break;
1907     default:
1908       assert(false, "unsupported");
1909       ShouldNotReachHere();
1910   }
1911 }
1912 
1913 // Vector reduction logical operations And, Or, Xor
1914 // Clobbers: rscratch1
1915 void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt,
1916                                             Register isrc, FloatRegister vsrc,
1917                                             unsigned vector_length_in_bytes) {
1918   assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV,
1919          "unsupported");
1920   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1921   assert_different_registers(dst, isrc);
1922   bool isQ = vector_length_in_bytes == 16;
1923 
1924   BLOCK_COMMENT("neon_reduce_logical {");
1925     umov(rscratch1, vsrc, isQ ? D : S, 0);
1926     umov(dst, vsrc, isQ ? D : S, 1);
1927     neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1);
1928     switch(bt) {
1929       case T_BYTE:
1930         if (isQ) {
1931           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1932         }
1933         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
1934         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8);
1935         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1936         sxtb(dst, dst);
1937         break;
1938       case T_SHORT:
1939         if (isQ) {
1940           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1941         }
1942         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
1943         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1944         sxth(dst, dst);
1945         break;
1946       case T_INT:
1947         if (isQ) {
1948           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1949         }
1950         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1951         break;
1952       case T_LONG:
1953         assert(isQ, "unsupported");
1954         neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst);
1955         break;
1956       default:
1957         assert(false, "unsupported");
1958         ShouldNotReachHere();
1959     }
1960   BLOCK_COMMENT("} neon_reduce_logical");
1961 }
1962 
1963 // Vector reduction min/max for integral type with ASIMD instructions.
1964 // Note: vtmp is not used and expected to be fnoreg for T_LONG case.
1965 // Clobbers: rscratch1, rflags
1966 void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt,
1967                                                     Register isrc, FloatRegister vsrc,
1968                                                     unsigned vector_length_in_bytes,
1969                                                     FloatRegister vtmp) {
1970   assert(opc == Op_MinReductionV || opc == Op_MaxReductionV, "unsupported");
1971   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1972   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported");
1973   assert_different_registers(dst, isrc);
1974   bool isQ = vector_length_in_bytes == 16;
1975   bool is_min = opc == Op_MinReductionV;
1976 
1977   BLOCK_COMMENT("neon_reduce_minmax_integral {");
1978     if (bt == T_LONG) {
1979       assert(vtmp == fnoreg, "should be");
1980       assert(isQ, "should be");
1981       umov(rscratch1, vsrc, D, 0);
1982       cmp(isrc, rscratch1);
1983       csel(dst, isrc, rscratch1, is_min ? LT : GT);
1984       umov(rscratch1, vsrc, D, 1);
1985       cmp(dst, rscratch1);
1986       csel(dst, dst, rscratch1, is_min ? LT : GT);
1987     } else {
1988       SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1989       if (size == T2S) {
1990         is_min ? sminp(vtmp, size, vsrc, vsrc) : smaxp(vtmp, size, vsrc, vsrc);
1991       } else {
1992         is_min ? sminv(vtmp, size, vsrc) : smaxv(vtmp, size, vsrc);
1993       }
1994       if (bt == T_INT) {
1995         umov(dst, vtmp, S, 0);
1996       } else {
1997         smov(dst, vtmp, elemType_to_regVariant(bt), 0);
1998       }
1999       cmpw(dst, isrc);
2000       cselw(dst, dst, isrc, is_min ? LT : GT);
2001     }
2002   BLOCK_COMMENT("} neon_reduce_minmax_integral");
2003 }
2004 
2005 // Vector reduction for integral type with SVE instruction.
2006 // Supported operations are Add, And, Or, Xor, Max, Min.
2007 // rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV.
2008 void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1,
2009                                             FloatRegister src2, PRegister pg, FloatRegister tmp) {
2010   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2011   assert(pg->is_governing(), "This register has to be a governing predicate register");
2012   assert_different_registers(src1, dst);
2013   // Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved.
2014   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2015   switch (opc) {
2016     case Op_AddReductionVI: {
2017       sve_uaddv(tmp, size, pg, src2);
2018       if (bt == T_BYTE) {
2019         smov(dst, tmp, size, 0);
2020         addw(dst, src1, dst, ext::sxtb);
2021       } else if (bt == T_SHORT) {
2022         smov(dst, tmp, size, 0);
2023         addw(dst, src1, dst, ext::sxth);
2024       } else {
2025         umov(dst, tmp, size, 0);
2026         addw(dst, dst, src1);
2027       }
2028       break;
2029     }
2030     case Op_AddReductionVL: {
2031       sve_uaddv(tmp, size, pg, src2);
2032       umov(dst, tmp, size, 0);
2033       add(dst, dst, src1);
2034       break;
2035     }
2036     case Op_AndReductionV: {
2037       sve_andv(tmp, size, pg, src2);
2038       if (bt == T_INT || bt == T_LONG) {
2039         umov(dst, tmp, size, 0);
2040       } else {
2041         smov(dst, tmp, size, 0);
2042       }
2043       if (bt == T_LONG) {
2044         andr(dst, dst, src1);
2045       } else {
2046         andw(dst, dst, src1);
2047       }
2048       break;
2049     }
2050     case Op_OrReductionV: {
2051       sve_orv(tmp, size, pg, src2);
2052       if (bt == T_INT || bt == T_LONG) {
2053         umov(dst, tmp, size, 0);
2054       } else {
2055         smov(dst, tmp, size, 0);
2056       }
2057       if (bt == T_LONG) {
2058         orr(dst, dst, src1);
2059       } else {
2060         orrw(dst, dst, src1);
2061       }
2062       break;
2063     }
2064     case Op_XorReductionV: {
2065       sve_eorv(tmp, size, pg, src2);
2066       if (bt == T_INT || bt == T_LONG) {
2067         umov(dst, tmp, size, 0);
2068       } else {
2069         smov(dst, tmp, size, 0);
2070       }
2071       if (bt == T_LONG) {
2072         eor(dst, dst, src1);
2073       } else {
2074         eorw(dst, dst, src1);
2075       }
2076       break;
2077     }
2078     case Op_MaxReductionV: {
2079       sve_smaxv(tmp, size, pg, src2);
2080       if (bt == T_INT || bt == T_LONG) {
2081         umov(dst, tmp, size, 0);
2082       } else {
2083         smov(dst, tmp, size, 0);
2084       }
2085       if (bt == T_LONG) {
2086         cmp(dst, src1);
2087         csel(dst, dst, src1, Assembler::GT);
2088       } else {
2089         cmpw(dst, src1);
2090         cselw(dst, dst, src1, Assembler::GT);
2091       }
2092       break;
2093     }
2094     case Op_MinReductionV: {
2095       sve_sminv(tmp, size, pg, src2);
2096       if (bt == T_INT || bt == T_LONG) {
2097         umov(dst, tmp, size, 0);
2098       } else {
2099         smov(dst, tmp, size, 0);
2100       }
2101       if (bt == T_LONG) {
2102         cmp(dst, src1);
2103         csel(dst, dst, src1, Assembler::LT);
2104       } else {
2105         cmpw(dst, src1);
2106         cselw(dst, dst, src1, Assembler::LT);
2107       }
2108       break;
2109     }
2110     default:
2111       assert(false, "unsupported");
2112       ShouldNotReachHere();
2113   }
2114 
2115   if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) {
2116     if (bt == T_BYTE) {
2117       sxtb(dst, dst);
2118     } else if (bt == T_SHORT) {
2119       sxth(dst, dst);
2120     }
2121   }
2122 }
2123 
2124 // Set elements of the dst predicate to true for lanes in the range of [0, lane_cnt), or
2125 // to false otherwise. The input "lane_cnt" should be smaller than or equal to the supported
2126 // max vector length of the basic type. Clobbers: rscratch1 and the rFlagsReg.
2127 void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt) {
2128   uint32_t max_vector_length = Matcher::max_vector_size(bt);
2129   assert(lane_cnt <= max_vector_length, "unsupported input lane_cnt");
2130 
2131   // Set all elements to false if the input "lane_cnt" is zero.
2132   if (lane_cnt == 0) {
2133     sve_pfalse(dst);
2134     return;
2135   }
2136 
2137   SIMD_RegVariant size = elemType_to_regVariant(bt);
2138   assert(size != Q, "invalid size");
2139 
2140   // Set all true if "lane_cnt" equals to the max lane count.
2141   if (lane_cnt == max_vector_length) {
2142     sve_ptrue(dst, size, /* ALL */ 0b11111);
2143     return;
2144   }
2145 
2146   // Fixed numbers for "ptrue".
2147   switch(lane_cnt) {
2148   case 1: /* VL1 */
2149   case 2: /* VL2 */
2150   case 3: /* VL3 */
2151   case 4: /* VL4 */
2152   case 5: /* VL5 */
2153   case 6: /* VL6 */
2154   case 7: /* VL7 */
2155   case 8: /* VL8 */
2156     sve_ptrue(dst, size, lane_cnt);
2157     return;
2158   case 16:
2159     sve_ptrue(dst, size, /* VL16 */ 0b01001);
2160     return;
2161   case 32:
2162     sve_ptrue(dst, size, /* VL32 */ 0b01010);
2163     return;
2164   case 64:
2165     sve_ptrue(dst, size, /* VL64 */ 0b01011);
2166     return;
2167   case 128:
2168     sve_ptrue(dst, size, /* VL128 */ 0b01100);
2169     return;
2170   case 256:
2171     sve_ptrue(dst, size, /* VL256 */ 0b01101);
2172     return;
2173   default:
2174     break;
2175   }
2176 
2177   // Special patterns for "ptrue".
2178   if (lane_cnt == round_down_power_of_2(max_vector_length)) {
2179     sve_ptrue(dst, size, /* POW2 */ 0b00000);
2180   } else if (lane_cnt == max_vector_length - (max_vector_length % 4)) {
2181     sve_ptrue(dst, size, /* MUL4 */ 0b11101);
2182   } else if (lane_cnt == max_vector_length - (max_vector_length % 3)) {
2183     sve_ptrue(dst, size, /* MUL3 */ 0b11110);
2184   } else {
2185     // Encode to "whileltw" for the remaining cases.
2186     mov(rscratch1, lane_cnt);
2187     sve_whileltw(dst, size, zr, rscratch1);
2188   }
2189 }
2190 
2191 // Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst.
2192 // Any remaining elements of dst will be filled with zero.
2193 // Clobbers: rscratch1
2194 // Preserves: mask, vzr
2195 void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask,
2196                                            FloatRegister vzr, FloatRegister vtmp,
2197                                            PRegister pgtmp, unsigned vector_length_in_bytes) {
2198   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2199   // When called by sve_compress_byte, src and vtmp may be the same register.
2200   assert_different_registers(dst, src, vzr);
2201   assert_different_registers(dst, vtmp, vzr);
2202   assert_different_registers(mask, pgtmp);
2203   // high <-- low
2204   // Example input:   src   = hh gg ff ee dd cc bb aa, one character is 8 bits.
2205   //                  mask  = 01 00 00 01 01 00 01 01, one character is 1 bit.
2206   // Expected result: dst   = 00 00 00 hh ee dd bb aa
2207 
2208   // Extend lowest half to type INT.
2209   // dst   =  00dd  00cc  00bb  00aa
2210   sve_uunpklo(dst, S, src);
2211   // pgtmp =  0001  0000  0001  0001
2212   sve_punpklo(pgtmp, mask);
2213   // Pack the active elements in size of type INT to the right,
2214   // and fill the remainings with zero.
2215   // dst   =  0000  00dd  00bb  00aa
2216   sve_compact(dst, S, dst, pgtmp);
2217   // Narrow the result back to type SHORT.
2218   // dst   = 00 00 00 00 00 dd bb aa
2219   sve_uzp1(dst, H, dst, vzr);
2220 
2221   // Return if the vector length is no more than MaxVectorSize/2, since the
2222   // highest half is invalid.
2223   if (vector_length_in_bytes <= (MaxVectorSize >> 1)) {
2224     return;
2225   }
2226 
2227   // Count the active elements of lowest half.
2228   // rscratch1 = 3
2229   sve_cntp(rscratch1, S, ptrue, pgtmp);
2230 
2231   // Repeat to the highest half.
2232   // pgtmp =  0001  0000  0000  0001
2233   sve_punpkhi(pgtmp, mask);
2234   // vtmp  =  00hh  00gg  00ff  00ee
2235   sve_uunpkhi(vtmp, S, src);
2236   // vtmp  =  0000  0000  00hh  00ee
2237   sve_compact(vtmp, S, vtmp, pgtmp);
2238   // vtmp  = 00 00 00 00 00 00 hh ee
2239   sve_uzp1(vtmp, H, vtmp, vzr);
2240 
2241   // pgtmp = 00 00 00 00 00 01 01 01
2242   sve_whilelt(pgtmp, H, zr, rscratch1);
2243   // Compressed low:  dst  = 00 00 00 00 00 dd bb aa
2244   // Compressed high: vtmp = 00 00 00 00 00 00 hh ee
2245   // Combine the compressed low with the compressed high:
2246   //                  dst  = 00 00 00 hh ee dd bb aa
2247   sve_splice(dst, H, pgtmp, vtmp);
2248 }
2249 
2250 // Clobbers: rscratch1, rscratch2
2251 // Preserves: src, mask
2252 void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask,
2253                                           FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
2254                                           PRegister ptmp, PRegister pgtmp, unsigned vector_length_in_bytes) {
2255   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2256   assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3);
2257   assert_different_registers(mask, ptmp, pgtmp);
2258   // high <-- low
2259   // Example input:   src   = q p n m l k j i h g f e d c b a, one character is 8 bits.
2260   //                  mask  = 0 1 0 0 0 0 0 1 0 1 0 0 0 1 0 1, one character is 1 bit.
2261   // Expected result: dst   = 0 0 0 0 0 0 0 0 0 0 0 p i g c a
2262   FloatRegister vzr = vtmp3;
2263   sve_dup(vzr, B, 0);
2264 
2265   // Extend lowest half to type SHORT.
2266   // vtmp1 =  0h  0g  0f  0e  0d  0c  0b  0a
2267   sve_uunpklo(vtmp1, H, src);
2268   // ptmp  =  00  01  00  00  00  01  00  01
2269   sve_punpklo(ptmp, mask);
2270   // Pack the active elements in size of type SHORT to the right,
2271   // and fill the remainings with zero.
2272   // dst   =  00  00  00  00  00  0g  0c  0a
2273   unsigned extended_size = vector_length_in_bytes << 1;
2274   sve_compress_short(dst, vtmp1, ptmp, vzr, vtmp2, pgtmp, extended_size > MaxVectorSize ? MaxVectorSize : extended_size);
2275   // Narrow the result back to type BYTE.
2276   // dst   = 0 0 0 0 0 0 0 0 0 0 0 0 0 g c a
2277   sve_uzp1(dst, B, dst, vzr);
2278 
2279   // Return if the vector length is no more than MaxVectorSize/2, since the
2280   // highest half is invalid.
2281   if (vector_length_in_bytes <= (MaxVectorSize >> 1)) {
2282     return;
2283   }
2284   // Count the active elements of lowest half.
2285   // rscratch2 = 3
2286   sve_cntp(rscratch2, H, ptrue, ptmp);
2287 
2288   // Repeat to the highest half.
2289   // ptmp  =  00  01  00  00  00  00  00  01
2290   sve_punpkhi(ptmp, mask);
2291   // vtmp2 =  0q  0p  0n  0m  0l  0k  0j  0i
2292   sve_uunpkhi(vtmp2, H, src);
2293   // vtmp1 =  00  00  00  00  00  00  0p  0i
2294   sve_compress_short(vtmp1, vtmp2, ptmp, vzr, vtmp2, pgtmp, extended_size - MaxVectorSize);
2295   // vtmp1 = 0 0 0 0 0 0 0 0 0 0 0 0 0 0 p i
2296   sve_uzp1(vtmp1, B, vtmp1, vzr);
2297 
2298   // ptmp  = 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1
2299   sve_whilelt(ptmp, B, zr, rscratch2);
2300   // Compressed low:  dst   = 0 0 0 0 0 0 0 0 0 0 0 0 0 g c a
2301   // Compressed high: vtmp1 = 0 0 0 0 0 0 0 0 0 0 0 0 0 0 p i
2302   // Combine the compressed low with the compressed high:
2303   //                  dst   = 0 0 0 0 0 0 0 0 0 0 0 p i g c a
2304   sve_splice(dst, B, ptmp, vtmp1);
2305 }
2306 
2307 void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2308   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2309   SIMD_Arrangement size = isQ ? T16B : T8B;
2310   if (bt == T_BYTE) {
2311     rbit(dst, size, src);
2312   } else {
2313     neon_reverse_bytes(dst, src, bt, isQ);
2314     rbit(dst, size, dst);
2315   }
2316 }
2317 
2318 void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2319   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2320   SIMD_Arrangement size = isQ ? T16B : T8B;
2321   switch (bt) {
2322     case T_BYTE:
2323       if (dst != src) {
2324         orr(dst, size, src, src);
2325       }
2326       break;
2327     case T_SHORT:
2328       rev16(dst, size, src);
2329       break;
2330     case T_INT:
2331       rev32(dst, size, src);
2332       break;
2333     case T_LONG:
2334       rev64(dst, size, src);
2335       break;
2336     default:
2337       assert(false, "unsupported");
2338       ShouldNotReachHere();
2339   }
2340 }
2341 
2342 // VectorRearrange implementation for short/int/float/long/double types with NEON
2343 // instructions. For VectorRearrange short/int/float, we use NEON tbl instruction.
2344 // But since it supports bytes table only, we need to lookup 2/4 bytes as a group.
2345 // For VectorRearrange long/double, we compare the shuffle input with iota indices,
2346 // and use bsl to implement the operation.
2347 void C2_MacroAssembler::neon_rearrange_hsd(FloatRegister dst, FloatRegister src,
2348                                            FloatRegister shuffle, FloatRegister tmp,
2349                                            BasicType bt, bool isQ) {
2350   assert_different_registers(dst, src, shuffle, tmp);
2351   SIMD_Arrangement size1 = isQ ? T16B : T8B;
2352   SIMD_Arrangement size2 = esize2arrangement((uint)type2aelembytes(bt), isQ);
2353 
2354   // Here is an example that rearranges a NEON vector with 4 ints:
2355   // Rearrange V1 int[a0, a1, a2, a3] to V2 int[a2, a3, a0, a1]
2356   //   1. We assume the shuffle input is Vi int[2, 3, 0, 1].
2357   //   2. Multiply Vi int[2, 3, 0, 1] with constant int vector
2358   //      [0x04040404, 0x04040404, 0x04040404, 0x04040404], and get
2359   //      tbl base Vm int[0x08080808, 0x0c0c0c0c, 0x00000000, 0x04040404].
2360   //   3. Add Vm with constant int[0x03020100, 0x03020100, 0x03020100, 0x03020100],
2361   //      and get tbl index Vm int[0x0b0a0908, 0x0f0e0d0c, 0x03020100, 0x07060504]
2362   //   4. Use Vm as index register, and use V1 as table register.
2363   //      Then get V2 as the result by tbl NEON instructions.
2364   switch (bt) {
2365     case T_SHORT:
2366       mov(tmp, size1, 0x02);
2367       mulv(dst, size2, shuffle, tmp);
2368       mov(tmp, size2, 0x0100);
2369       addv(dst, size1, dst, tmp);
2370       tbl(dst, size1, src, 1, dst);
2371       break;
2372     case T_INT:
2373     case T_FLOAT:
2374       mov(tmp, size1, 0x04);
2375       mulv(dst, size2, shuffle, tmp);
2376       mov(tmp, size2, 0x03020100);
2377       addv(dst, size1, dst, tmp);
2378       tbl(dst, size1, src, 1, dst);
2379       break;
2380     case T_LONG:
2381     case T_DOUBLE:
2382       // Load the iota indices for Long type. The indices are ordered by
2383       // type B/S/I/L/F/D, and the offset between two types is 16; Hence
2384       // the offset for L is 48.
2385       lea(rscratch1,
2386           ExternalAddress(StubRoutines::aarch64::vector_iota_indices() + 48));
2387       ldrq(tmp, rscratch1);
2388       // Check whether the input "shuffle" is the same with iota indices.
2389       // Return "src" if true, otherwise swap the two elements of "src".
2390       cm(EQ, dst, size2, shuffle, tmp);
2391       ext(tmp, size1, src, src, 8);
2392       bsl(dst, size1, src, tmp);
2393       break;
2394     default:
2395       assert(false, "unsupported element type");
2396       ShouldNotReachHere();
2397   }
2398 }
2399 
2400 // Extract a scalar element from an sve vector at position 'idx'.
2401 // The input elements in src are expected to be of integral type.
2402 void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src,
2403                                              int idx, FloatRegister vtmp) {
2404   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2405   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2406   if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction
2407     if (bt == T_INT || bt == T_LONG) {
2408       umov(dst, src, size, idx);
2409     } else {
2410       smov(dst, src, size, idx);
2411     }
2412   } else {
2413     sve_orr(vtmp, src, src);
2414     sve_ext(vtmp, vtmp, idx << size);
2415     if (bt == T_INT || bt == T_LONG) {
2416       umov(dst, vtmp, size, 0);
2417     } else {
2418       smov(dst, vtmp, size, 0);
2419     }
2420   }
2421 }
2422 
2423 // java.lang.Math::round intrinsics
2424 
2425 // Clobbers: rscratch1, rflags
2426 void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2427                                           FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) {
2428   assert_different_registers(tmp1, tmp2, tmp3, src, dst);
2429   switch (T) {
2430     case T2S:
2431     case T4S:
2432       fmovs(tmp1, T, 0.5f);
2433       mov(rscratch1, jint_cast(0x1.0p23f));
2434       break;
2435     case T2D:
2436       fmovd(tmp1, T, 0.5);
2437       mov(rscratch1, julong_cast(0x1.0p52));
2438       break;
2439     default:
2440       assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2441   }
2442   fadd(tmp1, T, tmp1, src);
2443   fcvtms(tmp1, T, tmp1);
2444   // tmp1 = floor(src + 0.5, ties to even)
2445 
2446   fcvtas(dst, T, src);
2447   // dst = round(src), ties to away
2448 
2449   fneg(tmp3, T, src);
2450   dup(tmp2, T, rscratch1);
2451   cm(HS, tmp3, T, tmp3, tmp2);
2452   // tmp3 is now a set of flags
2453 
2454   bif(dst, T16B, tmp1, tmp3);
2455   // result in dst
2456 }
2457 
2458 // Clobbers: rscratch1, rflags
2459 void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2460                                          FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) {
2461   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2462   assert_different_registers(tmp1, tmp2, src, dst);
2463 
2464   switch (T) {
2465     case S:
2466       mov(rscratch1, jint_cast(0x1.0p23f));
2467       break;
2468     case D:
2469       mov(rscratch1, julong_cast(0x1.0p52));
2470       break;
2471     default:
2472       assert(T == S || T == D, "invalid register variant");
2473   }
2474 
2475   sve_frinta(dst, T, ptrue, src);
2476   // dst = round(src), ties to away
2477 
2478   Label none;
2479 
2480   sve_fneg(tmp1, T, ptrue, src);
2481   sve_dup(tmp2, T, rscratch1);
2482   sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1);
2483   br(EQ, none);
2484   {
2485     sve_cpy(tmp1, T, pgtmp, 0.5);
2486     sve_fadd(tmp1, T, pgtmp, src);
2487     sve_frintm(dst, T, pgtmp, tmp1);
2488     // dst = floor(src + 0.5, ties to even)
2489   }
2490   bind(none);
2491 
2492   sve_fcvtzs(dst, T, ptrue, dst, T);
2493   // result in dst
2494 }
2495 
2496 void C2_MacroAssembler::vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero,
2497                                            FloatRegister one, SIMD_Arrangement T) {
2498   assert_different_registers(dst, src, zero, one);
2499   assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2500 
2501   facgt(dst, T, src, zero);
2502   ushr(dst, T, dst, 1); // dst=0 for +-0.0 and NaN. 0x7FF..F otherwise
2503   bsl(dst, T == T2S ? T8B : T16B, one, src); // Result in dst
2504 }
2505 
2506 void C2_MacroAssembler::vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero,
2507                                           FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T) {
2508     assert_different_registers(dst, src, zero, one, vtmp);
2509     assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2510 
2511     sve_orr(vtmp, src, src);
2512     sve_fac(Assembler::GT, pgtmp, T, ptrue, src, zero); // pmtp=0 for +-0.0 and NaN. 0x1 otherwise
2513     switch (T) {
2514     case S:
2515       sve_and(vtmp, T, min_jint); // Extract the sign bit of float value in every lane of src
2516       sve_orr(vtmp, T, jint_cast(1.0)); // OR it with +1 to make the final result +1 or -1 depending
2517                                         // on the sign of the float value
2518       break;
2519     case D:
2520       sve_and(vtmp, T, min_jlong);
2521       sve_orr(vtmp, T, jlong_cast(1.0));
2522       break;
2523     default:
2524       assert(false, "unsupported");
2525       ShouldNotReachHere();
2526     }
2527     sve_sel(dst, T, pgtmp, vtmp, src); // Select either from src or vtmp based on the predicate register pgtmp
2528                                        // Result in dst
2529 }
2530 
2531 bool C2_MacroAssembler::in_scratch_emit_size() {
2532   if (ciEnv::current()->task() != nullptr) {
2533     PhaseOutput* phase_output = Compile::current()->output();
2534     if (phase_output != nullptr && phase_output->in_scratch_emit_size()) {
2535       return true;
2536     }
2537   }
2538   return MacroAssembler::in_scratch_emit_size();
2539 }
2540 
2541 static void abort_verify_int_in_range(uint idx, jint val, jint lo, jint hi) {
2542   fatal("Invalid CastII, idx: %u, val: %d, lo: %d, hi: %d", idx, val, lo, hi);
2543 }
2544 
2545 void C2_MacroAssembler::verify_int_in_range(uint idx, const TypeInt* t, Register rval, Register rtmp) {
2546   assert(!t->empty() && !t->singleton(), "%s", Type::str(t));
2547   if (t == TypeInt::INT) {
2548     return;
2549   }
2550   BLOCK_COMMENT("verify_int_in_range {");
2551   Label L_success, L_failure;
2552 
2553   jint lo = t->_lo;
2554   jint hi = t->_hi;
2555 
2556   if (lo != min_jint && hi != max_jint) {
2557     subsw(rtmp, rval, lo);
2558     br(Assembler::LT, L_failure);
2559     subsw(rtmp, rval, hi);
2560     br(Assembler::LE, L_success);
2561   } else if (lo != min_jint) {
2562     subsw(rtmp, rval, lo);
2563     br(Assembler::GE, L_success);
2564   } else if (hi != max_jint) {
2565     subsw(rtmp, rval, hi);
2566     br(Assembler::LE, L_success);
2567   } else {
2568     ShouldNotReachHere();
2569   }
2570 
2571   bind(L_failure);
2572   movw(c_rarg0, idx);
2573   mov(c_rarg1, rval);
2574   movw(c_rarg2, lo);
2575   movw(c_rarg3, hi);
2576   reconstruct_frame_pointer(rtmp);
2577   rt_call(CAST_FROM_FN_PTR(address, abort_verify_int_in_range), rtmp);
2578   hlt(0);
2579 
2580   bind(L_success);
2581   BLOCK_COMMENT("} verify_int_in_range");
2582 }
2583 
2584 static void abort_verify_long_in_range(uint idx, jlong val, jlong lo, jlong hi) {
2585   fatal("Invalid CastLL, idx: %u, val: " JLONG_FORMAT ", lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, val, lo, hi);
2586 }
2587 
2588 void C2_MacroAssembler::verify_long_in_range(uint idx, const TypeLong* t, Register rval, Register rtmp) {
2589   assert(!t->empty() && !t->singleton(), "%s", Type::str(t));
2590   if (t == TypeLong::LONG) {
2591     return;
2592   }
2593   BLOCK_COMMENT("verify_long_in_range {");
2594   Label L_success, L_failure;
2595 
2596   jlong lo = t->_lo;
2597   jlong hi = t->_hi;
2598 
2599   if (lo != min_jlong && hi != max_jlong) {
2600     subs(rtmp, rval, lo);
2601     br(Assembler::LT, L_failure);
2602     subs(rtmp, rval, hi);
2603     br(Assembler::LE, L_success);
2604   } else if (lo != min_jlong) {
2605     subs(rtmp, rval, lo);
2606     br(Assembler::GE, L_success);
2607   } else if (hi != max_jlong) {
2608     subs(rtmp, rval, hi);
2609     br(Assembler::LE, L_success);
2610   } else {
2611     ShouldNotReachHere();
2612   }
2613 
2614   bind(L_failure);
2615   movw(c_rarg0, idx);
2616   mov(c_rarg1, rval);
2617   mov(c_rarg2, lo);
2618   mov(c_rarg3, hi);
2619   reconstruct_frame_pointer(rtmp);
2620   rt_call(CAST_FROM_FN_PTR(address, abort_verify_long_in_range), rtmp);
2621   hlt(0);
2622 
2623   bind(L_success);
2624   BLOCK_COMMENT("} verify_long_in_range");
2625 }
2626 
2627 void C2_MacroAssembler::reconstruct_frame_pointer(Register rtmp) {
2628   const int framesize = Compile::current()->output()->frame_size_in_bytes();
2629   if (PreserveFramePointer) {
2630     // frame pointer is valid
2631 #ifdef ASSERT
2632     // Verify frame pointer value in rfp.
2633     add(rtmp, sp, framesize - 2 * wordSize);
2634     Label L_success;
2635     cmp(rfp, rtmp);
2636     br(Assembler::EQ, L_success);
2637     stop("frame pointer mismatch");
2638     bind(L_success);
2639 #endif // ASSERT
2640   } else {
2641     add(rfp, sp, framesize - 2 * wordSize);
2642   }
2643 }
2644 
2645 // Selects elements from two source vectors (src1, src2) based on index values in the index register
2646 // using Neon instructions and places it in the destination vector element corresponding to the
2647 // index vector element. Each index in the index register must be in the range - [0, 2 * NUM_ELEM),
2648 // where NUM_ELEM is the number of BasicType elements per vector.
2649 // If idx < NUM_ELEM --> selects src1[idx] (idx is an element of the index register)
2650 // Otherwise, selects src2[idx – NUM_ELEM]
2651 void C2_MacroAssembler::select_from_two_vectors_neon(FloatRegister dst, FloatRegister src1,
2652                                                      FloatRegister src2, FloatRegister index,
2653                                                      FloatRegister tmp, unsigned vector_length_in_bytes) {
2654   assert_different_registers(dst, src1, src2, tmp);
2655   SIMD_Arrangement size = vector_length_in_bytes == 16 ? T16B : T8B;
2656 
2657   if (vector_length_in_bytes == 16) {
2658     assert(UseSVE <= 1, "sve must be <= 1");
2659     assert(src1->successor() == src2, "Source registers must be ordered");
2660     // If the vector length is 16B, then use the Neon "tbl" instruction with two vector table
2661     tbl(dst, size, src1, 2, index);
2662   } else { // vector length == 8
2663     assert(UseSVE == 0, "must be Neon only");
2664     // We need to fit both the source vectors (src1, src2) in a 128-bit register because the
2665     // Neon "tbl" instruction supports only looking up 16B vectors. We then use the Neon "tbl"
2666     // instruction with one vector lookup
2667     ins(tmp, D, src1, 0, 0);
2668     ins(tmp, D, src2, 1, 0);
2669     tbl(dst, size, tmp, 1, index);
2670   }
2671 }
2672 
2673 // Selects elements from two source vectors (src1, src2) based on index values in the index register
2674 // using SVE/SVE2 instructions and places it in the destination vector element corresponding to the
2675 // index vector element. Each index in the index register must be in the range - [0, 2 * NUM_ELEM),
2676 // where NUM_ELEM is the number of BasicType elements per vector.
2677 // If idx < NUM_ELEM --> selects src1[idx] (idx is an element of the index register)
2678 // Otherwise, selects src2[idx – NUM_ELEM]
2679 void C2_MacroAssembler::select_from_two_vectors_sve(FloatRegister dst, FloatRegister src1,
2680                                                     FloatRegister src2, FloatRegister index,
2681                                                     FloatRegister tmp, SIMD_RegVariant T,
2682                                                     unsigned vector_length_in_bytes) {
2683   assert_different_registers(dst, src1, src2, index, tmp);
2684 
2685   if (vector_length_in_bytes == 8) {
2686     // We need to fit both the source vectors (src1, src2) in a single vector register because the
2687     // SVE "tbl" instruction is unpredicated and works on the entire vector which can lead to
2688     // incorrect results if each source vector is only partially filled. We then use the SVE "tbl"
2689     // instruction with one vector lookup
2690     assert(UseSVE >= 1, "sve must be >= 1");
2691     ins(tmp, D, src1, 0, 0);
2692     ins(tmp, D, src2, 1, 0);
2693     sve_tbl(dst, T, tmp, index);
2694   } else {  // UseSVE == 2 and vector_length_in_bytes > 8
2695     // If the vector length is > 8, then use the SVE2 "tbl" instruction with the two vector table.
2696     // The assertion - vector_length_in_bytes == MaxVectorSize ensures that this operation
2697     // is not executed on machines where vector_length_in_bytes < MaxVectorSize
2698     // with the only exception of 8B vector length.
2699     assert(UseSVE == 2 && vector_length_in_bytes == MaxVectorSize, "must be");
2700     assert(src1->successor() == src2, "Source registers must be ordered");
2701     sve_tbl(dst, T, src1, src2, index);
2702   }
2703 }
2704 
2705 void C2_MacroAssembler::select_from_two_vectors(FloatRegister dst, FloatRegister src1,
2706                                                 FloatRegister src2, FloatRegister index,
2707                                                 FloatRegister tmp, BasicType bt,
2708                                                 unsigned vector_length_in_bytes) {
2709 
2710   assert_different_registers(dst, src1, src2, index, tmp);
2711 
2712   // The cases that can reach this method are -
2713   // - UseSVE = 0/1, vector_length_in_bytes = 8 or 16, excluding double and long types
2714   // - UseSVE = 2, vector_length_in_bytes >= 8, for all types
2715   //
2716   // SVE/SVE2 tbl instructions are generated when UseSVE = 1 with vector_length_in_bytes = 8
2717   // and UseSVE = 2 with vector_length_in_bytes >= 8
2718   //
2719   // Neon instructions are generated when UseSVE = 0 with vector_length_in_bytes = 8 or 16 and
2720   // UseSVE = 1 with vector_length_in_bytes = 16
2721 
2722   if ((UseSVE == 1 && vector_length_in_bytes == 8) || UseSVE == 2) {
2723     SIMD_RegVariant T = elemType_to_regVariant(bt);
2724     select_from_two_vectors_sve(dst, src1, src2, index, tmp, T, vector_length_in_bytes);
2725     return;
2726   }
2727 
2728   // The only BasicTypes that can reach here are T_SHORT, T_BYTE, T_INT and T_FLOAT
2729   assert(bt != T_DOUBLE && bt != T_LONG, "unsupported basic type");
2730   assert(vector_length_in_bytes <= 16, "length_in_bytes must be <= 16");
2731 
2732   bool isQ = vector_length_in_bytes == 16;
2733 
2734   SIMD_Arrangement size1 = isQ ? T16B : T8B;
2735   SIMD_Arrangement size2 = esize2arrangement((uint)type2aelembytes(bt), isQ);
2736 
2737   // Neon "tbl" instruction only supports byte tables, so we need to look at chunks of
2738   // 2B for selecting shorts or chunks of 4B for selecting ints/floats from the table.
2739   // The index values in "index" register are in the range of [0, 2 * NUM_ELEM) where NUM_ELEM
2740   // is the number of elements that can fit in a vector. For ex. for T_SHORT with 64-bit vector length,
2741   // the indices can range from [0, 8).
2742   // As an example with 64-bit vector length and T_SHORT type - let index = [2, 5, 1, 0]
2743   // Move a constant 0x02 in every byte of tmp - tmp = [0x0202, 0x0202, 0x0202, 0x0202]
2744   // Multiply index vector with tmp to yield - dst = [0x0404, 0x0a0a, 0x0202, 0x0000]
2745   // Move a constant 0x0100 in every 2B of tmp - tmp = [0x0100, 0x0100, 0x0100, 0x0100]
2746   // Add the multiplied result to the vector in tmp to obtain the byte level
2747   // offsets - dst = [0x0504, 0x0b0a, 0x0302, 0x0100]
2748   // Use these offsets in the "tbl" instruction to select chunks of 2B.
2749 
2750   if (bt == T_BYTE) {
2751     select_from_two_vectors_neon(dst, src1, src2, index, tmp, vector_length_in_bytes);
2752   } else {
2753     int elem_size = (bt == T_SHORT) ? 2 : 4;
2754     uint64_t tbl_offset = (bt == T_SHORT) ? 0x0100u : 0x03020100u;
2755 
2756     mov(tmp, size1, elem_size);
2757     mulv(dst, size2, index, tmp);
2758     mov(tmp, size2, tbl_offset);
2759     addv(dst, size1, dst, tmp); // "dst" now contains the processed index elements
2760                                 // to select a set of 2B/4B
2761     select_from_two_vectors_neon(dst, src1, src2, dst, tmp, vector_length_in_bytes);
2762   }
2763 }
2764 
2765 // Vector expand implementation. Elements from the src vector are expanded into
2766 // the dst vector under the control of the vector mask.
2767 // Since there are no native instructions directly corresponding to expand before
2768 // SVE2p2, the following implementations mainly leverages the TBL instruction to
2769 // implement expand. To compute the index input for TBL, the prefix sum algorithm
2770 // (https://en.wikipedia.org/wiki/Prefix_sum) is used. The same algorithm is used
2771 // for NEON and SVE, but with different instructions where appropriate.
2772 
2773 // Vector expand implementation for NEON.
2774 //
2775 // An example of 128-bit Byte vector:
2776 //   Data direction: high <== low
2777 //   Input:
2778 //         src   = g  f  e  d  c  b  a  9  8  7  6  5  4  3  2  1
2779 //         mask  = 0  0 -1 -1  0  0 -1 -1  0  0 -1 -1  0  0 -1 -1
2780 //   Expected result:
2781 //         dst   = 0  0  8  7  0  0  6  5  0  0  4  3  0  0  2  1
2782 void C2_MacroAssembler::vector_expand_neon(FloatRegister dst, FloatRegister src, FloatRegister mask,
2783                                            FloatRegister tmp1, FloatRegister tmp2, BasicType bt,
2784                                            int vector_length_in_bytes) {
2785   assert(vector_length_in_bytes <= 16, "the vector length in bytes for NEON must be <= 16");
2786   assert_different_registers(dst, src, mask, tmp1, tmp2);
2787   // Since the TBL instruction only supports byte table, we need to
2788   // compute indices in byte type for all types.
2789   SIMD_Arrangement size = vector_length_in_bytes == 16 ? T16B : T8B;
2790   // tmp1 =  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
2791   dup(tmp1, size, zr);
2792   // dst  =  0  0  1  1  0  0  1  1  0  0  1  1  0  0  1  1
2793   negr(dst, size, mask);
2794   // Calculate vector index for TBL with prefix sum algorithm.
2795   // dst  =  8  8  8  7  6  6  6  5  4  4  4  3  2  2  2  1
2796   for (int i = 1; i < vector_length_in_bytes; i <<= 1) {
2797     ext(tmp2, size, tmp1, dst, vector_length_in_bytes - i);
2798     addv(dst, size, tmp2, dst);
2799   }
2800   // tmp2 =  0  0 -1 -1  0  0 -1 -1  0  0 -1 -1  0  0 -1 -1
2801   orr(tmp2, size, mask, mask);
2802   // tmp2 =  0  0  8  7  0  0  6  5  0  0  4  3  0  0  2  1
2803   bsl(tmp2, size, dst, tmp1);
2804   // tmp1 =  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
2805   movi(tmp1, size, 1);
2806   // dst  = -1 -1  7  6 -1 -1  5  4 -1 -1  3  2 -1 -1  1  0
2807   subv(dst, size, tmp2, tmp1);
2808   // dst  =  0  0  8  7  0  0  6  5  0  0  4  3  0  0  2  1
2809   tbl(dst, size, src, 1, dst);
2810 }
2811 
2812 // Vector expand implementation for SVE.
2813 //
2814 // An example of 128-bit Short vector:
2815 //   Data direction: high <== low
2816 //   Input:
2817 //         src   = gf ed cb a9 87 65 43 21
2818 //         pg    = 00 01 00 01 00 01 00 01
2819 //   Expected result:
2820 //         dst   = 00 87 00 65 00 43 00 21
2821 void C2_MacroAssembler::vector_expand_sve(FloatRegister dst, FloatRegister src, PRegister pg,
2822                                           FloatRegister tmp1, FloatRegister tmp2, BasicType bt,
2823                                           int vector_length_in_bytes) {
2824   assert(UseSVE > 0, "expand implementation only for SVE");
2825   assert_different_registers(dst, src, tmp1, tmp2);
2826   SIMD_RegVariant size = elemType_to_regVariant(bt);
2827 
2828   // tmp1 = 00 00 00 00 00 00 00 00
2829   sve_dup(tmp1, size, 0);
2830   sve_movprfx(tmp2, tmp1);
2831   // tmp2 = 00 01 00 01 00 01 00 01
2832   sve_cpy(tmp2, size, pg, 1, true);
2833   // Calculate vector index for TBL with prefix sum algorithm.
2834   // tmp2 = 04 04 03 03 02 02 01 01
2835   for (int i = type2aelembytes(bt); i < vector_length_in_bytes; i <<= 1) {
2836     sve_movprfx(dst, tmp1);
2837     // The EXT instruction operates on the full-width sve register. The correct
2838     // index calculation method is:
2839     // vector_length_in_bytes - i + MaxVectorSize - vector_length_in_bytes =>
2840     // MaxVectorSize - i.
2841     sve_ext(dst, tmp2, MaxVectorSize - i);
2842     sve_add(tmp2, size, dst, tmp2);
2843   }
2844   // dst  = 00 04 00 03 00 02 00 01
2845   sve_sel(dst, size, pg, tmp2, tmp1);
2846   // dst  = -1 03 -1 02 -1 01 -1 00
2847   sve_sub(dst, size, 1);
2848   // dst  = 00 87 00 65 00 43 00 21
2849   sve_tbl(dst, size, src, dst);
2850 }