1 /*
   2  * Copyright (c) 2020, 2026, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "asm/assembler.hpp"
  26 #include "asm/assembler.inline.hpp"
  27 #include "opto/c2_MacroAssembler.hpp"
  28 #include "opto/compile.hpp"
  29 #include "opto/intrinsicnode.hpp"
  30 #include "opto/matcher.hpp"
  31 #include "opto/output.hpp"
  32 #include "opto/subnode.hpp"
  33 #include "runtime/objectMonitorTable.hpp"
  34 #include "runtime/stubRoutines.hpp"
  35 #include "runtime/synchronizer.hpp"
  36 #include "utilities/globalDefinitions.hpp"
  37 #include "utilities/powerOfTwo.hpp"
  38 
  39 #ifdef PRODUCT
  40 #define BLOCK_COMMENT(str) /* nothing */
  41 #define STOP(error) stop(error)
  42 #else
  43 #define BLOCK_COMMENT(str) block_comment(str)
  44 #define STOP(error) block_comment(error); stop(error)
  45 #endif
  46 
  47 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  48 
  49 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
  50 
  51 // jdk.internal.util.ArraysSupport.vectorizedHashCode
  52 address C2_MacroAssembler::arrays_hashcode(Register ary, Register cnt, Register result,
  53                                            FloatRegister vdata0, FloatRegister vdata1,
  54                                            FloatRegister vdata2, FloatRegister vdata3,
  55                                            FloatRegister vmul0, FloatRegister vmul1,
  56                                            FloatRegister vmul2, FloatRegister vmul3,
  57                                            FloatRegister vpow, FloatRegister vpowm,
  58                                            BasicType eltype) {
  59   ARRAYS_HASHCODE_REGISTERS;
  60 
  61   Register tmp1 = rscratch1, tmp2 = rscratch2;
  62 
  63   Label TAIL, STUB_SWITCH, STUB_SWITCH_OUT, LOOP, BR_BASE, LARGE, DONE;
  64 
  65   // Vectorization factor. Number of array elements loaded to one SIMD&FP registers by the stubs. We
  66   // use 8H load arrangements for chars and shorts and 8B for booleans and bytes. It's possible to
  67   // use 4H for chars and shorts instead, but using 8H gives better performance.
  68   const size_t vf = eltype == T_BOOLEAN || eltype == T_BYTE ? 8
  69                     : eltype == T_CHAR || eltype == T_SHORT ? 8
  70                     : eltype == T_INT                       ? 4
  71                                                             : 0;
  72   guarantee(vf, "unsupported eltype");
  73 
  74   // Unroll factor for the scalar loop below. The value is chosen based on performance analysis.
  75   const size_t unroll_factor = 4;
  76 
  77   switch (eltype) {
  78   case T_BOOLEAN:
  79     BLOCK_COMMENT("arrays_hashcode(unsigned byte) {");
  80     break;
  81   case T_CHAR:
  82     BLOCK_COMMENT("arrays_hashcode(char) {");
  83     break;
  84   case T_BYTE:
  85     BLOCK_COMMENT("arrays_hashcode(byte) {");
  86     break;
  87   case T_SHORT:
  88     BLOCK_COMMENT("arrays_hashcode(short) {");
  89     break;
  90   case T_INT:
  91     BLOCK_COMMENT("arrays_hashcode(int) {");
  92     break;
  93   default:
  94     ShouldNotReachHere();
  95   }
  96 
  97   // large_arrays_hashcode(T_INT) performs worse than the scalar loop below when the Neon loop
  98   // implemented by the stub executes just once. Call the stub only if at least two iterations will
  99   // be executed.
 100   const size_t large_threshold = eltype == T_INT ? vf * 2 : vf;
 101   cmpw(cnt, large_threshold);
 102   br(Assembler::HS, LARGE);
 103 
 104   bind(TAIL);
 105 
 106   // The andr performs cnt % uf where uf = unroll_factor. The subtract shifted by 3 offsets past
 107   // uf - (cnt % uf) pairs of load + madd insns i.e. it only executes cnt % uf load + madd pairs.
 108   // Iteration eats up the remainder, uf elements at a time.
 109   assert(is_power_of_2(unroll_factor), "can't use this value to calculate the jump target PC");
 110   andr(tmp2, cnt, unroll_factor - 1);
 111   adr(tmp1, BR_BASE);
 112   // For Cortex-A53 offset is 4 because 2 nops are generated.
 113   sub(tmp1, tmp1, tmp2, ext::sxtw, VM_Version::supports_a53mac() ? 4 : 3);
 114   movw(tmp2, 0x1f);
 115   br(tmp1);
 116 
 117   bind(LOOP);
 118   for (size_t i = 0; i < unroll_factor; ++i) {
 119     load(tmp1, Address(post(ary, type2aelembytes(eltype))), eltype);
 120     maddw(result, result, tmp2, tmp1);
 121     // maddw generates an extra nop for Cortex-A53 (see maddw definition in macroAssembler).
 122     // Generate 2nd nop to have 4 instructions per iteration.
 123     if (VM_Version::supports_a53mac()) {
 124       nop();
 125     }
 126   }
 127   bind(BR_BASE);
 128   subsw(cnt, cnt, unroll_factor);
 129   br(Assembler::HS, LOOP);
 130 
 131   b(DONE);
 132 
 133   bind(LARGE);
 134 
 135   RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_arrays_hashcode(eltype));
 136   assert(stub.target() != nullptr, "array_hashcode stub has not been generated");
 137   address tpc = trampoline_call(stub);
 138   if (tpc == nullptr) {
 139     DEBUG_ONLY(reset_labels(TAIL, BR_BASE));
 140     postcond(pc() == badAddress);
 141     return nullptr;
 142   }
 143 
 144   bind(DONE);
 145 
 146   BLOCK_COMMENT("} // arrays_hashcode");
 147 
 148   postcond(pc() != badAddress);
 149   return pc();
 150 }
 151 
 152 void C2_MacroAssembler::fast_lock(Register obj, Register box, Register t1,
 153                                   Register t2, Register t3) {
 154   assert_different_registers(obj, box, t1, t2, t3, rscratch2);
 155 
 156   // Handle inflated monitor.
 157   Label inflated;
 158   // Finish fast lock successfully. MUST branch to with flag == EQ
 159   Label locked;
 160   // Finish fast lock unsuccessfully. MUST branch to with flag == NE
 161   Label slow_path;
 162 
 163   if (UseObjectMonitorTable) {
 164     // Clear cache in case fast locking succeeds or we need to take the slow-path.
 165     str(zr, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 166   }
 167 
 168   if (DiagnoseSyncOnValueBasedClasses != 0) {
 169     load_klass(t1, obj);
 170     ldrb(t1, Address(t1, Klass::misc_flags_offset()));
 171     tst(t1, KlassFlags::_misc_is_value_based_class);
 172     br(Assembler::NE, slow_path);
 173   }
 174 
 175   const Register t1_mark = t1;
 176   const Register t3_t = t3;
 177 
 178   { // Fast locking
 179 
 180     // Push lock to the lock stack and finish successfully. MUST branch to with flag == EQ
 181     Label push;
 182 
 183     const Register t2_top = t2;
 184 
 185     // Check if lock-stack is full.
 186     ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 187     cmpw(t2_top, (unsigned)LockStack::end_offset() - 1);
 188     br(Assembler::GT, slow_path);
 189 
 190     // Check if recursive.
 191     subw(t3_t, t2_top, oopSize);
 192     ldr(t3_t, Address(rthread, t3_t));
 193     cmp(obj, t3_t);
 194     br(Assembler::EQ, push);
 195 
 196     // Relaxed normal load to check for monitor. Optimization for monitor case.
 197     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 198     tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
 199 
 200     // Not inflated
 201     assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea");
 202 
 203     // Try to lock. Transition lock-bits 0b01 => 0b00
 204     orr(t1_mark, t1_mark, markWord::unlocked_value);
 205     eor(t3_t, t1_mark, markWord::unlocked_value);
 206     cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
 207             /*acquire*/ true, /*release*/ false, /*weak*/ false, noreg);
 208     br(Assembler::NE, slow_path);
 209 
 210     bind(push);
 211     // After successful lock, push object on lock-stack.
 212     str(obj, Address(rthread, t2_top));
 213     addw(t2_top, t2_top, oopSize);
 214     strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 215     b(locked);
 216   }
 217 
 218   { // Handle inflated monitor.
 219     bind(inflated);
 220 
 221     const Register t1_monitor = t1;
 222 
 223     if (!UseObjectMonitorTable) {
 224       assert(t1_monitor == t1_mark, "should be the same here");
 225     } else {
 226       const Register t1_hash = t1;
 227       Label monitor_found;
 228 
 229       // Save the mark, we might need it to extract the hash.
 230       mov(t3, t1_mark);
 231 
 232       // Look for the monitor in the om_cache.
 233 
 234       ByteSize cache_offset   = JavaThread::om_cache_oops_offset();
 235       ByteSize monitor_offset = OMCache::oop_to_monitor_difference();
 236       const int num_unrolled  = OMCache::CAPACITY;
 237       for (int i = 0; i < num_unrolled; i++) {
 238         ldr(t1_monitor, Address(rthread, cache_offset + monitor_offset));
 239         ldr(t2, Address(rthread, cache_offset));
 240         cmp(obj, t2);
 241         br(Assembler::EQ, monitor_found);
 242         cache_offset = cache_offset + OMCache::oop_to_oop_difference();
 243       }
 244 
 245       if (UseCompactObjectHeaders) {
 246         // TODO: The fast-path table lookup currently doesn't work with Lilliput's
 247         // compact identity-hashcode implementation.
 248         // See: https://bugs.openjdk.org/browse/JDK-8380981
 249         b(slow_path);
 250       } else {
 251         // Look for the monitor in the table.
 252 
 253         // Get the hash code.
 254         ubfx(t1_hash, t3, markWord::hash_shift, markWord::hash_bits);
 255 
 256         // Get the table and calculate the bucket's address
 257         lea(t3, ExternalAddress(ObjectMonitorTable::current_table_address()));
 258         ldr(t3, Address(t3));
 259         ldr(t2, Address(t3, ObjectMonitorTable::table_capacity_mask_offset()));
 260         ands(t1_hash, t1_hash, t2);
 261         ldr(t3, Address(t3, ObjectMonitorTable::table_buckets_offset()));
 262 
 263         // Read the monitor from the bucket.
 264         ldr(t1_monitor, Address(t3, t1_hash, Address::lsl(LogBytesPerWord)));
 265 
 266         // Check if the monitor in the bucket is special (empty, tombstone or removed).
 267         cmp(t1_monitor, (unsigned char)ObjectMonitorTable::SpecialPointerValues::below_is_special);
 268         br(Assembler::LO, slow_path);
 269 
 270         // Check if object matches.
 271         ldr(t3, Address(t1_monitor, ObjectMonitor::object_offset()));
 272         BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
 273         bs_asm->try_resolve_weak_handle_in_c2(this, t3, t2, slow_path);
 274         cmp(t3, obj);
 275         br(Assembler::NE, slow_path);
 276       }
 277       bind(monitor_found);
 278     }
 279 
 280     const Register t2_owner_addr = t2;
 281     const Register t3_owner = t3;
 282     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 283     const Address owner_address(t1_monitor, ObjectMonitor::owner_offset() - monitor_tag);
 284     const Address recursions_address(t1_monitor, ObjectMonitor::recursions_offset() - monitor_tag);
 285 
 286     Label monitor_locked;
 287 
 288     // Compute owner address.
 289     lea(t2_owner_addr, owner_address);
 290 
 291     // Try to CAS owner (no owner => current thread's _monitor_owner_id).
 292     ldr(rscratch2, Address(rthread, JavaThread::monitor_owner_id_offset()));
 293     cmpxchg(t2_owner_addr, zr, rscratch2, Assembler::xword, /*acquire*/ true,
 294             /*release*/ false, /*weak*/ false, t3_owner);
 295     br(Assembler::EQ, monitor_locked);
 296 
 297     // Check if recursive.
 298     cmp(t3_owner, rscratch2);
 299     br(Assembler::NE, slow_path);
 300 
 301     // Recursive.
 302     increment(recursions_address, 1);
 303 
 304     bind(monitor_locked);
 305     if (UseObjectMonitorTable) {
 306       str(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 307     }
 308   }
 309 
 310   bind(locked);
 311 
 312 #ifdef ASSERT
 313   // Check that locked label is reached with Flags == EQ.
 314   Label flag_correct;
 315   br(Assembler::EQ, flag_correct);
 316   stop("Fast Lock Flag != EQ");
 317 #endif
 318 
 319   bind(slow_path);
 320 #ifdef ASSERT
 321   // Check that slow_path label is reached with Flags == NE.
 322   br(Assembler::NE, flag_correct);
 323   stop("Fast Lock Flag != NE");
 324   bind(flag_correct);
 325 #endif
 326   // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
 327 }
 328 
 329 void C2_MacroAssembler::fast_unlock(Register obj, Register box, Register t1,
 330                                     Register t2, Register t3) {
 331   assert_different_registers(obj, box, t1, t2, t3);
 332 
 333   // Handle inflated monitor.
 334   Label inflated, inflated_load_mark;
 335   // Finish fast unlock successfully. MUST branch to with flag == EQ
 336   Label unlocked;
 337   // Finish fast unlock unsuccessfully. MUST branch to with flag == NE
 338   Label slow_path;
 339 
 340   const Register t1_mark = t1;
 341   const Register t2_top = t2;
 342   const Register t3_t = t3;
 343 
 344   { // Fast unlock
 345 
 346     Label push_and_slow_path;
 347 
 348     // Check if obj is top of lock-stack.
 349     ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 350     subw(t2_top, t2_top, oopSize);
 351     ldr(t3_t, Address(rthread, t2_top));
 352     cmp(obj, t3_t);
 353     // Top of lock stack was not obj. Must be monitor.
 354     br(Assembler::NE, inflated_load_mark);
 355 
 356     // Pop lock-stack.
 357     DEBUG_ONLY(str(zr, Address(rthread, t2_top));)
 358     strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 359 
 360     // Check if recursive.
 361     subw(t3_t, t2_top, oopSize);
 362     ldr(t3_t, Address(rthread, t3_t));
 363     cmp(obj, t3_t);
 364     br(Assembler::EQ, unlocked);
 365 
 366     // Not recursive.
 367     // Load Mark.
 368     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 369 
 370     // Check header for monitor (0b10).
 371     // Because we got here by popping (meaning we pushed in locked)
 372     // there will be no monitor in the box. So we need to push back the obj
 373     // so that the runtime can fix any potential anonymous owner.
 374     tbnz(t1_mark, exact_log2(markWord::monitor_value), UseObjectMonitorTable ? push_and_slow_path : inflated);
 375 
 376     // Try to unlock. Transition lock bits 0b00 => 0b01
 377     assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea");
 378     orr(t3_t, t1_mark, markWord::unlocked_value);
 379     cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
 380             /*acquire*/ false, /*release*/ true, /*weak*/ false, noreg);
 381     br(Assembler::EQ, unlocked);
 382 
 383     bind(push_and_slow_path);
 384     // Compare and exchange failed.
 385     // Restore lock-stack and handle the unlock in runtime.
 386     DEBUG_ONLY(str(obj, Address(rthread, t2_top));)
 387     addw(t2_top, t2_top, oopSize);
 388     str(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 389     b(slow_path);
 390   }
 391 
 392 
 393   { // Handle inflated monitor.
 394     bind(inflated_load_mark);
 395     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 396 #ifdef ASSERT
 397     tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
 398     stop("Fast Unlock not monitor");
 399 #endif
 400 
 401     bind(inflated);
 402 
 403 #ifdef ASSERT
 404     Label check_done;
 405     subw(t2_top, t2_top, oopSize);
 406     cmpw(t2_top, in_bytes(JavaThread::lock_stack_base_offset()));
 407     br(Assembler::LT, check_done);
 408     ldr(t3_t, Address(rthread, t2_top));
 409     cmp(obj, t3_t);
 410     br(Assembler::NE, inflated);
 411     stop("Fast Unlock lock on stack");
 412     bind(check_done);
 413 #endif
 414 
 415     const Register t1_monitor = t1;
 416 
 417     if (!UseObjectMonitorTable) {
 418       assert(t1_monitor == t1_mark, "should be the same here");
 419 
 420       // Untag the monitor.
 421       add(t1_monitor, t1_mark, -(int)markWord::monitor_value);
 422     } else {
 423       ldr(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 424       // null check with Flags == NE, no valid pointer below alignof(ObjectMonitor*)
 425       cmp(t1_monitor, checked_cast<uint8_t>(alignof(ObjectMonitor*)));
 426       br(Assembler::LO, slow_path);
 427     }
 428 
 429     const Register t2_recursions = t2;
 430     Label not_recursive;
 431 
 432     // Check if recursive.
 433     ldr(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
 434     cbz(t2_recursions, not_recursive);
 435 
 436     // Recursive unlock.
 437     sub(t2_recursions, t2_recursions, 1u);
 438     str(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
 439     // Set flag == EQ
 440     cmp(t2_recursions, t2_recursions);
 441     b(unlocked);
 442 
 443     bind(not_recursive);
 444 
 445     const Register t2_owner_addr = t2;
 446 
 447     // Compute owner address.
 448     lea(t2_owner_addr, Address(t1_monitor, ObjectMonitor::owner_offset()));
 449 
 450     // Set owner to null.
 451     // Release to satisfy the JMM
 452     stlr(zr, t2_owner_addr);
 453     // We need a full fence after clearing owner to avoid stranding.
 454     // StoreLoad achieves this.
 455     membar(StoreLoad);
 456 
 457     // Check if the entry_list is empty.
 458     ldr(rscratch1, Address(t1_monitor, ObjectMonitor::entry_list_offset()));
 459     cmp(rscratch1, zr);
 460     br(Assembler::EQ, unlocked);  // If so we are done.
 461 
 462     // Check if there is a successor.
 463     ldr(rscratch1, Address(t1_monitor, ObjectMonitor::succ_offset()));
 464     cmp(rscratch1, zr);
 465     br(Assembler::NE, unlocked);  // If so we are done.
 466 
 467     // Save the monitor pointer in the current thread, so we can try to
 468     // reacquire the lock in SharedRuntime::monitor_exit_helper().
 469     str(t1_monitor, Address(rthread, JavaThread::unlocked_inflated_monitor_offset()));
 470 
 471     cmp(zr, rthread); // Set Flag to NE => slow path
 472     b(slow_path);
 473   }
 474 
 475   bind(unlocked);
 476   cmp(zr, zr); // Set Flags to EQ => fast path
 477 
 478 #ifdef ASSERT
 479   // Check that unlocked label is reached with Flags == EQ.
 480   Label flag_correct;
 481   br(Assembler::EQ, flag_correct);
 482   stop("Fast Unlock Flag != EQ");
 483 #endif
 484 
 485   bind(slow_path);
 486 #ifdef ASSERT
 487   // Check that slow_path label is reached with Flags == NE.
 488   br(Assembler::NE, flag_correct);
 489   stop("Fast Unlock Flag != NE");
 490   bind(flag_correct);
 491 #endif
 492   // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
 493 }
 494 
 495 // Search for str1 in str2 and return index or -1
 496 // Clobbers: rscratch1, rscratch2, rflags. May also clobber v0-v1, when icnt1==-1.
 497 void C2_MacroAssembler::string_indexof(Register str2, Register str1,
 498                                        Register cnt2, Register cnt1,
 499                                        Register tmp1, Register tmp2,
 500                                        Register tmp3, Register tmp4,
 501                                        Register tmp5, Register tmp6,
 502                                        int icnt1, Register result, int ae) {
 503   // NOTE: tmp5, tmp6 can be zr depending on specific method version
 504   Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
 505 
 506   Register ch1 = rscratch1;
 507   Register ch2 = rscratch2;
 508   Register cnt1tmp = tmp1;
 509   Register cnt2tmp = tmp2;
 510   Register cnt1_neg = cnt1;
 511   Register cnt2_neg = cnt2;
 512   Register result_tmp = tmp4;
 513 
 514   bool isL = ae == StrIntrinsicNode::LL;
 515 
 516   bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
 517   bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
 518   int str1_chr_shift = str1_isL ? 0:1;
 519   int str2_chr_shift = str2_isL ? 0:1;
 520   int str1_chr_size = str1_isL ? 1:2;
 521   int str2_chr_size = str2_isL ? 1:2;
 522   chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
 523                                       (chr_insn)&MacroAssembler::ldrh;
 524   chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
 525                                       (chr_insn)&MacroAssembler::ldrh;
 526   chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
 527   chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
 528 
 529   // Note, inline_string_indexOf() generates checks:
 530   // if (substr.count > string.count) return -1;
 531   // if (substr.count == 0) return 0;
 532 
 533   // We have two strings, a source string in str2, cnt2 and a pattern string
 534   // in str1, cnt1. Find the 1st occurrence of pattern in source or return -1.
 535 
 536   // For larger pattern and source we use a simplified Boyer Moore algorithm.
 537   // With a small pattern and source we use linear scan.
 538 
 539   if (icnt1 == -1) {
 540     sub(result_tmp, cnt2, cnt1);
 541     cmp(cnt1, (u1)8);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
 542     br(LT, LINEARSEARCH);
 543     dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
 544     subs(zr, cnt1, 256);
 545     lsr(tmp1, cnt2, 2);
 546     ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
 547     br(GE, LINEARSTUB);
 548   }
 549 
 550 // The Boyer Moore alogorithm is based on the description here:-
 551 //
 552 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
 553 //
 554 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
 555 // and the 'Good Suffix' rule.
 556 //
 557 // These rules are essentially heuristics for how far we can shift the
 558 // pattern along the search string.
 559 //
 560 // The implementation here uses the 'Bad Character' rule only because of the
 561 // complexity of initialisation for the 'Good Suffix' rule.
 562 //
 563 // This is also known as the Boyer-Moore-Horspool algorithm:-
 564 //
 565 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
 566 //
 567 // This particular implementation has few java-specific optimizations.
 568 //
 569 // #define ASIZE 256
 570 //
 571 //    int bm(unsigned char *x, int m, unsigned char *y, int n) {
 572 //       int i, j;
 573 //       unsigned c;
 574 //       unsigned char bc[ASIZE];
 575 //
 576 //       /* Preprocessing */
 577 //       for (i = 0; i < ASIZE; ++i)
 578 //          bc[i] = m;
 579 //       for (i = 0; i < m - 1; ) {
 580 //          c = x[i];
 581 //          ++i;
 582 //          // c < 256 for Latin1 string, so, no need for branch
 583 //          #ifdef PATTERN_STRING_IS_LATIN1
 584 //          bc[c] = m - i;
 585 //          #else
 586 //          if (c < ASIZE) bc[c] = m - i;
 587 //          #endif
 588 //       }
 589 //
 590 //       /* Searching */
 591 //       j = 0;
 592 //       while (j <= n - m) {
 593 //          c = y[i+j];
 594 //          if (x[m-1] == c)
 595 //            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
 596 //          if (i < 0) return j;
 597 //          // c < 256 for Latin1 string, so, no need for branch
 598 //          #ifdef SOURCE_STRING_IS_LATIN1
 599 //          // LL case: (c< 256) always true. Remove branch
 600 //          j += bc[y[j+m-1]];
 601 //          #endif
 602 //          #ifndef PATTERN_STRING_IS_UTF
 603 //          // UU case: need if (c<ASIZE) check. Skip 1 character if not.
 604 //          if (c < ASIZE)
 605 //            j += bc[y[j+m-1]];
 606 //          else
 607 //            j += 1
 608 //          #endif
 609 //          #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
 610 //          // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
 611 //          if (c < ASIZE)
 612 //            j += bc[y[j+m-1]];
 613 //          else
 614 //            j += m
 615 //          #endif
 616 //       }
 617 //    }
 618 
 619   if (icnt1 == -1) {
 620     Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
 621         BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
 622     Register cnt1end = tmp2;
 623     Register str2end = cnt2;
 624     Register skipch = tmp2;
 625 
 626     // str1 length is >=8, so, we can read at least 1 register for cases when
 627     // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
 628     // UL case. We'll re-read last character in inner pre-loop code to have
 629     // single outer pre-loop load
 630     const int firstStep = isL ? 7 : 3;
 631 
 632     const int ASIZE = 256;
 633     const int STORED_BYTES = 32; // amount of bytes stored per instruction
 634     sub(sp, sp, ASIZE);
 635     mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
 636     mov(ch1, sp);
 637     BIND(BM_INIT_LOOP);
 638       stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
 639       subs(tmp5, tmp5, 1);
 640       br(GT, BM_INIT_LOOP);
 641 
 642       sub(cnt1tmp, cnt1, 1);
 643       mov(tmp5, str2);
 644       add(str2end, str2, result_tmp, LSL, str2_chr_shift);
 645       sub(ch2, cnt1, 1);
 646       mov(tmp3, str1);
 647     BIND(BCLOOP);
 648       (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
 649       if (!str1_isL) {
 650         subs(zr, ch1, ASIZE);
 651         br(HS, BCSKIP);
 652       }
 653       strb(ch2, Address(sp, ch1));
 654     BIND(BCSKIP);
 655       subs(ch2, ch2, 1);
 656       br(GT, BCLOOP);
 657 
 658       add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
 659       if (str1_isL == str2_isL) {
 660         // load last 8 bytes (8LL/4UU symbols)
 661         ldr(tmp6, Address(tmp6, -wordSize));
 662       } else {
 663         ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
 664         // convert Latin1 to UTF. We'll have to wait until load completed, but
 665         // it's still faster than per-character loads+checks
 666         lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
 667         ubfx(ch1, tmp6, 8, 8); // str1[N-2]
 668         ubfx(ch2, tmp6, 16, 8); // str1[N-3]
 669         andr(tmp6, tmp6, 0xFF); // str1[N-4]
 670         orr(ch2, ch1, ch2, LSL, 16);
 671         orr(tmp6, tmp6, tmp3, LSL, 48);
 672         orr(tmp6, tmp6, ch2, LSL, 16);
 673       }
 674     BIND(BMLOOPSTR2);
 675       (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 676       sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
 677       if (str1_isL == str2_isL) {
 678         // re-init tmp3. It's for free because it's executed in parallel with
 679         // load above. Alternative is to initialize it before loop, but it'll
 680         // affect performance on in-order systems with 2 or more ld/st pipelines
 681         lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
 682       }
 683       if (!isL) { // UU/UL case
 684         lsl(ch2, cnt1tmp, 1); // offset in bytes
 685       }
 686       cmp(tmp3, skipch);
 687       br(NE, BMSKIP);
 688       ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
 689       mov(ch1, tmp6);
 690       if (isL) {
 691         b(BMLOOPSTR1_AFTER_LOAD);
 692       } else {
 693         sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
 694         b(BMLOOPSTR1_CMP);
 695       }
 696     BIND(BMLOOPSTR1);
 697       (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
 698       (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 699     BIND(BMLOOPSTR1_AFTER_LOAD);
 700       subs(cnt1tmp, cnt1tmp, 1);
 701       br(LT, BMLOOPSTR1_LASTCMP);
 702     BIND(BMLOOPSTR1_CMP);
 703       cmp(ch1, ch2);
 704       br(EQ, BMLOOPSTR1);
 705     BIND(BMSKIP);
 706       if (!isL) {
 707         // if we've met UTF symbol while searching Latin1 pattern, then we can
 708         // skip cnt1 symbols
 709         if (str1_isL != str2_isL) {
 710           mov(result_tmp, cnt1);
 711         } else {
 712           mov(result_tmp, 1);
 713         }
 714         subs(zr, skipch, ASIZE);
 715         br(HS, BMADV);
 716       }
 717       ldrb(result_tmp, Address(sp, skipch)); // load skip distance
 718     BIND(BMADV);
 719       sub(cnt1tmp, cnt1, 1);
 720       add(str2, str2, result_tmp, LSL, str2_chr_shift);
 721       cmp(str2, str2end);
 722       br(LE, BMLOOPSTR2);
 723       add(sp, sp, ASIZE);
 724       b(NOMATCH);
 725     BIND(BMLOOPSTR1_LASTCMP);
 726       cmp(ch1, ch2);
 727       br(NE, BMSKIP);
 728     BIND(BMMATCH);
 729       sub(result, str2, tmp5);
 730       if (!str2_isL) lsr(result, result, 1);
 731       add(sp, sp, ASIZE);
 732       b(DONE);
 733 
 734     BIND(LINEARSTUB);
 735     cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm
 736     br(LT, LINEAR_MEDIUM);
 737     mov(result, zr);
 738     RuntimeAddress stub = nullptr;
 739     if (isL) {
 740       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
 741       assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated");
 742     } else if (str1_isL) {
 743       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
 744        assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated");
 745     } else {
 746       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
 747       assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated");
 748     }
 749     address call = trampoline_call(stub);
 750     if (call == nullptr) {
 751       DEBUG_ONLY(reset_labels(LINEARSEARCH, LINEAR_MEDIUM, DONE, NOMATCH, MATCH));
 752       ciEnv::current()->record_failure("CodeCache is full");
 753       return;
 754     }
 755     b(DONE);
 756   }
 757 
 758   BIND(LINEARSEARCH);
 759   {
 760     Label DO1, DO2, DO3;
 761 
 762     Register str2tmp = tmp2;
 763     Register first = tmp3;
 764 
 765     if (icnt1 == -1)
 766     {
 767         Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
 768 
 769         cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2));
 770         br(LT, DOSHORT);
 771       BIND(LINEAR_MEDIUM);
 772         (this->*str1_load_1chr)(first, Address(str1));
 773         lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
 774         sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
 775         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 776         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 777 
 778       BIND(FIRST_LOOP);
 779         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
 780         cmp(first, ch2);
 781         br(EQ, STR1_LOOP);
 782       BIND(STR2_NEXT);
 783         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 784         br(LE, FIRST_LOOP);
 785         b(NOMATCH);
 786 
 787       BIND(STR1_LOOP);
 788         adds(cnt1tmp, cnt1_neg, str1_chr_size);
 789         add(cnt2tmp, cnt2_neg, str2_chr_size);
 790         br(GE, MATCH);
 791 
 792       BIND(STR1_NEXT);
 793         (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
 794         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
 795         cmp(ch1, ch2);
 796         br(NE, STR2_NEXT);
 797         adds(cnt1tmp, cnt1tmp, str1_chr_size);
 798         add(cnt2tmp, cnt2tmp, str2_chr_size);
 799         br(LT, STR1_NEXT);
 800         b(MATCH);
 801 
 802       BIND(DOSHORT);
 803       if (str1_isL == str2_isL) {
 804         cmp(cnt1, (u1)2);
 805         br(LT, DO1);
 806         br(GT, DO3);
 807       }
 808     }
 809 
 810     if (icnt1 == 4) {
 811       Label CH1_LOOP;
 812 
 813         (this->*load_4chr)(ch1, str1);
 814         sub(result_tmp, cnt2, 4);
 815         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 816         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 817 
 818       BIND(CH1_LOOP);
 819         (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
 820         cmp(ch1, ch2);
 821         br(EQ, MATCH);
 822         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 823         br(LE, CH1_LOOP);
 824         b(NOMATCH);
 825       }
 826 
 827     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
 828       Label CH1_LOOP;
 829 
 830       BIND(DO2);
 831         (this->*load_2chr)(ch1, str1);
 832         if (icnt1 == 2) {
 833           sub(result_tmp, cnt2, 2);
 834         }
 835         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 836         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 837       BIND(CH1_LOOP);
 838         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
 839         cmp(ch1, ch2);
 840         br(EQ, MATCH);
 841         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 842         br(LE, CH1_LOOP);
 843         b(NOMATCH);
 844     }
 845 
 846     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
 847       Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
 848 
 849       BIND(DO3);
 850         (this->*load_2chr)(first, str1);
 851         (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
 852         if (icnt1 == 3) {
 853           sub(result_tmp, cnt2, 3);
 854         }
 855         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 856         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 857       BIND(FIRST_LOOP);
 858         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
 859         cmpw(first, ch2);
 860         br(EQ, STR1_LOOP);
 861       BIND(STR2_NEXT);
 862         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 863         br(LE, FIRST_LOOP);
 864         b(NOMATCH);
 865 
 866       BIND(STR1_LOOP);
 867         add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
 868         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
 869         cmp(ch1, ch2);
 870         br(NE, STR2_NEXT);
 871         b(MATCH);
 872     }
 873 
 874     if (icnt1 == -1 || icnt1 == 1) {
 875       Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
 876 
 877       BIND(DO1);
 878         (this->*str1_load_1chr)(ch1, str1);
 879         cmp(cnt2, (u1)8);
 880         br(LT, DO1_SHORT);
 881 
 882         sub(result_tmp, cnt2, 8/str2_chr_size);
 883         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 884         mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
 885         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 886 
 887         if (str2_isL) {
 888           orr(ch1, ch1, ch1, LSL, 8);
 889         }
 890         orr(ch1, ch1, ch1, LSL, 16);
 891         orr(ch1, ch1, ch1, LSL, 32);
 892       BIND(CH1_LOOP);
 893         ldr(ch2, Address(str2, cnt2_neg));
 894         eor(ch2, ch1, ch2);
 895         sub(tmp1, ch2, tmp3);
 896         orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 897         bics(tmp1, tmp1, tmp2);
 898         br(NE, HAS_ZERO);
 899         adds(cnt2_neg, cnt2_neg, 8);
 900         br(LT, CH1_LOOP);
 901 
 902         cmp(cnt2_neg, (u1)8);
 903         mov(cnt2_neg, 0);
 904         br(LT, CH1_LOOP);
 905         b(NOMATCH);
 906 
 907       BIND(HAS_ZERO);
 908         rev(tmp1, tmp1);
 909         clz(tmp1, tmp1);
 910         add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
 911         b(MATCH);
 912 
 913       BIND(DO1_SHORT);
 914         mov(result_tmp, cnt2);
 915         lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
 916         sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
 917       BIND(DO1_LOOP);
 918         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
 919         cmpw(ch1, ch2);
 920         br(EQ, MATCH);
 921         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 922         br(LT, DO1_LOOP);
 923     }
 924   }
 925   BIND(NOMATCH);
 926     mov(result, -1);
 927     b(DONE);
 928   BIND(MATCH);
 929     add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
 930   BIND(DONE);
 931 }
 932 
 933 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
 934 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
 935 
 936 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1,
 937                                             Register ch, Register result,
 938                                             Register tmp1, Register tmp2, Register tmp3)
 939 {
 940   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
 941   Register cnt1_neg = cnt1;
 942   Register ch1 = rscratch1;
 943   Register result_tmp = rscratch2;
 944 
 945   cbz(cnt1, NOMATCH);
 946 
 947   cmp(cnt1, (u1)4);
 948   br(LT, DO1_SHORT);
 949 
 950   orr(ch, ch, ch, LSL, 16);
 951   orr(ch, ch, ch, LSL, 32);
 952 
 953   sub(cnt1, cnt1, 4);
 954   mov(result_tmp, cnt1);
 955   lea(str1, Address(str1, cnt1, Address::uxtw(1)));
 956   sub(cnt1_neg, zr, cnt1, LSL, 1);
 957 
 958   mov(tmp3, 0x0001000100010001);
 959 
 960   BIND(CH1_LOOP);
 961     ldr(ch1, Address(str1, cnt1_neg));
 962     eor(ch1, ch, ch1);
 963     sub(tmp1, ch1, tmp3);
 964     orr(tmp2, ch1, 0x7fff7fff7fff7fff);
 965     bics(tmp1, tmp1, tmp2);
 966     br(NE, HAS_ZERO);
 967     adds(cnt1_neg, cnt1_neg, 8);
 968     br(LT, CH1_LOOP);
 969 
 970     cmp(cnt1_neg, (u1)8);
 971     mov(cnt1_neg, 0);
 972     br(LT, CH1_LOOP);
 973     b(NOMATCH);
 974 
 975   BIND(HAS_ZERO);
 976     rev(tmp1, tmp1);
 977     clz(tmp1, tmp1);
 978     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
 979     b(MATCH);
 980 
 981   BIND(DO1_SHORT);
 982     mov(result_tmp, cnt1);
 983     lea(str1, Address(str1, cnt1, Address::uxtw(1)));
 984     sub(cnt1_neg, zr, cnt1, LSL, 1);
 985   BIND(DO1_LOOP);
 986     ldrh(ch1, Address(str1, cnt1_neg));
 987     cmpw(ch, ch1);
 988     br(EQ, MATCH);
 989     adds(cnt1_neg, cnt1_neg, 2);
 990     br(LT, DO1_LOOP);
 991   BIND(NOMATCH);
 992     mov(result, -1);
 993     b(DONE);
 994   BIND(MATCH);
 995     add(result, result_tmp, cnt1_neg, ASR, 1);
 996   BIND(DONE);
 997 }
 998 
 999 void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1,
1000                                                 Register ch, Register result,
1001                                                 FloatRegister ztmp1,
1002                                                 FloatRegister ztmp2,
1003                                                 PRegister tmp_pg,
1004                                                 PRegister tmp_pdn, bool isL)
1005 {
1006   // Note that `tmp_pdn` should *NOT* be used as governing predicate register.
1007   assert(tmp_pg->is_governing(),
1008          "this register has to be a governing predicate register");
1009 
1010   Label LOOP, MATCH, DONE, NOMATCH;
1011   Register vec_len = rscratch1;
1012   Register idx = rscratch2;
1013 
1014   SIMD_RegVariant T = (isL == true) ? B : H;
1015 
1016   cbz(cnt1, NOMATCH);
1017 
1018   // Assign the particular char throughout the vector.
1019   sve_dup(ztmp2, T, ch);
1020   if (isL) {
1021     sve_cntb(vec_len);
1022   } else {
1023     sve_cnth(vec_len);
1024   }
1025   mov(idx, 0);
1026 
1027   // Generate a predicate to control the reading of input string.
1028   sve_whilelt(tmp_pg, T, idx, cnt1);
1029 
1030   BIND(LOOP);
1031     // Read a vector of 8- or 16-bit data depending on the string type. Note
1032     // that inactive elements indicated by the predicate register won't cause
1033     // a data read from memory to the destination vector.
1034     if (isL) {
1035       sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx));
1036     } else {
1037       sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1)));
1038     }
1039     add(idx, idx, vec_len);
1040 
1041     // Perform the comparison. An element of the destination predicate is set
1042     // to active if the particular char is matched.
1043     sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2);
1044 
1045     // Branch if the particular char is found.
1046     br(NE, MATCH);
1047 
1048     sve_whilelt(tmp_pg, T, idx, cnt1);
1049 
1050     // Loop back if the particular char not found.
1051     br(MI, LOOP);
1052 
1053   BIND(NOMATCH);
1054     mov(result, -1);
1055     b(DONE);
1056 
1057   BIND(MATCH);
1058     // Undo the index increment.
1059     sub(idx, idx, vec_len);
1060 
1061     // Crop the vector to find its location.
1062     sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */);
1063     add(result, idx, -1);
1064     sve_incp(result, T, tmp_pdn);
1065   BIND(DONE);
1066 }
1067 
1068 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1,
1069                                             Register ch, Register result,
1070                                             Register tmp1, Register tmp2, Register tmp3)
1071 {
1072   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
1073   Register cnt1_neg = cnt1;
1074   Register ch1 = rscratch1;
1075   Register result_tmp = rscratch2;
1076 
1077   cbz(cnt1, NOMATCH);
1078 
1079   cmp(cnt1, (u1)8);
1080   br(LT, DO1_SHORT);
1081 
1082   orr(ch, ch, ch, LSL, 8);
1083   orr(ch, ch, ch, LSL, 16);
1084   orr(ch, ch, ch, LSL, 32);
1085 
1086   sub(cnt1, cnt1, 8);
1087   mov(result_tmp, cnt1);
1088   lea(str1, Address(str1, cnt1));
1089   sub(cnt1_neg, zr, cnt1);
1090 
1091   mov(tmp3, 0x0101010101010101);
1092 
1093   BIND(CH1_LOOP);
1094     ldr(ch1, Address(str1, cnt1_neg));
1095     eor(ch1, ch, ch1);
1096     sub(tmp1, ch1, tmp3);
1097     orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f);
1098     bics(tmp1, tmp1, tmp2);
1099     br(NE, HAS_ZERO);
1100     adds(cnt1_neg, cnt1_neg, 8);
1101     br(LT, CH1_LOOP);
1102 
1103     cmp(cnt1_neg, (u1)8);
1104     mov(cnt1_neg, 0);
1105     br(LT, CH1_LOOP);
1106     b(NOMATCH);
1107 
1108   BIND(HAS_ZERO);
1109     rev(tmp1, tmp1);
1110     clz(tmp1, tmp1);
1111     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
1112     b(MATCH);
1113 
1114   BIND(DO1_SHORT);
1115     mov(result_tmp, cnt1);
1116     lea(str1, Address(str1, cnt1));
1117     sub(cnt1_neg, zr, cnt1);
1118   BIND(DO1_LOOP);
1119     ldrb(ch1, Address(str1, cnt1_neg));
1120     cmp(ch, ch1);
1121     br(EQ, MATCH);
1122     adds(cnt1_neg, cnt1_neg, 1);
1123     br(LT, DO1_LOOP);
1124   BIND(NOMATCH);
1125     mov(result, -1);
1126     b(DONE);
1127   BIND(MATCH);
1128     add(result, result_tmp, cnt1_neg);
1129   BIND(DONE);
1130 }
1131 
1132 // Compare strings.
1133 void C2_MacroAssembler::string_compare(Register str1, Register str2,
1134     Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
1135     FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
1136     PRegister pgtmp1, PRegister pgtmp2, int ae) {
1137   Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
1138       DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
1139       SHORT_LOOP_START, TAIL_CHECK;
1140 
1141   bool isLL = ae == StrIntrinsicNode::LL;
1142   bool isLU = ae == StrIntrinsicNode::LU;
1143   bool isUL = ae == StrIntrinsicNode::UL;
1144 
1145   // The stub threshold for LL strings is: 72 (64 + 8) chars
1146   // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch)
1147   // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least)
1148   const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36);
1149 
1150   bool str1_isL = isLL || isLU;
1151   bool str2_isL = isLL || isUL;
1152 
1153   int str1_chr_shift = str1_isL ? 0 : 1;
1154   int str2_chr_shift = str2_isL ? 0 : 1;
1155   int str1_chr_size = str1_isL ? 1 : 2;
1156   int str2_chr_size = str2_isL ? 1 : 2;
1157   int minCharsInWord = isLL ? wordSize : wordSize/2;
1158 
1159   FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
1160   chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
1161                                       (chr_insn)&MacroAssembler::ldrh;
1162   chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
1163                                       (chr_insn)&MacroAssembler::ldrh;
1164   uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
1165                             (uxt_insn)&MacroAssembler::uxthw;
1166 
1167   BLOCK_COMMENT("string_compare {");
1168 
1169   // Bizarrely, the counts are passed in bytes, regardless of whether they
1170   // are L or U strings, however the result is always in characters.
1171   if (!str1_isL) asrw(cnt1, cnt1, 1);
1172   if (!str2_isL) asrw(cnt2, cnt2, 1);
1173 
1174   // Compute the minimum of the string lengths and save the difference.
1175   subsw(result, cnt1, cnt2);
1176   cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
1177 
1178   // A very short string
1179   cmpw(cnt2, minCharsInWord);
1180   br(Assembler::LE, SHORT_STRING);
1181 
1182   // Compare longwords
1183   // load first parts of strings and finish initialization while loading
1184   {
1185     if (str1_isL == str2_isL) { // LL or UU
1186       ldr(tmp1, Address(str1));
1187       cmp(str1, str2);
1188       br(Assembler::EQ, DONE);
1189       ldr(tmp2, Address(str2));
1190       cmp(cnt2, stub_threshold);
1191       br(GE, STUB);
1192       subsw(cnt2, cnt2, minCharsInWord);
1193       br(EQ, TAIL_CHECK);
1194       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1195       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1196       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1197     } else if (isLU) {
1198       ldrs(vtmp, Address(str1));
1199       ldr(tmp2, Address(str2));
1200       cmp(cnt2, stub_threshold);
1201       br(GE, STUB);
1202       subw(cnt2, cnt2, 4);
1203       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1204       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1205       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1206       zip1(vtmp, T8B, vtmp, vtmpZ);
1207       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1208       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1209       add(cnt1, cnt1, 4);
1210       fmovd(tmp1, vtmp);
1211     } else { // UL case
1212       ldr(tmp1, Address(str1));
1213       ldrs(vtmp, Address(str2));
1214       cmp(cnt2, stub_threshold);
1215       br(GE, STUB);
1216       subw(cnt2, cnt2, 4);
1217       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1218       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1219       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1220       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1221       zip1(vtmp, T8B, vtmp, vtmpZ);
1222       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1223       add(cnt1, cnt1, 8);
1224       fmovd(tmp2, vtmp);
1225     }
1226     adds(cnt2, cnt2, isUL ? 4 : 8);
1227     br(GE, TAIL);
1228     eor(rscratch2, tmp1, tmp2);
1229     cbnz(rscratch2, DIFF);
1230     // main loop
1231     bind(NEXT_WORD);
1232     if (str1_isL == str2_isL) {
1233       ldr(tmp1, Address(str1, cnt2));
1234       ldr(tmp2, Address(str2, cnt2));
1235       adds(cnt2, cnt2, 8);
1236     } else if (isLU) {
1237       ldrs(vtmp, Address(str1, cnt1));
1238       ldr(tmp2, Address(str2, cnt2));
1239       add(cnt1, cnt1, 4);
1240       zip1(vtmp, T8B, vtmp, vtmpZ);
1241       fmovd(tmp1, vtmp);
1242       adds(cnt2, cnt2, 8);
1243     } else { // UL
1244       ldrs(vtmp, Address(str2, cnt2));
1245       ldr(tmp1, Address(str1, cnt1));
1246       zip1(vtmp, T8B, vtmp, vtmpZ);
1247       add(cnt1, cnt1, 8);
1248       fmovd(tmp2, vtmp);
1249       adds(cnt2, cnt2, 4);
1250     }
1251     br(GE, TAIL);
1252 
1253     eor(rscratch2, tmp1, tmp2);
1254     cbz(rscratch2, NEXT_WORD);
1255     b(DIFF);
1256     bind(TAIL);
1257     eor(rscratch2, tmp1, tmp2);
1258     cbnz(rscratch2, DIFF);
1259     // Last longword.  In the case where length == 4 we compare the
1260     // same longword twice, but that's still faster than another
1261     // conditional branch.
1262     if (str1_isL == str2_isL) {
1263       ldr(tmp1, Address(str1));
1264       ldr(tmp2, Address(str2));
1265     } else if (isLU) {
1266       ldrs(vtmp, Address(str1));
1267       ldr(tmp2, Address(str2));
1268       zip1(vtmp, T8B, vtmp, vtmpZ);
1269       fmovd(tmp1, vtmp);
1270     } else { // UL
1271       ldrs(vtmp, Address(str2));
1272       ldr(tmp1, Address(str1));
1273       zip1(vtmp, T8B, vtmp, vtmpZ);
1274       fmovd(tmp2, vtmp);
1275     }
1276     bind(TAIL_CHECK);
1277     eor(rscratch2, tmp1, tmp2);
1278     cbz(rscratch2, DONE);
1279 
1280     // Find the first different characters in the longwords and
1281     // compute their difference.
1282     bind(DIFF);
1283     rev(rscratch2, rscratch2);
1284     clz(rscratch2, rscratch2);
1285     andr(rscratch2, rscratch2, isLL ? -8 : -16);
1286     lsrv(tmp1, tmp1, rscratch2);
1287     (this->*ext_chr)(tmp1, tmp1);
1288     lsrv(tmp2, tmp2, rscratch2);
1289     (this->*ext_chr)(tmp2, tmp2);
1290     subw(result, tmp1, tmp2);
1291     b(DONE);
1292   }
1293 
1294   bind(STUB);
1295     RuntimeAddress stub = nullptr;
1296     switch(ae) {
1297       case StrIntrinsicNode::LL:
1298         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
1299         break;
1300       case StrIntrinsicNode::UU:
1301         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
1302         break;
1303       case StrIntrinsicNode::LU:
1304         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
1305         break;
1306       case StrIntrinsicNode::UL:
1307         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
1308         break;
1309       default:
1310         ShouldNotReachHere();
1311      }
1312     assert(stub.target() != nullptr, "compare_long_string stub has not been generated");
1313     address call = trampoline_call(stub);
1314     if (call == nullptr) {
1315       DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START));
1316       ciEnv::current()->record_failure("CodeCache is full");
1317       return;
1318     }
1319     b(DONE);
1320 
1321   bind(SHORT_STRING);
1322   // Is the minimum length zero?
1323   cbz(cnt2, DONE);
1324   // arrange code to do most branches while loading and loading next characters
1325   // while comparing previous
1326   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1327   subs(cnt2, cnt2, 1);
1328   br(EQ, SHORT_LAST_INIT);
1329   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1330   b(SHORT_LOOP_START);
1331   bind(SHORT_LOOP);
1332   subs(cnt2, cnt2, 1);
1333   br(EQ, SHORT_LAST);
1334   bind(SHORT_LOOP_START);
1335   (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
1336   (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
1337   cmp(tmp1, cnt1);
1338   br(NE, SHORT_LOOP_TAIL);
1339   subs(cnt2, cnt2, 1);
1340   br(EQ, SHORT_LAST2);
1341   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1342   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1343   cmp(tmp2, rscratch1);
1344   br(EQ, SHORT_LOOP);
1345   sub(result, tmp2, rscratch1);
1346   b(DONE);
1347   bind(SHORT_LOOP_TAIL);
1348   sub(result, tmp1, cnt1);
1349   b(DONE);
1350   bind(SHORT_LAST2);
1351   cmp(tmp2, rscratch1);
1352   br(EQ, DONE);
1353   sub(result, tmp2, rscratch1);
1354 
1355   b(DONE);
1356   bind(SHORT_LAST_INIT);
1357   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1358   bind(SHORT_LAST);
1359   cmp(tmp1, cnt1);
1360   br(EQ, DONE);
1361   sub(result, tmp1, cnt1);
1362 
1363   bind(DONE);
1364 
1365   BLOCK_COMMENT("} string_compare");
1366 }
1367 
1368 void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1,
1369                                      FloatRegister src2, Condition cond, bool isQ) {
1370   SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1371   FloatRegister zn = src1, zm = src2;
1372   bool needs_negation = false;
1373   switch (cond) {
1374     case LT: cond = GT; zn = src2; zm = src1; break;
1375     case LE: cond = GE; zn = src2; zm = src1; break;
1376     case LO: cond = HI; zn = src2; zm = src1; break;
1377     case LS: cond = HS; zn = src2; zm = src1; break;
1378     case NE: cond = EQ; needs_negation = true; break;
1379     default:
1380       break;
1381   }
1382 
1383   if (is_floating_point_type(bt)) {
1384     fcm(cond, dst, size, zn, zm);
1385   } else {
1386     cm(cond, dst, size, zn, zm);
1387   }
1388 
1389   if (needs_negation) {
1390     notr(dst, isQ ? T16B : T8B, dst);
1391   }
1392 }
1393 
1394 void C2_MacroAssembler::neon_compare_zero(FloatRegister dst, BasicType bt, FloatRegister src,
1395                                           Condition cond, bool isQ) {
1396   SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1397   if (bt == T_FLOAT || bt == T_DOUBLE) {
1398     if (cond == Assembler::NE) {
1399       fcm(Assembler::EQ, dst, size, src);
1400       notr(dst, isQ ? T16B : T8B, dst);
1401     } else {
1402       fcm(cond, dst, size, src);
1403     }
1404   } else {
1405     if (cond == Assembler::NE) {
1406       cm(Assembler::EQ, dst, size, src);
1407       notr(dst, isQ ? T16B : T8B, dst);
1408     } else {
1409       cm(cond, dst, size, src);
1410     }
1411   }
1412 }
1413 
1414 // Compress the least significant bit of each byte to the rightmost and clear
1415 // the higher garbage bits.
1416 void C2_MacroAssembler::bytemask_compress(Register dst) {
1417   // Example input, dst = 0x01 00 00 00 01 01 00 01
1418   // The "??" bytes are garbage.
1419   orr(dst, dst, dst, Assembler::LSR, 7);  // dst = 0x?? 02 ?? 00 ?? 03 ?? 01
1420   orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D
1421   orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D
1422   andr(dst, dst, 0xff);                   // dst = 0x8D
1423 }
1424 
1425 // Pack the value of each mask element in "src" into a long value in "dst", at most
1426 // the first 64 lane elements. The input "src" is a vector of boolean represented as
1427 // bytes with 0x00/0x01 as element values. Each lane value from "src" is packed into
1428 // one bit in "dst".
1429 //
1430 // Example:   src = 0x0001010000010001 0100000001010001, lane_cnt = 16
1431 // Expected:  dst = 0x658D
1432 //
1433 // Clobbers: rscratch1
1434 void C2_MacroAssembler::sve_vmask_tolong(Register dst, FloatRegister src,
1435                                          FloatRegister vtmp, int lane_cnt) {
1436   assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
1437   assert_different_registers(dst, rscratch1);
1438   assert_different_registers(src, vtmp);
1439   assert(UseSVE > 0, "must be");
1440 
1441   // Compress the lowest 8 bytes.
1442   fmovd(dst, src);
1443   bytemask_compress(dst);
1444   if (lane_cnt <= 8) return;
1445 
1446   // Repeat on higher bytes and join the results.
1447   // Compress 8 bytes in each iteration.
1448   for (int idx = 1; idx < (lane_cnt / 8); idx++) {
1449     sve_extract_integral(rscratch1, T_LONG, src, idx, vtmp);
1450     bytemask_compress(rscratch1);
1451     orr(dst, dst, rscratch1, Assembler::LSL, idx << 3);
1452   }
1453 }
1454 
1455 // The function is same as above "sve_vmask_tolong", but it uses SVE2's BEXT
1456 // instruction which requires the FEAT_BITPERM feature.
1457 void C2_MacroAssembler::sve2_vmask_tolong(Register dst, FloatRegister src,
1458                                           FloatRegister vtmp1, FloatRegister vtmp2,
1459                                           int lane_cnt) {
1460   assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
1461   assert_different_registers(src, vtmp1, vtmp2);
1462   assert(UseSVE > 1 && VM_Version::supports_svebitperm(), "must be");
1463 
1464   // Given a vector with the value 0x00 or 0x01 in each byte, the basic idea
1465   // is to compress each significant bit of the byte in a cross-lane way. Due
1466   // to the lack of a cross-lane bit-compress instruction, we use BEXT
1467   // (bit-compress in each lane) with the biggest lane size (T = D) then
1468   // concatenate the results.
1469 
1470   // The second source input of BEXT, initialized with 0x01 in each byte.
1471   // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1472   sve_dup(vtmp2, B, 1);
1473 
1474   // BEXT vtmp1.D, src.D, vtmp2.D
1475   // src   = 0x0001010000010001 | 0x0100000001010001
1476   // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1477   //         ---------------------------------------
1478   // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1479   sve_bext(vtmp1, D, src, vtmp2);
1480 
1481   // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the
1482   // result to dst.
1483   // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1484   // dst   = 0x658D
1485   if (lane_cnt <= 8) {
1486     // No need to concatenate.
1487     umov(dst, vtmp1, B, 0);
1488   } else if (lane_cnt <= 16) {
1489     ins(vtmp1, B, vtmp1, 1, 8);
1490     umov(dst, vtmp1, H, 0);
1491   } else {
1492     // As the lane count is 64 at most, the final expected value must be in
1493     // the lowest 64 bits after narrowing vtmp1 from D to B.
1494     sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2);
1495     umov(dst, vtmp1, D, 0);
1496   }
1497 }
1498 
1499 // Unpack the mask, a long value in "src", into a vector register of boolean
1500 // represented as bytes with 0x00/0x01 as element values in "dst".  Each bit in
1501 // "src" is unpacked into one byte lane in "dst". Note that "dst" can support at
1502 // most 64 lanes.
1503 //
1504 // Below example gives the expected dst vector register, with a valid src(0x658D)
1505 // on a 128-bit vector size machine.
1506 // dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01
1507 void C2_MacroAssembler::sve_vmask_fromlong(FloatRegister dst, Register src,
1508                                            FloatRegister vtmp, int lane_cnt) {
1509   assert_different_registers(dst, vtmp);
1510   assert(UseSVE == 2 && VM_Version::supports_svebitperm() &&
1511          lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported");
1512 
1513   // Example:   src = 0x658D, lane_cnt = 16
1514   // Expected:  dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01
1515 
1516   // Put long value from general purpose register into the first lane of vector.
1517   // vtmp = 0x0000000000000000 | 0x000000000000658D
1518   sve_dup(vtmp, B, 0);
1519   mov(vtmp, D, 0, src);
1520 
1521   // Transform the value in the first lane which is mask in bit now to the mask in
1522   // byte, which can be done by SVE2's BDEP instruction.
1523 
1524   // The first source input of BDEP instruction. Deposite each byte in every 8 bytes.
1525   // vtmp = 0x0000000000000065 | 0x000000000000008D
1526   if (lane_cnt <= 8) {
1527     // Nothing. As only one byte exsits.
1528   } else if (lane_cnt <= 16) {
1529     ins(vtmp, B, vtmp, 8, 1);
1530   } else {
1531     sve_vector_extend(vtmp, D, vtmp, B);
1532   }
1533 
1534   // The second source input of BDEP instruction, initialized with 0x01 for each byte.
1535   // dst = 0x01010101 0x01010101 0x01010101 0x01010101
1536   sve_dup(dst, B, 1);
1537 
1538   // BDEP dst.D, vtmp.D, dst.D
1539   // vtmp = 0x0000000000000065 | 0x000000000000008D
1540   // dst  = 0x0101010101010101 | 0x0101010101010101
1541   //        ---------------------------------------
1542   // dst  = 0x0001010000010001 | 0x0100000001010001
1543   sve_bdep(dst, D, vtmp, dst);
1544 }
1545 
1546 // Clobbers: rflags
1547 void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg,
1548                                     FloatRegister zn, FloatRegister zm, Condition cond) {
1549   assert(pg->is_governing(), "This register has to be a governing predicate register");
1550   FloatRegister z1 = zn, z2 = zm;
1551   switch (cond) {
1552     case LE: z1 = zm; z2 = zn; cond = GE; break;
1553     case LT: z1 = zm; z2 = zn; cond = GT; break;
1554     case LO: z1 = zm; z2 = zn; cond = HI; break;
1555     case LS: z1 = zm; z2 = zn; cond = HS; break;
1556     default:
1557       break;
1558   }
1559 
1560   SIMD_RegVariant size = elemType_to_regVariant(bt);
1561   if (is_floating_point_type(bt)) {
1562     sve_fcm(cond, pd, size, pg, z1, z2);
1563   } else {
1564     assert(is_integral_type(bt), "unsupported element type");
1565     sve_cmp(cond, pd, size, pg, z1, z2);
1566   }
1567 }
1568 
1569 // Get index of the last mask lane that is set
1570 void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) {
1571   SIMD_RegVariant size = elemType_to_regVariant(bt);
1572   sve_rev(ptmp, size, src);
1573   sve_brkb(ptmp, ptrue, ptmp, false);
1574   sve_cntp(dst, size, ptrue, ptmp);
1575   movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1);
1576   subw(dst, rscratch1, dst);
1577 }
1578 
1579 // Extend integer vector src to dst with the same lane count
1580 // but larger element size, e.g. 4B -> 4I
1581 void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes,
1582                                            FloatRegister src, BasicType src_bt, bool is_unsigned) {
1583   if (src_bt == T_BYTE) {
1584     // 4B to 4S/4I, 8B to 8S
1585     assert(dst_vlen_in_bytes == 8 || dst_vlen_in_bytes == 16, "unsupported");
1586     assert(dst_bt == T_SHORT || dst_bt == T_INT, "unsupported");
1587     _xshll(is_unsigned, dst, T8H, src, T8B, 0);
1588     if (dst_bt == T_INT) {
1589       _xshll(is_unsigned, dst, T4S, dst, T4H, 0);
1590     }
1591   } else if (src_bt == T_SHORT) {
1592     // 2S to 2I/2L, 4S to 4I
1593     assert(dst_vlen_in_bytes == 8 || dst_vlen_in_bytes == 16, "unsupported");
1594     assert(dst_bt == T_INT || dst_bt == T_LONG, "unsupported");
1595     _xshll(is_unsigned, dst, T4S, src, T4H, 0);
1596     if (dst_bt == T_LONG) {
1597       _xshll(is_unsigned, dst, T2D, dst, T2S, 0);
1598     }
1599   } else if (src_bt == T_INT) {
1600     // 2I to 2L
1601     assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported");
1602     _xshll(is_unsigned, dst, T2D, src, T2S, 0);
1603   } else {
1604     ShouldNotReachHere();
1605   }
1606 }
1607 
1608 // Narrow integer vector src down to dst with the same lane count
1609 // but smaller element size, e.g. 4I -> 4B
1610 void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt,
1611                                            FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) {
1612   if (src_bt == T_SHORT) {
1613     // 4S/8S to 4B/8B
1614     assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
1615     assert(dst_bt == T_BYTE, "unsupported");
1616     xtn(dst, T8B, src, T8H);
1617   } else if (src_bt == T_INT) {
1618     // 2I to 2S, 4I to 4B/4S
1619     assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
1620     assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported");
1621     xtn(dst, T4H, src, T4S);
1622     if (dst_bt == T_BYTE) {
1623       xtn(dst, T8B, dst, T8H);
1624     }
1625   } else if (src_bt == T_LONG) {
1626     // 2L to 2S/2I
1627     assert(src_vlen_in_bytes == 16, "unsupported");
1628     assert(dst_bt == T_INT || dst_bt == T_SHORT, "unsupported");
1629     xtn(dst, T2S, src, T2D);
1630     if (dst_bt == T_SHORT) {
1631       xtn(dst, T4H, dst, T4S);
1632     }
1633   } else {
1634     ShouldNotReachHere();
1635   }
1636 }
1637 
1638 void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size,
1639                                           FloatRegister src, SIMD_RegVariant src_size,
1640                                           bool is_unsigned) {
1641   assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size");
1642 
1643   if (src_size == B) {
1644     switch (dst_size) {
1645     case H:
1646       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1647       break;
1648     case S:
1649       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1650       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1651       break;
1652     case D:
1653       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1654       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1655       _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1656       break;
1657     default:
1658       ShouldNotReachHere();
1659     }
1660   } else if (src_size == H) {
1661     if (dst_size == S) {
1662       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1663     } else { // D
1664       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1665       _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1666     }
1667   } else if (src_size == S) {
1668     _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, src);
1669   }
1670 }
1671 
1672 // Vector narrow from src to dst with specified element sizes.
1673 // High part of dst vector will be filled with zero.
1674 void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size,
1675                                           FloatRegister src, SIMD_RegVariant src_size,
1676                                           FloatRegister tmp) {
1677   assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size");
1678   assert_different_registers(src, tmp);
1679   sve_dup(tmp, src_size, 0);
1680   if (src_size == D) {
1681     switch (dst_size) {
1682     case S:
1683       sve_uzp1(dst, S, src, tmp);
1684       break;
1685     case H:
1686       assert_different_registers(dst, tmp);
1687       sve_uzp1(dst, S, src, tmp);
1688       sve_uzp1(dst, H, dst, tmp);
1689       break;
1690     case B:
1691       assert_different_registers(dst, tmp);
1692       sve_uzp1(dst, S, src, tmp);
1693       sve_uzp1(dst, H, dst, tmp);
1694       sve_uzp1(dst, B, dst, tmp);
1695       break;
1696     default:
1697       ShouldNotReachHere();
1698     }
1699   } else if (src_size == S) {
1700     if (dst_size == H) {
1701       sve_uzp1(dst, H, src, tmp);
1702     } else { // B
1703       assert_different_registers(dst, tmp);
1704       sve_uzp1(dst, H, src, tmp);
1705       sve_uzp1(dst, B, dst, tmp);
1706     }
1707   } else if (src_size == H) {
1708     sve_uzp1(dst, B, src, tmp);
1709   }
1710 }
1711 
1712 // Extend src predicate to dst predicate with the same lane count but larger
1713 // element size, e.g. 64Byte -> 512Long
1714 void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src,
1715                                              uint dst_element_length_in_bytes,
1716                                              uint src_element_length_in_bytes) {
1717   if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) {
1718     sve_punpklo(dst, src);
1719   } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) {
1720     sve_punpklo(dst, src);
1721     sve_punpklo(dst, dst);
1722   } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) {
1723     sve_punpklo(dst, src);
1724     sve_punpklo(dst, dst);
1725     sve_punpklo(dst, dst);
1726   } else {
1727     assert(false, "unsupported");
1728     ShouldNotReachHere();
1729   }
1730 }
1731 
1732 // Narrow src predicate to dst predicate with the same lane count but
1733 // smaller element size, e.g. 512Long -> 64Byte
1734 void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src, PRegister ptmp,
1735                                              uint dst_element_length_in_bytes, uint src_element_length_in_bytes) {
1736   // The insignificant bits in src predicate are expected to be zero.
1737   // To ensure the higher order bits of the resultant narrowed vector are 0, an all-zero predicate is
1738   // passed as the second argument. An example narrowing operation with a given mask would be -
1739   // 128Long -> 64Int on a 128-bit machine i.e 2L -> 2I
1740   // Mask (for 2 Longs) : TF
1741   // Predicate register for the above mask (16 bits) : 00000001 00000000
1742   // After narrowing (uzp1 dst.b, src.b, ptmp.b) : 0000 0000 0001 0000
1743   // Which translates to mask for 2 integers as : TF (lower half is considered while upper half is 0)
1744   assert_different_registers(src, ptmp);
1745   assert_different_registers(dst, ptmp);
1746   sve_pfalse(ptmp);
1747   if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) {
1748     sve_uzp1(dst, B, src, ptmp);
1749   } else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) {
1750     sve_uzp1(dst, H, src, ptmp);
1751     sve_uzp1(dst, B, dst, ptmp);
1752   } else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) {
1753     sve_uzp1(dst, S, src, ptmp);
1754     sve_uzp1(dst, H, dst, ptmp);
1755     sve_uzp1(dst, B, dst, ptmp);
1756   } else {
1757     assert(false, "unsupported");
1758     ShouldNotReachHere();
1759   }
1760 }
1761 
1762 // Vector reduction add for integral type with ASIMD instructions.
1763 void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt,
1764                                                  Register isrc, FloatRegister vsrc,
1765                                                  unsigned vector_length_in_bytes,
1766                                                  FloatRegister vtmp) {
1767   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1768   assert_different_registers(dst, isrc);
1769   bool isQ = vector_length_in_bytes == 16;
1770 
1771   BLOCK_COMMENT("neon_reduce_add_integral {");
1772     switch(bt) {
1773       case T_BYTE:
1774         addv(vtmp, isQ ? T16B : T8B, vsrc);
1775         smov(dst, vtmp, B, 0);
1776         addw(dst, dst, isrc, ext::sxtb);
1777         break;
1778       case T_SHORT:
1779         addv(vtmp, isQ ? T8H : T4H, vsrc);
1780         smov(dst, vtmp, H, 0);
1781         addw(dst, dst, isrc, ext::sxth);
1782         break;
1783       case T_INT:
1784         isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc);
1785         umov(dst, vtmp, S, 0);
1786         addw(dst, dst, isrc);
1787         break;
1788       case T_LONG:
1789         assert(isQ, "unsupported");
1790         addpd(vtmp, vsrc);
1791         umov(dst, vtmp, D, 0);
1792         add(dst, dst, isrc);
1793         break;
1794       default:
1795         assert(false, "unsupported");
1796         ShouldNotReachHere();
1797     }
1798   BLOCK_COMMENT("} neon_reduce_add_integral");
1799 }
1800 
1801 // Vector reduction multiply for integral type with ASIMD instructions.
1802 // Note: temporary registers vtmp1 and vtmp2 are not used in some cases.
1803 // Clobbers: rscratch1
1804 void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt,
1805                                                  Register isrc, FloatRegister vsrc,
1806                                                  unsigned vector_length_in_bytes,
1807                                                  FloatRegister vtmp1, FloatRegister vtmp2) {
1808   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1809   bool isQ = vector_length_in_bytes == 16;
1810 
1811   BLOCK_COMMENT("neon_reduce_mul_integral {");
1812     switch(bt) {
1813       case T_BYTE:
1814         if (isQ) {
1815           // Multiply the lower half and higher half of vector iteratively.
1816           // vtmp1 = vsrc[8:15]
1817           ins(vtmp1, D, vsrc, 0, 1);
1818           // vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7]
1819           mulv(vtmp1, T8B, vtmp1, vsrc);
1820           // vtmp2 = vtmp1[4:7]
1821           ins(vtmp2, S, vtmp1, 0, 1);
1822           // vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3]
1823           mulv(vtmp1, T8B, vtmp2, vtmp1);
1824         } else {
1825           ins(vtmp1, S, vsrc, 0, 1);
1826           mulv(vtmp1, T8B, vtmp1, vsrc);
1827         }
1828         // vtmp2 = vtmp1[2:3]
1829         ins(vtmp2, H, vtmp1, 0, 1);
1830         // vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1]
1831         mulv(vtmp2, T8B, vtmp2, vtmp1);
1832         // dst = vtmp2[0] * isrc * vtmp2[1]
1833         umov(rscratch1, vtmp2, B, 0);
1834         mulw(dst, rscratch1, isrc);
1835         sxtb(dst, dst);
1836         umov(rscratch1, vtmp2, B, 1);
1837         mulw(dst, rscratch1, dst);
1838         sxtb(dst, dst);
1839         break;
1840       case T_SHORT:
1841         if (isQ) {
1842           ins(vtmp2, D, vsrc, 0, 1);
1843           mulv(vtmp2, T4H, vtmp2, vsrc);
1844           ins(vtmp1, S, vtmp2, 0, 1);
1845           mulv(vtmp1, T4H, vtmp1, vtmp2);
1846         } else {
1847           ins(vtmp1, S, vsrc, 0, 1);
1848           mulv(vtmp1, T4H, vtmp1, vsrc);
1849         }
1850         umov(rscratch1, vtmp1, H, 0);
1851         mulw(dst, rscratch1, isrc);
1852         sxth(dst, dst);
1853         umov(rscratch1, vtmp1, H, 1);
1854         mulw(dst, rscratch1, dst);
1855         sxth(dst, dst);
1856         break;
1857       case T_INT:
1858         if (isQ) {
1859           ins(vtmp1, D, vsrc, 0, 1);
1860           mulv(vtmp1, T2S, vtmp1, vsrc);
1861         } else {
1862           vtmp1 = vsrc;
1863         }
1864         umov(rscratch1, vtmp1, S, 0);
1865         mul(dst, rscratch1, isrc);
1866         umov(rscratch1, vtmp1, S, 1);
1867         mul(dst, rscratch1, dst);
1868         break;
1869       case T_LONG:
1870         umov(rscratch1, vsrc, D, 0);
1871         mul(dst, isrc, rscratch1);
1872         umov(rscratch1, vsrc, D, 1);
1873         mul(dst, dst, rscratch1);
1874         break;
1875       default:
1876         assert(false, "unsupported");
1877         ShouldNotReachHere();
1878     }
1879   BLOCK_COMMENT("} neon_reduce_mul_integral");
1880 }
1881 
1882 // Vector reduction multiply for floating-point type with ASIMD instructions.
1883 void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt,
1884                                            FloatRegister fsrc, FloatRegister vsrc,
1885                                            unsigned vector_length_in_bytes,
1886                                            FloatRegister vtmp) {
1887   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1888   bool isQ = vector_length_in_bytes == 16;
1889 
1890   BLOCK_COMMENT("neon_reduce_mul_fp {");
1891     switch(bt) {
1892       case T_FLOAT:
1893         fmuls(dst, fsrc, vsrc);
1894         ins(vtmp, S, vsrc, 0, 1);
1895         fmuls(dst, dst, vtmp);
1896         if (isQ) {
1897           ins(vtmp, S, vsrc, 0, 2);
1898           fmuls(dst, dst, vtmp);
1899           ins(vtmp, S, vsrc, 0, 3);
1900           fmuls(dst, dst, vtmp);
1901          }
1902         break;
1903       case T_DOUBLE:
1904         assert(isQ, "unsupported");
1905         fmuld(dst, fsrc, vsrc);
1906         ins(vtmp, D, vsrc, 0, 1);
1907         fmuld(dst, dst, vtmp);
1908         break;
1909       default:
1910         assert(false, "unsupported");
1911         ShouldNotReachHere();
1912     }
1913   BLOCK_COMMENT("} neon_reduce_mul_fp");
1914 }
1915 
1916 // Helper to select logical instruction
1917 void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd,
1918                                                    Register Rn, Register Rm,
1919                                                    enum shift_kind kind, unsigned shift) {
1920   switch(opc) {
1921     case Op_AndReductionV:
1922       is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift);
1923       break;
1924     case Op_OrReductionV:
1925       is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift);
1926       break;
1927     case Op_XorReductionV:
1928       is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift);
1929       break;
1930     default:
1931       assert(false, "unsupported");
1932       ShouldNotReachHere();
1933   }
1934 }
1935 
1936 // Vector reduction logical operations And, Or, Xor
1937 // Clobbers: rscratch1
1938 void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt,
1939                                             Register isrc, FloatRegister vsrc,
1940                                             unsigned vector_length_in_bytes) {
1941   assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV,
1942          "unsupported");
1943   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1944   assert_different_registers(dst, isrc);
1945   bool isQ = vector_length_in_bytes == 16;
1946 
1947   BLOCK_COMMENT("neon_reduce_logical {");
1948     umov(rscratch1, vsrc, isQ ? D : S, 0);
1949     umov(dst, vsrc, isQ ? D : S, 1);
1950     neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1);
1951     switch(bt) {
1952       case T_BYTE:
1953         if (isQ) {
1954           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1955         }
1956         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
1957         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8);
1958         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1959         sxtb(dst, dst);
1960         break;
1961       case T_SHORT:
1962         if (isQ) {
1963           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1964         }
1965         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
1966         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1967         sxth(dst, dst);
1968         break;
1969       case T_INT:
1970         if (isQ) {
1971           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1972         }
1973         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1974         break;
1975       case T_LONG:
1976         assert(isQ, "unsupported");
1977         neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst);
1978         break;
1979       default:
1980         assert(false, "unsupported");
1981         ShouldNotReachHere();
1982     }
1983   BLOCK_COMMENT("} neon_reduce_logical");
1984 }
1985 
1986 // Helper function to decode min/max reduction operation properties
1987 void C2_MacroAssembler::decode_minmax_reduction_opc(int opc, bool* is_min,
1988                                                     bool* is_unsigned,
1989                                                     Condition* cond) {
1990   switch(opc) {
1991     case Op_MinReductionV:
1992       *is_min = true;  *is_unsigned = false; *cond = LT; break;
1993     case Op_MaxReductionV:
1994       *is_min = false; *is_unsigned = false; *cond = GT; break;
1995     case Op_UMinReductionV:
1996       *is_min = true;  *is_unsigned = true;  *cond = LO; break;
1997     case Op_UMaxReductionV:
1998       *is_min = false; *is_unsigned = true;  *cond = HI; break;
1999     default:
2000       ShouldNotReachHere();
2001   }
2002 }
2003 
2004 // Vector reduction min/max/umin/umax for integral type with ASIMD instructions.
2005 // Note: vtmp is not used and expected to be fnoreg for T_LONG case.
2006 // Clobbers: rscratch1, rflags
2007 void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt,
2008                                                     Register isrc, FloatRegister vsrc,
2009                                                     unsigned vector_length_in_bytes,
2010                                                     FloatRegister vtmp) {
2011   assert(opc == Op_MinReductionV || opc == Op_MaxReductionV ||
2012          opc == Op_UMinReductionV || opc == Op_UMaxReductionV, "unsupported");
2013   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
2014   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported");
2015   assert_different_registers(dst, isrc);
2016   bool isQ = vector_length_in_bytes == 16;
2017   bool is_min;
2018   bool is_unsigned;
2019   Condition cond;
2020   decode_minmax_reduction_opc(opc, &is_min, &is_unsigned, &cond);
2021   BLOCK_COMMENT("neon_reduce_minmax_integral {");
2022     if (bt == T_LONG) {
2023       assert(vtmp == fnoreg, "should be");
2024       assert(isQ, "should be");
2025       umov(rscratch1, vsrc, D, 0);
2026       cmp(isrc, rscratch1);
2027       csel(dst, isrc, rscratch1, cond);
2028       umov(rscratch1, vsrc, D, 1);
2029       cmp(dst, rscratch1);
2030       csel(dst, dst, rscratch1, cond);
2031     } else {
2032       SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
2033       if (size == T2S) {
2034         // For T2S (2x32-bit elements), use pairwise instructions because
2035         // uminv/umaxv/sminv/smaxv don't support arrangement 2S.
2036         neon_minmaxp(is_unsigned, is_min, vtmp, size, vsrc, vsrc);
2037       } else {
2038         // For other sizes, use reduction to scalar instructions.
2039         neon_minmaxv(is_unsigned, is_min, vtmp, size, vsrc);
2040       }
2041       if (bt == T_INT) {
2042         umov(dst, vtmp, S, 0);
2043       } else if (is_unsigned) {
2044         umov(dst, vtmp, elemType_to_regVariant(bt), 0);
2045       } else {
2046         smov(dst, vtmp, elemType_to_regVariant(bt), 0);
2047       }
2048       cmpw(dst, isrc);
2049       cselw(dst, dst, isrc, cond);
2050     }
2051   BLOCK_COMMENT("} neon_reduce_minmax_integral");
2052 }
2053 
2054 // Vector reduction for integral type with SVE instruction.
2055 // Supported operations are Add, And, Or, Xor, Max, Min, UMax, UMin.
2056 // rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV.
2057 void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1,
2058                                             FloatRegister src2, PRegister pg, FloatRegister tmp) {
2059   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2060   assert(pg->is_governing(), "This register has to be a governing predicate register");
2061   assert_different_registers(src1, dst);
2062   // Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved.
2063   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2064   switch (opc) {
2065     case Op_AddReductionVI: {
2066       sve_uaddv(tmp, size, pg, src2);
2067       if (bt == T_BYTE) {
2068         smov(dst, tmp, size, 0);
2069         addw(dst, src1, dst, ext::sxtb);
2070       } else if (bt == T_SHORT) {
2071         smov(dst, tmp, size, 0);
2072         addw(dst, src1, dst, ext::sxth);
2073       } else {
2074         umov(dst, tmp, size, 0);
2075         addw(dst, dst, src1);
2076       }
2077       break;
2078     }
2079     case Op_AddReductionVL: {
2080       sve_uaddv(tmp, size, pg, src2);
2081       umov(dst, tmp, size, 0);
2082       add(dst, dst, src1);
2083       break;
2084     }
2085     case Op_AndReductionV: {
2086       sve_andv(tmp, size, pg, src2);
2087       if (bt == T_INT || bt == T_LONG) {
2088         umov(dst, tmp, size, 0);
2089       } else {
2090         smov(dst, tmp, size, 0);
2091       }
2092       if (bt == T_LONG) {
2093         andr(dst, dst, src1);
2094       } else {
2095         andw(dst, dst, src1);
2096       }
2097       break;
2098     }
2099     case Op_OrReductionV: {
2100       sve_orv(tmp, size, pg, src2);
2101       if (bt == T_INT || bt == T_LONG) {
2102         umov(dst, tmp, size, 0);
2103       } else {
2104         smov(dst, tmp, size, 0);
2105       }
2106       if (bt == T_LONG) {
2107         orr(dst, dst, src1);
2108       } else {
2109         orrw(dst, dst, src1);
2110       }
2111       break;
2112     }
2113     case Op_XorReductionV: {
2114       sve_eorv(tmp, size, pg, src2);
2115       if (bt == T_INT || bt == T_LONG) {
2116         umov(dst, tmp, size, 0);
2117       } else {
2118         smov(dst, tmp, size, 0);
2119       }
2120       if (bt == T_LONG) {
2121         eor(dst, dst, src1);
2122       } else {
2123         eorw(dst, dst, src1);
2124       }
2125       break;
2126     }
2127     case Op_MaxReductionV:
2128     case Op_MinReductionV:
2129     case Op_UMaxReductionV:
2130     case Op_UMinReductionV: {
2131       bool is_min;
2132       bool is_unsigned;
2133       Condition cond;
2134       decode_minmax_reduction_opc(opc, &is_min, &is_unsigned, &cond);
2135       sve_minmaxv(is_unsigned, is_min, tmp, size, pg, src2);
2136       // Move result from vector to general register
2137       if (is_unsigned || bt == T_INT || bt == T_LONG) {
2138         umov(dst, tmp, size, 0);
2139       } else {
2140         smov(dst, tmp, size, 0);
2141       }
2142       if (bt == T_LONG) {
2143         cmp(dst, src1);
2144         csel(dst, dst, src1, cond);
2145       } else {
2146         cmpw(dst, src1);
2147         cselw(dst, dst, src1, cond);
2148       }
2149       break;
2150     }
2151     default:
2152       assert(false, "unsupported");
2153       ShouldNotReachHere();
2154   }
2155 
2156   if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) {
2157     if (bt == T_BYTE) {
2158       sxtb(dst, dst);
2159     } else if (bt == T_SHORT) {
2160       sxth(dst, dst);
2161     }
2162   }
2163 }
2164 
2165 // Set elements of the dst predicate to true for lanes in the range of [0, lane_cnt), or
2166 // to false otherwise. The input "lane_cnt" should be smaller than or equal to the supported
2167 // max vector length of the basic type. Clobbers: rscratch1 and the rFlagsReg.
2168 void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt) {
2169   uint32_t max_vector_length = Matcher::max_vector_size(bt);
2170   assert(lane_cnt <= max_vector_length, "unsupported input lane_cnt");
2171 
2172   // Set all elements to false if the input "lane_cnt" is zero.
2173   if (lane_cnt == 0) {
2174     sve_pfalse(dst);
2175     return;
2176   }
2177 
2178   SIMD_RegVariant size = elemType_to_regVariant(bt);
2179   assert(size != Q, "invalid size");
2180 
2181   // Set all true if "lane_cnt" equals to the max lane count.
2182   if (lane_cnt == max_vector_length) {
2183     sve_ptrue(dst, size, /* ALL */ 0b11111);
2184     return;
2185   }
2186 
2187   // Fixed numbers for "ptrue".
2188   switch(lane_cnt) {
2189   case 1: /* VL1 */
2190   case 2: /* VL2 */
2191   case 3: /* VL3 */
2192   case 4: /* VL4 */
2193   case 5: /* VL5 */
2194   case 6: /* VL6 */
2195   case 7: /* VL7 */
2196   case 8: /* VL8 */
2197     sve_ptrue(dst, size, lane_cnt);
2198     return;
2199   case 16:
2200     sve_ptrue(dst, size, /* VL16 */ 0b01001);
2201     return;
2202   case 32:
2203     sve_ptrue(dst, size, /* VL32 */ 0b01010);
2204     return;
2205   case 64:
2206     sve_ptrue(dst, size, /* VL64 */ 0b01011);
2207     return;
2208   case 128:
2209     sve_ptrue(dst, size, /* VL128 */ 0b01100);
2210     return;
2211   case 256:
2212     sve_ptrue(dst, size, /* VL256 */ 0b01101);
2213     return;
2214   default:
2215     break;
2216   }
2217 
2218   // Special patterns for "ptrue".
2219   if (lane_cnt == round_down_power_of_2(max_vector_length)) {
2220     sve_ptrue(dst, size, /* POW2 */ 0b00000);
2221   } else if (lane_cnt == max_vector_length - (max_vector_length % 4)) {
2222     sve_ptrue(dst, size, /* MUL4 */ 0b11101);
2223   } else if (lane_cnt == max_vector_length - (max_vector_length % 3)) {
2224     sve_ptrue(dst, size, /* MUL3 */ 0b11110);
2225   } else {
2226     // Encode to "whileltw" for the remaining cases.
2227     mov(rscratch1, lane_cnt);
2228     sve_whileltw(dst, size, zr, rscratch1);
2229   }
2230 }
2231 
2232 // Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst.
2233 // Any remaining elements of dst will be filled with zero.
2234 // Clobbers: rscratch1
2235 // Preserves: mask, vzr
2236 void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask,
2237                                            FloatRegister vzr, FloatRegister vtmp,
2238                                            PRegister pgtmp, unsigned vector_length_in_bytes) {
2239   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2240   // When called by sve_compress_byte, src and vtmp may be the same register.
2241   assert_different_registers(dst, src, vzr);
2242   assert_different_registers(dst, vtmp, vzr);
2243   assert_different_registers(mask, pgtmp);
2244   // high <-- low
2245   // Example input:   src   = hh gg ff ee dd cc bb aa, one character is 8 bits.
2246   //                  mask  = 01 00 00 01 01 00 01 01, one character is 1 bit.
2247   // Expected result: dst   = 00 00 00 hh ee dd bb aa
2248 
2249   // Extend lowest half to type INT.
2250   // dst   =  00dd  00cc  00bb  00aa
2251   sve_uunpklo(dst, S, src);
2252   // pgtmp =  0001  0000  0001  0001
2253   sve_punpklo(pgtmp, mask);
2254   // Pack the active elements in size of type INT to the right,
2255   // and fill the remainings with zero.
2256   // dst   =  0000  00dd  00bb  00aa
2257   sve_compact(dst, S, dst, pgtmp);
2258   // Narrow the result back to type SHORT.
2259   // dst   = 00 00 00 00 00 dd bb aa
2260   sve_uzp1(dst, H, dst, vzr);
2261 
2262   // Return if the vector length is no more than MaxVectorSize/2, since the
2263   // highest half is invalid.
2264   if (vector_length_in_bytes <= (MaxVectorSize >> 1)) {
2265     return;
2266   }
2267 
2268   // Count the active elements of lowest half.
2269   // rscratch1 = 3
2270   sve_cntp(rscratch1, S, ptrue, pgtmp);
2271 
2272   // Repeat to the highest half.
2273   // pgtmp =  0001  0000  0000  0001
2274   sve_punpkhi(pgtmp, mask);
2275   // vtmp  =  00hh  00gg  00ff  00ee
2276   sve_uunpkhi(vtmp, S, src);
2277   // vtmp  =  0000  0000  00hh  00ee
2278   sve_compact(vtmp, S, vtmp, pgtmp);
2279   // vtmp  = 00 00 00 00 00 00 hh ee
2280   sve_uzp1(vtmp, H, vtmp, vzr);
2281 
2282   // pgtmp = 00 00 00 00 00 01 01 01
2283   sve_whilelt(pgtmp, H, zr, rscratch1);
2284   // Compressed low:  dst  = 00 00 00 00 00 dd bb aa
2285   // Compressed high: vtmp = 00 00 00 00 00 00 hh ee
2286   // Combine the compressed low with the compressed high:
2287   //                  dst  = 00 00 00 hh ee dd bb aa
2288   sve_splice(dst, H, pgtmp, vtmp);
2289 }
2290 
2291 // Clobbers: rscratch1, rscratch2
2292 // Preserves: src, mask
2293 void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask,
2294                                           FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
2295                                           PRegister ptmp, PRegister pgtmp, unsigned vector_length_in_bytes) {
2296   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2297   assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3);
2298   assert_different_registers(mask, ptmp, pgtmp);
2299   // high <-- low
2300   // Example input:   src   = q p n m l k j i h g f e d c b a, one character is 8 bits.
2301   //                  mask  = 0 1 0 0 0 0 0 1 0 1 0 0 0 1 0 1, one character is 1 bit.
2302   // Expected result: dst   = 0 0 0 0 0 0 0 0 0 0 0 p i g c a
2303   FloatRegister vzr = vtmp3;
2304   sve_dup(vzr, B, 0);
2305 
2306   // Extend lowest half to type SHORT.
2307   // vtmp1 =  0h  0g  0f  0e  0d  0c  0b  0a
2308   sve_uunpklo(vtmp1, H, src);
2309   // ptmp  =  00  01  00  00  00  01  00  01
2310   sve_punpklo(ptmp, mask);
2311   // Pack the active elements in size of type SHORT to the right,
2312   // and fill the remainings with zero.
2313   // dst   =  00  00  00  00  00  0g  0c  0a
2314   unsigned extended_size = vector_length_in_bytes << 1;
2315   sve_compress_short(dst, vtmp1, ptmp, vzr, vtmp2, pgtmp, extended_size > MaxVectorSize ? MaxVectorSize : extended_size);
2316   // Narrow the result back to type BYTE.
2317   // dst   = 0 0 0 0 0 0 0 0 0 0 0 0 0 g c a
2318   sve_uzp1(dst, B, dst, vzr);
2319 
2320   // Return if the vector length is no more than MaxVectorSize/2, since the
2321   // highest half is invalid.
2322   if (vector_length_in_bytes <= (MaxVectorSize >> 1)) {
2323     return;
2324   }
2325   // Count the active elements of lowest half.
2326   // rscratch2 = 3
2327   sve_cntp(rscratch2, H, ptrue, ptmp);
2328 
2329   // Repeat to the highest half.
2330   // ptmp  =  00  01  00  00  00  00  00  01
2331   sve_punpkhi(ptmp, mask);
2332   // vtmp2 =  0q  0p  0n  0m  0l  0k  0j  0i
2333   sve_uunpkhi(vtmp2, H, src);
2334   // vtmp1 =  00  00  00  00  00  00  0p  0i
2335   sve_compress_short(vtmp1, vtmp2, ptmp, vzr, vtmp2, pgtmp, extended_size - MaxVectorSize);
2336   // vtmp1 = 0 0 0 0 0 0 0 0 0 0 0 0 0 0 p i
2337   sve_uzp1(vtmp1, B, vtmp1, vzr);
2338 
2339   // ptmp  = 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1
2340   sve_whilelt(ptmp, B, zr, rscratch2);
2341   // Compressed low:  dst   = 0 0 0 0 0 0 0 0 0 0 0 0 0 g c a
2342   // Compressed high: vtmp1 = 0 0 0 0 0 0 0 0 0 0 0 0 0 0 p i
2343   // Combine the compressed low with the compressed high:
2344   //                  dst   = 0 0 0 0 0 0 0 0 0 0 0 p i g c a
2345   sve_splice(dst, B, ptmp, vtmp1);
2346 }
2347 
2348 void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2349   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2350   SIMD_Arrangement size = isQ ? T16B : T8B;
2351   if (bt == T_BYTE) {
2352     rbit(dst, size, src);
2353   } else {
2354     neon_reverse_bytes(dst, src, bt, isQ);
2355     rbit(dst, size, dst);
2356   }
2357 }
2358 
2359 void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2360   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2361   SIMD_Arrangement size = isQ ? T16B : T8B;
2362   switch (bt) {
2363     case T_BYTE:
2364       if (dst != src) {
2365         orr(dst, size, src, src);
2366       }
2367       break;
2368     case T_SHORT:
2369       rev16(dst, size, src);
2370       break;
2371     case T_INT:
2372       rev32(dst, size, src);
2373       break;
2374     case T_LONG:
2375       rev64(dst, size, src);
2376       break;
2377     default:
2378       assert(false, "unsupported");
2379       ShouldNotReachHere();
2380   }
2381 }
2382 
2383 // VectorRearrange implementation for short/int/float/long/double types with NEON
2384 // instructions. For VectorRearrange short/int/float, we use NEON tbl instruction.
2385 // But since it supports bytes table only, we need to lookup 2/4 bytes as a group.
2386 // For VectorRearrange long/double, we compare the shuffle input with iota indices,
2387 // and use bsl to implement the operation.
2388 void C2_MacroAssembler::neon_rearrange_hsd(FloatRegister dst, FloatRegister src,
2389                                            FloatRegister shuffle, FloatRegister tmp,
2390                                            BasicType bt, bool isQ) {
2391   assert_different_registers(dst, src, shuffle, tmp);
2392   SIMD_Arrangement size1 = isQ ? T16B : T8B;
2393   SIMD_Arrangement size2 = esize2arrangement((uint)type2aelembytes(bt), isQ);
2394 
2395   // Here is an example that rearranges a NEON vector with 4 ints:
2396   // Rearrange V1 int[a0, a1, a2, a3] to V2 int[a2, a3, a0, a1]
2397   //   1. We assume the shuffle input is Vi int[2, 3, 0, 1].
2398   //   2. Multiply Vi int[2, 3, 0, 1] with constant int vector
2399   //      [0x04040404, 0x04040404, 0x04040404, 0x04040404], and get
2400   //      tbl base Vm int[0x08080808, 0x0c0c0c0c, 0x00000000, 0x04040404].
2401   //   3. Add Vm with constant int[0x03020100, 0x03020100, 0x03020100, 0x03020100],
2402   //      and get tbl index Vm int[0x0b0a0908, 0x0f0e0d0c, 0x03020100, 0x07060504]
2403   //   4. Use Vm as index register, and use V1 as table register.
2404   //      Then get V2 as the result by tbl NEON instructions.
2405   switch (bt) {
2406     case T_SHORT:
2407       mov(tmp, size1, 0x02);
2408       mulv(dst, size2, shuffle, tmp);
2409       mov(tmp, size2, 0x0100);
2410       addv(dst, size1, dst, tmp);
2411       tbl(dst, size1, src, 1, dst);
2412       break;
2413     case T_INT:
2414     case T_FLOAT:
2415       mov(tmp, size1, 0x04);
2416       mulv(dst, size2, shuffle, tmp);
2417       mov(tmp, size2, 0x03020100);
2418       addv(dst, size1, dst, tmp);
2419       tbl(dst, size1, src, 1, dst);
2420       break;
2421     case T_LONG:
2422     case T_DOUBLE:
2423       // Load the iota indices for Long type. The indices are ordered by
2424       // type B/S/I/L/F/D, and the offset between two types is 16; Hence
2425       // the offset for L is 48.
2426       lea(rscratch1,
2427           ExternalAddress(StubRoutines::aarch64::vector_iota_indices() + 48));
2428       ldrq(tmp, rscratch1);
2429       // Check whether the input "shuffle" is the same with iota indices.
2430       // Return "src" if true, otherwise swap the two elements of "src".
2431       cm(EQ, dst, size2, shuffle, tmp);
2432       ext(tmp, size1, src, src, 8);
2433       bsl(dst, size1, src, tmp);
2434       break;
2435     default:
2436       assert(false, "unsupported element type");
2437       ShouldNotReachHere();
2438   }
2439 }
2440 
2441 // Extract a scalar element from an sve vector at position 'idx'.
2442 // The input elements in src are expected to be of integral type.
2443 void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src,
2444                                              int idx, FloatRegister vtmp) {
2445   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2446   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2447   if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction
2448     if (bt == T_INT || bt == T_LONG) {
2449       umov(dst, src, size, idx);
2450     } else {
2451       smov(dst, src, size, idx);
2452     }
2453   } else {
2454     sve_orr(vtmp, src, src);
2455     sve_ext(vtmp, vtmp, idx << size);
2456     if (bt == T_INT || bt == T_LONG) {
2457       umov(dst, vtmp, size, 0);
2458     } else {
2459       smov(dst, vtmp, size, 0);
2460     }
2461   }
2462 }
2463 
2464 // java.lang.Math::round intrinsics
2465 
2466 // Clobbers: rscratch1, rflags
2467 void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2468                                           FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) {
2469   assert_different_registers(tmp1, tmp2, tmp3, src, dst);
2470   switch (T) {
2471     case T2S:
2472     case T4S:
2473       fmovs(tmp1, T, 0.5f);
2474       mov(rscratch1, jint_cast(0x1.0p23f));
2475       break;
2476     case T2D:
2477       fmovd(tmp1, T, 0.5);
2478       mov(rscratch1, julong_cast(0x1.0p52));
2479       break;
2480     default:
2481       assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2482   }
2483   fadd(tmp1, T, tmp1, src);
2484   fcvtms(tmp1, T, tmp1);
2485   // tmp1 = floor(src + 0.5, ties to even)
2486 
2487   fcvtas(dst, T, src);
2488   // dst = round(src), ties to away
2489 
2490   fneg(tmp3, T, src);
2491   dup(tmp2, T, rscratch1);
2492   cm(HS, tmp3, T, tmp3, tmp2);
2493   // tmp3 is now a set of flags
2494 
2495   bif(dst, T16B, tmp1, tmp3);
2496   // result in dst
2497 }
2498 
2499 // Clobbers: rscratch1, rflags
2500 void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2501                                          FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) {
2502   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2503   assert_different_registers(tmp1, tmp2, src, dst);
2504 
2505   switch (T) {
2506     case S:
2507       mov(rscratch1, jint_cast(0x1.0p23f));
2508       break;
2509     case D:
2510       mov(rscratch1, julong_cast(0x1.0p52));
2511       break;
2512     default:
2513       assert(T == S || T == D, "invalid register variant");
2514   }
2515 
2516   sve_frinta(dst, T, ptrue, src);
2517   // dst = round(src), ties to away
2518 
2519   Label none;
2520 
2521   sve_fneg(tmp1, T, ptrue, src);
2522   sve_dup(tmp2, T, rscratch1);
2523   sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1);
2524   br(EQ, none);
2525   {
2526     sve_cpy(tmp1, T, pgtmp, 0.5);
2527     sve_fadd(tmp1, T, pgtmp, src);
2528     sve_frintm(dst, T, pgtmp, tmp1);
2529     // dst = floor(src + 0.5, ties to even)
2530   }
2531   bind(none);
2532 
2533   sve_fcvtzs(dst, T, ptrue, dst, T);
2534   // result in dst
2535 }
2536 
2537 void C2_MacroAssembler::vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero,
2538                                            FloatRegister one, SIMD_Arrangement T) {
2539   assert_different_registers(dst, src, zero, one);
2540   assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2541 
2542   facgt(dst, T, src, zero);
2543   ushr(dst, T, dst, 1); // dst=0 for +-0.0 and NaN. 0x7FF..F otherwise
2544   bsl(dst, T == T2S ? T8B : T16B, one, src); // Result in dst
2545 }
2546 
2547 void C2_MacroAssembler::vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero,
2548                                           FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T) {
2549     assert_different_registers(dst, src, zero, one, vtmp);
2550     assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2551 
2552     sve_orr(vtmp, src, src);
2553     sve_fac(Assembler::GT, pgtmp, T, ptrue, src, zero); // pmtp=0 for +-0.0 and NaN. 0x1 otherwise
2554     switch (T) {
2555     case S:
2556       sve_and(vtmp, T, min_jint); // Extract the sign bit of float value in every lane of src
2557       sve_orr(vtmp, T, jint_cast(1.0)); // OR it with +1 to make the final result +1 or -1 depending
2558                                         // on the sign of the float value
2559       break;
2560     case D:
2561       sve_and(vtmp, T, min_jlong);
2562       sve_orr(vtmp, T, jlong_cast(1.0));
2563       break;
2564     default:
2565       assert(false, "unsupported");
2566       ShouldNotReachHere();
2567     }
2568     sve_sel(dst, T, pgtmp, vtmp, src); // Select either from src or vtmp based on the predicate register pgtmp
2569                                        // Result in dst
2570 }
2571 
2572 bool C2_MacroAssembler::in_scratch_emit_size() {
2573   if (ciEnv::current()->task() != nullptr) {
2574     PhaseOutput* phase_output = Compile::current()->output();
2575     if (phase_output != nullptr && phase_output->in_scratch_emit_size()) {
2576       return true;
2577     }
2578   }
2579   return MacroAssembler::in_scratch_emit_size();
2580 }
2581 
2582 static void abort_verify_int_in_range(uint idx, jint val, jint lo, jint hi) {
2583   fatal("Invalid CastII, idx: %u, val: %d, lo: %d, hi: %d", idx, val, lo, hi);
2584 }
2585 
2586 void C2_MacroAssembler::verify_int_in_range(uint idx, const TypeInt* t, Register rval, Register rtmp) {
2587   assert(!t->empty() && !t->singleton(), "%s", Type::str(t));
2588   if (t == TypeInt::INT) {
2589     return;
2590   }
2591 
2592   BLOCK_COMMENT("verify_int_in_range {");
2593   Label L_success, L_failure;
2594 
2595   jint lo = t->_lo;
2596   jint hi = t->_hi;
2597 
2598   if (lo != min_jint) {
2599     subsw(rtmp, rval, lo);
2600     br(Assembler::LT, L_failure);
2601   }
2602   if (hi != max_jint) {
2603     subsw(rtmp, rval, hi);
2604     br(Assembler::GT, L_failure);
2605   }
2606   b(L_success);
2607 
2608   bind(L_failure);
2609   movw(c_rarg0, idx);
2610   mov(c_rarg1, rval);
2611   movw(c_rarg2, lo);
2612   movw(c_rarg3, hi);
2613   reconstruct_frame_pointer(rtmp);
2614   rt_call(CAST_FROM_FN_PTR(address, abort_verify_int_in_range), rtmp);
2615   hlt(0);
2616 
2617   bind(L_success);
2618   BLOCK_COMMENT("} verify_int_in_range");
2619 }
2620 
2621 static void abort_verify_long_in_range(uint idx, jlong val, jlong lo, jlong hi) {
2622   fatal("Invalid CastLL, idx: %u, val: " JLONG_FORMAT ", lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, val, lo, hi);
2623 }
2624 
2625 void C2_MacroAssembler::verify_long_in_range(uint idx, const TypeLong* t, Register rval, Register rtmp) {
2626   assert(!t->empty() && !t->singleton(), "%s", Type::str(t));
2627   if (t == TypeLong::LONG) {
2628     return;
2629   }
2630 
2631   BLOCK_COMMENT("verify_long_in_range {");
2632   Label L_success, L_failure;
2633 
2634   jlong lo = t->_lo;
2635   jlong hi = t->_hi;
2636 
2637   if (lo != min_jlong) {
2638     subs(rtmp, rval, lo);
2639     br(Assembler::LT, L_failure);
2640   }
2641   if (hi != max_jlong) {
2642     subs(rtmp, rval, hi);
2643     br(Assembler::GT, L_failure);
2644   }
2645   b(L_success);
2646 
2647   bind(L_failure);
2648   movw(c_rarg0, idx);
2649   mov(c_rarg1, rval);
2650   mov(c_rarg2, lo);
2651   mov(c_rarg3, hi);
2652   reconstruct_frame_pointer(rtmp);
2653   rt_call(CAST_FROM_FN_PTR(address, abort_verify_long_in_range), rtmp);
2654   hlt(0);
2655 
2656   bind(L_success);
2657   BLOCK_COMMENT("} verify_long_in_range");
2658 }
2659 
2660 void C2_MacroAssembler::reconstruct_frame_pointer(Register rtmp) {
2661   const int framesize = Compile::current()->output()->frame_size_in_bytes();
2662   if (PreserveFramePointer) {
2663     // frame pointer is valid
2664 #ifdef ASSERT
2665     // Verify frame pointer value in rfp.
2666     add(rtmp, sp, framesize - 2 * wordSize);
2667     Label L_success;
2668     cmp(rfp, rtmp);
2669     br(Assembler::EQ, L_success);
2670     stop("frame pointer mismatch");
2671     bind(L_success);
2672 #endif // ASSERT
2673   } else {
2674     add(rfp, sp, framesize - 2 * wordSize);
2675   }
2676 }
2677 
2678 // Selects elements from two source vectors (src1, src2) based on index values in the index register
2679 // using Neon instructions and places it in the destination vector element corresponding to the
2680 // index vector element. Each index in the index register must be in the range - [0, 2 * NUM_ELEM),
2681 // where NUM_ELEM is the number of BasicType elements per vector.
2682 // If idx < NUM_ELEM --> selects src1[idx] (idx is an element of the index register)
2683 // Otherwise, selects src2[idx – NUM_ELEM]
2684 void C2_MacroAssembler::select_from_two_vectors_neon(FloatRegister dst, FloatRegister src1,
2685                                                      FloatRegister src2, FloatRegister index,
2686                                                      FloatRegister tmp, unsigned vector_length_in_bytes) {
2687   assert_different_registers(dst, src1, src2, tmp);
2688   SIMD_Arrangement size = vector_length_in_bytes == 16 ? T16B : T8B;
2689 
2690   if (vector_length_in_bytes == 16) {
2691     assert(UseSVE <= 1, "sve must be <= 1");
2692     assert(src1->successor() == src2, "Source registers must be ordered");
2693     // If the vector length is 16B, then use the Neon "tbl" instruction with two vector table
2694     tbl(dst, size, src1, 2, index);
2695   } else { // vector length == 8
2696     assert(UseSVE == 0, "must be Neon only");
2697     // We need to fit both the source vectors (src1, src2) in a 128-bit register because the
2698     // Neon "tbl" instruction supports only looking up 16B vectors. We then use the Neon "tbl"
2699     // instruction with one vector lookup
2700     ins(tmp, D, src1, 0, 0);
2701     ins(tmp, D, src2, 1, 0);
2702     tbl(dst, size, tmp, 1, index);
2703   }
2704 }
2705 
2706 // Selects elements from two source vectors (src1, src2) based on index values in the index register
2707 // using SVE/SVE2 instructions and places it in the destination vector element corresponding to the
2708 // index vector element. Each index in the index register must be in the range - [0, 2 * NUM_ELEM),
2709 // where NUM_ELEM is the number of BasicType elements per vector.
2710 // If idx < NUM_ELEM --> selects src1[idx] (idx is an element of the index register)
2711 // Otherwise, selects src2[idx – NUM_ELEM]
2712 void C2_MacroAssembler::select_from_two_vectors_sve(FloatRegister dst, FloatRegister src1,
2713                                                     FloatRegister src2, FloatRegister index,
2714                                                     FloatRegister tmp, SIMD_RegVariant T,
2715                                                     unsigned vector_length_in_bytes) {
2716   assert_different_registers(dst, src1, src2, index, tmp);
2717 
2718   if (vector_length_in_bytes == 8) {
2719     // We need to fit both the source vectors (src1, src2) in a single vector register because the
2720     // SVE "tbl" instruction is unpredicated and works on the entire vector which can lead to
2721     // incorrect results if each source vector is only partially filled. We then use the SVE "tbl"
2722     // instruction with one vector lookup
2723     assert(UseSVE >= 1, "sve must be >= 1");
2724     ins(tmp, D, src1, 0, 0);
2725     ins(tmp, D, src2, 1, 0);
2726     sve_tbl(dst, T, tmp, index);
2727   } else {  // UseSVE == 2 and vector_length_in_bytes > 8
2728     // If the vector length is > 8, then use the SVE2 "tbl" instruction with the two vector table.
2729     // The assertion - vector_length_in_bytes == MaxVectorSize ensures that this operation
2730     // is not executed on machines where vector_length_in_bytes < MaxVectorSize
2731     // with the only exception of 8B vector length.
2732     assert(UseSVE == 2 && vector_length_in_bytes == MaxVectorSize, "must be");
2733     assert(src1->successor() == src2, "Source registers must be ordered");
2734     sve_tbl(dst, T, src1, src2, index);
2735   }
2736 }
2737 
2738 void C2_MacroAssembler::select_from_two_vectors(FloatRegister dst, FloatRegister src1,
2739                                                 FloatRegister src2, FloatRegister index,
2740                                                 FloatRegister tmp, BasicType bt,
2741                                                 unsigned vector_length_in_bytes) {
2742 
2743   assert_different_registers(dst, src1, src2, index, tmp);
2744 
2745   // The cases that can reach this method are -
2746   // - UseSVE = 0/1, vector_length_in_bytes = 8 or 16, excluding double and long types
2747   // - UseSVE = 2, vector_length_in_bytes >= 8, for all types
2748   //
2749   // SVE/SVE2 tbl instructions are generated when UseSVE = 1 with vector_length_in_bytes = 8
2750   // and UseSVE = 2 with vector_length_in_bytes >= 8
2751   //
2752   // Neon instructions are generated when UseSVE = 0 with vector_length_in_bytes = 8 or 16 and
2753   // UseSVE = 1 with vector_length_in_bytes = 16
2754 
2755   if ((UseSVE == 1 && vector_length_in_bytes == 8) || UseSVE == 2) {
2756     SIMD_RegVariant T = elemType_to_regVariant(bt);
2757     select_from_two_vectors_sve(dst, src1, src2, index, tmp, T, vector_length_in_bytes);
2758     return;
2759   }
2760 
2761   // The only BasicTypes that can reach here are T_SHORT, T_BYTE, T_INT and T_FLOAT
2762   assert(bt != T_DOUBLE && bt != T_LONG, "unsupported basic type");
2763   assert(vector_length_in_bytes <= 16, "length_in_bytes must be <= 16");
2764 
2765   bool isQ = vector_length_in_bytes == 16;
2766 
2767   SIMD_Arrangement size1 = isQ ? T16B : T8B;
2768   SIMD_Arrangement size2 = esize2arrangement((uint)type2aelembytes(bt), isQ);
2769 
2770   // Neon "tbl" instruction only supports byte tables, so we need to look at chunks of
2771   // 2B for selecting shorts or chunks of 4B for selecting ints/floats from the table.
2772   // The index values in "index" register are in the range of [0, 2 * NUM_ELEM) where NUM_ELEM
2773   // is the number of elements that can fit in a vector. For ex. for T_SHORT with 64-bit vector length,
2774   // the indices can range from [0, 8).
2775   // As an example with 64-bit vector length and T_SHORT type - let index = [2, 5, 1, 0]
2776   // Move a constant 0x02 in every byte of tmp - tmp = [0x0202, 0x0202, 0x0202, 0x0202]
2777   // Multiply index vector with tmp to yield - dst = [0x0404, 0x0a0a, 0x0202, 0x0000]
2778   // Move a constant 0x0100 in every 2B of tmp - tmp = [0x0100, 0x0100, 0x0100, 0x0100]
2779   // Add the multiplied result to the vector in tmp to obtain the byte level
2780   // offsets - dst = [0x0504, 0x0b0a, 0x0302, 0x0100]
2781   // Use these offsets in the "tbl" instruction to select chunks of 2B.
2782 
2783   if (bt == T_BYTE) {
2784     select_from_two_vectors_neon(dst, src1, src2, index, tmp, vector_length_in_bytes);
2785   } else {
2786     int elem_size = (bt == T_SHORT) ? 2 : 4;
2787     uint64_t tbl_offset = (bt == T_SHORT) ? 0x0100u : 0x03020100u;
2788 
2789     mov(tmp, size1, elem_size);
2790     mulv(dst, size2, index, tmp);
2791     mov(tmp, size2, tbl_offset);
2792     addv(dst, size1, dst, tmp); // "dst" now contains the processed index elements
2793                                 // to select a set of 2B/4B
2794     select_from_two_vectors_neon(dst, src1, src2, dst, tmp, vector_length_in_bytes);
2795   }
2796 }
2797 
2798 // Vector expand implementation. Elements from the src vector are expanded into
2799 // the dst vector under the control of the vector mask.
2800 // Since there are no native instructions directly corresponding to expand before
2801 // SVE2p2, the following implementations mainly leverages the TBL instruction to
2802 // implement expand. To compute the index input for TBL, the prefix sum algorithm
2803 // (https://en.wikipedia.org/wiki/Prefix_sum) is used. The same algorithm is used
2804 // for NEON and SVE, but with different instructions where appropriate.
2805 
2806 // Vector expand implementation for NEON.
2807 //
2808 // An example of 128-bit Byte vector:
2809 //   Data direction: high <== low
2810 //   Input:
2811 //         src   = g  f  e  d  c  b  a  9  8  7  6  5  4  3  2  1
2812 //         mask  = 0  0 -1 -1  0  0 -1 -1  0  0 -1 -1  0  0 -1 -1
2813 //   Expected result:
2814 //         dst   = 0  0  8  7  0  0  6  5  0  0  4  3  0  0  2  1
2815 void C2_MacroAssembler::vector_expand_neon(FloatRegister dst, FloatRegister src, FloatRegister mask,
2816                                            FloatRegister tmp1, FloatRegister tmp2, BasicType bt,
2817                                            int vector_length_in_bytes) {
2818   assert(vector_length_in_bytes <= 16, "the vector length in bytes for NEON must be <= 16");
2819   assert_different_registers(dst, src, mask, tmp1, tmp2);
2820   // Since the TBL instruction only supports byte table, we need to
2821   // compute indices in byte type for all types.
2822   SIMD_Arrangement size = vector_length_in_bytes == 16 ? T16B : T8B;
2823   // tmp1 =  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
2824   dup(tmp1, size, zr);
2825   // dst  =  0  0  1  1  0  0  1  1  0  0  1  1  0  0  1  1
2826   negr(dst, size, mask);
2827   // Calculate vector index for TBL with prefix sum algorithm.
2828   // dst  =  8  8  8  7  6  6  6  5  4  4  4  3  2  2  2  1
2829   for (int i = 1; i < vector_length_in_bytes; i <<= 1) {
2830     ext(tmp2, size, tmp1, dst, vector_length_in_bytes - i);
2831     addv(dst, size, tmp2, dst);
2832   }
2833   // tmp2 =  0  0 -1 -1  0  0 -1 -1  0  0 -1 -1  0  0 -1 -1
2834   orr(tmp2, size, mask, mask);
2835   // tmp2 =  0  0  8  7  0  0  6  5  0  0  4  3  0  0  2  1
2836   bsl(tmp2, size, dst, tmp1);
2837   // tmp1 =  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
2838   movi(tmp1, size, 1);
2839   // dst  = -1 -1  7  6 -1 -1  5  4 -1 -1  3  2 -1 -1  1  0
2840   subv(dst, size, tmp2, tmp1);
2841   // dst  =  0  0  8  7  0  0  6  5  0  0  4  3  0  0  2  1
2842   tbl(dst, size, src, 1, dst);
2843 }
2844 
2845 // Vector expand implementation for SVE.
2846 //
2847 // An example of 128-bit Short vector:
2848 //   Data direction: high <== low
2849 //   Input:
2850 //         src   = gf ed cb a9 87 65 43 21
2851 //         pg    = 00 01 00 01 00 01 00 01
2852 //   Expected result:
2853 //         dst   = 00 87 00 65 00 43 00 21
2854 void C2_MacroAssembler::vector_expand_sve(FloatRegister dst, FloatRegister src, PRegister pg,
2855                                           FloatRegister tmp1, FloatRegister tmp2, BasicType bt,
2856                                           int vector_length_in_bytes) {
2857   assert(UseSVE > 0, "expand implementation only for SVE");
2858   assert_different_registers(dst, src, tmp1, tmp2);
2859   SIMD_RegVariant size = elemType_to_regVariant(bt);
2860 
2861   // tmp1 = 00 00 00 00 00 00 00 00
2862   sve_dup(tmp1, size, 0);
2863   sve_movprfx(tmp2, tmp1);
2864   // tmp2 = 00 01 00 01 00 01 00 01
2865   sve_cpy(tmp2, size, pg, 1, true);
2866   // Calculate vector index for TBL with prefix sum algorithm.
2867   // tmp2 = 04 04 03 03 02 02 01 01
2868   for (int i = type2aelembytes(bt); i < vector_length_in_bytes; i <<= 1) {
2869     sve_movprfx(dst, tmp1);
2870     // The EXT instruction operates on the full-width sve register. The correct
2871     // index calculation method is:
2872     // vector_length_in_bytes - i + MaxVectorSize - vector_length_in_bytes =>
2873     // MaxVectorSize - i.
2874     sve_ext(dst, tmp2, MaxVectorSize - i);
2875     sve_add(tmp2, size, dst, tmp2);
2876   }
2877   // dst  = 00 04 00 03 00 02 00 01
2878   sve_sel(dst, size, pg, tmp2, tmp1);
2879   // dst  = -1 03 -1 02 -1 01 -1 00
2880   sve_sub(dst, size, 1);
2881   // dst  = 00 87 00 65 00 43 00 21
2882   sve_tbl(dst, size, src, dst);
2883 }
2884 
2885 // Optimized SVE cpy (imm, zeroing) instruction.
2886 //
2887 // `movi; cpy(imm, merging)` and `cpy(imm, zeroing)` have the same
2888 // functionality, but test results show that `movi; cpy(imm, merging)` has
2889 // higher throughput on some microarchitectures. This would depend on
2890 // microarchitecture and so may vary between implementations.
2891 void C2_MacroAssembler::sve_cpy(FloatRegister dst, SIMD_RegVariant T,
2892                                 PRegister pg, int imm8, bool isMerge) {
2893   if (VM_Version::prefer_sve_merging_mode_cpy() && !isMerge) {
2894     // Generates a NEON instruction `movi V<dst>.2d, #0`.
2895     // On AArch64, Z and V registers alias in the low 128 bits, so V<dst> is
2896     // the low 128 bits of Z<dst>. A write to V<dst> also clears all bits of
2897     // Z<dst> above 128, so this `movi` instruction effectively zeroes the
2898     // entire Z<dst> register. According to the Arm Software Optimization
2899     // Guide, `movi` is zero latency.
2900     movi(dst, T2D, 0);
2901     isMerge = true;
2902   }
2903   Assembler::sve_cpy(dst, T, pg, imm8, isMerge);
2904 }