1 /*
   2  * Copyright (c) 2020, 2026, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright 2026 Arm Limited and/or its affiliates.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "opto/c2_MacroAssembler.hpp"
  29 #include "opto/compile.hpp"
  30 #include "opto/intrinsicnode.hpp"
  31 #include "opto/matcher.hpp"
  32 #include "opto/output.hpp"
  33 #include "opto/subnode.hpp"
  34 #include "runtime/objectMonitorTable.hpp"
  35 #include "runtime/stubRoutines.hpp"
  36 #include "runtime/synchronizer.hpp"
  37 #include "utilities/globalDefinitions.hpp"
  38 #include "utilities/powerOfTwo.hpp"
  39 
  40 #ifdef PRODUCT
  41 #define BLOCK_COMMENT(str) /* nothing */
  42 #define STOP(error) stop(error)
  43 #else
  44 #define BLOCK_COMMENT(str) block_comment(str)
  45 #define STOP(error) block_comment(error); stop(error)
  46 #endif
  47 
  48 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  49 
  50 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
  51 
  52 // jdk.internal.util.ArraysSupport.vectorizedHashCode
  53 address C2_MacroAssembler::arrays_hashcode(Register ary, Register cnt, Register result,
  54                                            FloatRegister vdata0, FloatRegister vdata1,
  55                                            FloatRegister vdata2, FloatRegister vdata3,
  56                                            FloatRegister vmul0, FloatRegister vmul1,
  57                                            FloatRegister vmul2, FloatRegister vmul3,
  58                                            FloatRegister vpow, FloatRegister vpowm,
  59                                            BasicType eltype) {
  60   ARRAYS_HASHCODE_REGISTERS;
  61 
  62   Register tmp1 = rscratch1, tmp2 = rscratch2;
  63 
  64   Label TAIL, STUB_SWITCH, STUB_SWITCH_OUT, LOOP, BR_BASE, LARGE, DONE;
  65 
  66   // Vectorization factor. Number of array elements loaded to one SIMD&FP registers by the stubs. We
  67   // use 8H load arrangements for chars and shorts and 8B for booleans and bytes. It's possible to
  68   // use 4H for chars and shorts instead, but using 8H gives better performance.
  69   const size_t vf = eltype == T_BOOLEAN || eltype == T_BYTE ? 8
  70                     : eltype == T_CHAR || eltype == T_SHORT ? 8
  71                     : eltype == T_INT                       ? 4
  72                                                             : 0;
  73   guarantee(vf, "unsupported eltype");
  74 
  75   // Unroll factor for the scalar loop below. The value is chosen based on performance analysis.
  76   const size_t unroll_factor = 4;
  77 
  78   switch (eltype) {
  79   case T_BOOLEAN:
  80     BLOCK_COMMENT("arrays_hashcode(unsigned byte) {");
  81     break;
  82   case T_CHAR:
  83     BLOCK_COMMENT("arrays_hashcode(char) {");
  84     break;
  85   case T_BYTE:
  86     BLOCK_COMMENT("arrays_hashcode(byte) {");
  87     break;
  88   case T_SHORT:
  89     BLOCK_COMMENT("arrays_hashcode(short) {");
  90     break;
  91   case T_INT:
  92     BLOCK_COMMENT("arrays_hashcode(int) {");
  93     break;
  94   default:
  95     ShouldNotReachHere();
  96   }
  97 
  98   // large_arrays_hashcode(T_INT) performs worse than the scalar loop below when the Neon loop
  99   // implemented by the stub executes just once. Call the stub only if at least two iterations will
 100   // be executed.
 101   const size_t large_threshold = eltype == T_INT ? vf * 2 : vf;
 102   cmpw(cnt, large_threshold);
 103   br(Assembler::HS, LARGE);
 104 
 105   bind(TAIL);
 106 
 107   // The andr performs cnt % uf where uf = unroll_factor. The subtract shifted by 3 offsets past
 108   // uf - (cnt % uf) pairs of load + madd insns i.e. it only executes cnt % uf load + madd pairs.
 109   // Iteration eats up the remainder, uf elements at a time.
 110   assert(is_power_of_2(unroll_factor), "can't use this value to calculate the jump target PC");
 111   andr(tmp2, cnt, unroll_factor - 1);
 112   adr(tmp1, BR_BASE);
 113   // For Cortex-A53 offset is 4 because 2 nops are generated.
 114   sub(tmp1, tmp1, tmp2, ext::sxtw, VM_Version::supports_a53mac() ? 4 : 3);
 115   movw(tmp2, 0x1f);
 116   br(tmp1);
 117 
 118   bind(LOOP);
 119   for (size_t i = 0; i < unroll_factor; ++i) {
 120     load(tmp1, Address(post(ary, type2aelembytes(eltype))), eltype);
 121     maddw(result, result, tmp2, tmp1);
 122     // maddw generates an extra nop for Cortex-A53 (see maddw definition in macroAssembler).
 123     // Generate 2nd nop to have 4 instructions per iteration.
 124     if (VM_Version::supports_a53mac()) {
 125       nop();
 126     }
 127   }
 128   bind(BR_BASE);
 129   subsw(cnt, cnt, unroll_factor);
 130   br(Assembler::HS, LOOP);
 131 
 132   b(DONE);
 133 
 134   bind(LARGE);
 135 
 136   RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_arrays_hashcode(eltype));
 137   assert(stub.target() != nullptr, "array_hashcode stub has not been generated");
 138   address tpc = trampoline_call(stub);
 139   if (tpc == nullptr) {
 140     DEBUG_ONLY(reset_labels(TAIL, BR_BASE));
 141     postcond(pc() == badAddress);
 142     return nullptr;
 143   }
 144 
 145   bind(DONE);
 146 
 147   BLOCK_COMMENT("} // arrays_hashcode");
 148 
 149   postcond(pc() != badAddress);
 150   return pc();
 151 }
 152 
 153 void C2_MacroAssembler::fast_lock(Register obj, Register box, Register t1,
 154                                   Register t2, Register t3) {
 155   assert_different_registers(obj, box, t1, t2, t3, rscratch2);
 156 
 157   // Handle inflated monitor.
 158   Label inflated;
 159   // Finish fast lock successfully. MUST branch to with flag == EQ
 160   Label locked;
 161   // Finish fast lock unsuccessfully. MUST branch to with flag == NE
 162   Label slow_path;
 163 
 164   if (UseObjectMonitorTable) {
 165     // Clear cache in case fast locking succeeds or we need to take the slow-path.
 166     str(zr, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 167   }
 168 
 169   if (DiagnoseSyncOnValueBasedClasses != 0) {
 170     load_klass(t1, obj);
 171     ldrb(t1, Address(t1, Klass::misc_flags_offset()));
 172     tst(t1, KlassFlags::_misc_is_value_based_class);
 173     br(Assembler::NE, slow_path);
 174   }
 175 
 176   const Register t1_mark = t1;
 177   const Register t3_t = t3;
 178 
 179   { // Fast locking
 180 
 181     // Push lock to the lock stack and finish successfully. MUST branch to with flag == EQ
 182     Label push;
 183 
 184     const Register t2_top = t2;
 185 
 186     // Check if lock-stack is full.
 187     ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 188     cmpw(t2_top, (unsigned)LockStack::end_offset() - 1);
 189     br(Assembler::GT, slow_path);
 190 
 191     // Check if recursive.
 192     subw(t3_t, t2_top, oopSize);
 193     ldr(t3_t, Address(rthread, t3_t));
 194     cmp(obj, t3_t);
 195     br(Assembler::EQ, push);
 196 
 197     // Relaxed normal load to check for monitor. Optimization for monitor case.
 198     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 199     tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
 200 
 201     // Not inflated
 202     assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea");
 203 
 204     // Try to lock. Transition lock-bits 0b01 => 0b00
 205     orr(t1_mark, t1_mark, markWord::unlocked_value);
 206     eor(t3_t, t1_mark, markWord::unlocked_value);
 207     cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
 208             /*acquire*/ true, /*release*/ false, /*weak*/ false, noreg);
 209     br(Assembler::NE, slow_path);
 210 
 211     bind(push);
 212     // After successful lock, push object on lock-stack.
 213     str(obj, Address(rthread, t2_top));
 214     addw(t2_top, t2_top, oopSize);
 215     strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 216     b(locked);
 217   }
 218 
 219   { // Handle inflated monitor.
 220     bind(inflated);
 221 
 222     const Register t1_monitor = t1;
 223 
 224     if (!UseObjectMonitorTable) {
 225       assert(t1_monitor == t1_mark, "should be the same here");
 226     } else {
 227       const Register t1_hash = t1;
 228       Label monitor_found;
 229 
 230       // Save the mark, we might need it to extract the hash.
 231       mov(t3, t1_mark);
 232 
 233       // Look for the monitor in the om_cache.
 234 
 235       ByteSize cache_offset   = JavaThread::om_cache_oops_offset();
 236       ByteSize monitor_offset = OMCache::oop_to_monitor_difference();
 237       const int num_unrolled  = OMCache::CAPACITY;
 238       for (int i = 0; i < num_unrolled; i++) {
 239         ldr(t1_monitor, Address(rthread, cache_offset + monitor_offset));
 240         ldr(t2, Address(rthread, cache_offset));
 241         cmp(obj, t2);
 242         br(Assembler::EQ, monitor_found);
 243         cache_offset = cache_offset + OMCache::oop_to_oop_difference();
 244       }
 245 
 246       // Look for the monitor in the table.
 247 
 248       // Get the hash code.
 249       ubfx(t1_hash, t3, markWord::hash_shift, markWord::hash_bits);
 250 
 251       // Get the table and calculate the bucket's address
 252       lea(t3, ExternalAddress(ObjectMonitorTable::current_table_address()));
 253       ldr(t3, Address(t3));
 254       ldr(t2, Address(t3, ObjectMonitorTable::table_capacity_mask_offset()));
 255       ands(t1_hash, t1_hash, t2);
 256       ldr(t3, Address(t3, ObjectMonitorTable::table_buckets_offset()));
 257 
 258       // Read the monitor from the bucket.
 259       ldr(t1_monitor, Address(t3, t1_hash, Address::lsl(LogBytesPerWord)));
 260 
 261       // Check if the monitor in the bucket is special (empty, tombstone or removed).
 262       cmp(t1_monitor, (unsigned char)ObjectMonitorTable::SpecialPointerValues::below_is_special);
 263       br(Assembler::LO, slow_path);
 264 
 265       // Check if object matches.
 266       ldr(t3, Address(t1_monitor, ObjectMonitor::object_offset()));
 267       BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
 268       bs_asm->try_resolve_weak_handle_in_c2(this, t3, t2, slow_path);
 269       cmp(t3, obj);
 270       br(Assembler::NE, slow_path);
 271 
 272       bind(monitor_found);
 273     }
 274 
 275     const Register t2_owner_addr = t2;
 276     const Register t3_owner = t3;
 277     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 278     const Address owner_address(t1_monitor, ObjectMonitor::owner_offset() - monitor_tag);
 279     const Address recursions_address(t1_monitor, ObjectMonitor::recursions_offset() - monitor_tag);
 280 
 281     Label monitor_locked;
 282 
 283     // Compute owner address.
 284     lea(t2_owner_addr, owner_address);
 285 
 286     // Try to CAS owner (no owner => current thread's _monitor_owner_id).
 287     ldr(rscratch2, Address(rthread, JavaThread::monitor_owner_id_offset()));
 288     cmpxchg(t2_owner_addr, zr, rscratch2, Assembler::xword, /*acquire*/ true,
 289             /*release*/ false, /*weak*/ false, t3_owner);
 290     br(Assembler::EQ, monitor_locked);
 291 
 292     // Check if recursive.
 293     cmp(t3_owner, rscratch2);
 294     br(Assembler::NE, slow_path);
 295 
 296     // Recursive.
 297     increment(recursions_address, 1);
 298 
 299     bind(monitor_locked);
 300     if (UseObjectMonitorTable) {
 301       str(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 302     }
 303   }
 304 
 305   bind(locked);
 306 
 307 #ifdef ASSERT
 308   // Check that locked label is reached with Flags == EQ.
 309   Label flag_correct;
 310   br(Assembler::EQ, flag_correct);
 311   stop("Fast Lock Flag != EQ");
 312 #endif
 313 
 314   bind(slow_path);
 315 #ifdef ASSERT
 316   // Check that slow_path label is reached with Flags == NE.
 317   br(Assembler::NE, flag_correct);
 318   stop("Fast Lock Flag != NE");
 319   bind(flag_correct);
 320 #endif
 321   // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
 322 }
 323 
 324 void C2_MacroAssembler::fast_unlock(Register obj, Register box, Register t1,
 325                                     Register t2, Register t3) {
 326   assert_different_registers(obj, box, t1, t2, t3);
 327 
 328   // Handle inflated monitor.
 329   Label inflated, inflated_load_mark;
 330   // Finish fast unlock successfully. MUST branch to with flag == EQ
 331   Label unlocked;
 332   // Finish fast unlock unsuccessfully. MUST branch to with flag == NE
 333   Label slow_path;
 334 
 335   const Register t1_mark = t1;
 336   const Register t2_top = t2;
 337   const Register t3_t = t3;
 338 
 339   { // Fast unlock
 340 
 341     Label push_and_slow_path;
 342 
 343     // Check if obj is top of lock-stack.
 344     ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 345     subw(t2_top, t2_top, oopSize);
 346     ldr(t3_t, Address(rthread, t2_top));
 347     cmp(obj, t3_t);
 348     // Top of lock stack was not obj. Must be monitor.
 349     br(Assembler::NE, inflated_load_mark);
 350 
 351     // Pop lock-stack.
 352     DEBUG_ONLY(str(zr, Address(rthread, t2_top));)
 353     strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 354 
 355     // Check if recursive.
 356     subw(t3_t, t2_top, oopSize);
 357     ldr(t3_t, Address(rthread, t3_t));
 358     cmp(obj, t3_t);
 359     br(Assembler::EQ, unlocked);
 360 
 361     // Not recursive.
 362     // Load Mark.
 363     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 364 
 365     // Check header for monitor (0b10).
 366     // Because we got here by popping (meaning we pushed in locked)
 367     // there will be no monitor in the box. So we need to push back the obj
 368     // so that the runtime can fix any potential anonymous owner.
 369     tbnz(t1_mark, exact_log2(markWord::monitor_value), UseObjectMonitorTable ? push_and_slow_path : inflated);
 370 
 371     // Try to unlock. Transition lock bits 0b00 => 0b01
 372     assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea");
 373     orr(t3_t, t1_mark, markWord::unlocked_value);
 374     cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
 375             /*acquire*/ false, /*release*/ true, /*weak*/ false, noreg);
 376     br(Assembler::EQ, unlocked);
 377 
 378     bind(push_and_slow_path);
 379     // Compare and exchange failed.
 380     // Restore lock-stack and handle the unlock in runtime.
 381     DEBUG_ONLY(str(obj, Address(rthread, t2_top));)
 382     addw(t2_top, t2_top, oopSize);
 383     str(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 384     b(slow_path);
 385   }
 386 
 387 
 388   { // Handle inflated monitor.
 389     bind(inflated_load_mark);
 390     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 391 #ifdef ASSERT
 392     tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
 393     stop("Fast Unlock not monitor");
 394 #endif
 395 
 396     bind(inflated);
 397 
 398 #ifdef ASSERT
 399     Label check_done;
 400     subw(t2_top, t2_top, oopSize);
 401     cmpw(t2_top, in_bytes(JavaThread::lock_stack_base_offset()));
 402     br(Assembler::LT, check_done);
 403     ldr(t3_t, Address(rthread, t2_top));
 404     cmp(obj, t3_t);
 405     br(Assembler::NE, inflated);
 406     stop("Fast Unlock lock on stack");
 407     bind(check_done);
 408 #endif
 409 
 410     const Register t1_monitor = t1;
 411 
 412     if (!UseObjectMonitorTable) {
 413       assert(t1_monitor == t1_mark, "should be the same here");
 414 
 415       // Untag the monitor.
 416       add(t1_monitor, t1_mark, -(int)markWord::monitor_value);
 417     } else {
 418       ldr(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 419       // null check with Flags == NE, no valid pointer below alignof(ObjectMonitor*)
 420       cmp(t1_monitor, checked_cast<uint8_t>(alignof(ObjectMonitor*)));
 421       br(Assembler::LO, slow_path);
 422     }
 423 
 424     const Register t2_recursions = t2;
 425     Label not_recursive;
 426 
 427     // Check if recursive.
 428     ldr(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
 429     cbz(t2_recursions, not_recursive);
 430 
 431     // Recursive unlock.
 432     sub(t2_recursions, t2_recursions, 1u);
 433     str(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
 434     // Set flag == EQ
 435     cmp(t2_recursions, t2_recursions);
 436     b(unlocked);
 437 
 438     bind(not_recursive);
 439 
 440     const Register t2_owner_addr = t2;
 441 
 442     // Compute owner address.
 443     lea(t2_owner_addr, Address(t1_monitor, ObjectMonitor::owner_offset()));
 444 
 445     // Set owner to null.
 446     // Release to satisfy the JMM
 447     stlr(zr, t2_owner_addr);
 448     // We need a full fence after clearing owner to avoid stranding.
 449     // StoreLoad achieves this.
 450     membar(StoreLoad);
 451 
 452     // Check if the entry_list is empty.
 453     ldr(rscratch1, Address(t1_monitor, ObjectMonitor::entry_list_offset()));
 454     cmp(rscratch1, zr);
 455     br(Assembler::EQ, unlocked);  // If so we are done.
 456 
 457     // Check if there is a successor.
 458     ldr(rscratch1, Address(t1_monitor, ObjectMonitor::succ_offset()));
 459     cmp(rscratch1, zr);
 460     br(Assembler::NE, unlocked);  // If so we are done.
 461 
 462     // Save the monitor pointer in the current thread, so we can try to
 463     // reacquire the lock in SharedRuntime::monitor_exit_helper().
 464     str(t1_monitor, Address(rthread, JavaThread::unlocked_inflated_monitor_offset()));
 465 
 466     cmp(zr, rthread); // Set Flag to NE => slow path
 467     b(slow_path);
 468   }
 469 
 470   bind(unlocked);
 471   cmp(zr, zr); // Set Flags to EQ => fast path
 472 
 473 #ifdef ASSERT
 474   // Check that unlocked label is reached with Flags == EQ.
 475   Label flag_correct;
 476   br(Assembler::EQ, flag_correct);
 477   stop("Fast Unlock Flag != EQ");
 478 #endif
 479 
 480   bind(slow_path);
 481 #ifdef ASSERT
 482   // Check that slow_path label is reached with Flags == NE.
 483   br(Assembler::NE, flag_correct);
 484   stop("Fast Unlock Flag != NE");
 485   bind(flag_correct);
 486 #endif
 487   // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
 488 }
 489 
 490 // Search for str1 in str2 and return index or -1
 491 // Clobbers: rscratch1, rscratch2, rflags. May also clobber v0-v1, when icnt1==-1.
 492 void C2_MacroAssembler::string_indexof(Register str2, Register str1,
 493                                        Register cnt2, Register cnt1,
 494                                        Register tmp1, Register tmp2,
 495                                        Register tmp3, Register tmp4,
 496                                        Register tmp5, Register tmp6,
 497                                        int icnt1, Register result, int ae) {
 498   // NOTE: tmp5, tmp6 can be zr depending on specific method version
 499   Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
 500 
 501   Register ch1 = rscratch1;
 502   Register ch2 = rscratch2;
 503   Register cnt1tmp = tmp1;
 504   Register cnt2tmp = tmp2;
 505   Register cnt1_neg = cnt1;
 506   Register cnt2_neg = cnt2;
 507   Register result_tmp = tmp4;
 508 
 509   bool isL = ae == StrIntrinsicNode::LL;
 510 
 511   bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
 512   bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
 513   int str1_chr_shift = str1_isL ? 0:1;
 514   int str2_chr_shift = str2_isL ? 0:1;
 515   int str1_chr_size = str1_isL ? 1:2;
 516   int str2_chr_size = str2_isL ? 1:2;
 517   chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
 518                                       (chr_insn)&MacroAssembler::ldrh;
 519   chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
 520                                       (chr_insn)&MacroAssembler::ldrh;
 521   chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
 522   chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
 523 
 524   // Note, inline_string_indexOf() generates checks:
 525   // if (substr.count > string.count) return -1;
 526   // if (substr.count == 0) return 0;
 527 
 528   // We have two strings, a source string in str2, cnt2 and a pattern string
 529   // in str1, cnt1. Find the 1st occurrence of pattern in source or return -1.
 530 
 531   // For larger pattern and source we use a simplified Boyer Moore algorithm.
 532   // With a small pattern and source we use linear scan.
 533 
 534   if (icnt1 == -1) {
 535     sub(result_tmp, cnt2, cnt1);
 536     cmp(cnt1, (u1)8);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
 537     br(LT, LINEARSEARCH);
 538     dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
 539     subs(zr, cnt1, 256);
 540     lsr(tmp1, cnt2, 2);
 541     ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
 542     br(GE, LINEARSTUB);
 543   }
 544 
 545 // The Boyer Moore alogorithm is based on the description here:-
 546 //
 547 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
 548 //
 549 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
 550 // and the 'Good Suffix' rule.
 551 //
 552 // These rules are essentially heuristics for how far we can shift the
 553 // pattern along the search string.
 554 //
 555 // The implementation here uses the 'Bad Character' rule only because of the
 556 // complexity of initialisation for the 'Good Suffix' rule.
 557 //
 558 // This is also known as the Boyer-Moore-Horspool algorithm:-
 559 //
 560 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
 561 //
 562 // This particular implementation has few java-specific optimizations.
 563 //
 564 // #define ASIZE 256
 565 //
 566 //    int bm(unsigned char *x, int m, unsigned char *y, int n) {
 567 //       int i, j;
 568 //       unsigned c;
 569 //       unsigned char bc[ASIZE];
 570 //
 571 //       /* Preprocessing */
 572 //       for (i = 0; i < ASIZE; ++i)
 573 //          bc[i] = m;
 574 //       for (i = 0; i < m - 1; ) {
 575 //          c = x[i];
 576 //          ++i;
 577 //          // c < 256 for Latin1 string, so, no need for branch
 578 //          #ifdef PATTERN_STRING_IS_LATIN1
 579 //          bc[c] = m - i;
 580 //          #else
 581 //          if (c < ASIZE) bc[c] = m - i;
 582 //          #endif
 583 //       }
 584 //
 585 //       /* Searching */
 586 //       j = 0;
 587 //       while (j <= n - m) {
 588 //          c = y[i+j];
 589 //          if (x[m-1] == c)
 590 //            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
 591 //          if (i < 0) return j;
 592 //          // c < 256 for Latin1 string, so, no need for branch
 593 //          #ifdef SOURCE_STRING_IS_LATIN1
 594 //          // LL case: (c< 256) always true. Remove branch
 595 //          j += bc[y[j+m-1]];
 596 //          #endif
 597 //          #ifndef PATTERN_STRING_IS_UTF
 598 //          // UU case: need if (c<ASIZE) check. Skip 1 character if not.
 599 //          if (c < ASIZE)
 600 //            j += bc[y[j+m-1]];
 601 //          else
 602 //            j += 1
 603 //          #endif
 604 //          #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
 605 //          // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
 606 //          if (c < ASIZE)
 607 //            j += bc[y[j+m-1]];
 608 //          else
 609 //            j += m
 610 //          #endif
 611 //       }
 612 //    }
 613 
 614   if (icnt1 == -1) {
 615     Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
 616         BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
 617     Register cnt1end = tmp2;
 618     Register str2end = cnt2;
 619     Register skipch = tmp2;
 620 
 621     // str1 length is >=8, so, we can read at least 1 register for cases when
 622     // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
 623     // UL case. We'll re-read last character in inner pre-loop code to have
 624     // single outer pre-loop load
 625     const int firstStep = isL ? 7 : 3;
 626 
 627     const int ASIZE = 256;
 628     const int STORED_BYTES = 32; // amount of bytes stored per instruction
 629     sub(sp, sp, ASIZE);
 630     mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
 631     mov(ch1, sp);
 632     BIND(BM_INIT_LOOP);
 633       stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
 634       subs(tmp5, tmp5, 1);
 635       br(GT, BM_INIT_LOOP);
 636 
 637       sub(cnt1tmp, cnt1, 1);
 638       mov(tmp5, str2);
 639       add(str2end, str2, result_tmp, LSL, str2_chr_shift);
 640       sub(ch2, cnt1, 1);
 641       mov(tmp3, str1);
 642     BIND(BCLOOP);
 643       (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
 644       if (!str1_isL) {
 645         subs(zr, ch1, ASIZE);
 646         br(HS, BCSKIP);
 647       }
 648       strb(ch2, Address(sp, ch1));
 649     BIND(BCSKIP);
 650       subs(ch2, ch2, 1);
 651       br(GT, BCLOOP);
 652 
 653       add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
 654       if (str1_isL == str2_isL) {
 655         // load last 8 bytes (8LL/4UU symbols)
 656         ldr(tmp6, Address(tmp6, -wordSize));
 657       } else {
 658         ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
 659         // convert Latin1 to UTF. We'll have to wait until load completed, but
 660         // it's still faster than per-character loads+checks
 661         lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
 662         ubfx(ch1, tmp6, 8, 8); // str1[N-2]
 663         ubfx(ch2, tmp6, 16, 8); // str1[N-3]
 664         andr(tmp6, tmp6, 0xFF); // str1[N-4]
 665         orr(ch2, ch1, ch2, LSL, 16);
 666         orr(tmp6, tmp6, tmp3, LSL, 48);
 667         orr(tmp6, tmp6, ch2, LSL, 16);
 668       }
 669     BIND(BMLOOPSTR2);
 670       (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 671       sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
 672       if (str1_isL == str2_isL) {
 673         // re-init tmp3. It's for free because it's executed in parallel with
 674         // load above. Alternative is to initialize it before loop, but it'll
 675         // affect performance on in-order systems with 2 or more ld/st pipelines
 676         lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
 677       }
 678       if (!isL) { // UU/UL case
 679         lsl(ch2, cnt1tmp, 1); // offset in bytes
 680       }
 681       cmp(tmp3, skipch);
 682       br(NE, BMSKIP);
 683       ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
 684       mov(ch1, tmp6);
 685       if (isL) {
 686         b(BMLOOPSTR1_AFTER_LOAD);
 687       } else {
 688         sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
 689         b(BMLOOPSTR1_CMP);
 690       }
 691     BIND(BMLOOPSTR1);
 692       (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
 693       (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 694     BIND(BMLOOPSTR1_AFTER_LOAD);
 695       subs(cnt1tmp, cnt1tmp, 1);
 696       br(LT, BMLOOPSTR1_LASTCMP);
 697     BIND(BMLOOPSTR1_CMP);
 698       cmp(ch1, ch2);
 699       br(EQ, BMLOOPSTR1);
 700     BIND(BMSKIP);
 701       if (!isL) {
 702         // if we've met UTF symbol while searching Latin1 pattern, then we can
 703         // skip cnt1 symbols
 704         if (str1_isL != str2_isL) {
 705           mov(result_tmp, cnt1);
 706         } else {
 707           mov(result_tmp, 1);
 708         }
 709         subs(zr, skipch, ASIZE);
 710         br(HS, BMADV);
 711       }
 712       ldrb(result_tmp, Address(sp, skipch)); // load skip distance
 713     BIND(BMADV);
 714       sub(cnt1tmp, cnt1, 1);
 715       add(str2, str2, result_tmp, LSL, str2_chr_shift);
 716       cmp(str2, str2end);
 717       br(LE, BMLOOPSTR2);
 718       add(sp, sp, ASIZE);
 719       b(NOMATCH);
 720     BIND(BMLOOPSTR1_LASTCMP);
 721       cmp(ch1, ch2);
 722       br(NE, BMSKIP);
 723     BIND(BMMATCH);
 724       sub(result, str2, tmp5);
 725       if (!str2_isL) lsr(result, result, 1);
 726       add(sp, sp, ASIZE);
 727       b(DONE);
 728 
 729     BIND(LINEARSTUB);
 730     cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm
 731     br(LT, LINEAR_MEDIUM);
 732     mov(result, zr);
 733     RuntimeAddress stub = nullptr;
 734     if (isL) {
 735       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
 736       assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated");
 737     } else if (str1_isL) {
 738       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
 739        assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated");
 740     } else {
 741       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
 742       assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated");
 743     }
 744     address call = trampoline_call(stub);
 745     if (call == nullptr) {
 746       DEBUG_ONLY(reset_labels(LINEARSEARCH, LINEAR_MEDIUM, DONE, NOMATCH, MATCH));
 747       ciEnv::current()->record_failure("CodeCache is full");
 748       return;
 749     }
 750     b(DONE);
 751   }
 752 
 753   BIND(LINEARSEARCH);
 754   {
 755     Label DO1, DO2, DO3;
 756 
 757     Register str2tmp = tmp2;
 758     Register first = tmp3;
 759 
 760     if (icnt1 == -1)
 761     {
 762         Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
 763 
 764         cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2));
 765         br(LT, DOSHORT);
 766       BIND(LINEAR_MEDIUM);
 767         (this->*str1_load_1chr)(first, Address(str1));
 768         lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
 769         sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
 770         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 771         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 772 
 773       BIND(FIRST_LOOP);
 774         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
 775         cmp(first, ch2);
 776         br(EQ, STR1_LOOP);
 777       BIND(STR2_NEXT);
 778         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 779         br(LE, FIRST_LOOP);
 780         b(NOMATCH);
 781 
 782       BIND(STR1_LOOP);
 783         adds(cnt1tmp, cnt1_neg, str1_chr_size);
 784         add(cnt2tmp, cnt2_neg, str2_chr_size);
 785         br(GE, MATCH);
 786 
 787       BIND(STR1_NEXT);
 788         (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
 789         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
 790         cmp(ch1, ch2);
 791         br(NE, STR2_NEXT);
 792         adds(cnt1tmp, cnt1tmp, str1_chr_size);
 793         add(cnt2tmp, cnt2tmp, str2_chr_size);
 794         br(LT, STR1_NEXT);
 795         b(MATCH);
 796 
 797       BIND(DOSHORT);
 798       if (str1_isL == str2_isL) {
 799         cmp(cnt1, (u1)2);
 800         br(LT, DO1);
 801         br(GT, DO3);
 802       }
 803     }
 804 
 805     if (icnt1 == 4) {
 806       Label CH1_LOOP;
 807 
 808         (this->*load_4chr)(ch1, str1);
 809         sub(result_tmp, cnt2, 4);
 810         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 811         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 812 
 813       BIND(CH1_LOOP);
 814         (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
 815         cmp(ch1, ch2);
 816         br(EQ, MATCH);
 817         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 818         br(LE, CH1_LOOP);
 819         b(NOMATCH);
 820       }
 821 
 822     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
 823       Label CH1_LOOP;
 824 
 825       BIND(DO2);
 826         (this->*load_2chr)(ch1, str1);
 827         if (icnt1 == 2) {
 828           sub(result_tmp, cnt2, 2);
 829         }
 830         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 831         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 832       BIND(CH1_LOOP);
 833         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
 834         cmp(ch1, ch2);
 835         br(EQ, MATCH);
 836         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 837         br(LE, CH1_LOOP);
 838         b(NOMATCH);
 839     }
 840 
 841     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
 842       Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
 843 
 844       BIND(DO3);
 845         (this->*load_2chr)(first, str1);
 846         (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
 847         if (icnt1 == 3) {
 848           sub(result_tmp, cnt2, 3);
 849         }
 850         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 851         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 852       BIND(FIRST_LOOP);
 853         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
 854         cmpw(first, ch2);
 855         br(EQ, STR1_LOOP);
 856       BIND(STR2_NEXT);
 857         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 858         br(LE, FIRST_LOOP);
 859         b(NOMATCH);
 860 
 861       BIND(STR1_LOOP);
 862         add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
 863         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
 864         cmp(ch1, ch2);
 865         br(NE, STR2_NEXT);
 866         b(MATCH);
 867     }
 868 
 869     if (icnt1 == -1 || icnt1 == 1) {
 870       Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
 871 
 872       BIND(DO1);
 873         (this->*str1_load_1chr)(ch1, str1);
 874         cmp(cnt2, (u1)8);
 875         br(LT, DO1_SHORT);
 876 
 877         sub(result_tmp, cnt2, 8/str2_chr_size);
 878         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 879         mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
 880         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 881 
 882         if (str2_isL) {
 883           orr(ch1, ch1, ch1, LSL, 8);
 884         }
 885         orr(ch1, ch1, ch1, LSL, 16);
 886         orr(ch1, ch1, ch1, LSL, 32);
 887       BIND(CH1_LOOP);
 888         ldr(ch2, Address(str2, cnt2_neg));
 889         eor(ch2, ch1, ch2);
 890         sub(tmp1, ch2, tmp3);
 891         orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 892         bics(tmp1, tmp1, tmp2);
 893         br(NE, HAS_ZERO);
 894         adds(cnt2_neg, cnt2_neg, 8);
 895         br(LT, CH1_LOOP);
 896 
 897         cmp(cnt2_neg, (u1)8);
 898         mov(cnt2_neg, 0);
 899         br(LT, CH1_LOOP);
 900         b(NOMATCH);
 901 
 902       BIND(HAS_ZERO);
 903         rev(tmp1, tmp1);
 904         clz(tmp1, tmp1);
 905         add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
 906         b(MATCH);
 907 
 908       BIND(DO1_SHORT);
 909         mov(result_tmp, cnt2);
 910         lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
 911         sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
 912       BIND(DO1_LOOP);
 913         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
 914         cmpw(ch1, ch2);
 915         br(EQ, MATCH);
 916         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 917         br(LT, DO1_LOOP);
 918     }
 919   }
 920   BIND(NOMATCH);
 921     mov(result, -1);
 922     b(DONE);
 923   BIND(MATCH);
 924     add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
 925   BIND(DONE);
 926 }
 927 
 928 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
 929 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
 930 
 931 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1,
 932                                             Register ch, Register result,
 933                                             Register tmp1, Register tmp2, Register tmp3)
 934 {
 935   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
 936   Register cnt1_neg = cnt1;
 937   Register ch1 = rscratch1;
 938   Register result_tmp = rscratch2;
 939 
 940   cbz(cnt1, NOMATCH);
 941 
 942   cmp(cnt1, (u1)4);
 943   br(LT, DO1_SHORT);
 944 
 945   orr(ch, ch, ch, LSL, 16);
 946   orr(ch, ch, ch, LSL, 32);
 947 
 948   sub(cnt1, cnt1, 4);
 949   mov(result_tmp, cnt1);
 950   lea(str1, Address(str1, cnt1, Address::uxtw(1)));
 951   sub(cnt1_neg, zr, cnt1, LSL, 1);
 952 
 953   mov(tmp3, 0x0001000100010001);
 954 
 955   BIND(CH1_LOOP);
 956     ldr(ch1, Address(str1, cnt1_neg));
 957     eor(ch1, ch, ch1);
 958     sub(tmp1, ch1, tmp3);
 959     orr(tmp2, ch1, 0x7fff7fff7fff7fff);
 960     bics(tmp1, tmp1, tmp2);
 961     br(NE, HAS_ZERO);
 962     adds(cnt1_neg, cnt1_neg, 8);
 963     br(LT, CH1_LOOP);
 964 
 965     cmp(cnt1_neg, (u1)8);
 966     mov(cnt1_neg, 0);
 967     br(LT, CH1_LOOP);
 968     b(NOMATCH);
 969 
 970   BIND(HAS_ZERO);
 971     rev(tmp1, tmp1);
 972     clz(tmp1, tmp1);
 973     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
 974     b(MATCH);
 975 
 976   BIND(DO1_SHORT);
 977     mov(result_tmp, cnt1);
 978     lea(str1, Address(str1, cnt1, Address::uxtw(1)));
 979     sub(cnt1_neg, zr, cnt1, LSL, 1);
 980   BIND(DO1_LOOP);
 981     ldrh(ch1, Address(str1, cnt1_neg));
 982     cmpw(ch, ch1);
 983     br(EQ, MATCH);
 984     adds(cnt1_neg, cnt1_neg, 2);
 985     br(LT, DO1_LOOP);
 986   BIND(NOMATCH);
 987     mov(result, -1);
 988     b(DONE);
 989   BIND(MATCH);
 990     add(result, result_tmp, cnt1_neg, ASR, 1);
 991   BIND(DONE);
 992 }
 993 
 994 void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1,
 995                                                 Register ch, Register result,
 996                                                 FloatRegister ztmp1,
 997                                                 FloatRegister ztmp2,
 998                                                 PRegister tmp_pg,
 999                                                 PRegister tmp_pdn, bool isL)
1000 {
1001   // Note that `tmp_pdn` should *NOT* be used as governing predicate register.
1002   assert(tmp_pg->is_governing(),
1003          "this register has to be a governing predicate register");
1004 
1005   Label LOOP, MATCH, DONE, NOMATCH;
1006   Register vec_len = rscratch1;
1007   Register idx = rscratch2;
1008 
1009   SIMD_RegVariant T = (isL == true) ? B : H;
1010 
1011   cbz(cnt1, NOMATCH);
1012 
1013   // Assign the particular char throughout the vector.
1014   sve_dup(ztmp2, T, ch);
1015   if (isL) {
1016     sve_cntb(vec_len);
1017   } else {
1018     sve_cnth(vec_len);
1019   }
1020   mov(idx, 0);
1021 
1022   // Generate a predicate to control the reading of input string.
1023   sve_whilelt(tmp_pg, T, idx, cnt1);
1024 
1025   BIND(LOOP);
1026     // Read a vector of 8- or 16-bit data depending on the string type. Note
1027     // that inactive elements indicated by the predicate register won't cause
1028     // a data read from memory to the destination vector.
1029     if (isL) {
1030       sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx));
1031     } else {
1032       sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1)));
1033     }
1034     add(idx, idx, vec_len);
1035 
1036     // Perform the comparison. An element of the destination predicate is set
1037     // to active if the particular char is matched.
1038     sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2);
1039 
1040     // Branch if the particular char is found.
1041     br(NE, MATCH);
1042 
1043     sve_whilelt(tmp_pg, T, idx, cnt1);
1044 
1045     // Loop back if the particular char not found.
1046     br(MI, LOOP);
1047 
1048   BIND(NOMATCH);
1049     mov(result, -1);
1050     b(DONE);
1051 
1052   BIND(MATCH);
1053     // Undo the index increment.
1054     sub(idx, idx, vec_len);
1055 
1056     // Crop the vector to find its location.
1057     sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */);
1058     add(result, idx, -1);
1059     sve_incp(result, T, tmp_pdn);
1060   BIND(DONE);
1061 }
1062 
1063 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1,
1064                                             Register ch, Register result,
1065                                             Register tmp1, Register tmp2, Register tmp3)
1066 {
1067   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
1068   Register cnt1_neg = cnt1;
1069   Register ch1 = rscratch1;
1070   Register result_tmp = rscratch2;
1071 
1072   cbz(cnt1, NOMATCH);
1073 
1074   cmp(cnt1, (u1)8);
1075   br(LT, DO1_SHORT);
1076 
1077   orr(ch, ch, ch, LSL, 8);
1078   orr(ch, ch, ch, LSL, 16);
1079   orr(ch, ch, ch, LSL, 32);
1080 
1081   sub(cnt1, cnt1, 8);
1082   mov(result_tmp, cnt1);
1083   lea(str1, Address(str1, cnt1));
1084   sub(cnt1_neg, zr, cnt1);
1085 
1086   mov(tmp3, 0x0101010101010101);
1087 
1088   BIND(CH1_LOOP);
1089     ldr(ch1, Address(str1, cnt1_neg));
1090     eor(ch1, ch, ch1);
1091     sub(tmp1, ch1, tmp3);
1092     orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f);
1093     bics(tmp1, tmp1, tmp2);
1094     br(NE, HAS_ZERO);
1095     adds(cnt1_neg, cnt1_neg, 8);
1096     br(LT, CH1_LOOP);
1097 
1098     cmp(cnt1_neg, (u1)8);
1099     mov(cnt1_neg, 0);
1100     br(LT, CH1_LOOP);
1101     b(NOMATCH);
1102 
1103   BIND(HAS_ZERO);
1104     rev(tmp1, tmp1);
1105     clz(tmp1, tmp1);
1106     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
1107     b(MATCH);
1108 
1109   BIND(DO1_SHORT);
1110     mov(result_tmp, cnt1);
1111     lea(str1, Address(str1, cnt1));
1112     sub(cnt1_neg, zr, cnt1);
1113   BIND(DO1_LOOP);
1114     ldrb(ch1, Address(str1, cnt1_neg));
1115     cmp(ch, ch1);
1116     br(EQ, MATCH);
1117     adds(cnt1_neg, cnt1_neg, 1);
1118     br(LT, DO1_LOOP);
1119   BIND(NOMATCH);
1120     mov(result, -1);
1121     b(DONE);
1122   BIND(MATCH);
1123     add(result, result_tmp, cnt1_neg);
1124   BIND(DONE);
1125 }
1126 
1127 // Compare strings.
1128 void C2_MacroAssembler::string_compare(Register str1, Register str2,
1129     Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
1130     FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
1131     PRegister pgtmp1, PRegister pgtmp2, int ae) {
1132   Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
1133       DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
1134       SHORT_LOOP_START, TAIL_CHECK;
1135 
1136   bool isLL = ae == StrIntrinsicNode::LL;
1137   bool isLU = ae == StrIntrinsicNode::LU;
1138   bool isUL = ae == StrIntrinsicNode::UL;
1139 
1140   // The stub threshold for LL strings is: 72 (64 + 8) chars
1141   // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch)
1142   // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least)
1143   const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36);
1144 
1145   bool str1_isL = isLL || isLU;
1146   bool str2_isL = isLL || isUL;
1147 
1148   int str1_chr_shift = str1_isL ? 0 : 1;
1149   int str2_chr_shift = str2_isL ? 0 : 1;
1150   int str1_chr_size = str1_isL ? 1 : 2;
1151   int str2_chr_size = str2_isL ? 1 : 2;
1152   int minCharsInWord = isLL ? wordSize : wordSize/2;
1153 
1154   FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
1155   chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
1156                                       (chr_insn)&MacroAssembler::ldrh;
1157   chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
1158                                       (chr_insn)&MacroAssembler::ldrh;
1159   uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
1160                             (uxt_insn)&MacroAssembler::uxthw;
1161 
1162   BLOCK_COMMENT("string_compare {");
1163 
1164   // Bizarrely, the counts are passed in bytes, regardless of whether they
1165   // are L or U strings, however the result is always in characters.
1166   if (!str1_isL) asrw(cnt1, cnt1, 1);
1167   if (!str2_isL) asrw(cnt2, cnt2, 1);
1168 
1169   // Compute the minimum of the string lengths and save the difference.
1170   subsw(result, cnt1, cnt2);
1171   cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
1172 
1173   // A very short string
1174   cmpw(cnt2, minCharsInWord);
1175   br(Assembler::LE, SHORT_STRING);
1176 
1177   // Compare longwords
1178   // load first parts of strings and finish initialization while loading
1179   {
1180     if (str1_isL == str2_isL) { // LL or UU
1181       ldr(tmp1, Address(str1));
1182       cmp(str1, str2);
1183       br(Assembler::EQ, DONE);
1184       ldr(tmp2, Address(str2));
1185       cmp(cnt2, stub_threshold);
1186       br(GE, STUB);
1187       subsw(cnt2, cnt2, minCharsInWord);
1188       br(EQ, TAIL_CHECK);
1189       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1190       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1191       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1192     } else if (isLU) {
1193       ldrs(vtmp, Address(str1));
1194       ldr(tmp2, Address(str2));
1195       cmp(cnt2, stub_threshold);
1196       br(GE, STUB);
1197       subw(cnt2, cnt2, 4);
1198       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1199       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1200       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1201       zip1(vtmp, T8B, vtmp, vtmpZ);
1202       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1203       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1204       add(cnt1, cnt1, 4);
1205       fmovd(tmp1, vtmp);
1206     } else { // UL case
1207       ldr(tmp1, Address(str1));
1208       ldrs(vtmp, Address(str2));
1209       cmp(cnt2, stub_threshold);
1210       br(GE, STUB);
1211       subw(cnt2, cnt2, 4);
1212       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1213       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1214       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1215       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1216       zip1(vtmp, T8B, vtmp, vtmpZ);
1217       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1218       add(cnt1, cnt1, 8);
1219       fmovd(tmp2, vtmp);
1220     }
1221     adds(cnt2, cnt2, isUL ? 4 : 8);
1222     br(GE, TAIL);
1223     eor(rscratch2, tmp1, tmp2);
1224     cbnz(rscratch2, DIFF);
1225     // main loop
1226     bind(NEXT_WORD);
1227     if (str1_isL == str2_isL) {
1228       ldr(tmp1, Address(str1, cnt2));
1229       ldr(tmp2, Address(str2, cnt2));
1230       adds(cnt2, cnt2, 8);
1231     } else if (isLU) {
1232       ldrs(vtmp, Address(str1, cnt1));
1233       ldr(tmp2, Address(str2, cnt2));
1234       add(cnt1, cnt1, 4);
1235       zip1(vtmp, T8B, vtmp, vtmpZ);
1236       fmovd(tmp1, vtmp);
1237       adds(cnt2, cnt2, 8);
1238     } else { // UL
1239       ldrs(vtmp, Address(str2, cnt2));
1240       ldr(tmp1, Address(str1, cnt1));
1241       zip1(vtmp, T8B, vtmp, vtmpZ);
1242       add(cnt1, cnt1, 8);
1243       fmovd(tmp2, vtmp);
1244       adds(cnt2, cnt2, 4);
1245     }
1246     br(GE, TAIL);
1247 
1248     eor(rscratch2, tmp1, tmp2);
1249     cbz(rscratch2, NEXT_WORD);
1250     b(DIFF);
1251     bind(TAIL);
1252     eor(rscratch2, tmp1, tmp2);
1253     cbnz(rscratch2, DIFF);
1254     // Last longword.  In the case where length == 4 we compare the
1255     // same longword twice, but that's still faster than another
1256     // conditional branch.
1257     if (str1_isL == str2_isL) {
1258       ldr(tmp1, Address(str1));
1259       ldr(tmp2, Address(str2));
1260     } else if (isLU) {
1261       ldrs(vtmp, Address(str1));
1262       ldr(tmp2, Address(str2));
1263       zip1(vtmp, T8B, vtmp, vtmpZ);
1264       fmovd(tmp1, vtmp);
1265     } else { // UL
1266       ldrs(vtmp, Address(str2));
1267       ldr(tmp1, Address(str1));
1268       zip1(vtmp, T8B, vtmp, vtmpZ);
1269       fmovd(tmp2, vtmp);
1270     }
1271     bind(TAIL_CHECK);
1272     eor(rscratch2, tmp1, tmp2);
1273     cbz(rscratch2, DONE);
1274 
1275     // Find the first different characters in the longwords and
1276     // compute their difference.
1277     bind(DIFF);
1278     rev(rscratch2, rscratch2);
1279     clz(rscratch2, rscratch2);
1280     andr(rscratch2, rscratch2, isLL ? -8 : -16);
1281     lsrv(tmp1, tmp1, rscratch2);
1282     (this->*ext_chr)(tmp1, tmp1);
1283     lsrv(tmp2, tmp2, rscratch2);
1284     (this->*ext_chr)(tmp2, tmp2);
1285     subw(result, tmp1, tmp2);
1286     b(DONE);
1287   }
1288 
1289   bind(STUB);
1290     RuntimeAddress stub = nullptr;
1291     switch(ae) {
1292       case StrIntrinsicNode::LL:
1293         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
1294         break;
1295       case StrIntrinsicNode::UU:
1296         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
1297         break;
1298       case StrIntrinsicNode::LU:
1299         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
1300         break;
1301       case StrIntrinsicNode::UL:
1302         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
1303         break;
1304       default:
1305         ShouldNotReachHere();
1306      }
1307     assert(stub.target() != nullptr, "compare_long_string stub has not been generated");
1308     address call = trampoline_call(stub);
1309     if (call == nullptr) {
1310       DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START));
1311       ciEnv::current()->record_failure("CodeCache is full");
1312       return;
1313     }
1314     b(DONE);
1315 
1316   bind(SHORT_STRING);
1317   // Is the minimum length zero?
1318   cbz(cnt2, DONE);
1319   // arrange code to do most branches while loading and loading next characters
1320   // while comparing previous
1321   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1322   subs(cnt2, cnt2, 1);
1323   br(EQ, SHORT_LAST_INIT);
1324   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1325   b(SHORT_LOOP_START);
1326   bind(SHORT_LOOP);
1327   subs(cnt2, cnt2, 1);
1328   br(EQ, SHORT_LAST);
1329   bind(SHORT_LOOP_START);
1330   (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
1331   (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
1332   cmp(tmp1, cnt1);
1333   br(NE, SHORT_LOOP_TAIL);
1334   subs(cnt2, cnt2, 1);
1335   br(EQ, SHORT_LAST2);
1336   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1337   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1338   cmp(tmp2, rscratch1);
1339   br(EQ, SHORT_LOOP);
1340   sub(result, tmp2, rscratch1);
1341   b(DONE);
1342   bind(SHORT_LOOP_TAIL);
1343   sub(result, tmp1, cnt1);
1344   b(DONE);
1345   bind(SHORT_LAST2);
1346   cmp(tmp2, rscratch1);
1347   br(EQ, DONE);
1348   sub(result, tmp2, rscratch1);
1349 
1350   b(DONE);
1351   bind(SHORT_LAST_INIT);
1352   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1353   bind(SHORT_LAST);
1354   cmp(tmp1, cnt1);
1355   br(EQ, DONE);
1356   sub(result, tmp1, cnt1);
1357 
1358   bind(DONE);
1359 
1360   BLOCK_COMMENT("} string_compare");
1361 }
1362 
1363 void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1,
1364                                      FloatRegister src2, Condition cond, bool isQ) {
1365   SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1366   FloatRegister zn = src1, zm = src2;
1367   bool needs_negation = false;
1368   switch (cond) {
1369     case LT: cond = GT; zn = src2; zm = src1; break;
1370     case LE: cond = GE; zn = src2; zm = src1; break;
1371     case LO: cond = HI; zn = src2; zm = src1; break;
1372     case LS: cond = HS; zn = src2; zm = src1; break;
1373     case NE: cond = EQ; needs_negation = true; break;
1374     default:
1375       break;
1376   }
1377 
1378   if (is_floating_point_type(bt)) {
1379     fcm(cond, dst, size, zn, zm);
1380   } else {
1381     cm(cond, dst, size, zn, zm);
1382   }
1383 
1384   if (needs_negation) {
1385     notr(dst, isQ ? T16B : T8B, dst);
1386   }
1387 }
1388 
1389 void C2_MacroAssembler::neon_compare_zero(FloatRegister dst, BasicType bt, FloatRegister src,
1390                                           Condition cond, bool isQ) {
1391   SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1392   if (bt == T_FLOAT || bt == T_DOUBLE) {
1393     if (cond == Assembler::NE) {
1394       fcm(Assembler::EQ, dst, size, src);
1395       notr(dst, isQ ? T16B : T8B, dst);
1396     } else {
1397       fcm(cond, dst, size, src);
1398     }
1399   } else {
1400     if (cond == Assembler::NE) {
1401       cm(Assembler::EQ, dst, size, src);
1402       notr(dst, isQ ? T16B : T8B, dst);
1403     } else {
1404       cm(cond, dst, size, src);
1405     }
1406   }
1407 }
1408 
1409 // Compress the least significant bit of each byte to the rightmost and clear
1410 // the higher garbage bits.
1411 void C2_MacroAssembler::bytemask_compress(Register dst) {
1412   // Example input, dst = 0x01 00 00 00 01 01 00 01
1413   // The "??" bytes are garbage.
1414   orr(dst, dst, dst, Assembler::LSR, 7);  // dst = 0x?? 02 ?? 00 ?? 03 ?? 01
1415   orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D
1416   orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D
1417   andr(dst, dst, 0xff);                   // dst = 0x8D
1418 }
1419 
1420 // Pack the value of each mask element in "src" into a long value in "dst", at most
1421 // the first 64 lane elements. The input "src" is a vector of boolean represented as
1422 // bytes with 0x00/0x01 as element values. Each lane value from "src" is packed into
1423 // one bit in "dst".
1424 //
1425 // Example:   src = 0x0001010000010001 0100000001010001, lane_cnt = 16
1426 // Expected:  dst = 0x658D
1427 //
1428 // Clobbers: rscratch1
1429 void C2_MacroAssembler::sve_vmask_tolong(Register dst, FloatRegister src,
1430                                          FloatRegister vtmp, int lane_cnt) {
1431   assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
1432   assert_different_registers(dst, rscratch1);
1433   assert_different_registers(src, vtmp);
1434   assert(UseSVE > 0, "must be");
1435 
1436   // Compress the lowest 8 bytes.
1437   fmovd(dst, src);
1438   bytemask_compress(dst);
1439   if (lane_cnt <= 8) return;
1440 
1441   // Repeat on higher bytes and join the results.
1442   // Compress 8 bytes in each iteration.
1443   for (int idx = 1; idx < (lane_cnt / 8); idx++) {
1444     sve_extract_integral(rscratch1, T_LONG, src, idx, vtmp);
1445     bytemask_compress(rscratch1);
1446     orr(dst, dst, rscratch1, Assembler::LSL, idx << 3);
1447   }
1448 }
1449 
1450 // The function is same as above "sve_vmask_tolong", but it uses SVE2's BEXT
1451 // instruction which requires the FEAT_BITPERM feature.
1452 void C2_MacroAssembler::sve2_vmask_tolong(Register dst, FloatRegister src,
1453                                           FloatRegister vtmp1, FloatRegister vtmp2,
1454                                           int lane_cnt) {
1455   assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
1456   assert_different_registers(src, vtmp1, vtmp2);
1457   assert(UseSVE > 1 && VM_Version::supports_svebitperm(), "must be");
1458 
1459   // Given a vector with the value 0x00 or 0x01 in each byte, the basic idea
1460   // is to compress each significant bit of the byte in a cross-lane way. Due
1461   // to the lack of a cross-lane bit-compress instruction, we use BEXT
1462   // (bit-compress in each lane) with the biggest lane size (T = D) then
1463   // concatenate the results.
1464 
1465   // The second source input of BEXT, initialized with 0x01 in each byte.
1466   // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1467   sve_dup(vtmp2, B, 1);
1468 
1469   // BEXT vtmp1.D, src.D, vtmp2.D
1470   // src   = 0x0001010000010001 | 0x0100000001010001
1471   // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1472   //         ---------------------------------------
1473   // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1474   sve_bext(vtmp1, D, src, vtmp2);
1475 
1476   // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the
1477   // result to dst.
1478   // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1479   // dst   = 0x658D
1480   if (lane_cnt <= 8) {
1481     // No need to concatenate.
1482     umov(dst, vtmp1, B, 0);
1483   } else if (lane_cnt <= 16) {
1484     ins(vtmp1, B, vtmp1, 1, 8);
1485     umov(dst, vtmp1, H, 0);
1486   } else {
1487     // As the lane count is 64 at most, the final expected value must be in
1488     // the lowest 64 bits after narrowing vtmp1 from D to B.
1489     sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2);
1490     umov(dst, vtmp1, D, 0);
1491   }
1492 }
1493 
1494 // Unpack the mask, a long value in "src", into a vector register of boolean
1495 // represented as bytes with 0x00/0x01 as element values in "dst".  Each bit in
1496 // "src" is unpacked into one byte lane in "dst". Note that "dst" can support at
1497 // most 64 lanes.
1498 //
1499 // Below example gives the expected dst vector register, with a valid src(0x658D)
1500 // on a 128-bit vector size machine.
1501 // dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01
1502 void C2_MacroAssembler::sve_vmask_fromlong(FloatRegister dst, Register src,
1503                                            FloatRegister vtmp, int lane_cnt) {
1504   assert_different_registers(dst, vtmp);
1505   assert(UseSVE == 2 && VM_Version::supports_svebitperm() &&
1506          lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported");
1507 
1508   // Example:   src = 0x658D, lane_cnt = 16
1509   // Expected:  dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01
1510 
1511   // Put long value from general purpose register into the first lane of vector.
1512   // vtmp = 0x0000000000000000 | 0x000000000000658D
1513   sve_dup(vtmp, B, 0);
1514   mov(vtmp, D, 0, src);
1515 
1516   // Transform the value in the first lane which is mask in bit now to the mask in
1517   // byte, which can be done by SVE2's BDEP instruction.
1518 
1519   // The first source input of BDEP instruction. Deposite each byte in every 8 bytes.
1520   // vtmp = 0x0000000000000065 | 0x000000000000008D
1521   if (lane_cnt <= 8) {
1522     // Nothing. As only one byte exsits.
1523   } else if (lane_cnt <= 16) {
1524     ins(vtmp, B, vtmp, 8, 1);
1525   } else {
1526     sve_vector_extend(vtmp, D, vtmp, B);
1527   }
1528 
1529   // The second source input of BDEP instruction, initialized with 0x01 for each byte.
1530   // dst = 0x01010101 0x01010101 0x01010101 0x01010101
1531   sve_dup(dst, B, 1);
1532 
1533   // BDEP dst.D, vtmp.D, dst.D
1534   // vtmp = 0x0000000000000065 | 0x000000000000008D
1535   // dst  = 0x0101010101010101 | 0x0101010101010101
1536   //        ---------------------------------------
1537   // dst  = 0x0001010000010001 | 0x0100000001010001
1538   sve_bdep(dst, D, vtmp, dst);
1539 }
1540 
1541 // Clobbers: rflags
1542 void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg,
1543                                     FloatRegister zn, FloatRegister zm, Condition cond) {
1544   assert(pg->is_governing(), "This register has to be a governing predicate register");
1545   FloatRegister z1 = zn, z2 = zm;
1546   switch (cond) {
1547     case LE: z1 = zm; z2 = zn; cond = GE; break;
1548     case LT: z1 = zm; z2 = zn; cond = GT; break;
1549     case LO: z1 = zm; z2 = zn; cond = HI; break;
1550     case LS: z1 = zm; z2 = zn; cond = HS; break;
1551     default:
1552       break;
1553   }
1554 
1555   SIMD_RegVariant size = elemType_to_regVariant(bt);
1556   if (is_floating_point_type(bt)) {
1557     sve_fcm(cond, pd, size, pg, z1, z2);
1558   } else {
1559     assert(is_integral_type(bt), "unsupported element type");
1560     sve_cmp(cond, pd, size, pg, z1, z2);
1561   }
1562 }
1563 
1564 // Get index of the last mask lane that is set
1565 void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) {
1566   SIMD_RegVariant size = elemType_to_regVariant(bt);
1567   sve_rev(ptmp, size, src);
1568   sve_brkb(ptmp, ptrue, ptmp, false);
1569   sve_cntp(dst, size, ptrue, ptmp);
1570   movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1);
1571   subw(dst, rscratch1, dst);
1572 }
1573 
1574 // Extend integer vector src to dst with the same lane count
1575 // but larger element size, e.g. 4B -> 4I
1576 void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes,
1577                                            FloatRegister src, BasicType src_bt, bool is_unsigned) {
1578   if (src_bt == T_BYTE) {
1579     // 4B to 4S/4I, 8B to 8S
1580     assert(dst_vlen_in_bytes == 8 || dst_vlen_in_bytes == 16, "unsupported");
1581     assert(dst_bt == T_SHORT || dst_bt == T_INT, "unsupported");
1582     _xshll(is_unsigned, dst, T8H, src, T8B, 0);
1583     if (dst_bt == T_INT) {
1584       _xshll(is_unsigned, dst, T4S, dst, T4H, 0);
1585     }
1586   } else if (src_bt == T_SHORT) {
1587     // 2S to 2I/2L, 4S to 4I
1588     assert(dst_vlen_in_bytes == 8 || dst_vlen_in_bytes == 16, "unsupported");
1589     assert(dst_bt == T_INT || dst_bt == T_LONG, "unsupported");
1590     _xshll(is_unsigned, dst, T4S, src, T4H, 0);
1591     if (dst_bt == T_LONG) {
1592       _xshll(is_unsigned, dst, T2D, dst, T2S, 0);
1593     }
1594   } else if (src_bt == T_INT) {
1595     // 2I to 2L
1596     assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported");
1597     _xshll(is_unsigned, dst, T2D, src, T2S, 0);
1598   } else {
1599     ShouldNotReachHere();
1600   }
1601 }
1602 
1603 // Narrow integer vector src down to dst with the same lane count
1604 // but smaller element size, e.g. 4I -> 4B
1605 void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt,
1606                                            FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) {
1607   if (src_bt == T_SHORT) {
1608     // 4S/8S to 4B/8B
1609     assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
1610     assert(dst_bt == T_BYTE, "unsupported");
1611     xtn(dst, T8B, src, T8H);
1612   } else if (src_bt == T_INT) {
1613     // 2I to 2S, 4I to 4B/4S
1614     assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
1615     assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported");
1616     xtn(dst, T4H, src, T4S);
1617     if (dst_bt == T_BYTE) {
1618       xtn(dst, T8B, dst, T8H);
1619     }
1620   } else if (src_bt == T_LONG) {
1621     // 2L to 2S/2I
1622     assert(src_vlen_in_bytes == 16, "unsupported");
1623     assert(dst_bt == T_INT || dst_bt == T_SHORT, "unsupported");
1624     xtn(dst, T2S, src, T2D);
1625     if (dst_bt == T_SHORT) {
1626       xtn(dst, T4H, dst, T4S);
1627     }
1628   } else {
1629     ShouldNotReachHere();
1630   }
1631 }
1632 
1633 void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size,
1634                                           FloatRegister src, SIMD_RegVariant src_size,
1635                                           bool is_unsigned) {
1636   assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size");
1637 
1638   if (src_size == B) {
1639     switch (dst_size) {
1640     case H:
1641       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1642       break;
1643     case S:
1644       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1645       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1646       break;
1647     case D:
1648       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1649       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1650       _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1651       break;
1652     default:
1653       ShouldNotReachHere();
1654     }
1655   } else if (src_size == H) {
1656     if (dst_size == S) {
1657       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1658     } else { // D
1659       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1660       _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1661     }
1662   } else if (src_size == S) {
1663     _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, src);
1664   }
1665 }
1666 
1667 // Vector narrow from src to dst with specified element sizes.
1668 // High part of dst vector will be filled with zero.
1669 void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size,
1670                                           FloatRegister src, SIMD_RegVariant src_size,
1671                                           FloatRegister tmp) {
1672   assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size");
1673   assert_different_registers(src, tmp);
1674   sve_dup(tmp, src_size, 0);
1675   if (src_size == D) {
1676     switch (dst_size) {
1677     case S:
1678       sve_uzp1(dst, S, src, tmp);
1679       break;
1680     case H:
1681       assert_different_registers(dst, tmp);
1682       sve_uzp1(dst, S, src, tmp);
1683       sve_uzp1(dst, H, dst, tmp);
1684       break;
1685     case B:
1686       assert_different_registers(dst, tmp);
1687       sve_uzp1(dst, S, src, tmp);
1688       sve_uzp1(dst, H, dst, tmp);
1689       sve_uzp1(dst, B, dst, tmp);
1690       break;
1691     default:
1692       ShouldNotReachHere();
1693     }
1694   } else if (src_size == S) {
1695     if (dst_size == H) {
1696       sve_uzp1(dst, H, src, tmp);
1697     } else { // B
1698       assert_different_registers(dst, tmp);
1699       sve_uzp1(dst, H, src, tmp);
1700       sve_uzp1(dst, B, dst, tmp);
1701     }
1702   } else if (src_size == H) {
1703     sve_uzp1(dst, B, src, tmp);
1704   }
1705 }
1706 
1707 // Extend src predicate to dst predicate with the same lane count but larger
1708 // element size, e.g. 64Byte -> 512Long
1709 void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src,
1710                                              uint dst_element_length_in_bytes,
1711                                              uint src_element_length_in_bytes) {
1712   if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) {
1713     sve_punpklo(dst, src);
1714   } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) {
1715     sve_punpklo(dst, src);
1716     sve_punpklo(dst, dst);
1717   } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) {
1718     sve_punpklo(dst, src);
1719     sve_punpklo(dst, dst);
1720     sve_punpklo(dst, dst);
1721   } else {
1722     assert(false, "unsupported");
1723     ShouldNotReachHere();
1724   }
1725 }
1726 
1727 // Narrow src predicate to dst predicate with the same lane count but
1728 // smaller element size, e.g. 512Long -> 64Byte
1729 void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src, PRegister ptmp,
1730                                              uint dst_element_length_in_bytes, uint src_element_length_in_bytes) {
1731   // The insignificant bits in src predicate are expected to be zero.
1732   // To ensure the higher order bits of the resultant narrowed vector are 0, an all-zero predicate is
1733   // passed as the second argument. An example narrowing operation with a given mask would be -
1734   // 128Long -> 64Int on a 128-bit machine i.e 2L -> 2I
1735   // Mask (for 2 Longs) : TF
1736   // Predicate register for the above mask (16 bits) : 00000001 00000000
1737   // After narrowing (uzp1 dst.b, src.b, ptmp.b) : 0000 0000 0001 0000
1738   // Which translates to mask for 2 integers as : TF (lower half is considered while upper half is 0)
1739   assert_different_registers(src, ptmp);
1740   assert_different_registers(dst, ptmp);
1741   sve_pfalse(ptmp);
1742   if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) {
1743     sve_uzp1(dst, B, src, ptmp);
1744   } else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) {
1745     sve_uzp1(dst, H, src, ptmp);
1746     sve_uzp1(dst, B, dst, ptmp);
1747   } else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) {
1748     sve_uzp1(dst, S, src, ptmp);
1749     sve_uzp1(dst, H, dst, ptmp);
1750     sve_uzp1(dst, B, dst, ptmp);
1751   } else {
1752     assert(false, "unsupported");
1753     ShouldNotReachHere();
1754   }
1755 }
1756 
1757 // Vector reduction add for integral type with ASIMD instructions.
1758 void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt,
1759                                                  Register isrc, FloatRegister vsrc,
1760                                                  unsigned vector_length_in_bytes,
1761                                                  FloatRegister vtmp) {
1762   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1763   assert_different_registers(dst, isrc);
1764   bool isQ = vector_length_in_bytes == 16;
1765 
1766   BLOCK_COMMENT("neon_reduce_add_integral {");
1767     switch(bt) {
1768       case T_BYTE:
1769         addv(vtmp, isQ ? T16B : T8B, vsrc);
1770         smov(dst, vtmp, B, 0);
1771         addw(dst, dst, isrc, ext::sxtb);
1772         break;
1773       case T_SHORT:
1774         addv(vtmp, isQ ? T8H : T4H, vsrc);
1775         smov(dst, vtmp, H, 0);
1776         addw(dst, dst, isrc, ext::sxth);
1777         break;
1778       case T_INT:
1779         isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc);
1780         umov(dst, vtmp, S, 0);
1781         addw(dst, dst, isrc);
1782         break;
1783       case T_LONG:
1784         assert(isQ, "unsupported");
1785         addpd(vtmp, vsrc);
1786         umov(dst, vtmp, D, 0);
1787         add(dst, dst, isrc);
1788         break;
1789       default:
1790         assert(false, "unsupported");
1791         ShouldNotReachHere();
1792     }
1793   BLOCK_COMMENT("} neon_reduce_add_integral");
1794 }
1795 
1796 // Vector reduction multiply for integral type with ASIMD instructions.
1797 // Note: temporary registers vtmp1 and vtmp2 are not used in some cases.
1798 // Clobbers: rscratch1
1799 void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt,
1800                                                  Register isrc, FloatRegister vsrc,
1801                                                  unsigned vector_length_in_bytes,
1802                                                  FloatRegister vtmp1, FloatRegister vtmp2) {
1803   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1804   bool isQ = vector_length_in_bytes == 16;
1805 
1806   BLOCK_COMMENT("neon_reduce_mul_integral {");
1807     switch(bt) {
1808       case T_BYTE:
1809         if (isQ) {
1810           // Multiply the lower half and higher half of vector iteratively.
1811           // vtmp1 = vsrc[8:15]
1812           ins(vtmp1, D, vsrc, 0, 1);
1813           // vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7]
1814           mulv(vtmp1, T8B, vtmp1, vsrc);
1815           // vtmp2 = vtmp1[4:7]
1816           ins(vtmp2, S, vtmp1, 0, 1);
1817           // vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3]
1818           mulv(vtmp1, T8B, vtmp2, vtmp1);
1819         } else {
1820           ins(vtmp1, S, vsrc, 0, 1);
1821           mulv(vtmp1, T8B, vtmp1, vsrc);
1822         }
1823         // vtmp2 = vtmp1[2:3]
1824         ins(vtmp2, H, vtmp1, 0, 1);
1825         // vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1]
1826         mulv(vtmp2, T8B, vtmp2, vtmp1);
1827         // dst = vtmp2[0] * isrc * vtmp2[1]
1828         umov(rscratch1, vtmp2, B, 0);
1829         mulw(dst, rscratch1, isrc);
1830         sxtb(dst, dst);
1831         umov(rscratch1, vtmp2, B, 1);
1832         mulw(dst, rscratch1, dst);
1833         sxtb(dst, dst);
1834         break;
1835       case T_SHORT:
1836         if (isQ) {
1837           ins(vtmp2, D, vsrc, 0, 1);
1838           mulv(vtmp2, T4H, vtmp2, vsrc);
1839           ins(vtmp1, S, vtmp2, 0, 1);
1840           mulv(vtmp1, T4H, vtmp1, vtmp2);
1841         } else {
1842           ins(vtmp1, S, vsrc, 0, 1);
1843           mulv(vtmp1, T4H, vtmp1, vsrc);
1844         }
1845         umov(rscratch1, vtmp1, H, 0);
1846         mulw(dst, rscratch1, isrc);
1847         sxth(dst, dst);
1848         umov(rscratch1, vtmp1, H, 1);
1849         mulw(dst, rscratch1, dst);
1850         sxth(dst, dst);
1851         break;
1852       case T_INT:
1853         if (isQ) {
1854           ins(vtmp1, D, vsrc, 0, 1);
1855           mulv(vtmp1, T2S, vtmp1, vsrc);
1856         } else {
1857           vtmp1 = vsrc;
1858         }
1859         umov(rscratch1, vtmp1, S, 0);
1860         mul(dst, rscratch1, isrc);
1861         umov(rscratch1, vtmp1, S, 1);
1862         mul(dst, rscratch1, dst);
1863         break;
1864       case T_LONG:
1865         umov(rscratch1, vsrc, D, 0);
1866         mul(dst, isrc, rscratch1);
1867         umov(rscratch1, vsrc, D, 1);
1868         mul(dst, dst, rscratch1);
1869         break;
1870       default:
1871         assert(false, "unsupported");
1872         ShouldNotReachHere();
1873     }
1874   BLOCK_COMMENT("} neon_reduce_mul_integral");
1875 }
1876 
1877 // Vector reduction multiply for floating-point type with ASIMD instructions.
1878 void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt,
1879                                            FloatRegister fsrc, FloatRegister vsrc,
1880                                            unsigned vector_length_in_bytes,
1881                                            FloatRegister vtmp) {
1882   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1883   bool isQ = vector_length_in_bytes == 16;
1884 
1885   BLOCK_COMMENT("neon_reduce_mul_fp {");
1886     switch(bt) {
1887       // The T_SHORT type below is for Float16 type which also uses floating-point
1888       // instructions.
1889       case T_SHORT:
1890         fmulh(dst, fsrc, vsrc);
1891         ext(vtmp, T8B, vsrc, vsrc, 2);
1892         fmulh(dst, dst, vtmp);
1893         ext(vtmp, T8B, vsrc, vsrc, 4);
1894         fmulh(dst, dst, vtmp);
1895         ext(vtmp, T8B, vsrc, vsrc, 6);
1896         fmulh(dst, dst, vtmp);
1897         if (isQ) {
1898           ext(vtmp, T16B, vsrc, vsrc, 8);
1899           fmulh(dst, dst, vtmp);
1900           ext(vtmp, T16B, vsrc, vsrc, 10);
1901           fmulh(dst, dst, vtmp);
1902           ext(vtmp, T16B, vsrc, vsrc, 12);
1903           fmulh(dst, dst, vtmp);
1904           ext(vtmp, T16B, vsrc, vsrc, 14);
1905           fmulh(dst, dst, vtmp);
1906         }
1907         break;
1908       case T_FLOAT:
1909         fmuls(dst, fsrc, vsrc);
1910         ins(vtmp, S, vsrc, 0, 1);
1911         fmuls(dst, dst, vtmp);
1912         if (isQ) {
1913           ins(vtmp, S, vsrc, 0, 2);
1914           fmuls(dst, dst, vtmp);
1915           ins(vtmp, S, vsrc, 0, 3);
1916           fmuls(dst, dst, vtmp);
1917          }
1918         break;
1919       case T_DOUBLE:
1920         assert(isQ, "unsupported");
1921         fmuld(dst, fsrc, vsrc);
1922         ins(vtmp, D, vsrc, 0, 1);
1923         fmuld(dst, dst, vtmp);
1924         break;
1925       default:
1926         assert(false, "unsupported");
1927         ShouldNotReachHere();
1928     }
1929   BLOCK_COMMENT("} neon_reduce_mul_fp");
1930 }
1931 
1932 // Vector reduction add for half float type with ASIMD instructions.
1933 void C2_MacroAssembler::neon_reduce_add_fp16(FloatRegister dst, FloatRegister fsrc, FloatRegister vsrc,
1934                                              unsigned vector_length_in_bytes, FloatRegister vtmp) {
1935   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1936   bool isQ = vector_length_in_bytes == 16;
1937 
1938   BLOCK_COMMENT("neon_reduce_add_fp16 {");
1939     faddh(dst, fsrc, vsrc);
1940     ext(vtmp, T8B, vsrc, vsrc, 2);
1941     faddh(dst, dst, vtmp);
1942     ext(vtmp, T8B, vsrc, vsrc, 4);
1943     faddh(dst, dst, vtmp);
1944     ext(vtmp, T8B, vsrc, vsrc, 6);
1945     faddh(dst, dst, vtmp);
1946     if (isQ) {
1947       ext(vtmp, T16B, vsrc, vsrc, 8);
1948       faddh(dst, dst, vtmp);
1949       ext(vtmp, T16B, vsrc, vsrc, 10);
1950       faddh(dst, dst, vtmp);
1951       ext(vtmp, T16B, vsrc, vsrc, 12);
1952       faddh(dst, dst, vtmp);
1953       ext(vtmp, T16B, vsrc, vsrc, 14);
1954       faddh(dst, dst, vtmp);
1955     }
1956   BLOCK_COMMENT("} neon_reduce_add_fp16");
1957 }
1958 
1959 // Helper to select logical instruction
1960 void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd,
1961                                                    Register Rn, Register Rm,
1962                                                    enum shift_kind kind, unsigned shift) {
1963   switch(opc) {
1964     case Op_AndReductionV:
1965       is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift);
1966       break;
1967     case Op_OrReductionV:
1968       is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift);
1969       break;
1970     case Op_XorReductionV:
1971       is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift);
1972       break;
1973     default:
1974       assert(false, "unsupported");
1975       ShouldNotReachHere();
1976   }
1977 }
1978 
1979 // Vector reduction logical operations And, Or, Xor
1980 // Clobbers: rscratch1
1981 void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt,
1982                                             Register isrc, FloatRegister vsrc,
1983                                             unsigned vector_length_in_bytes) {
1984   assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV,
1985          "unsupported");
1986   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1987   assert_different_registers(dst, isrc);
1988   bool isQ = vector_length_in_bytes == 16;
1989 
1990   BLOCK_COMMENT("neon_reduce_logical {");
1991     umov(rscratch1, vsrc, isQ ? D : S, 0);
1992     umov(dst, vsrc, isQ ? D : S, 1);
1993     neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1);
1994     switch(bt) {
1995       case T_BYTE:
1996         if (isQ) {
1997           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1998         }
1999         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
2000         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8);
2001         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
2002         sxtb(dst, dst);
2003         break;
2004       case T_SHORT:
2005         if (isQ) {
2006           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
2007         }
2008         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
2009         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
2010         sxth(dst, dst);
2011         break;
2012       case T_INT:
2013         if (isQ) {
2014           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
2015         }
2016         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
2017         break;
2018       case T_LONG:
2019         assert(isQ, "unsupported");
2020         neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst);
2021         break;
2022       default:
2023         assert(false, "unsupported");
2024         ShouldNotReachHere();
2025     }
2026   BLOCK_COMMENT("} neon_reduce_logical");
2027 }
2028 
2029 // Helper function to decode min/max reduction operation properties
2030 void C2_MacroAssembler::decode_minmax_reduction_opc(int opc, bool* is_min,
2031                                                     bool* is_unsigned,
2032                                                     Condition* cond) {
2033   switch(opc) {
2034     case Op_MinReductionV:
2035       *is_min = true;  *is_unsigned = false; *cond = LT; break;
2036     case Op_MaxReductionV:
2037       *is_min = false; *is_unsigned = false; *cond = GT; break;
2038     case Op_UMinReductionV:
2039       *is_min = true;  *is_unsigned = true;  *cond = LO; break;
2040     case Op_UMaxReductionV:
2041       *is_min = false; *is_unsigned = true;  *cond = HI; break;
2042     default:
2043       ShouldNotReachHere();
2044   }
2045 }
2046 
2047 // Vector reduction min/max/umin/umax for integral type with ASIMD instructions.
2048 // Note: vtmp is not used and expected to be fnoreg for T_LONG case.
2049 // Clobbers: rscratch1, rflags
2050 void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt,
2051                                                     Register isrc, FloatRegister vsrc,
2052                                                     unsigned vector_length_in_bytes,
2053                                                     FloatRegister vtmp) {
2054   assert(opc == Op_MinReductionV || opc == Op_MaxReductionV ||
2055          opc == Op_UMinReductionV || opc == Op_UMaxReductionV, "unsupported");
2056   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
2057   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported");
2058   assert_different_registers(dst, isrc);
2059   bool isQ = vector_length_in_bytes == 16;
2060   bool is_min;
2061   bool is_unsigned;
2062   Condition cond;
2063   decode_minmax_reduction_opc(opc, &is_min, &is_unsigned, &cond);
2064   BLOCK_COMMENT("neon_reduce_minmax_integral {");
2065     if (bt == T_LONG) {
2066       assert(vtmp == fnoreg, "should be");
2067       assert(isQ, "should be");
2068       umov(rscratch1, vsrc, D, 0);
2069       cmp(isrc, rscratch1);
2070       csel(dst, isrc, rscratch1, cond);
2071       umov(rscratch1, vsrc, D, 1);
2072       cmp(dst, rscratch1);
2073       csel(dst, dst, rscratch1, cond);
2074     } else {
2075       SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
2076       if (size == T2S) {
2077         // For T2S (2x32-bit elements), use pairwise instructions because
2078         // uminv/umaxv/sminv/smaxv don't support arrangement 2S.
2079         neon_minmaxp(is_unsigned, is_min, vtmp, size, vsrc, vsrc);
2080       } else {
2081         // For other sizes, use reduction to scalar instructions.
2082         neon_minmaxv(is_unsigned, is_min, vtmp, size, vsrc);
2083       }
2084       if (bt == T_INT) {
2085         umov(dst, vtmp, S, 0);
2086       } else if (is_unsigned) {
2087         umov(dst, vtmp, elemType_to_regVariant(bt), 0);
2088       } else {
2089         smov(dst, vtmp, elemType_to_regVariant(bt), 0);
2090       }
2091       cmpw(dst, isrc);
2092       cselw(dst, dst, isrc, cond);
2093     }
2094   BLOCK_COMMENT("} neon_reduce_minmax_integral");
2095 }
2096 
2097 // Vector reduction for integral type with SVE instruction.
2098 // Supported operations are Add, And, Or, Xor, Max, Min, UMax, UMin.
2099 // rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV.
2100 void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1,
2101                                             FloatRegister src2, PRegister pg, FloatRegister tmp) {
2102   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2103   assert(pg->is_governing(), "This register has to be a governing predicate register");
2104   assert_different_registers(src1, dst);
2105   // Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved.
2106   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2107   switch (opc) {
2108     case Op_AddReductionVI: {
2109       sve_uaddv(tmp, size, pg, src2);
2110       if (bt == T_BYTE) {
2111         smov(dst, tmp, size, 0);
2112         addw(dst, src1, dst, ext::sxtb);
2113       } else if (bt == T_SHORT) {
2114         smov(dst, tmp, size, 0);
2115         addw(dst, src1, dst, ext::sxth);
2116       } else {
2117         umov(dst, tmp, size, 0);
2118         addw(dst, dst, src1);
2119       }
2120       break;
2121     }
2122     case Op_AddReductionVL: {
2123       sve_uaddv(tmp, size, pg, src2);
2124       umov(dst, tmp, size, 0);
2125       add(dst, dst, src1);
2126       break;
2127     }
2128     case Op_AndReductionV: {
2129       sve_andv(tmp, size, pg, src2);
2130       if (bt == T_INT || bt == T_LONG) {
2131         umov(dst, tmp, size, 0);
2132       } else {
2133         smov(dst, tmp, size, 0);
2134       }
2135       if (bt == T_LONG) {
2136         andr(dst, dst, src1);
2137       } else {
2138         andw(dst, dst, src1);
2139       }
2140       break;
2141     }
2142     case Op_OrReductionV: {
2143       sve_orv(tmp, size, pg, src2);
2144       if (bt == T_INT || bt == T_LONG) {
2145         umov(dst, tmp, size, 0);
2146       } else {
2147         smov(dst, tmp, size, 0);
2148       }
2149       if (bt == T_LONG) {
2150         orr(dst, dst, src1);
2151       } else {
2152         orrw(dst, dst, src1);
2153       }
2154       break;
2155     }
2156     case Op_XorReductionV: {
2157       sve_eorv(tmp, size, pg, src2);
2158       if (bt == T_INT || bt == T_LONG) {
2159         umov(dst, tmp, size, 0);
2160       } else {
2161         smov(dst, tmp, size, 0);
2162       }
2163       if (bt == T_LONG) {
2164         eor(dst, dst, src1);
2165       } else {
2166         eorw(dst, dst, src1);
2167       }
2168       break;
2169     }
2170     case Op_MaxReductionV:
2171     case Op_MinReductionV:
2172     case Op_UMaxReductionV:
2173     case Op_UMinReductionV: {
2174       bool is_min;
2175       bool is_unsigned;
2176       Condition cond;
2177       decode_minmax_reduction_opc(opc, &is_min, &is_unsigned, &cond);
2178       sve_minmaxv(is_unsigned, is_min, tmp, size, pg, src2);
2179       // Move result from vector to general register
2180       if (is_unsigned || bt == T_INT || bt == T_LONG) {
2181         umov(dst, tmp, size, 0);
2182       } else {
2183         smov(dst, tmp, size, 0);
2184       }
2185       if (bt == T_LONG) {
2186         cmp(dst, src1);
2187         csel(dst, dst, src1, cond);
2188       } else {
2189         cmpw(dst, src1);
2190         cselw(dst, dst, src1, cond);
2191       }
2192       break;
2193     }
2194     default:
2195       assert(false, "unsupported");
2196       ShouldNotReachHere();
2197   }
2198 
2199   if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) {
2200     if (bt == T_BYTE) {
2201       sxtb(dst, dst);
2202     } else if (bt == T_SHORT) {
2203       sxth(dst, dst);
2204     }
2205   }
2206 }
2207 
2208 // Set elements of the dst predicate to true for lanes in the range of [0, lane_cnt), or
2209 // to false otherwise. The input "lane_cnt" should be smaller than or equal to the supported
2210 // max vector length of the basic type. Clobbers: rscratch1 and the rFlagsReg.
2211 void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt) {
2212   uint32_t max_vector_length = Matcher::max_vector_size(bt);
2213   assert(lane_cnt <= max_vector_length, "unsupported input lane_cnt");
2214 
2215   // Set all elements to false if the input "lane_cnt" is zero.
2216   if (lane_cnt == 0) {
2217     sve_pfalse(dst);
2218     return;
2219   }
2220 
2221   SIMD_RegVariant size = elemType_to_regVariant(bt);
2222   assert(size != Q, "invalid size");
2223 
2224   // Set all true if "lane_cnt" equals to the max lane count.
2225   if (lane_cnt == max_vector_length) {
2226     sve_ptrue(dst, size, /* ALL */ 0b11111);
2227     return;
2228   }
2229 
2230   // Fixed numbers for "ptrue".
2231   switch(lane_cnt) {
2232   case 1: /* VL1 */
2233   case 2: /* VL2 */
2234   case 3: /* VL3 */
2235   case 4: /* VL4 */
2236   case 5: /* VL5 */
2237   case 6: /* VL6 */
2238   case 7: /* VL7 */
2239   case 8: /* VL8 */
2240     sve_ptrue(dst, size, lane_cnt);
2241     return;
2242   case 16:
2243     sve_ptrue(dst, size, /* VL16 */ 0b01001);
2244     return;
2245   case 32:
2246     sve_ptrue(dst, size, /* VL32 */ 0b01010);
2247     return;
2248   case 64:
2249     sve_ptrue(dst, size, /* VL64 */ 0b01011);
2250     return;
2251   case 128:
2252     sve_ptrue(dst, size, /* VL128 */ 0b01100);
2253     return;
2254   case 256:
2255     sve_ptrue(dst, size, /* VL256 */ 0b01101);
2256     return;
2257   default:
2258     break;
2259   }
2260 
2261   // Special patterns for "ptrue".
2262   if (lane_cnt == round_down_power_of_2(max_vector_length)) {
2263     sve_ptrue(dst, size, /* POW2 */ 0b00000);
2264   } else if (lane_cnt == max_vector_length - (max_vector_length % 4)) {
2265     sve_ptrue(dst, size, /* MUL4 */ 0b11101);
2266   } else if (lane_cnt == max_vector_length - (max_vector_length % 3)) {
2267     sve_ptrue(dst, size, /* MUL3 */ 0b11110);
2268   } else {
2269     // Encode to "whileltw" for the remaining cases.
2270     mov(rscratch1, lane_cnt);
2271     sve_whileltw(dst, size, zr, rscratch1);
2272   }
2273 }
2274 
2275 // Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst.
2276 // Any remaining elements of dst will be filled with zero.
2277 // Clobbers: rscratch1
2278 // Preserves: mask, vzr
2279 void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask,
2280                                            FloatRegister vzr, FloatRegister vtmp,
2281                                            PRegister pgtmp, unsigned vector_length_in_bytes) {
2282   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2283   // When called by sve_compress_byte, src and vtmp may be the same register.
2284   assert_different_registers(dst, src, vzr);
2285   assert_different_registers(dst, vtmp, vzr);
2286   assert_different_registers(mask, pgtmp);
2287   // high <-- low
2288   // Example input:   src   = hh gg ff ee dd cc bb aa, one character is 8 bits.
2289   //                  mask  = 01 00 00 01 01 00 01 01, one character is 1 bit.
2290   // Expected result: dst   = 00 00 00 hh ee dd bb aa
2291 
2292   // Extend lowest half to type INT.
2293   // dst   =  00dd  00cc  00bb  00aa
2294   sve_uunpklo(dst, S, src);
2295   // pgtmp =  0001  0000  0001  0001
2296   sve_punpklo(pgtmp, mask);
2297   // Pack the active elements in size of type INT to the right,
2298   // and fill the remainings with zero.
2299   // dst   =  0000  00dd  00bb  00aa
2300   sve_compact(dst, S, dst, pgtmp);
2301   // Narrow the result back to type SHORT.
2302   // dst   = 00 00 00 00 00 dd bb aa
2303   sve_uzp1(dst, H, dst, vzr);
2304 
2305   // Return if the vector length is no more than MaxVectorSize/2, since the
2306   // highest half is invalid.
2307   if (vector_length_in_bytes <= (MaxVectorSize >> 1)) {
2308     return;
2309   }
2310 
2311   // Count the active elements of lowest half.
2312   // rscratch1 = 3
2313   sve_cntp(rscratch1, S, ptrue, pgtmp);
2314 
2315   // Repeat to the highest half.
2316   // pgtmp =  0001  0000  0000  0001
2317   sve_punpkhi(pgtmp, mask);
2318   // vtmp  =  00hh  00gg  00ff  00ee
2319   sve_uunpkhi(vtmp, S, src);
2320   // vtmp  =  0000  0000  00hh  00ee
2321   sve_compact(vtmp, S, vtmp, pgtmp);
2322   // vtmp  = 00 00 00 00 00 00 hh ee
2323   sve_uzp1(vtmp, H, vtmp, vzr);
2324 
2325   // pgtmp = 00 00 00 00 00 01 01 01
2326   sve_whilelt(pgtmp, H, zr, rscratch1);
2327   // Compressed low:  dst  = 00 00 00 00 00 dd bb aa
2328   // Compressed high: vtmp = 00 00 00 00 00 00 hh ee
2329   // Combine the compressed low with the compressed high:
2330   //                  dst  = 00 00 00 hh ee dd bb aa
2331   sve_splice(dst, H, pgtmp, vtmp);
2332 }
2333 
2334 // Clobbers: rscratch1, rscratch2
2335 // Preserves: src, mask
2336 void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask,
2337                                           FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
2338                                           PRegister ptmp, PRegister pgtmp, unsigned vector_length_in_bytes) {
2339   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2340   assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3);
2341   assert_different_registers(mask, ptmp, pgtmp);
2342   // high <-- low
2343   // Example input:   src   = q p n m l k j i h g f e d c b a, one character is 8 bits.
2344   //                  mask  = 0 1 0 0 0 0 0 1 0 1 0 0 0 1 0 1, one character is 1 bit.
2345   // Expected result: dst   = 0 0 0 0 0 0 0 0 0 0 0 p i g c a
2346   FloatRegister vzr = vtmp3;
2347   sve_dup(vzr, B, 0);
2348 
2349   // Extend lowest half to type SHORT.
2350   // vtmp1 =  0h  0g  0f  0e  0d  0c  0b  0a
2351   sve_uunpklo(vtmp1, H, src);
2352   // ptmp  =  00  01  00  00  00  01  00  01
2353   sve_punpklo(ptmp, mask);
2354   // Pack the active elements in size of type SHORT to the right,
2355   // and fill the remainings with zero.
2356   // dst   =  00  00  00  00  00  0g  0c  0a
2357   unsigned extended_size = vector_length_in_bytes << 1;
2358   sve_compress_short(dst, vtmp1, ptmp, vzr, vtmp2, pgtmp, extended_size > MaxVectorSize ? MaxVectorSize : extended_size);
2359   // Narrow the result back to type BYTE.
2360   // dst   = 0 0 0 0 0 0 0 0 0 0 0 0 0 g c a
2361   sve_uzp1(dst, B, dst, vzr);
2362 
2363   // Return if the vector length is no more than MaxVectorSize/2, since the
2364   // highest half is invalid.
2365   if (vector_length_in_bytes <= (MaxVectorSize >> 1)) {
2366     return;
2367   }
2368   // Count the active elements of lowest half.
2369   // rscratch2 = 3
2370   sve_cntp(rscratch2, H, ptrue, ptmp);
2371 
2372   // Repeat to the highest half.
2373   // ptmp  =  00  01  00  00  00  00  00  01
2374   sve_punpkhi(ptmp, mask);
2375   // vtmp2 =  0q  0p  0n  0m  0l  0k  0j  0i
2376   sve_uunpkhi(vtmp2, H, src);
2377   // vtmp1 =  00  00  00  00  00  00  0p  0i
2378   sve_compress_short(vtmp1, vtmp2, ptmp, vzr, vtmp2, pgtmp, extended_size - MaxVectorSize);
2379   // vtmp1 = 0 0 0 0 0 0 0 0 0 0 0 0 0 0 p i
2380   sve_uzp1(vtmp1, B, vtmp1, vzr);
2381 
2382   // ptmp  = 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1
2383   sve_whilelt(ptmp, B, zr, rscratch2);
2384   // Compressed low:  dst   = 0 0 0 0 0 0 0 0 0 0 0 0 0 g c a
2385   // Compressed high: vtmp1 = 0 0 0 0 0 0 0 0 0 0 0 0 0 0 p i
2386   // Combine the compressed low with the compressed high:
2387   //                  dst   = 0 0 0 0 0 0 0 0 0 0 0 p i g c a
2388   sve_splice(dst, B, ptmp, vtmp1);
2389 }
2390 
2391 void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2392   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2393   SIMD_Arrangement size = isQ ? T16B : T8B;
2394   if (bt == T_BYTE) {
2395     rbit(dst, size, src);
2396   } else {
2397     neon_reverse_bytes(dst, src, bt, isQ);
2398     rbit(dst, size, dst);
2399   }
2400 }
2401 
2402 void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2403   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2404   SIMD_Arrangement size = isQ ? T16B : T8B;
2405   switch (bt) {
2406     case T_BYTE:
2407       if (dst != src) {
2408         orr(dst, size, src, src);
2409       }
2410       break;
2411     case T_SHORT:
2412       rev16(dst, size, src);
2413       break;
2414     case T_INT:
2415       rev32(dst, size, src);
2416       break;
2417     case T_LONG:
2418       rev64(dst, size, src);
2419       break;
2420     default:
2421       assert(false, "unsupported");
2422       ShouldNotReachHere();
2423   }
2424 }
2425 
2426 // VectorRearrange implementation for short/int/float/long/double types with NEON
2427 // instructions. For VectorRearrange short/int/float, we use NEON tbl instruction.
2428 // But since it supports bytes table only, we need to lookup 2/4 bytes as a group.
2429 // For VectorRearrange long/double, we compare the shuffle input with iota indices,
2430 // and use bsl to implement the operation.
2431 void C2_MacroAssembler::neon_rearrange_hsd(FloatRegister dst, FloatRegister src,
2432                                            FloatRegister shuffle, FloatRegister tmp,
2433                                            BasicType bt, bool isQ) {
2434   assert_different_registers(dst, src, shuffle, tmp);
2435   SIMD_Arrangement size1 = isQ ? T16B : T8B;
2436   SIMD_Arrangement size2 = esize2arrangement((uint)type2aelembytes(bt), isQ);
2437 
2438   // Here is an example that rearranges a NEON vector with 4 ints:
2439   // Rearrange V1 int[a0, a1, a2, a3] to V2 int[a2, a3, a0, a1]
2440   //   1. We assume the shuffle input is Vi int[2, 3, 0, 1].
2441   //   2. Multiply Vi int[2, 3, 0, 1] with constant int vector
2442   //      [0x04040404, 0x04040404, 0x04040404, 0x04040404], and get
2443   //      tbl base Vm int[0x08080808, 0x0c0c0c0c, 0x00000000, 0x04040404].
2444   //   3. Add Vm with constant int[0x03020100, 0x03020100, 0x03020100, 0x03020100],
2445   //      and get tbl index Vm int[0x0b0a0908, 0x0f0e0d0c, 0x03020100, 0x07060504]
2446   //   4. Use Vm as index register, and use V1 as table register.
2447   //      Then get V2 as the result by tbl NEON instructions.
2448   switch (bt) {
2449     case T_SHORT:
2450       mov(tmp, size1, 0x02);
2451       mulv(dst, size2, shuffle, tmp);
2452       mov(tmp, size2, 0x0100);
2453       addv(dst, size1, dst, tmp);
2454       tbl(dst, size1, src, 1, dst);
2455       break;
2456     case T_INT:
2457     case T_FLOAT:
2458       mov(tmp, size1, 0x04);
2459       mulv(dst, size2, shuffle, tmp);
2460       mov(tmp, size2, 0x03020100);
2461       addv(dst, size1, dst, tmp);
2462       tbl(dst, size1, src, 1, dst);
2463       break;
2464     case T_LONG:
2465     case T_DOUBLE:
2466       {
2467         int idx = vector_iota_entry_index(T_LONG);
2468         lea(rscratch1,
2469             ExternalAddress(StubRoutines::aarch64::vector_iota_indices(idx)));
2470         ldrq(tmp, rscratch1);
2471         // Check whether the input "shuffle" is the same with iota indices.
2472         // Return "src" if true, otherwise swap the two elements of "src".
2473         cm(EQ, dst, size2, shuffle, tmp);
2474         ext(tmp, size1, src, src, 8);
2475         bsl(dst, size1, src, tmp);
2476       }
2477       break;
2478     default:
2479       assert(false, "unsupported element type");
2480       ShouldNotReachHere();
2481   }
2482 }
2483 
2484 // Extract a scalar element from an sve vector at position 'idx'.
2485 // The input elements in src are expected to be of integral type.
2486 void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src,
2487                                              int idx, FloatRegister vtmp) {
2488   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2489   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2490   if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction
2491     if (bt == T_INT || bt == T_LONG) {
2492       umov(dst, src, size, idx);
2493     } else {
2494       smov(dst, src, size, idx);
2495     }
2496   } else {
2497     sve_orr(vtmp, src, src);
2498     sve_ext(vtmp, vtmp, idx << size);
2499     if (bt == T_INT || bt == T_LONG) {
2500       umov(dst, vtmp, size, 0);
2501     } else {
2502       smov(dst, vtmp, size, 0);
2503     }
2504   }
2505 }
2506 
2507 // java.lang.Math::round intrinsics
2508 
2509 // Clobbers: rscratch1, rflags
2510 void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2511                                           FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) {
2512   assert_different_registers(tmp1, tmp2, tmp3, src, dst);
2513   switch (T) {
2514     case T2S:
2515     case T4S:
2516       fmovs(tmp1, T, 0.5f);
2517       mov(rscratch1, jint_cast(0x1.0p23f));
2518       break;
2519     case T2D:
2520       fmovd(tmp1, T, 0.5);
2521       mov(rscratch1, julong_cast(0x1.0p52));
2522       break;
2523     default:
2524       assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2525   }
2526   fadd(tmp1, T, tmp1, src);
2527   fcvtms(tmp1, T, tmp1);
2528   // tmp1 = floor(src + 0.5, ties to even)
2529 
2530   fcvtas(dst, T, src);
2531   // dst = round(src), ties to away
2532 
2533   fneg(tmp3, T, src);
2534   dup(tmp2, T, rscratch1);
2535   cm(HS, tmp3, T, tmp3, tmp2);
2536   // tmp3 is now a set of flags
2537 
2538   bif(dst, T16B, tmp1, tmp3);
2539   // result in dst
2540 }
2541 
2542 // Clobbers: rscratch1, rflags
2543 void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2544                                          FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) {
2545   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2546   assert_different_registers(tmp1, tmp2, src, dst);
2547 
2548   switch (T) {
2549     case S:
2550       mov(rscratch1, jint_cast(0x1.0p23f));
2551       break;
2552     case D:
2553       mov(rscratch1, julong_cast(0x1.0p52));
2554       break;
2555     default:
2556       assert(T == S || T == D, "invalid register variant");
2557   }
2558 
2559   sve_frinta(dst, T, ptrue, src);
2560   // dst = round(src), ties to away
2561 
2562   Label none;
2563 
2564   sve_fneg(tmp1, T, ptrue, src);
2565   sve_dup(tmp2, T, rscratch1);
2566   sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1);
2567   br(EQ, none);
2568   {
2569     sve_cpy(tmp1, T, pgtmp, 0.5);
2570     sve_fadd(tmp1, T, pgtmp, src);
2571     sve_frintm(dst, T, pgtmp, tmp1);
2572     // dst = floor(src + 0.5, ties to even)
2573   }
2574   bind(none);
2575 
2576   sve_fcvtzs(dst, T, ptrue, dst, T);
2577   // result in dst
2578 }
2579 
2580 void C2_MacroAssembler::vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero,
2581                                            FloatRegister one, SIMD_Arrangement T) {
2582   assert_different_registers(dst, src, zero, one);
2583   assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2584 
2585   facgt(dst, T, src, zero);
2586   ushr(dst, T, dst, 1); // dst=0 for +-0.0 and NaN. 0x7FF..F otherwise
2587   bsl(dst, T == T2S ? T8B : T16B, one, src); // Result in dst
2588 }
2589 
2590 void C2_MacroAssembler::vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero,
2591                                           FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T) {
2592     assert_different_registers(dst, src, zero, one, vtmp);
2593     assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2594 
2595     sve_orr(vtmp, src, src);
2596     sve_fac(Assembler::GT, pgtmp, T, ptrue, src, zero); // pmtp=0 for +-0.0 and NaN. 0x1 otherwise
2597     switch (T) {
2598     case S:
2599       sve_and(vtmp, T, min_jint); // Extract the sign bit of float value in every lane of src
2600       sve_orr(vtmp, T, jint_cast(1.0)); // OR it with +1 to make the final result +1 or -1 depending
2601                                         // on the sign of the float value
2602       break;
2603     case D:
2604       sve_and(vtmp, T, min_jlong);
2605       sve_orr(vtmp, T, jlong_cast(1.0));
2606       break;
2607     default:
2608       assert(false, "unsupported");
2609       ShouldNotReachHere();
2610     }
2611     sve_sel(dst, T, pgtmp, vtmp, src); // Select either from src or vtmp based on the predicate register pgtmp
2612                                        // Result in dst
2613 }
2614 
2615 bool C2_MacroAssembler::in_scratch_emit_size() {
2616   if (ciEnv::current()->task() != nullptr) {
2617     PhaseOutput* phase_output = Compile::current()->output();
2618     if (phase_output != nullptr && phase_output->in_scratch_emit_size()) {
2619       return true;
2620     }
2621   }
2622   return MacroAssembler::in_scratch_emit_size();
2623 }
2624 
2625 static void abort_verify_int_in_range(uint idx, jint val, jint lo, jint hi) {
2626   fatal("Invalid CastII, idx: %u, val: %d, lo: %d, hi: %d", idx, val, lo, hi);
2627 }
2628 
2629 void C2_MacroAssembler::verify_int_in_range(uint idx, const TypeInt* t, Register rval, Register rtmp) {
2630   assert(!t->empty() && !t->singleton(), "%s", Type::str(t));
2631   if (t == TypeInt::INT) {
2632     return;
2633   }
2634 
2635   BLOCK_COMMENT("verify_int_in_range {");
2636   Label L_success, L_failure;
2637 
2638   jint lo = t->_lo;
2639   jint hi = t->_hi;
2640 
2641   if (lo != min_jint) {
2642     subsw(rtmp, rval, lo);
2643     br(Assembler::LT, L_failure);
2644   }
2645   if (hi != max_jint) {
2646     subsw(rtmp, rval, hi);
2647     br(Assembler::GT, L_failure);
2648   }
2649   b(L_success);
2650 
2651   bind(L_failure);
2652   movw(c_rarg0, idx);
2653   mov(c_rarg1, rval);
2654   movw(c_rarg2, lo);
2655   movw(c_rarg3, hi);
2656   reconstruct_frame_pointer(rtmp);
2657   rt_call(CAST_FROM_FN_PTR(address, abort_verify_int_in_range), rtmp);
2658   hlt(0);
2659 
2660   bind(L_success);
2661   BLOCK_COMMENT("} verify_int_in_range");
2662 }
2663 
2664 static void abort_verify_long_in_range(uint idx, jlong val, jlong lo, jlong hi) {
2665   fatal("Invalid CastLL, idx: %u, val: " JLONG_FORMAT ", lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, val, lo, hi);
2666 }
2667 
2668 void C2_MacroAssembler::verify_long_in_range(uint idx, const TypeLong* t, Register rval, Register rtmp) {
2669   assert(!t->empty() && !t->singleton(), "%s", Type::str(t));
2670   if (t == TypeLong::LONG) {
2671     return;
2672   }
2673 
2674   BLOCK_COMMENT("verify_long_in_range {");
2675   Label L_success, L_failure;
2676 
2677   jlong lo = t->_lo;
2678   jlong hi = t->_hi;
2679 
2680   if (lo != min_jlong) {
2681     subs(rtmp, rval, lo);
2682     br(Assembler::LT, L_failure);
2683   }
2684   if (hi != max_jlong) {
2685     subs(rtmp, rval, hi);
2686     br(Assembler::GT, L_failure);
2687   }
2688   b(L_success);
2689 
2690   bind(L_failure);
2691   movw(c_rarg0, idx);
2692   mov(c_rarg1, rval);
2693   mov(c_rarg2, lo);
2694   mov(c_rarg3, hi);
2695   reconstruct_frame_pointer(rtmp);
2696   rt_call(CAST_FROM_FN_PTR(address, abort_verify_long_in_range), rtmp);
2697   hlt(0);
2698 
2699   bind(L_success);
2700   BLOCK_COMMENT("} verify_long_in_range");
2701 }
2702 
2703 void C2_MacroAssembler::reconstruct_frame_pointer(Register rtmp) {
2704   const int framesize = Compile::current()->output()->frame_size_in_bytes();
2705   if (PreserveFramePointer) {
2706     // frame pointer is valid
2707 #ifdef ASSERT
2708     // Verify frame pointer value in rfp.
2709     add(rtmp, sp, framesize - 2 * wordSize);
2710     Label L_success;
2711     cmp(rfp, rtmp);
2712     br(Assembler::EQ, L_success);
2713     stop("frame pointer mismatch");
2714     bind(L_success);
2715 #endif // ASSERT
2716   } else {
2717     add(rfp, sp, framesize - 2 * wordSize);
2718   }
2719 }
2720 
2721 // Selects elements from two source vectors (src1, src2) based on index values in the index register
2722 // using Neon instructions and places it in the destination vector element corresponding to the
2723 // index vector element. Each index in the index register must be in the range - [0, 2 * NUM_ELEM),
2724 // where NUM_ELEM is the number of BasicType elements per vector.
2725 // If idx < NUM_ELEM --> selects src1[idx] (idx is an element of the index register)
2726 // Otherwise, selects src2[idx – NUM_ELEM]
2727 void C2_MacroAssembler::select_from_two_vectors_neon(FloatRegister dst, FloatRegister src1,
2728                                                      FloatRegister src2, FloatRegister index,
2729                                                      FloatRegister tmp, unsigned vector_length_in_bytes) {
2730   assert_different_registers(dst, src1, src2, tmp);
2731   SIMD_Arrangement size = vector_length_in_bytes == 16 ? T16B : T8B;
2732 
2733   if (vector_length_in_bytes == 16) {
2734     assert(UseSVE <= 1, "sve must be <= 1");
2735     assert(src1->successor() == src2, "Source registers must be ordered");
2736     // If the vector length is 16B, then use the Neon "tbl" instruction with two vector table
2737     tbl(dst, size, src1, 2, index);
2738   } else { // vector length == 8
2739     assert(UseSVE == 0, "must be Neon only");
2740     // We need to fit both the source vectors (src1, src2) in a 128-bit register because the
2741     // Neon "tbl" instruction supports only looking up 16B vectors. We then use the Neon "tbl"
2742     // instruction with one vector lookup
2743     ins(tmp, D, src1, 0, 0);
2744     ins(tmp, D, src2, 1, 0);
2745     tbl(dst, size, tmp, 1, index);
2746   }
2747 }
2748 
2749 // Selects elements from two source vectors (src1, src2) based on index values in the index register
2750 // using SVE/SVE2 instructions and places it in the destination vector element corresponding to the
2751 // index vector element. Each index in the index register must be in the range - [0, 2 * NUM_ELEM),
2752 // where NUM_ELEM is the number of BasicType elements per vector.
2753 // If idx < NUM_ELEM --> selects src1[idx] (idx is an element of the index register)
2754 // Otherwise, selects src2[idx – NUM_ELEM]
2755 void C2_MacroAssembler::select_from_two_vectors_sve(FloatRegister dst, FloatRegister src1,
2756                                                     FloatRegister src2, FloatRegister index,
2757                                                     FloatRegister tmp, SIMD_RegVariant T,
2758                                                     unsigned vector_length_in_bytes) {
2759   assert_different_registers(dst, src1, src2, index, tmp);
2760 
2761   if (vector_length_in_bytes == 8) {
2762     // We need to fit both the source vectors (src1, src2) in a single vector register because the
2763     // SVE "tbl" instruction is unpredicated and works on the entire vector which can lead to
2764     // incorrect results if each source vector is only partially filled. We then use the SVE "tbl"
2765     // instruction with one vector lookup
2766     assert(UseSVE >= 1, "sve must be >= 1");
2767     ins(tmp, D, src1, 0, 0);
2768     ins(tmp, D, src2, 1, 0);
2769     sve_tbl(dst, T, tmp, index);
2770   } else {  // UseSVE == 2 and vector_length_in_bytes > 8
2771     // If the vector length is > 8, then use the SVE2 "tbl" instruction with the two vector table.
2772     // The assertion - vector_length_in_bytes == MaxVectorSize ensures that this operation
2773     // is not executed on machines where vector_length_in_bytes < MaxVectorSize
2774     // with the only exception of 8B vector length.
2775     assert(UseSVE == 2 && vector_length_in_bytes == MaxVectorSize, "must be");
2776     assert(src1->successor() == src2, "Source registers must be ordered");
2777     sve_tbl(dst, T, src1, src2, index);
2778   }
2779 }
2780 
2781 void C2_MacroAssembler::select_from_two_vectors(FloatRegister dst, FloatRegister src1,
2782                                                 FloatRegister src2, FloatRegister index,
2783                                                 FloatRegister tmp, BasicType bt,
2784                                                 unsigned vector_length_in_bytes) {
2785 
2786   assert_different_registers(dst, src1, src2, index, tmp);
2787 
2788   // The cases that can reach this method are -
2789   // - UseSVE = 0/1, vector_length_in_bytes = 8 or 16, excluding double and long types
2790   // - UseSVE = 2, vector_length_in_bytes >= 8, for all types
2791   //
2792   // SVE/SVE2 tbl instructions are generated when UseSVE = 1 with vector_length_in_bytes = 8
2793   // and UseSVE = 2 with vector_length_in_bytes >= 8
2794   //
2795   // Neon instructions are generated when UseSVE = 0 with vector_length_in_bytes = 8 or 16 and
2796   // UseSVE = 1 with vector_length_in_bytes = 16
2797 
2798   if ((UseSVE == 1 && vector_length_in_bytes == 8) || UseSVE == 2) {
2799     SIMD_RegVariant T = elemType_to_regVariant(bt);
2800     select_from_two_vectors_sve(dst, src1, src2, index, tmp, T, vector_length_in_bytes);
2801     return;
2802   }
2803 
2804   // The only BasicTypes that can reach here are T_SHORT, T_BYTE, T_INT and T_FLOAT
2805   assert(bt != T_DOUBLE && bt != T_LONG, "unsupported basic type");
2806   assert(vector_length_in_bytes <= 16, "length_in_bytes must be <= 16");
2807 
2808   bool isQ = vector_length_in_bytes == 16;
2809 
2810   SIMD_Arrangement size1 = isQ ? T16B : T8B;
2811   SIMD_Arrangement size2 = esize2arrangement((uint)type2aelembytes(bt), isQ);
2812 
2813   // Neon "tbl" instruction only supports byte tables, so we need to look at chunks of
2814   // 2B for selecting shorts or chunks of 4B for selecting ints/floats from the table.
2815   // The index values in "index" register are in the range of [0, 2 * NUM_ELEM) where NUM_ELEM
2816   // is the number of elements that can fit in a vector. For ex. for T_SHORT with 64-bit vector length,
2817   // the indices can range from [0, 8).
2818   // As an example with 64-bit vector length and T_SHORT type - let index = [2, 5, 1, 0]
2819   // Move a constant 0x02 in every byte of tmp - tmp = [0x0202, 0x0202, 0x0202, 0x0202]
2820   // Multiply index vector with tmp to yield - dst = [0x0404, 0x0a0a, 0x0202, 0x0000]
2821   // Move a constant 0x0100 in every 2B of tmp - tmp = [0x0100, 0x0100, 0x0100, 0x0100]
2822   // Add the multiplied result to the vector in tmp to obtain the byte level
2823   // offsets - dst = [0x0504, 0x0b0a, 0x0302, 0x0100]
2824   // Use these offsets in the "tbl" instruction to select chunks of 2B.
2825 
2826   if (bt == T_BYTE) {
2827     select_from_two_vectors_neon(dst, src1, src2, index, tmp, vector_length_in_bytes);
2828   } else {
2829     int elem_size = (bt == T_SHORT) ? 2 : 4;
2830     uint64_t tbl_offset = (bt == T_SHORT) ? 0x0100u : 0x03020100u;
2831 
2832     mov(tmp, size1, elem_size);
2833     mulv(dst, size2, index, tmp);
2834     mov(tmp, size2, tbl_offset);
2835     addv(dst, size1, dst, tmp); // "dst" now contains the processed index elements
2836                                 // to select a set of 2B/4B
2837     select_from_two_vectors_neon(dst, src1, src2, dst, tmp, vector_length_in_bytes);
2838   }
2839 }
2840 
2841 // Vector expand implementation. Elements from the src vector are expanded into
2842 // the dst vector under the control of the vector mask.
2843 // Since there are no native instructions directly corresponding to expand before
2844 // SVE2p2, the following implementations mainly leverages the TBL instruction to
2845 // implement expand. To compute the index input for TBL, the prefix sum algorithm
2846 // (https://en.wikipedia.org/wiki/Prefix_sum) is used. The same algorithm is used
2847 // for NEON and SVE, but with different instructions where appropriate.
2848 
2849 // Vector expand implementation for NEON.
2850 //
2851 // An example of 128-bit Byte vector:
2852 //   Data direction: high <== low
2853 //   Input:
2854 //         src   = g  f  e  d  c  b  a  9  8  7  6  5  4  3  2  1
2855 //         mask  = 0  0 -1 -1  0  0 -1 -1  0  0 -1 -1  0  0 -1 -1
2856 //   Expected result:
2857 //         dst   = 0  0  8  7  0  0  6  5  0  0  4  3  0  0  2  1
2858 void C2_MacroAssembler::vector_expand_neon(FloatRegister dst, FloatRegister src, FloatRegister mask,
2859                                            FloatRegister tmp1, FloatRegister tmp2, BasicType bt,
2860                                            int vector_length_in_bytes) {
2861   assert(vector_length_in_bytes <= 16, "the vector length in bytes for NEON must be <= 16");
2862   assert_different_registers(dst, src, mask, tmp1, tmp2);
2863   // Since the TBL instruction only supports byte table, we need to
2864   // compute indices in byte type for all types.
2865   SIMD_Arrangement size = vector_length_in_bytes == 16 ? T16B : T8B;
2866   // tmp1 =  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
2867   dup(tmp1, size, zr);
2868   // dst  =  0  0  1  1  0  0  1  1  0  0  1  1  0  0  1  1
2869   negr(dst, size, mask);
2870   // Calculate vector index for TBL with prefix sum algorithm.
2871   // dst  =  8  8  8  7  6  6  6  5  4  4  4  3  2  2  2  1
2872   for (int i = 1; i < vector_length_in_bytes; i <<= 1) {
2873     ext(tmp2, size, tmp1, dst, vector_length_in_bytes - i);
2874     addv(dst, size, tmp2, dst);
2875   }
2876   // tmp2 =  0  0 -1 -1  0  0 -1 -1  0  0 -1 -1  0  0 -1 -1
2877   orr(tmp2, size, mask, mask);
2878   // tmp2 =  0  0  8  7  0  0  6  5  0  0  4  3  0  0  2  1
2879   bsl(tmp2, size, dst, tmp1);
2880   // tmp1 =  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
2881   movi(tmp1, size, 1);
2882   // dst  = -1 -1  7  6 -1 -1  5  4 -1 -1  3  2 -1 -1  1  0
2883   subv(dst, size, tmp2, tmp1);
2884   // dst  =  0  0  8  7  0  0  6  5  0  0  4  3  0  0  2  1
2885   tbl(dst, size, src, 1, dst);
2886 }
2887 
2888 // Vector expand implementation for SVE.
2889 //
2890 // An example of 128-bit Short vector:
2891 //   Data direction: high <== low
2892 //   Input:
2893 //         src   = gf ed cb a9 87 65 43 21
2894 //         pg    = 00 01 00 01 00 01 00 01
2895 //   Expected result:
2896 //         dst   = 00 87 00 65 00 43 00 21
2897 void C2_MacroAssembler::vector_expand_sve(FloatRegister dst, FloatRegister src, PRegister pg,
2898                                           FloatRegister tmp1, FloatRegister tmp2, BasicType bt,
2899                                           int vector_length_in_bytes) {
2900   assert(UseSVE > 0, "expand implementation only for SVE");
2901   assert_different_registers(dst, src, tmp1, tmp2);
2902   SIMD_RegVariant size = elemType_to_regVariant(bt);
2903 
2904   // tmp1 = 00 00 00 00 00 00 00 00
2905   sve_dup(tmp1, size, 0);
2906   sve_movprfx(tmp2, tmp1);
2907   // tmp2 = 00 01 00 01 00 01 00 01
2908   sve_cpy(tmp2, size, pg, 1, true);
2909   // Calculate vector index for TBL with prefix sum algorithm.
2910   // tmp2 = 04 04 03 03 02 02 01 01
2911   for (int i = type2aelembytes(bt); i < vector_length_in_bytes; i <<= 1) {
2912     sve_movprfx(dst, tmp1);
2913     // The EXT instruction operates on the full-width sve register. The correct
2914     // index calculation method is:
2915     // vector_length_in_bytes - i + MaxVectorSize - vector_length_in_bytes =>
2916     // MaxVectorSize - i.
2917     sve_ext(dst, tmp2, MaxVectorSize - i);
2918     sve_add(tmp2, size, dst, tmp2);
2919   }
2920   // dst  = 00 04 00 03 00 02 00 01
2921   sve_sel(dst, size, pg, tmp2, tmp1);
2922   // dst  = -1 03 -1 02 -1 01 -1 00
2923   sve_sub(dst, size, 1);
2924   // dst  = 00 87 00 65 00 43 00 21
2925   sve_tbl(dst, size, src, dst);
2926 }
2927 
2928 // Optimized SVE cpy (imm, zeroing) instruction.
2929 //
2930 // `movi; cpy(imm, merging)` and `cpy(imm, zeroing)` have the same
2931 // functionality, but test results show that `movi; cpy(imm, merging)` has
2932 // higher throughput on some microarchitectures. This would depend on
2933 // microarchitecture and so may vary between implementations.
2934 void C2_MacroAssembler::sve_cpy(FloatRegister dst, SIMD_RegVariant T,
2935                                 PRegister pg, int imm8, bool isMerge) {
2936   if (VM_Version::prefer_sve_merging_mode_cpy() && !isMerge) {
2937     // Generates a NEON instruction `movi V<dst>.2d, #0`.
2938     // On AArch64, Z and V registers alias in the low 128 bits, so V<dst> is
2939     // the low 128 bits of Z<dst>. A write to V<dst> also clears all bits of
2940     // Z<dst> above 128, so this `movi` instruction effectively zeroes the
2941     // entire Z<dst> register. According to the Arm Software Optimization
2942     // Guide, `movi` is zero latency.
2943     movi(dst, T2D, 0);
2944     isMerge = true;
2945   }
2946   Assembler::sve_cpy(dst, T, pg, imm8, isMerge);
2947 }
2948 
2949 int C2_MacroAssembler::vector_iota_entry_index(BasicType bt) {
2950   // The vector iota entries array is ordered by type B/S/I/L/F/D, and
2951   // the offset between two types is 16.
2952   switch(bt) {
2953   case T_BYTE:
2954     return 0;
2955   case T_SHORT:
2956     return 1;
2957   case T_INT:
2958     return 2;
2959   case T_LONG:
2960     return 3;
2961   case T_FLOAT:
2962     return 4;
2963   case T_DOUBLE:
2964     return 5;
2965   default:
2966     ShouldNotReachHere();
2967   }
2968 }