1 /*
   2  * Copyright (c) 2020, 2026, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright 2026 Arm Limited and/or its affiliates.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "opto/c2_MacroAssembler.hpp"
  29 #include "opto/compile.hpp"
  30 #include "opto/intrinsicnode.hpp"
  31 #include "opto/matcher.hpp"
  32 #include "opto/output.hpp"
  33 #include "opto/subnode.hpp"
  34 #include "runtime/objectMonitorTable.hpp"
  35 #include "runtime/stubRoutines.hpp"
  36 #include "runtime/synchronizer.hpp"
  37 #include "utilities/globalDefinitions.hpp"
  38 #include "utilities/powerOfTwo.hpp"
  39 
  40 #ifdef PRODUCT
  41 #define BLOCK_COMMENT(str) /* nothing */
  42 #define STOP(error) stop(error)
  43 #else
  44 #define BLOCK_COMMENT(str) block_comment(str)
  45 #define STOP(error) block_comment(error); stop(error)
  46 #endif
  47 
  48 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  49 
  50 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
  51 
  52 // jdk.internal.util.ArraysSupport.vectorizedHashCode
  53 address C2_MacroAssembler::arrays_hashcode(Register ary, Register cnt, Register result,
  54                                            FloatRegister vdata0, FloatRegister vdata1,
  55                                            FloatRegister vdata2, FloatRegister vdata3,
  56                                            FloatRegister vmul0, FloatRegister vmul1,
  57                                            FloatRegister vmul2, FloatRegister vmul3,
  58                                            FloatRegister vpow, FloatRegister vpowm,
  59                                            BasicType eltype) {
  60   ARRAYS_HASHCODE_REGISTERS;
  61 
  62   Register tmp1 = rscratch1, tmp2 = rscratch2;
  63 
  64   Label TAIL, STUB_SWITCH, STUB_SWITCH_OUT, LOOP, BR_BASE, LARGE, DONE;
  65 
  66   // Vectorization factor. Number of array elements loaded to one SIMD&FP registers by the stubs. We
  67   // use 8H load arrangements for chars and shorts and 8B for booleans and bytes. It's possible to
  68   // use 4H for chars and shorts instead, but using 8H gives better performance.
  69   const size_t vf = eltype == T_BOOLEAN || eltype == T_BYTE ? 8
  70                     : eltype == T_CHAR || eltype == T_SHORT ? 8
  71                     : eltype == T_INT                       ? 4
  72                                                             : 0;
  73   guarantee(vf, "unsupported eltype");
  74 
  75   // Unroll factor for the scalar loop below. The value is chosen based on performance analysis.
  76   const size_t unroll_factor = 4;
  77 
  78   switch (eltype) {
  79   case T_BOOLEAN:
  80     BLOCK_COMMENT("arrays_hashcode(unsigned byte) {");
  81     break;
  82   case T_CHAR:
  83     BLOCK_COMMENT("arrays_hashcode(char) {");
  84     break;
  85   case T_BYTE:
  86     BLOCK_COMMENT("arrays_hashcode(byte) {");
  87     break;
  88   case T_SHORT:
  89     BLOCK_COMMENT("arrays_hashcode(short) {");
  90     break;
  91   case T_INT:
  92     BLOCK_COMMENT("arrays_hashcode(int) {");
  93     break;
  94   default:
  95     ShouldNotReachHere();
  96   }
  97 
  98   // large_arrays_hashcode(T_INT) performs worse than the scalar loop below when the Neon loop
  99   // implemented by the stub executes just once. Call the stub only if at least two iterations will
 100   // be executed.
 101   const size_t large_threshold = eltype == T_INT ? vf * 2 : vf;
 102   cmpw(cnt, large_threshold);
 103   br(Assembler::HS, LARGE);
 104 
 105   bind(TAIL);
 106 
 107   // The andr performs cnt % uf where uf = unroll_factor. The subtract shifted by 3 offsets past
 108   // uf - (cnt % uf) pairs of load + madd insns i.e. it only executes cnt % uf load + madd pairs.
 109   // Iteration eats up the remainder, uf elements at a time.
 110   assert(is_power_of_2(unroll_factor), "can't use this value to calculate the jump target PC");
 111   andr(tmp2, cnt, unroll_factor - 1);
 112   adr(tmp1, BR_BASE);
 113   // For Cortex-A53 offset is 4 because 2 nops are generated.
 114   sub(tmp1, tmp1, tmp2, ext::sxtw, VM_Version::supports_a53mac() ? 4 : 3);
 115   movw(tmp2, 0x1f);
 116   br(tmp1);
 117 
 118   bind(LOOP);
 119   for (size_t i = 0; i < unroll_factor; ++i) {
 120     load(tmp1, Address(post(ary, type2aelembytes(eltype))), eltype);
 121     maddw(result, result, tmp2, tmp1);
 122     // maddw generates an extra nop for Cortex-A53 (see maddw definition in macroAssembler).
 123     // Generate 2nd nop to have 4 instructions per iteration.
 124     if (VM_Version::supports_a53mac()) {
 125       nop();
 126     }
 127   }
 128   bind(BR_BASE);
 129   subsw(cnt, cnt, unroll_factor);
 130   br(Assembler::HS, LOOP);
 131 
 132   b(DONE);
 133 
 134   bind(LARGE);
 135 
 136   RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_arrays_hashcode(eltype));
 137   assert(stub.target() != nullptr, "array_hashcode stub has not been generated");
 138   address tpc = trampoline_call(stub);
 139   if (tpc == nullptr) {
 140     DEBUG_ONLY(reset_labels(TAIL, BR_BASE));
 141     postcond(pc() == badAddress);
 142     return nullptr;
 143   }
 144 
 145   bind(DONE);
 146 
 147   BLOCK_COMMENT("} // arrays_hashcode");
 148 
 149   postcond(pc() != badAddress);
 150   return pc();
 151 }
 152 
 153 void C2_MacroAssembler::fast_lock(Register obj, Register box, Register t1,
 154                                   Register t2, Register t3) {
 155   assert_different_registers(obj, box, t1, t2, t3, rscratch2);
 156 
 157   // Handle inflated monitor.
 158   Label inflated;
 159   // Finish fast lock successfully. MUST branch to with flag == EQ
 160   Label locked;
 161   // Finish fast lock unsuccessfully. MUST branch to with flag == NE
 162   Label slow_path;
 163 
 164   if (UseObjectMonitorTable) {
 165     // Clear cache in case fast locking succeeds or we need to take the slow-path.
 166     str(zr, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 167   }
 168 
 169   if (DiagnoseSyncOnValueBasedClasses != 0) {
 170     load_klass(t1, obj);
 171     ldrb(t1, Address(t1, Klass::misc_flags_offset()));
 172     tst(t1, KlassFlags::_misc_is_value_based_class);
 173     br(Assembler::NE, slow_path);
 174   }
 175 
 176   const Register t1_mark = t1;
 177   const Register t3_t = t3;
 178 
 179   { // Fast locking
 180 
 181     // Push lock to the lock stack and finish successfully. MUST branch to with flag == EQ
 182     Label push;
 183 
 184     const Register t2_top = t2;
 185 
 186     // Check if lock-stack is full.
 187     ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 188     cmpw(t2_top, (unsigned)LockStack::end_offset() - 1);
 189     br(Assembler::GT, slow_path);
 190 
 191     // Check if recursive.
 192     subw(t3_t, t2_top, oopSize);
 193     ldr(t3_t, Address(rthread, t3_t));
 194     cmp(obj, t3_t);
 195     br(Assembler::EQ, push);
 196 
 197     // Relaxed normal load to check for monitor. Optimization for monitor case.
 198     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 199     tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
 200 
 201     // Not inflated
 202     assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea");
 203 
 204     // Try to lock. Transition lock-bits 0b01 => 0b00
 205     orr(t1_mark, t1_mark, markWord::unlocked_value);
 206     eor(t3_t, t1_mark, markWord::unlocked_value);
 207     cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
 208             /*acquire*/ true, /*release*/ false, /*weak*/ false, noreg);
 209     br(Assembler::NE, slow_path);
 210 
 211     bind(push);
 212     // After successful lock, push object on lock-stack.
 213     str(obj, Address(rthread, t2_top));
 214     addw(t2_top, t2_top, oopSize);
 215     strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 216     b(locked);
 217   }
 218 
 219   { // Handle inflated monitor.
 220     bind(inflated);
 221 
 222     const Register t1_monitor = t1;
 223 
 224     if (!UseObjectMonitorTable) {
 225       assert(t1_monitor == t1_mark, "should be the same here");
 226     } else {
 227       const Register t1_hash = t1;
 228       Label monitor_found;
 229 
 230       // Save the mark, we might need it to extract the hash.
 231       mov(t3, t1_mark);
 232 
 233       // Look for the monitor in the om_cache.
 234 
 235       ByteSize cache_offset   = JavaThread::om_cache_oops_offset();
 236       ByteSize monitor_offset = OMCache::oop_to_monitor_difference();
 237       const int num_unrolled  = OMCache::CAPACITY;
 238       for (int i = 0; i < num_unrolled; i++) {
 239         ldr(t1_monitor, Address(rthread, cache_offset + monitor_offset));
 240         ldr(t2, Address(rthread, cache_offset));
 241         cmp(obj, t2);
 242         br(Assembler::EQ, monitor_found);
 243         cache_offset = cache_offset + OMCache::oop_to_oop_difference();
 244       }
 245 
 246       if (UseCompactObjectHeaders) {
 247         // TODO: The fast-path table lookup currently doesn't work with Lilliput's
 248         // compact identity-hashcode implementation.
 249         // See: https://bugs.openjdk.org/browse/JDK-8380981
 250         b(slow_path);
 251       } else {
 252         // Look for the monitor in the table.
 253 
 254         // Get the hash code.
 255         ubfx(t1_hash, t3, markWord::hash_shift, markWord::hash_bits);
 256 
 257         // Get the table and calculate the bucket's address
 258         lea(t3, ExternalAddress(ObjectMonitorTable::current_table_address()));
 259         ldr(t3, Address(t3));
 260         ldr(t2, Address(t3, ObjectMonitorTable::table_capacity_mask_offset()));
 261         ands(t1_hash, t1_hash, t2);
 262         ldr(t3, Address(t3, ObjectMonitorTable::table_buckets_offset()));
 263 
 264         // Read the monitor from the bucket.
 265         ldr(t1_monitor, Address(t3, t1_hash, Address::lsl(LogBytesPerWord)));
 266 
 267         // Check if the monitor in the bucket is special (empty, tombstone or removed).
 268         cmp(t1_monitor, (unsigned char)ObjectMonitorTable::SpecialPointerValues::below_is_special);
 269         br(Assembler::LO, slow_path);
 270 
 271         // Check if object matches.
 272         ldr(t3, Address(t1_monitor, ObjectMonitor::object_offset()));
 273         BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
 274         bs_asm->try_peek_weak_handle_in_nmethod(this, t3, t3, t2, slow_path);
 275         cmp(t3, obj);
 276         br(Assembler::NE, slow_path);
 277       }
 278       bind(monitor_found);
 279     }
 280 
 281     const Register t2_owner_addr = t2;
 282     const Register t3_owner = t3;
 283     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 284     const Address owner_address(t1_monitor, ObjectMonitor::owner_offset() - monitor_tag);
 285     const Address recursions_address(t1_monitor, ObjectMonitor::recursions_offset() - monitor_tag);
 286 
 287     Label monitor_locked;
 288 
 289     // Compute owner address.
 290     lea(t2_owner_addr, owner_address);
 291 
 292     // Try to CAS owner (no owner => current thread's _monitor_owner_id).
 293     ldr(rscratch2, Address(rthread, JavaThread::monitor_owner_id_offset()));
 294     cmpxchg(t2_owner_addr, zr, rscratch2, Assembler::xword, /*acquire*/ true,
 295             /*release*/ false, /*weak*/ false, t3_owner);
 296     br(Assembler::EQ, monitor_locked);
 297 
 298     // Check if recursive.
 299     cmp(t3_owner, rscratch2);
 300     br(Assembler::NE, slow_path);
 301 
 302     // Recursive.
 303     increment(recursions_address, 1);
 304 
 305     bind(monitor_locked);
 306     if (UseObjectMonitorTable) {
 307       str(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 308     }
 309   }
 310 
 311   bind(locked);
 312 
 313 #ifdef ASSERT
 314   // Check that locked label is reached with Flags == EQ.
 315   Label flag_correct;
 316   br(Assembler::EQ, flag_correct);
 317   stop("Fast Lock Flag != EQ");
 318 #endif
 319 
 320   bind(slow_path);
 321 #ifdef ASSERT
 322   // Check that slow_path label is reached with Flags == NE.
 323   br(Assembler::NE, flag_correct);
 324   stop("Fast Lock Flag != NE");
 325   bind(flag_correct);
 326 #endif
 327   // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
 328 }
 329 
 330 void C2_MacroAssembler::fast_unlock(Register obj, Register box, Register t1,
 331                                     Register t2, Register t3) {
 332   assert_different_registers(obj, box, t1, t2, t3);
 333 
 334   // Handle inflated monitor.
 335   Label inflated, inflated_load_mark;
 336   // Finish fast unlock successfully. MUST branch to with flag == EQ
 337   Label unlocked;
 338   // Finish fast unlock unsuccessfully. MUST branch to with flag == NE
 339   Label slow_path;
 340 
 341   const Register t1_mark = t1;
 342   const Register t2_top = t2;
 343   const Register t3_t = t3;
 344 
 345   { // Fast unlock
 346 
 347     Label push_and_slow_path;
 348 
 349     // Check if obj is top of lock-stack.
 350     ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 351     subw(t2_top, t2_top, oopSize);
 352     ldr(t3_t, Address(rthread, t2_top));
 353     cmp(obj, t3_t);
 354     // Top of lock stack was not obj. Must be monitor.
 355     br(Assembler::NE, inflated_load_mark);
 356 
 357     // Pop lock-stack.
 358     DEBUG_ONLY(str(zr, Address(rthread, t2_top));)
 359     strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 360 
 361     // Check if recursive.
 362     subw(t3_t, t2_top, oopSize);
 363     ldr(t3_t, Address(rthread, t3_t));
 364     cmp(obj, t3_t);
 365     br(Assembler::EQ, unlocked);
 366 
 367     // Not recursive.
 368     // Load Mark.
 369     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 370 
 371     // Check header for monitor (0b10).
 372     // Because we got here by popping (meaning we pushed in locked)
 373     // there will be no monitor in the box. So we need to push back the obj
 374     // so that the runtime can fix any potential anonymous owner.
 375     tbnz(t1_mark, exact_log2(markWord::monitor_value), UseObjectMonitorTable ? push_and_slow_path : inflated);
 376 
 377     // Try to unlock. Transition lock bits 0b00 => 0b01
 378     assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea");
 379     orr(t3_t, t1_mark, markWord::unlocked_value);
 380     cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
 381             /*acquire*/ false, /*release*/ true, /*weak*/ false, noreg);
 382     br(Assembler::EQ, unlocked);
 383 
 384     bind(push_and_slow_path);
 385     // Compare and exchange failed.
 386     // Restore lock-stack and handle the unlock in runtime.
 387     DEBUG_ONLY(str(obj, Address(rthread, t2_top));)
 388     addw(t2_top, t2_top, oopSize);
 389     str(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 390     b(slow_path);
 391   }
 392 
 393 
 394   { // Handle inflated monitor.
 395     bind(inflated_load_mark);
 396     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 397 #ifdef ASSERT
 398     tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
 399     stop("Fast Unlock not monitor");
 400 #endif
 401 
 402     bind(inflated);
 403 
 404 #ifdef ASSERT
 405     Label check_done;
 406     subw(t2_top, t2_top, oopSize);
 407     cmpw(t2_top, in_bytes(JavaThread::lock_stack_base_offset()));
 408     br(Assembler::LT, check_done);
 409     ldr(t3_t, Address(rthread, t2_top));
 410     cmp(obj, t3_t);
 411     br(Assembler::NE, inflated);
 412     stop("Fast Unlock lock on stack");
 413     bind(check_done);
 414 #endif
 415 
 416     const Register t1_monitor = t1;
 417 
 418     if (!UseObjectMonitorTable) {
 419       assert(t1_monitor == t1_mark, "should be the same here");
 420 
 421       // Untag the monitor.
 422       add(t1_monitor, t1_mark, -(int)markWord::monitor_value);
 423     } else {
 424       ldr(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 425       // null check with Flags == NE, no valid pointer below alignof(ObjectMonitor*)
 426       cmp(t1_monitor, checked_cast<uint8_t>(alignof(ObjectMonitor*)));
 427       br(Assembler::LO, slow_path);
 428     }
 429 
 430     const Register t2_recursions = t2;
 431     Label not_recursive;
 432 
 433     // Check if recursive.
 434     ldr(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
 435     cbz(t2_recursions, not_recursive);
 436 
 437     // Recursive unlock.
 438     sub(t2_recursions, t2_recursions, 1u);
 439     str(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
 440     // Set flag == EQ
 441     cmp(t2_recursions, t2_recursions);
 442     b(unlocked);
 443 
 444     bind(not_recursive);
 445 
 446     const Register t2_owner_addr = t2;
 447 
 448     // Compute owner address.
 449     lea(t2_owner_addr, Address(t1_monitor, ObjectMonitor::owner_offset()));
 450 
 451     // Set owner to null.
 452     // Release to satisfy the JMM
 453     stlr(zr, t2_owner_addr);
 454     // We need a full fence after clearing owner to avoid stranding.
 455     // StoreLoad achieves this.
 456     membar(StoreLoad);
 457 
 458     // Check if the entry_list is empty.
 459     ldr(rscratch1, Address(t1_monitor, ObjectMonitor::entry_list_offset()));
 460     cmp(rscratch1, zr);
 461     br(Assembler::EQ, unlocked);  // If so we are done.
 462 
 463     // Check if there is a successor.
 464     ldr(rscratch1, Address(t1_monitor, ObjectMonitor::succ_offset()));
 465     cmp(rscratch1, zr);
 466     br(Assembler::NE, unlocked);  // If so we are done.
 467 
 468     // Save the monitor pointer in the current thread, so we can try to
 469     // reacquire the lock in SharedRuntime::monitor_exit_helper().
 470     str(t1_monitor, Address(rthread, JavaThread::unlocked_inflated_monitor_offset()));
 471 
 472     cmp(zr, rthread); // Set Flag to NE => slow path
 473     b(slow_path);
 474   }
 475 
 476   bind(unlocked);
 477   cmp(zr, zr); // Set Flags to EQ => fast path
 478 
 479 #ifdef ASSERT
 480   // Check that unlocked label is reached with Flags == EQ.
 481   Label flag_correct;
 482   br(Assembler::EQ, flag_correct);
 483   stop("Fast Unlock Flag != EQ");
 484 #endif
 485 
 486   bind(slow_path);
 487 #ifdef ASSERT
 488   // Check that slow_path label is reached with Flags == NE.
 489   br(Assembler::NE, flag_correct);
 490   stop("Fast Unlock Flag != NE");
 491   bind(flag_correct);
 492 #endif
 493   // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
 494 }
 495 
 496 // Search for str1 in str2 and return index or -1
 497 // Clobbers: rscratch1, rscratch2, rflags. May also clobber v0-v1, when icnt1==-1.
 498 void C2_MacroAssembler::string_indexof(Register str2, Register str1,
 499                                        Register cnt2, Register cnt1,
 500                                        Register tmp1, Register tmp2,
 501                                        Register tmp3, Register tmp4,
 502                                        Register tmp5, Register tmp6,
 503                                        int icnt1, Register result, int ae) {
 504   // NOTE: tmp5, tmp6 can be zr depending on specific method version
 505   Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
 506 
 507   Register ch1 = rscratch1;
 508   Register ch2 = rscratch2;
 509   Register cnt1tmp = tmp1;
 510   Register cnt2tmp = tmp2;
 511   Register cnt1_neg = cnt1;
 512   Register cnt2_neg = cnt2;
 513   Register result_tmp = tmp4;
 514 
 515   bool isL = ae == StrIntrinsicNode::LL;
 516 
 517   bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
 518   bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
 519   int str1_chr_shift = str1_isL ? 0:1;
 520   int str2_chr_shift = str2_isL ? 0:1;
 521   int str1_chr_size = str1_isL ? 1:2;
 522   int str2_chr_size = str2_isL ? 1:2;
 523   chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
 524                                       (chr_insn)&MacroAssembler::ldrh;
 525   chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
 526                                       (chr_insn)&MacroAssembler::ldrh;
 527   chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
 528   chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
 529 
 530   // Note, inline_string_indexOf() generates checks:
 531   // if (substr.count > string.count) return -1;
 532   // if (substr.count == 0) return 0;
 533 
 534   // We have two strings, a source string in str2, cnt2 and a pattern string
 535   // in str1, cnt1. Find the 1st occurrence of pattern in source or return -1.
 536 
 537   // For larger pattern and source we use a simplified Boyer Moore algorithm.
 538   // With a small pattern and source we use linear scan.
 539 
 540   if (icnt1 == -1) {
 541     sub(result_tmp, cnt2, cnt1);
 542     cmp(cnt1, (u1)8);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
 543     br(LT, LINEARSEARCH);
 544     dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
 545     subs(zr, cnt1, 256);
 546     lsr(tmp1, cnt2, 2);
 547     ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
 548     br(GE, LINEARSTUB);
 549   }
 550 
 551 // The Boyer Moore alogorithm is based on the description here:-
 552 //
 553 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
 554 //
 555 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
 556 // and the 'Good Suffix' rule.
 557 //
 558 // These rules are essentially heuristics for how far we can shift the
 559 // pattern along the search string.
 560 //
 561 // The implementation here uses the 'Bad Character' rule only because of the
 562 // complexity of initialisation for the 'Good Suffix' rule.
 563 //
 564 // This is also known as the Boyer-Moore-Horspool algorithm:-
 565 //
 566 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
 567 //
 568 // This particular implementation has few java-specific optimizations.
 569 //
 570 // #define ASIZE 256
 571 //
 572 //    int bm(unsigned char *x, int m, unsigned char *y, int n) {
 573 //       int i, j;
 574 //       unsigned c;
 575 //       unsigned char bc[ASIZE];
 576 //
 577 //       /* Preprocessing */
 578 //       for (i = 0; i < ASIZE; ++i)
 579 //          bc[i] = m;
 580 //       for (i = 0; i < m - 1; ) {
 581 //          c = x[i];
 582 //          ++i;
 583 //          // c < 256 for Latin1 string, so, no need for branch
 584 //          #ifdef PATTERN_STRING_IS_LATIN1
 585 //          bc[c] = m - i;
 586 //          #else
 587 //          if (c < ASIZE) bc[c] = m - i;
 588 //          #endif
 589 //       }
 590 //
 591 //       /* Searching */
 592 //       j = 0;
 593 //       while (j <= n - m) {
 594 //          c = y[i+j];
 595 //          if (x[m-1] == c)
 596 //            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
 597 //          if (i < 0) return j;
 598 //          // c < 256 for Latin1 string, so, no need for branch
 599 //          #ifdef SOURCE_STRING_IS_LATIN1
 600 //          // LL case: (c< 256) always true. Remove branch
 601 //          j += bc[y[j+m-1]];
 602 //          #endif
 603 //          #ifndef PATTERN_STRING_IS_UTF
 604 //          // UU case: need if (c<ASIZE) check. Skip 1 character if not.
 605 //          if (c < ASIZE)
 606 //            j += bc[y[j+m-1]];
 607 //          else
 608 //            j += 1
 609 //          #endif
 610 //          #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
 611 //          // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
 612 //          if (c < ASIZE)
 613 //            j += bc[y[j+m-1]];
 614 //          else
 615 //            j += m
 616 //          #endif
 617 //       }
 618 //    }
 619 
 620   if (icnt1 == -1) {
 621     Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
 622         BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
 623     Register cnt1end = tmp2;
 624     Register str2end = cnt2;
 625     Register skipch = tmp2;
 626 
 627     // str1 length is >=8, so, we can read at least 1 register for cases when
 628     // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
 629     // UL case. We'll re-read last character in inner pre-loop code to have
 630     // single outer pre-loop load
 631     const int firstStep = isL ? 7 : 3;
 632 
 633     const int ASIZE = 256;
 634     const int STORED_BYTES = 32; // amount of bytes stored per instruction
 635     sub(sp, sp, ASIZE);
 636     mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
 637     mov(ch1, sp);
 638     BIND(BM_INIT_LOOP);
 639       stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
 640       subs(tmp5, tmp5, 1);
 641       br(GT, BM_INIT_LOOP);
 642 
 643       sub(cnt1tmp, cnt1, 1);
 644       mov(tmp5, str2);
 645       add(str2end, str2, result_tmp, LSL, str2_chr_shift);
 646       sub(ch2, cnt1, 1);
 647       mov(tmp3, str1);
 648     BIND(BCLOOP);
 649       (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
 650       if (!str1_isL) {
 651         subs(zr, ch1, ASIZE);
 652         br(HS, BCSKIP);
 653       }
 654       strb(ch2, Address(sp, ch1));
 655     BIND(BCSKIP);
 656       subs(ch2, ch2, 1);
 657       br(GT, BCLOOP);
 658 
 659       add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
 660       if (str1_isL == str2_isL) {
 661         // load last 8 bytes (8LL/4UU symbols)
 662         ldr(tmp6, Address(tmp6, -wordSize));
 663       } else {
 664         ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
 665         // convert Latin1 to UTF. We'll have to wait until load completed, but
 666         // it's still faster than per-character loads+checks
 667         lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
 668         ubfx(ch1, tmp6, 8, 8); // str1[N-2]
 669         ubfx(ch2, tmp6, 16, 8); // str1[N-3]
 670         andr(tmp6, tmp6, 0xFF); // str1[N-4]
 671         orr(ch2, ch1, ch2, LSL, 16);
 672         orr(tmp6, tmp6, tmp3, LSL, 48);
 673         orr(tmp6, tmp6, ch2, LSL, 16);
 674       }
 675     BIND(BMLOOPSTR2);
 676       (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 677       sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
 678       if (str1_isL == str2_isL) {
 679         // re-init tmp3. It's for free because it's executed in parallel with
 680         // load above. Alternative is to initialize it before loop, but it'll
 681         // affect performance on in-order systems with 2 or more ld/st pipelines
 682         lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
 683       }
 684       if (!isL) { // UU/UL case
 685         lsl(ch2, cnt1tmp, 1); // offset in bytes
 686       }
 687       cmp(tmp3, skipch);
 688       br(NE, BMSKIP);
 689       ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
 690       mov(ch1, tmp6);
 691       if (isL) {
 692         b(BMLOOPSTR1_AFTER_LOAD);
 693       } else {
 694         sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
 695         b(BMLOOPSTR1_CMP);
 696       }
 697     BIND(BMLOOPSTR1);
 698       (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
 699       (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 700     BIND(BMLOOPSTR1_AFTER_LOAD);
 701       subs(cnt1tmp, cnt1tmp, 1);
 702       br(LT, BMLOOPSTR1_LASTCMP);
 703     BIND(BMLOOPSTR1_CMP);
 704       cmp(ch1, ch2);
 705       br(EQ, BMLOOPSTR1);
 706     BIND(BMSKIP);
 707       if (!isL) {
 708         // if we've met UTF symbol while searching Latin1 pattern, then we can
 709         // skip cnt1 symbols
 710         if (str1_isL != str2_isL) {
 711           mov(result_tmp, cnt1);
 712         } else {
 713           mov(result_tmp, 1);
 714         }
 715         subs(zr, skipch, ASIZE);
 716         br(HS, BMADV);
 717       }
 718       ldrb(result_tmp, Address(sp, skipch)); // load skip distance
 719     BIND(BMADV);
 720       sub(cnt1tmp, cnt1, 1);
 721       add(str2, str2, result_tmp, LSL, str2_chr_shift);
 722       cmp(str2, str2end);
 723       br(LE, BMLOOPSTR2);
 724       add(sp, sp, ASIZE);
 725       b(NOMATCH);
 726     BIND(BMLOOPSTR1_LASTCMP);
 727       cmp(ch1, ch2);
 728       br(NE, BMSKIP);
 729     BIND(BMMATCH);
 730       sub(result, str2, tmp5);
 731       if (!str2_isL) lsr(result, result, 1);
 732       add(sp, sp, ASIZE);
 733       b(DONE);
 734 
 735     BIND(LINEARSTUB);
 736     cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm
 737     br(LT, LINEAR_MEDIUM);
 738     mov(result, zr);
 739     RuntimeAddress stub = nullptr;
 740     if (isL) {
 741       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
 742       assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated");
 743     } else if (str1_isL) {
 744       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
 745        assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated");
 746     } else {
 747       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
 748       assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated");
 749     }
 750     address call = trampoline_call(stub);
 751     if (call == nullptr) {
 752       DEBUG_ONLY(reset_labels(LINEARSEARCH, LINEAR_MEDIUM, DONE, NOMATCH, MATCH));
 753       ciEnv::current()->record_failure("CodeCache is full");
 754       return;
 755     }
 756     b(DONE);
 757   }
 758 
 759   BIND(LINEARSEARCH);
 760   {
 761     Label DO1, DO2, DO3;
 762 
 763     Register str2tmp = tmp2;
 764     Register first = tmp3;
 765 
 766     if (icnt1 == -1)
 767     {
 768         Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
 769 
 770         cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2));
 771         br(LT, DOSHORT);
 772       BIND(LINEAR_MEDIUM);
 773         (this->*str1_load_1chr)(first, Address(str1));
 774         lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
 775         sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
 776         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 777         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 778 
 779       BIND(FIRST_LOOP);
 780         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
 781         cmp(first, ch2);
 782         br(EQ, STR1_LOOP);
 783       BIND(STR2_NEXT);
 784         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 785         br(LE, FIRST_LOOP);
 786         b(NOMATCH);
 787 
 788       BIND(STR1_LOOP);
 789         adds(cnt1tmp, cnt1_neg, str1_chr_size);
 790         add(cnt2tmp, cnt2_neg, str2_chr_size);
 791         br(GE, MATCH);
 792 
 793       BIND(STR1_NEXT);
 794         (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
 795         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
 796         cmp(ch1, ch2);
 797         br(NE, STR2_NEXT);
 798         adds(cnt1tmp, cnt1tmp, str1_chr_size);
 799         add(cnt2tmp, cnt2tmp, str2_chr_size);
 800         br(LT, STR1_NEXT);
 801         b(MATCH);
 802 
 803       BIND(DOSHORT);
 804       if (str1_isL == str2_isL) {
 805         cmp(cnt1, (u1)2);
 806         br(LT, DO1);
 807         br(GT, DO3);
 808       }
 809     }
 810 
 811     if (icnt1 == 4) {
 812       Label CH1_LOOP;
 813 
 814         (this->*load_4chr)(ch1, str1);
 815         sub(result_tmp, cnt2, 4);
 816         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 817         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 818 
 819       BIND(CH1_LOOP);
 820         (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
 821         cmp(ch1, ch2);
 822         br(EQ, MATCH);
 823         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 824         br(LE, CH1_LOOP);
 825         b(NOMATCH);
 826       }
 827 
 828     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
 829       Label CH1_LOOP;
 830 
 831       BIND(DO2);
 832         (this->*load_2chr)(ch1, str1);
 833         if (icnt1 == 2) {
 834           sub(result_tmp, cnt2, 2);
 835         }
 836         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 837         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 838       BIND(CH1_LOOP);
 839         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
 840         cmp(ch1, ch2);
 841         br(EQ, MATCH);
 842         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 843         br(LE, CH1_LOOP);
 844         b(NOMATCH);
 845     }
 846 
 847     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
 848       Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
 849 
 850       BIND(DO3);
 851         (this->*load_2chr)(first, str1);
 852         (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
 853         if (icnt1 == 3) {
 854           sub(result_tmp, cnt2, 3);
 855         }
 856         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 857         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 858       BIND(FIRST_LOOP);
 859         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
 860         cmpw(first, ch2);
 861         br(EQ, STR1_LOOP);
 862       BIND(STR2_NEXT);
 863         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 864         br(LE, FIRST_LOOP);
 865         b(NOMATCH);
 866 
 867       BIND(STR1_LOOP);
 868         add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
 869         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
 870         cmp(ch1, ch2);
 871         br(NE, STR2_NEXT);
 872         b(MATCH);
 873     }
 874 
 875     if (icnt1 == -1 || icnt1 == 1) {
 876       Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
 877 
 878       BIND(DO1);
 879         (this->*str1_load_1chr)(ch1, str1);
 880         cmp(cnt2, (u1)8);
 881         br(LT, DO1_SHORT);
 882 
 883         sub(result_tmp, cnt2, 8/str2_chr_size);
 884         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 885         mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
 886         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 887 
 888         if (str2_isL) {
 889           orr(ch1, ch1, ch1, LSL, 8);
 890         }
 891         orr(ch1, ch1, ch1, LSL, 16);
 892         orr(ch1, ch1, ch1, LSL, 32);
 893       BIND(CH1_LOOP);
 894         ldr(ch2, Address(str2, cnt2_neg));
 895         eor(ch2, ch1, ch2);
 896         sub(tmp1, ch2, tmp3);
 897         orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 898         bics(tmp1, tmp1, tmp2);
 899         br(NE, HAS_ZERO);
 900         adds(cnt2_neg, cnt2_neg, 8);
 901         br(LT, CH1_LOOP);
 902 
 903         cmp(cnt2_neg, (u1)8);
 904         mov(cnt2_neg, 0);
 905         br(LT, CH1_LOOP);
 906         b(NOMATCH);
 907 
 908       BIND(HAS_ZERO);
 909         rev(tmp1, tmp1);
 910         clz(tmp1, tmp1);
 911         add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
 912         b(MATCH);
 913 
 914       BIND(DO1_SHORT);
 915         mov(result_tmp, cnt2);
 916         lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
 917         sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
 918       BIND(DO1_LOOP);
 919         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
 920         cmpw(ch1, ch2);
 921         br(EQ, MATCH);
 922         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 923         br(LT, DO1_LOOP);
 924     }
 925   }
 926   BIND(NOMATCH);
 927     mov(result, -1);
 928     b(DONE);
 929   BIND(MATCH);
 930     add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
 931   BIND(DONE);
 932 }
 933 
 934 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
 935 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
 936 
 937 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1,
 938                                             Register ch, Register result,
 939                                             Register tmp1, Register tmp2, Register tmp3)
 940 {
 941   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
 942   Register cnt1_neg = cnt1;
 943   Register ch1 = rscratch1;
 944   Register result_tmp = rscratch2;
 945 
 946   cbz(cnt1, NOMATCH);
 947 
 948   cmp(cnt1, (u1)4);
 949   br(LT, DO1_SHORT);
 950 
 951   orr(ch, ch, ch, LSL, 16);
 952   orr(ch, ch, ch, LSL, 32);
 953 
 954   sub(cnt1, cnt1, 4);
 955   mov(result_tmp, cnt1);
 956   lea(str1, Address(str1, cnt1, Address::uxtw(1)));
 957   sub(cnt1_neg, zr, cnt1, LSL, 1);
 958 
 959   mov(tmp3, 0x0001000100010001);
 960 
 961   BIND(CH1_LOOP);
 962     ldr(ch1, Address(str1, cnt1_neg));
 963     eor(ch1, ch, ch1);
 964     sub(tmp1, ch1, tmp3);
 965     orr(tmp2, ch1, 0x7fff7fff7fff7fff);
 966     bics(tmp1, tmp1, tmp2);
 967     br(NE, HAS_ZERO);
 968     adds(cnt1_neg, cnt1_neg, 8);
 969     br(LT, CH1_LOOP);
 970 
 971     cmp(cnt1_neg, (u1)8);
 972     mov(cnt1_neg, 0);
 973     br(LT, CH1_LOOP);
 974     b(NOMATCH);
 975 
 976   BIND(HAS_ZERO);
 977     rev(tmp1, tmp1);
 978     clz(tmp1, tmp1);
 979     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
 980     b(MATCH);
 981 
 982   BIND(DO1_SHORT);
 983     mov(result_tmp, cnt1);
 984     lea(str1, Address(str1, cnt1, Address::uxtw(1)));
 985     sub(cnt1_neg, zr, cnt1, LSL, 1);
 986   BIND(DO1_LOOP);
 987     ldrh(ch1, Address(str1, cnt1_neg));
 988     cmpw(ch, ch1);
 989     br(EQ, MATCH);
 990     adds(cnt1_neg, cnt1_neg, 2);
 991     br(LT, DO1_LOOP);
 992   BIND(NOMATCH);
 993     mov(result, -1);
 994     b(DONE);
 995   BIND(MATCH);
 996     add(result, result_tmp, cnt1_neg, ASR, 1);
 997   BIND(DONE);
 998 }
 999 
1000 void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1,
1001                                                 Register ch, Register result,
1002                                                 FloatRegister ztmp1,
1003                                                 FloatRegister ztmp2,
1004                                                 PRegister tmp_pg,
1005                                                 PRegister tmp_pdn, bool isL)
1006 {
1007   // Note that `tmp_pdn` should *NOT* be used as governing predicate register.
1008   assert(tmp_pg->is_governing(),
1009          "this register has to be a governing predicate register");
1010 
1011   Label LOOP, MATCH, DONE, NOMATCH;
1012   Register vec_len = rscratch1;
1013   Register idx = rscratch2;
1014 
1015   SIMD_RegVariant T = (isL == true) ? B : H;
1016 
1017   cbz(cnt1, NOMATCH);
1018 
1019   // Assign the particular char throughout the vector.
1020   sve_dup(ztmp2, T, ch);
1021   if (isL) {
1022     sve_cntb(vec_len);
1023   } else {
1024     sve_cnth(vec_len);
1025   }
1026   mov(idx, 0);
1027 
1028   // Generate a predicate to control the reading of input string.
1029   sve_whilelt(tmp_pg, T, idx, cnt1);
1030 
1031   BIND(LOOP);
1032     // Read a vector of 8- or 16-bit data depending on the string type. Note
1033     // that inactive elements indicated by the predicate register won't cause
1034     // a data read from memory to the destination vector.
1035     if (isL) {
1036       sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx));
1037     } else {
1038       sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1)));
1039     }
1040     add(idx, idx, vec_len);
1041 
1042     // Perform the comparison. An element of the destination predicate is set
1043     // to active if the particular char is matched.
1044     sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2);
1045 
1046     // Branch if the particular char is found.
1047     br(NE, MATCH);
1048 
1049     sve_whilelt(tmp_pg, T, idx, cnt1);
1050 
1051     // Loop back if the particular char not found.
1052     br(MI, LOOP);
1053 
1054   BIND(NOMATCH);
1055     mov(result, -1);
1056     b(DONE);
1057 
1058   BIND(MATCH);
1059     // Undo the index increment.
1060     sub(idx, idx, vec_len);
1061 
1062     // Crop the vector to find its location.
1063     sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */);
1064     add(result, idx, -1);
1065     sve_incp(result, T, tmp_pdn);
1066   BIND(DONE);
1067 }
1068 
1069 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1,
1070                                             Register ch, Register result,
1071                                             Register tmp1, Register tmp2, Register tmp3)
1072 {
1073   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
1074   Register cnt1_neg = cnt1;
1075   Register ch1 = rscratch1;
1076   Register result_tmp = rscratch2;
1077 
1078   cbz(cnt1, NOMATCH);
1079 
1080   cmp(cnt1, (u1)8);
1081   br(LT, DO1_SHORT);
1082 
1083   orr(ch, ch, ch, LSL, 8);
1084   orr(ch, ch, ch, LSL, 16);
1085   orr(ch, ch, ch, LSL, 32);
1086 
1087   sub(cnt1, cnt1, 8);
1088   mov(result_tmp, cnt1);
1089   lea(str1, Address(str1, cnt1));
1090   sub(cnt1_neg, zr, cnt1);
1091 
1092   mov(tmp3, 0x0101010101010101);
1093 
1094   BIND(CH1_LOOP);
1095     ldr(ch1, Address(str1, cnt1_neg));
1096     eor(ch1, ch, ch1);
1097     sub(tmp1, ch1, tmp3);
1098     orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f);
1099     bics(tmp1, tmp1, tmp2);
1100     br(NE, HAS_ZERO);
1101     adds(cnt1_neg, cnt1_neg, 8);
1102     br(LT, CH1_LOOP);
1103 
1104     cmp(cnt1_neg, (u1)8);
1105     mov(cnt1_neg, 0);
1106     br(LT, CH1_LOOP);
1107     b(NOMATCH);
1108 
1109   BIND(HAS_ZERO);
1110     rev(tmp1, tmp1);
1111     clz(tmp1, tmp1);
1112     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
1113     b(MATCH);
1114 
1115   BIND(DO1_SHORT);
1116     mov(result_tmp, cnt1);
1117     lea(str1, Address(str1, cnt1));
1118     sub(cnt1_neg, zr, cnt1);
1119   BIND(DO1_LOOP);
1120     ldrb(ch1, Address(str1, cnt1_neg));
1121     cmp(ch, ch1);
1122     br(EQ, MATCH);
1123     adds(cnt1_neg, cnt1_neg, 1);
1124     br(LT, DO1_LOOP);
1125   BIND(NOMATCH);
1126     mov(result, -1);
1127     b(DONE);
1128   BIND(MATCH);
1129     add(result, result_tmp, cnt1_neg);
1130   BIND(DONE);
1131 }
1132 
1133 // Compare strings.
1134 void C2_MacroAssembler::string_compare(Register str1, Register str2,
1135     Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
1136     FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
1137     PRegister pgtmp1, PRegister pgtmp2, int ae) {
1138   Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
1139       DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
1140       SHORT_LOOP_START, TAIL_CHECK;
1141 
1142   bool isLL = ae == StrIntrinsicNode::LL;
1143   bool isLU = ae == StrIntrinsicNode::LU;
1144   bool isUL = ae == StrIntrinsicNode::UL;
1145 
1146   // The stub threshold for LL strings is: 72 (64 + 8) chars
1147   // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch)
1148   // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least)
1149   const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36);
1150 
1151   bool str1_isL = isLL || isLU;
1152   bool str2_isL = isLL || isUL;
1153 
1154   int str1_chr_shift = str1_isL ? 0 : 1;
1155   int str2_chr_shift = str2_isL ? 0 : 1;
1156   int str1_chr_size = str1_isL ? 1 : 2;
1157   int str2_chr_size = str2_isL ? 1 : 2;
1158   int minCharsInWord = isLL ? wordSize : wordSize/2;
1159 
1160   FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
1161   chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
1162                                       (chr_insn)&MacroAssembler::ldrh;
1163   chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
1164                                       (chr_insn)&MacroAssembler::ldrh;
1165   uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
1166                             (uxt_insn)&MacroAssembler::uxthw;
1167 
1168   BLOCK_COMMENT("string_compare {");
1169 
1170   // Bizarrely, the counts are passed in bytes, regardless of whether they
1171   // are L or U strings, however the result is always in characters.
1172   if (!str1_isL) asrw(cnt1, cnt1, 1);
1173   if (!str2_isL) asrw(cnt2, cnt2, 1);
1174 
1175   // Compute the minimum of the string lengths and save the difference.
1176   subsw(result, cnt1, cnt2);
1177   cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
1178 
1179   // A very short string
1180   cmpw(cnt2, minCharsInWord);
1181   br(Assembler::LE, SHORT_STRING);
1182 
1183   // Compare longwords
1184   // load first parts of strings and finish initialization while loading
1185   {
1186     if (str1_isL == str2_isL) { // LL or UU
1187       ldr(tmp1, Address(str1));
1188       cmp(str1, str2);
1189       br(Assembler::EQ, DONE);
1190       ldr(tmp2, Address(str2));
1191       cmp(cnt2, stub_threshold);
1192       br(GE, STUB);
1193       subsw(cnt2, cnt2, minCharsInWord);
1194       br(EQ, TAIL_CHECK);
1195       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1196       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1197       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1198     } else if (isLU) {
1199       ldrs(vtmp, Address(str1));
1200       ldr(tmp2, Address(str2));
1201       cmp(cnt2, stub_threshold);
1202       br(GE, STUB);
1203       subw(cnt2, cnt2, 4);
1204       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1205       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1206       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1207       zip1(vtmp, T8B, vtmp, vtmpZ);
1208       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1209       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1210       add(cnt1, cnt1, 4);
1211       fmovd(tmp1, vtmp);
1212     } else { // UL case
1213       ldr(tmp1, Address(str1));
1214       ldrs(vtmp, Address(str2));
1215       cmp(cnt2, stub_threshold);
1216       br(GE, STUB);
1217       subw(cnt2, cnt2, 4);
1218       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1219       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1220       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1221       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1222       zip1(vtmp, T8B, vtmp, vtmpZ);
1223       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1224       add(cnt1, cnt1, 8);
1225       fmovd(tmp2, vtmp);
1226     }
1227     adds(cnt2, cnt2, isUL ? 4 : 8);
1228     br(GE, TAIL);
1229     eor(rscratch2, tmp1, tmp2);
1230     cbnz(rscratch2, DIFF);
1231     // main loop
1232     bind(NEXT_WORD);
1233     if (str1_isL == str2_isL) {
1234       ldr(tmp1, Address(str1, cnt2));
1235       ldr(tmp2, Address(str2, cnt2));
1236       adds(cnt2, cnt2, 8);
1237     } else if (isLU) {
1238       ldrs(vtmp, Address(str1, cnt1));
1239       ldr(tmp2, Address(str2, cnt2));
1240       add(cnt1, cnt1, 4);
1241       zip1(vtmp, T8B, vtmp, vtmpZ);
1242       fmovd(tmp1, vtmp);
1243       adds(cnt2, cnt2, 8);
1244     } else { // UL
1245       ldrs(vtmp, Address(str2, cnt2));
1246       ldr(tmp1, Address(str1, cnt1));
1247       zip1(vtmp, T8B, vtmp, vtmpZ);
1248       add(cnt1, cnt1, 8);
1249       fmovd(tmp2, vtmp);
1250       adds(cnt2, cnt2, 4);
1251     }
1252     br(GE, TAIL);
1253 
1254     eor(rscratch2, tmp1, tmp2);
1255     cbz(rscratch2, NEXT_WORD);
1256     b(DIFF);
1257     bind(TAIL);
1258     eor(rscratch2, tmp1, tmp2);
1259     cbnz(rscratch2, DIFF);
1260     // Last longword.  In the case where length == 4 we compare the
1261     // same longword twice, but that's still faster than another
1262     // conditional branch.
1263     if (str1_isL == str2_isL) {
1264       ldr(tmp1, Address(str1));
1265       ldr(tmp2, Address(str2));
1266     } else if (isLU) {
1267       ldrs(vtmp, Address(str1));
1268       ldr(tmp2, Address(str2));
1269       zip1(vtmp, T8B, vtmp, vtmpZ);
1270       fmovd(tmp1, vtmp);
1271     } else { // UL
1272       ldrs(vtmp, Address(str2));
1273       ldr(tmp1, Address(str1));
1274       zip1(vtmp, T8B, vtmp, vtmpZ);
1275       fmovd(tmp2, vtmp);
1276     }
1277     bind(TAIL_CHECK);
1278     eor(rscratch2, tmp1, tmp2);
1279     cbz(rscratch2, DONE);
1280 
1281     // Find the first different characters in the longwords and
1282     // compute their difference.
1283     bind(DIFF);
1284     rev(rscratch2, rscratch2);
1285     clz(rscratch2, rscratch2);
1286     andr(rscratch2, rscratch2, isLL ? -8 : -16);
1287     lsrv(tmp1, tmp1, rscratch2);
1288     (this->*ext_chr)(tmp1, tmp1);
1289     lsrv(tmp2, tmp2, rscratch2);
1290     (this->*ext_chr)(tmp2, tmp2);
1291     subw(result, tmp1, tmp2);
1292     b(DONE);
1293   }
1294 
1295   bind(STUB);
1296     RuntimeAddress stub = nullptr;
1297     switch(ae) {
1298       case StrIntrinsicNode::LL:
1299         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
1300         break;
1301       case StrIntrinsicNode::UU:
1302         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
1303         break;
1304       case StrIntrinsicNode::LU:
1305         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
1306         break;
1307       case StrIntrinsicNode::UL:
1308         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
1309         break;
1310       default:
1311         ShouldNotReachHere();
1312      }
1313     assert(stub.target() != nullptr, "compare_long_string stub has not been generated");
1314     address call = trampoline_call(stub);
1315     if (call == nullptr) {
1316       DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START));
1317       ciEnv::current()->record_failure("CodeCache is full");
1318       return;
1319     }
1320     b(DONE);
1321 
1322   bind(SHORT_STRING);
1323   // Is the minimum length zero?
1324   cbz(cnt2, DONE);
1325   // arrange code to do most branches while loading and loading next characters
1326   // while comparing previous
1327   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1328   subs(cnt2, cnt2, 1);
1329   br(EQ, SHORT_LAST_INIT);
1330   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1331   b(SHORT_LOOP_START);
1332   bind(SHORT_LOOP);
1333   subs(cnt2, cnt2, 1);
1334   br(EQ, SHORT_LAST);
1335   bind(SHORT_LOOP_START);
1336   (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
1337   (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
1338   cmp(tmp1, cnt1);
1339   br(NE, SHORT_LOOP_TAIL);
1340   subs(cnt2, cnt2, 1);
1341   br(EQ, SHORT_LAST2);
1342   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1343   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1344   cmp(tmp2, rscratch1);
1345   br(EQ, SHORT_LOOP);
1346   sub(result, tmp2, rscratch1);
1347   b(DONE);
1348   bind(SHORT_LOOP_TAIL);
1349   sub(result, tmp1, cnt1);
1350   b(DONE);
1351   bind(SHORT_LAST2);
1352   cmp(tmp2, rscratch1);
1353   br(EQ, DONE);
1354   sub(result, tmp2, rscratch1);
1355 
1356   b(DONE);
1357   bind(SHORT_LAST_INIT);
1358   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1359   bind(SHORT_LAST);
1360   cmp(tmp1, cnt1);
1361   br(EQ, DONE);
1362   sub(result, tmp1, cnt1);
1363 
1364   bind(DONE);
1365 
1366   BLOCK_COMMENT("} string_compare");
1367 }
1368 
1369 void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1,
1370                                      FloatRegister src2, Condition cond, bool isQ) {
1371   SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1372   FloatRegister zn = src1, zm = src2;
1373   bool needs_negation = false;
1374   switch (cond) {
1375     case LT: cond = GT; zn = src2; zm = src1; break;
1376     case LE: cond = GE; zn = src2; zm = src1; break;
1377     case LO: cond = HI; zn = src2; zm = src1; break;
1378     case LS: cond = HS; zn = src2; zm = src1; break;
1379     case NE: cond = EQ; needs_negation = true; break;
1380     default:
1381       break;
1382   }
1383 
1384   if (is_floating_point_type(bt)) {
1385     fcm(cond, dst, size, zn, zm);
1386   } else {
1387     cm(cond, dst, size, zn, zm);
1388   }
1389 
1390   if (needs_negation) {
1391     notr(dst, isQ ? T16B : T8B, dst);
1392   }
1393 }
1394 
1395 void C2_MacroAssembler::neon_compare_zero(FloatRegister dst, BasicType bt, FloatRegister src,
1396                                           Condition cond, bool isQ) {
1397   SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1398   if (bt == T_FLOAT || bt == T_DOUBLE) {
1399     if (cond == Assembler::NE) {
1400       fcm(Assembler::EQ, dst, size, src);
1401       notr(dst, isQ ? T16B : T8B, dst);
1402     } else {
1403       fcm(cond, dst, size, src);
1404     }
1405   } else {
1406     if (cond == Assembler::NE) {
1407       cm(Assembler::EQ, dst, size, src);
1408       notr(dst, isQ ? T16B : T8B, dst);
1409     } else {
1410       cm(cond, dst, size, src);
1411     }
1412   }
1413 }
1414 
1415 // Compress the least significant bit of each byte to the rightmost and clear
1416 // the higher garbage bits.
1417 void C2_MacroAssembler::bytemask_compress(Register dst) {
1418   // Example input, dst = 0x01 00 00 00 01 01 00 01
1419   // The "??" bytes are garbage.
1420   orr(dst, dst, dst, Assembler::LSR, 7);  // dst = 0x?? 02 ?? 00 ?? 03 ?? 01
1421   orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D
1422   orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D
1423   andr(dst, dst, 0xff);                   // dst = 0x8D
1424 }
1425 
1426 // Pack the value of each mask element in "src" into a long value in "dst", at most
1427 // the first 64 lane elements. The input "src" is a vector of boolean represented as
1428 // bytes with 0x00/0x01 as element values. Each lane value from "src" is packed into
1429 // one bit in "dst".
1430 //
1431 // Example:   src = 0x0001010000010001 0100000001010001, lane_cnt = 16
1432 // Expected:  dst = 0x658D
1433 //
1434 // Clobbers: rscratch1
1435 void C2_MacroAssembler::sve_vmask_tolong(Register dst, FloatRegister src,
1436                                          FloatRegister vtmp, int lane_cnt) {
1437   assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
1438   assert_different_registers(dst, rscratch1);
1439   assert_different_registers(src, vtmp);
1440   assert(UseSVE > 0, "must be");
1441 
1442   // Compress the lowest 8 bytes.
1443   fmovd(dst, src);
1444   bytemask_compress(dst);
1445   if (lane_cnt <= 8) return;
1446 
1447   // Repeat on higher bytes and join the results.
1448   // Compress 8 bytes in each iteration.
1449   for (int idx = 1; idx < (lane_cnt / 8); idx++) {
1450     sve_extract_integral(rscratch1, T_LONG, src, idx, vtmp);
1451     bytemask_compress(rscratch1);
1452     orr(dst, dst, rscratch1, Assembler::LSL, idx << 3);
1453   }
1454 }
1455 
1456 // The function is same as above "sve_vmask_tolong", but it uses SVE2's BEXT
1457 // instruction which requires the FEAT_BITPERM feature.
1458 void C2_MacroAssembler::sve2_vmask_tolong(Register dst, FloatRegister src,
1459                                           FloatRegister vtmp1, FloatRegister vtmp2,
1460                                           int lane_cnt) {
1461   assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
1462   assert_different_registers(src, vtmp1, vtmp2);
1463   assert(UseSVE > 1 && VM_Version::supports_svebitperm(), "must be");
1464 
1465   // Given a vector with the value 0x00 or 0x01 in each byte, the basic idea
1466   // is to compress each significant bit of the byte in a cross-lane way. Due
1467   // to the lack of a cross-lane bit-compress instruction, we use BEXT
1468   // (bit-compress in each lane) with the biggest lane size (T = D) then
1469   // concatenate the results.
1470 
1471   // The second source input of BEXT, initialized with 0x01 in each byte.
1472   // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1473   sve_dup(vtmp2, B, 1);
1474 
1475   // BEXT vtmp1.D, src.D, vtmp2.D
1476   // src   = 0x0001010000010001 | 0x0100000001010001
1477   // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1478   //         ---------------------------------------
1479   // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1480   sve_bext(vtmp1, D, src, vtmp2);
1481 
1482   // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the
1483   // result to dst.
1484   // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1485   // dst   = 0x658D
1486   if (lane_cnt <= 8) {
1487     // No need to concatenate.
1488     umov(dst, vtmp1, B, 0);
1489   } else if (lane_cnt <= 16) {
1490     ins(vtmp1, B, vtmp1, 1, 8);
1491     umov(dst, vtmp1, H, 0);
1492   } else {
1493     // As the lane count is 64 at most, the final expected value must be in
1494     // the lowest 64 bits after narrowing vtmp1 from D to B.
1495     sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2);
1496     umov(dst, vtmp1, D, 0);
1497   }
1498 }
1499 
1500 // Unpack the mask, a long value in "src", into a vector register of boolean
1501 // represented as bytes with 0x00/0x01 as element values in "dst".  Each bit in
1502 // "src" is unpacked into one byte lane in "dst". Note that "dst" can support at
1503 // most 64 lanes.
1504 //
1505 // Below example gives the expected dst vector register, with a valid src(0x658D)
1506 // on a 128-bit vector size machine.
1507 // dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01
1508 void C2_MacroAssembler::sve_vmask_fromlong(FloatRegister dst, Register src,
1509                                            FloatRegister vtmp, int lane_cnt) {
1510   assert_different_registers(dst, vtmp);
1511   assert(UseSVE == 2 && VM_Version::supports_svebitperm() &&
1512          lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported");
1513 
1514   // Example:   src = 0x658D, lane_cnt = 16
1515   // Expected:  dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01
1516 
1517   // Put long value from general purpose register into the first lane of vector.
1518   // vtmp = 0x0000000000000000 | 0x000000000000658D
1519   sve_dup(vtmp, B, 0);
1520   mov(vtmp, D, 0, src);
1521 
1522   // Transform the value in the first lane which is mask in bit now to the mask in
1523   // byte, which can be done by SVE2's BDEP instruction.
1524 
1525   // The first source input of BDEP instruction. Deposite each byte in every 8 bytes.
1526   // vtmp = 0x0000000000000065 | 0x000000000000008D
1527   if (lane_cnt <= 8) {
1528     // Nothing. As only one byte exsits.
1529   } else if (lane_cnt <= 16) {
1530     ins(vtmp, B, vtmp, 8, 1);
1531   } else {
1532     sve_vector_extend(vtmp, D, vtmp, B);
1533   }
1534 
1535   // The second source input of BDEP instruction, initialized with 0x01 for each byte.
1536   // dst = 0x01010101 0x01010101 0x01010101 0x01010101
1537   sve_dup(dst, B, 1);
1538 
1539   // BDEP dst.D, vtmp.D, dst.D
1540   // vtmp = 0x0000000000000065 | 0x000000000000008D
1541   // dst  = 0x0101010101010101 | 0x0101010101010101
1542   //        ---------------------------------------
1543   // dst  = 0x0001010000010001 | 0x0100000001010001
1544   sve_bdep(dst, D, vtmp, dst);
1545 }
1546 
1547 // Clobbers: rflags
1548 void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg,
1549                                     FloatRegister zn, FloatRegister zm, Condition cond) {
1550   assert(pg->is_governing(), "This register has to be a governing predicate register");
1551   FloatRegister z1 = zn, z2 = zm;
1552   switch (cond) {
1553     case LE: z1 = zm; z2 = zn; cond = GE; break;
1554     case LT: z1 = zm; z2 = zn; cond = GT; break;
1555     case LO: z1 = zm; z2 = zn; cond = HI; break;
1556     case LS: z1 = zm; z2 = zn; cond = HS; break;
1557     default:
1558       break;
1559   }
1560 
1561   SIMD_RegVariant size = elemType_to_regVariant(bt);
1562   if (is_floating_point_type(bt)) {
1563     sve_fcm(cond, pd, size, pg, z1, z2);
1564   } else {
1565     assert(is_integral_type(bt), "unsupported element type");
1566     sve_cmp(cond, pd, size, pg, z1, z2);
1567   }
1568 }
1569 
1570 // Get index of the last mask lane that is set
1571 void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) {
1572   SIMD_RegVariant size = elemType_to_regVariant(bt);
1573   sve_rev(ptmp, size, src);
1574   sve_brkb(ptmp, ptrue, ptmp, false);
1575   sve_cntp(dst, size, ptrue, ptmp);
1576   movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1);
1577   subw(dst, rscratch1, dst);
1578 }
1579 
1580 // Extend integer vector src to dst with the same lane count
1581 // but larger element size, e.g. 4B -> 4I
1582 void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes,
1583                                            FloatRegister src, BasicType src_bt, bool is_unsigned) {
1584   if (src_bt == T_BYTE) {
1585     // 4B to 4S/4I, 8B to 8S
1586     assert(dst_vlen_in_bytes == 8 || dst_vlen_in_bytes == 16, "unsupported");
1587     assert(dst_bt == T_SHORT || dst_bt == T_INT, "unsupported");
1588     _xshll(is_unsigned, dst, T8H, src, T8B, 0);
1589     if (dst_bt == T_INT) {
1590       _xshll(is_unsigned, dst, T4S, dst, T4H, 0);
1591     }
1592   } else if (src_bt == T_SHORT) {
1593     // 2S to 2I/2L, 4S to 4I
1594     assert(dst_vlen_in_bytes == 8 || dst_vlen_in_bytes == 16, "unsupported");
1595     assert(dst_bt == T_INT || dst_bt == T_LONG, "unsupported");
1596     _xshll(is_unsigned, dst, T4S, src, T4H, 0);
1597     if (dst_bt == T_LONG) {
1598       _xshll(is_unsigned, dst, T2D, dst, T2S, 0);
1599     }
1600   } else if (src_bt == T_INT) {
1601     // 2I to 2L
1602     assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported");
1603     _xshll(is_unsigned, dst, T2D, src, T2S, 0);
1604   } else {
1605     ShouldNotReachHere();
1606   }
1607 }
1608 
1609 // Narrow integer vector src down to dst with the same lane count
1610 // but smaller element size, e.g. 4I -> 4B
1611 void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt,
1612                                            FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) {
1613   if (src_bt == T_SHORT) {
1614     // 4S/8S to 4B/8B
1615     assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
1616     assert(dst_bt == T_BYTE, "unsupported");
1617     xtn(dst, T8B, src, T8H);
1618   } else if (src_bt == T_INT) {
1619     // 2I to 2S, 4I to 4B/4S
1620     assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
1621     assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported");
1622     xtn(dst, T4H, src, T4S);
1623     if (dst_bt == T_BYTE) {
1624       xtn(dst, T8B, dst, T8H);
1625     }
1626   } else if (src_bt == T_LONG) {
1627     // 2L to 2S/2I
1628     assert(src_vlen_in_bytes == 16, "unsupported");
1629     assert(dst_bt == T_INT || dst_bt == T_SHORT, "unsupported");
1630     xtn(dst, T2S, src, T2D);
1631     if (dst_bt == T_SHORT) {
1632       xtn(dst, T4H, dst, T4S);
1633     }
1634   } else {
1635     ShouldNotReachHere();
1636   }
1637 }
1638 
1639 void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size,
1640                                           FloatRegister src, SIMD_RegVariant src_size,
1641                                           bool is_unsigned) {
1642   assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size");
1643 
1644   if (src_size == B) {
1645     switch (dst_size) {
1646     case H:
1647       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1648       break;
1649     case S:
1650       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1651       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1652       break;
1653     case D:
1654       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1655       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1656       _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1657       break;
1658     default:
1659       ShouldNotReachHere();
1660     }
1661   } else if (src_size == H) {
1662     if (dst_size == S) {
1663       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1664     } else { // D
1665       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1666       _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1667     }
1668   } else if (src_size == S) {
1669     _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, src);
1670   }
1671 }
1672 
1673 // Vector narrow from src to dst with specified element sizes.
1674 // High part of dst vector will be filled with zero.
1675 void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size,
1676                                           FloatRegister src, SIMD_RegVariant src_size,
1677                                           FloatRegister tmp) {
1678   assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size");
1679   assert_different_registers(src, tmp);
1680   sve_dup(tmp, src_size, 0);
1681   if (src_size == D) {
1682     switch (dst_size) {
1683     case S:
1684       sve_uzp1(dst, S, src, tmp);
1685       break;
1686     case H:
1687       assert_different_registers(dst, tmp);
1688       sve_uzp1(dst, S, src, tmp);
1689       sve_uzp1(dst, H, dst, tmp);
1690       break;
1691     case B:
1692       assert_different_registers(dst, tmp);
1693       sve_uzp1(dst, S, src, tmp);
1694       sve_uzp1(dst, H, dst, tmp);
1695       sve_uzp1(dst, B, dst, tmp);
1696       break;
1697     default:
1698       ShouldNotReachHere();
1699     }
1700   } else if (src_size == S) {
1701     if (dst_size == H) {
1702       sve_uzp1(dst, H, src, tmp);
1703     } else { // B
1704       assert_different_registers(dst, tmp);
1705       sve_uzp1(dst, H, src, tmp);
1706       sve_uzp1(dst, B, dst, tmp);
1707     }
1708   } else if (src_size == H) {
1709     sve_uzp1(dst, B, src, tmp);
1710   }
1711 }
1712 
1713 // Extend src predicate to dst predicate with the same lane count but larger
1714 // element size, e.g. 64Byte -> 512Long
1715 void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src,
1716                                              uint dst_element_length_in_bytes,
1717                                              uint src_element_length_in_bytes) {
1718   if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) {
1719     sve_punpklo(dst, src);
1720   } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) {
1721     sve_punpklo(dst, src);
1722     sve_punpklo(dst, dst);
1723   } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) {
1724     sve_punpklo(dst, src);
1725     sve_punpklo(dst, dst);
1726     sve_punpklo(dst, dst);
1727   } else {
1728     assert(false, "unsupported");
1729     ShouldNotReachHere();
1730   }
1731 }
1732 
1733 // Narrow src predicate to dst predicate with the same lane count but
1734 // smaller element size, e.g. 512Long -> 64Byte
1735 void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src, PRegister ptmp,
1736                                              uint dst_element_length_in_bytes, uint src_element_length_in_bytes) {
1737   // The insignificant bits in src predicate are expected to be zero.
1738   // To ensure the higher order bits of the resultant narrowed vector are 0, an all-zero predicate is
1739   // passed as the second argument. An example narrowing operation with a given mask would be -
1740   // 128Long -> 64Int on a 128-bit machine i.e 2L -> 2I
1741   // Mask (for 2 Longs) : TF
1742   // Predicate register for the above mask (16 bits) : 00000001 00000000
1743   // After narrowing (uzp1 dst.b, src.b, ptmp.b) : 0000 0000 0001 0000
1744   // Which translates to mask for 2 integers as : TF (lower half is considered while upper half is 0)
1745   assert_different_registers(src, ptmp);
1746   assert_different_registers(dst, ptmp);
1747   sve_pfalse(ptmp);
1748   if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) {
1749     sve_uzp1(dst, B, src, ptmp);
1750   } else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) {
1751     sve_uzp1(dst, H, src, ptmp);
1752     sve_uzp1(dst, B, dst, ptmp);
1753   } else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) {
1754     sve_uzp1(dst, S, src, ptmp);
1755     sve_uzp1(dst, H, dst, ptmp);
1756     sve_uzp1(dst, B, dst, ptmp);
1757   } else {
1758     assert(false, "unsupported");
1759     ShouldNotReachHere();
1760   }
1761 }
1762 
1763 // Vector reduction add for integral type with ASIMD instructions.
1764 void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt,
1765                                                  Register isrc, FloatRegister vsrc,
1766                                                  unsigned vector_length_in_bytes,
1767                                                  FloatRegister vtmp) {
1768   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1769   assert_different_registers(dst, isrc);
1770   bool isQ = vector_length_in_bytes == 16;
1771 
1772   BLOCK_COMMENT("neon_reduce_add_integral {");
1773     switch(bt) {
1774       case T_BYTE:
1775         addv(vtmp, isQ ? T16B : T8B, vsrc);
1776         smov(dst, vtmp, B, 0);
1777         addw(dst, dst, isrc, ext::sxtb);
1778         break;
1779       case T_SHORT:
1780         addv(vtmp, isQ ? T8H : T4H, vsrc);
1781         smov(dst, vtmp, H, 0);
1782         addw(dst, dst, isrc, ext::sxth);
1783         break;
1784       case T_INT:
1785         isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc);
1786         umov(dst, vtmp, S, 0);
1787         addw(dst, dst, isrc);
1788         break;
1789       case T_LONG:
1790         assert(isQ, "unsupported");
1791         addpd(vtmp, vsrc);
1792         umov(dst, vtmp, D, 0);
1793         add(dst, dst, isrc);
1794         break;
1795       default:
1796         assert(false, "unsupported");
1797         ShouldNotReachHere();
1798     }
1799   BLOCK_COMMENT("} neon_reduce_add_integral");
1800 }
1801 
1802 // Vector reduction multiply for integral type with ASIMD instructions.
1803 // Note: temporary registers vtmp1 and vtmp2 are not used in some cases.
1804 // Clobbers: rscratch1
1805 void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt,
1806                                                  Register isrc, FloatRegister vsrc,
1807                                                  unsigned vector_length_in_bytes,
1808                                                  FloatRegister vtmp1, FloatRegister vtmp2) {
1809   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1810   bool isQ = vector_length_in_bytes == 16;
1811 
1812   BLOCK_COMMENT("neon_reduce_mul_integral {");
1813     switch(bt) {
1814       case T_BYTE:
1815         if (isQ) {
1816           // Multiply the lower half and higher half of vector iteratively.
1817           // vtmp1 = vsrc[8:15]
1818           ins(vtmp1, D, vsrc, 0, 1);
1819           // vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7]
1820           mulv(vtmp1, T8B, vtmp1, vsrc);
1821           // vtmp2 = vtmp1[4:7]
1822           ins(vtmp2, S, vtmp1, 0, 1);
1823           // vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3]
1824           mulv(vtmp1, T8B, vtmp2, vtmp1);
1825         } else {
1826           ins(vtmp1, S, vsrc, 0, 1);
1827           mulv(vtmp1, T8B, vtmp1, vsrc);
1828         }
1829         // vtmp2 = vtmp1[2:3]
1830         ins(vtmp2, H, vtmp1, 0, 1);
1831         // vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1]
1832         mulv(vtmp2, T8B, vtmp2, vtmp1);
1833         // dst = vtmp2[0] * isrc * vtmp2[1]
1834         umov(rscratch1, vtmp2, B, 0);
1835         mulw(dst, rscratch1, isrc);
1836         sxtb(dst, dst);
1837         umov(rscratch1, vtmp2, B, 1);
1838         mulw(dst, rscratch1, dst);
1839         sxtb(dst, dst);
1840         break;
1841       case T_SHORT:
1842         if (isQ) {
1843           ins(vtmp2, D, vsrc, 0, 1);
1844           mulv(vtmp2, T4H, vtmp2, vsrc);
1845           ins(vtmp1, S, vtmp2, 0, 1);
1846           mulv(vtmp1, T4H, vtmp1, vtmp2);
1847         } else {
1848           ins(vtmp1, S, vsrc, 0, 1);
1849           mulv(vtmp1, T4H, vtmp1, vsrc);
1850         }
1851         umov(rscratch1, vtmp1, H, 0);
1852         mulw(dst, rscratch1, isrc);
1853         sxth(dst, dst);
1854         umov(rscratch1, vtmp1, H, 1);
1855         mulw(dst, rscratch1, dst);
1856         sxth(dst, dst);
1857         break;
1858       case T_INT:
1859         if (isQ) {
1860           ins(vtmp1, D, vsrc, 0, 1);
1861           mulv(vtmp1, T2S, vtmp1, vsrc);
1862         } else {
1863           vtmp1 = vsrc;
1864         }
1865         umov(rscratch1, vtmp1, S, 0);
1866         mul(dst, rscratch1, isrc);
1867         umov(rscratch1, vtmp1, S, 1);
1868         mul(dst, rscratch1, dst);
1869         break;
1870       case T_LONG:
1871         umov(rscratch1, vsrc, D, 0);
1872         mul(dst, isrc, rscratch1);
1873         umov(rscratch1, vsrc, D, 1);
1874         mul(dst, dst, rscratch1);
1875         break;
1876       default:
1877         assert(false, "unsupported");
1878         ShouldNotReachHere();
1879     }
1880   BLOCK_COMMENT("} neon_reduce_mul_integral");
1881 }
1882 
1883 // Vector reduction multiply for floating-point type with ASIMD instructions.
1884 void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt,
1885                                            FloatRegister fsrc, FloatRegister vsrc,
1886                                            unsigned vector_length_in_bytes,
1887                                            FloatRegister vtmp) {
1888   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1889   bool isQ = vector_length_in_bytes == 16;
1890 
1891   BLOCK_COMMENT("neon_reduce_mul_fp {");
1892     switch(bt) {
1893       // The T_SHORT type below is for Float16 type which also uses floating-point
1894       // instructions.
1895       case T_SHORT:
1896         fmulh(dst, fsrc, vsrc);
1897         ext(vtmp, T8B, vsrc, vsrc, 2);
1898         fmulh(dst, dst, vtmp);
1899         ext(vtmp, T8B, vsrc, vsrc, 4);
1900         fmulh(dst, dst, vtmp);
1901         ext(vtmp, T8B, vsrc, vsrc, 6);
1902         fmulh(dst, dst, vtmp);
1903         if (isQ) {
1904           ext(vtmp, T16B, vsrc, vsrc, 8);
1905           fmulh(dst, dst, vtmp);
1906           ext(vtmp, T16B, vsrc, vsrc, 10);
1907           fmulh(dst, dst, vtmp);
1908           ext(vtmp, T16B, vsrc, vsrc, 12);
1909           fmulh(dst, dst, vtmp);
1910           ext(vtmp, T16B, vsrc, vsrc, 14);
1911           fmulh(dst, dst, vtmp);
1912         }
1913         break;
1914       case T_FLOAT:
1915         fmuls(dst, fsrc, vsrc);
1916         ins(vtmp, S, vsrc, 0, 1);
1917         fmuls(dst, dst, vtmp);
1918         if (isQ) {
1919           ins(vtmp, S, vsrc, 0, 2);
1920           fmuls(dst, dst, vtmp);
1921           ins(vtmp, S, vsrc, 0, 3);
1922           fmuls(dst, dst, vtmp);
1923          }
1924         break;
1925       case T_DOUBLE:
1926         assert(isQ, "unsupported");
1927         fmuld(dst, fsrc, vsrc);
1928         ins(vtmp, D, vsrc, 0, 1);
1929         fmuld(dst, dst, vtmp);
1930         break;
1931       default:
1932         assert(false, "unsupported");
1933         ShouldNotReachHere();
1934     }
1935   BLOCK_COMMENT("} neon_reduce_mul_fp");
1936 }
1937 
1938 // Vector reduction add for half float type with ASIMD instructions.
1939 void C2_MacroAssembler::neon_reduce_add_fp16(FloatRegister dst, FloatRegister fsrc, FloatRegister vsrc,
1940                                              unsigned vector_length_in_bytes, FloatRegister vtmp) {
1941   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1942   bool isQ = vector_length_in_bytes == 16;
1943 
1944   BLOCK_COMMENT("neon_reduce_add_fp16 {");
1945     faddh(dst, fsrc, vsrc);
1946     ext(vtmp, T8B, vsrc, vsrc, 2);
1947     faddh(dst, dst, vtmp);
1948     ext(vtmp, T8B, vsrc, vsrc, 4);
1949     faddh(dst, dst, vtmp);
1950     ext(vtmp, T8B, vsrc, vsrc, 6);
1951     faddh(dst, dst, vtmp);
1952     if (isQ) {
1953       ext(vtmp, T16B, vsrc, vsrc, 8);
1954       faddh(dst, dst, vtmp);
1955       ext(vtmp, T16B, vsrc, vsrc, 10);
1956       faddh(dst, dst, vtmp);
1957       ext(vtmp, T16B, vsrc, vsrc, 12);
1958       faddh(dst, dst, vtmp);
1959       ext(vtmp, T16B, vsrc, vsrc, 14);
1960       faddh(dst, dst, vtmp);
1961     }
1962   BLOCK_COMMENT("} neon_reduce_add_fp16");
1963 }
1964 
1965 // Helper to select logical instruction
1966 void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd,
1967                                                    Register Rn, Register Rm,
1968                                                    enum shift_kind kind, unsigned shift) {
1969   switch(opc) {
1970     case Op_AndReductionV:
1971       is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift);
1972       break;
1973     case Op_OrReductionV:
1974       is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift);
1975       break;
1976     case Op_XorReductionV:
1977       is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift);
1978       break;
1979     default:
1980       assert(false, "unsupported");
1981       ShouldNotReachHere();
1982   }
1983 }
1984 
1985 // Vector reduction logical operations And, Or, Xor
1986 // Clobbers: rscratch1
1987 void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt,
1988                                             Register isrc, FloatRegister vsrc,
1989                                             unsigned vector_length_in_bytes) {
1990   assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV,
1991          "unsupported");
1992   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1993   assert_different_registers(dst, isrc);
1994   bool isQ = vector_length_in_bytes == 16;
1995 
1996   BLOCK_COMMENT("neon_reduce_logical {");
1997     umov(rscratch1, vsrc, isQ ? D : S, 0);
1998     umov(dst, vsrc, isQ ? D : S, 1);
1999     neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1);
2000     switch(bt) {
2001       case T_BYTE:
2002         if (isQ) {
2003           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
2004         }
2005         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
2006         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8);
2007         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
2008         sxtb(dst, dst);
2009         break;
2010       case T_SHORT:
2011         if (isQ) {
2012           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
2013         }
2014         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
2015         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
2016         sxth(dst, dst);
2017         break;
2018       case T_INT:
2019         if (isQ) {
2020           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
2021         }
2022         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
2023         break;
2024       case T_LONG:
2025         assert(isQ, "unsupported");
2026         neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst);
2027         break;
2028       default:
2029         assert(false, "unsupported");
2030         ShouldNotReachHere();
2031     }
2032   BLOCK_COMMENT("} neon_reduce_logical");
2033 }
2034 
2035 // Helper function to decode min/max reduction operation properties
2036 void C2_MacroAssembler::decode_minmax_reduction_opc(int opc, bool* is_min,
2037                                                     bool* is_unsigned,
2038                                                     Condition* cond) {
2039   switch(opc) {
2040     case Op_MinReductionV:
2041       *is_min = true;  *is_unsigned = false; *cond = LT; break;
2042     case Op_MaxReductionV:
2043       *is_min = false; *is_unsigned = false; *cond = GT; break;
2044     case Op_UMinReductionV:
2045       *is_min = true;  *is_unsigned = true;  *cond = LO; break;
2046     case Op_UMaxReductionV:
2047       *is_min = false; *is_unsigned = true;  *cond = HI; break;
2048     default:
2049       ShouldNotReachHere();
2050   }
2051 }
2052 
2053 // Vector reduction min/max/umin/umax for integral type with ASIMD instructions.
2054 // Note: vtmp is not used and expected to be fnoreg for T_LONG case.
2055 // Clobbers: rscratch1, rflags
2056 void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt,
2057                                                     Register isrc, FloatRegister vsrc,
2058                                                     unsigned vector_length_in_bytes,
2059                                                     FloatRegister vtmp) {
2060   assert(opc == Op_MinReductionV || opc == Op_MaxReductionV ||
2061          opc == Op_UMinReductionV || opc == Op_UMaxReductionV, "unsupported");
2062   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
2063   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported");
2064   assert_different_registers(dst, isrc);
2065   bool isQ = vector_length_in_bytes == 16;
2066   bool is_min;
2067   bool is_unsigned;
2068   Condition cond;
2069   decode_minmax_reduction_opc(opc, &is_min, &is_unsigned, &cond);
2070   BLOCK_COMMENT("neon_reduce_minmax_integral {");
2071     if (bt == T_LONG) {
2072       assert(vtmp == fnoreg, "should be");
2073       assert(isQ, "should be");
2074       umov(rscratch1, vsrc, D, 0);
2075       cmp(isrc, rscratch1);
2076       csel(dst, isrc, rscratch1, cond);
2077       umov(rscratch1, vsrc, D, 1);
2078       cmp(dst, rscratch1);
2079       csel(dst, dst, rscratch1, cond);
2080     } else {
2081       SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
2082       if (size == T2S) {
2083         // For T2S (2x32-bit elements), use pairwise instructions because
2084         // uminv/umaxv/sminv/smaxv don't support arrangement 2S.
2085         neon_minmaxp(is_unsigned, is_min, vtmp, size, vsrc, vsrc);
2086       } else {
2087         // For other sizes, use reduction to scalar instructions.
2088         neon_minmaxv(is_unsigned, is_min, vtmp, size, vsrc);
2089       }
2090       if (bt == T_INT) {
2091         umov(dst, vtmp, S, 0);
2092       } else if (is_unsigned) {
2093         umov(dst, vtmp, elemType_to_regVariant(bt), 0);
2094       } else {
2095         smov(dst, vtmp, elemType_to_regVariant(bt), 0);
2096       }
2097       cmpw(dst, isrc);
2098       cselw(dst, dst, isrc, cond);
2099     }
2100   BLOCK_COMMENT("} neon_reduce_minmax_integral");
2101 }
2102 
2103 // Vector reduction for integral type with SVE instruction.
2104 // Supported operations are Add, And, Or, Xor, Max, Min, UMax, UMin.
2105 // rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV.
2106 void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1,
2107                                             FloatRegister src2, PRegister pg, FloatRegister tmp) {
2108   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2109   assert(pg->is_governing(), "This register has to be a governing predicate register");
2110   assert_different_registers(src1, dst);
2111   // Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved.
2112   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2113   switch (opc) {
2114     case Op_AddReductionVI: {
2115       sve_uaddv(tmp, size, pg, src2);
2116       if (bt == T_BYTE) {
2117         smov(dst, tmp, size, 0);
2118         addw(dst, src1, dst, ext::sxtb);
2119       } else if (bt == T_SHORT) {
2120         smov(dst, tmp, size, 0);
2121         addw(dst, src1, dst, ext::sxth);
2122       } else {
2123         umov(dst, tmp, size, 0);
2124         addw(dst, dst, src1);
2125       }
2126       break;
2127     }
2128     case Op_AddReductionVL: {
2129       sve_uaddv(tmp, size, pg, src2);
2130       umov(dst, tmp, size, 0);
2131       add(dst, dst, src1);
2132       break;
2133     }
2134     case Op_AndReductionV: {
2135       sve_andv(tmp, size, pg, src2);
2136       if (bt == T_INT || bt == T_LONG) {
2137         umov(dst, tmp, size, 0);
2138       } else {
2139         smov(dst, tmp, size, 0);
2140       }
2141       if (bt == T_LONG) {
2142         andr(dst, dst, src1);
2143       } else {
2144         andw(dst, dst, src1);
2145       }
2146       break;
2147     }
2148     case Op_OrReductionV: {
2149       sve_orv(tmp, size, pg, src2);
2150       if (bt == T_INT || bt == T_LONG) {
2151         umov(dst, tmp, size, 0);
2152       } else {
2153         smov(dst, tmp, size, 0);
2154       }
2155       if (bt == T_LONG) {
2156         orr(dst, dst, src1);
2157       } else {
2158         orrw(dst, dst, src1);
2159       }
2160       break;
2161     }
2162     case Op_XorReductionV: {
2163       sve_eorv(tmp, size, pg, src2);
2164       if (bt == T_INT || bt == T_LONG) {
2165         umov(dst, tmp, size, 0);
2166       } else {
2167         smov(dst, tmp, size, 0);
2168       }
2169       if (bt == T_LONG) {
2170         eor(dst, dst, src1);
2171       } else {
2172         eorw(dst, dst, src1);
2173       }
2174       break;
2175     }
2176     case Op_MaxReductionV:
2177     case Op_MinReductionV:
2178     case Op_UMaxReductionV:
2179     case Op_UMinReductionV: {
2180       bool is_min;
2181       bool is_unsigned;
2182       Condition cond;
2183       decode_minmax_reduction_opc(opc, &is_min, &is_unsigned, &cond);
2184       sve_minmaxv(is_unsigned, is_min, tmp, size, pg, src2);
2185       // Move result from vector to general register
2186       if (is_unsigned || bt == T_INT || bt == T_LONG) {
2187         umov(dst, tmp, size, 0);
2188       } else {
2189         smov(dst, tmp, size, 0);
2190       }
2191       if (bt == T_LONG) {
2192         cmp(dst, src1);
2193         csel(dst, dst, src1, cond);
2194       } else {
2195         cmpw(dst, src1);
2196         cselw(dst, dst, src1, cond);
2197       }
2198       break;
2199     }
2200     default:
2201       assert(false, "unsupported");
2202       ShouldNotReachHere();
2203   }
2204 
2205   if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) {
2206     if (bt == T_BYTE) {
2207       sxtb(dst, dst);
2208     } else if (bt == T_SHORT) {
2209       sxth(dst, dst);
2210     }
2211   }
2212 }
2213 
2214 // Set elements of the dst predicate to true for lanes in the range of [0, lane_cnt), or
2215 // to false otherwise. The input "lane_cnt" should be smaller than or equal to the supported
2216 // max vector length of the basic type. Clobbers: rscratch1 and the rFlagsReg.
2217 void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt) {
2218   uint32_t max_vector_length = Matcher::max_vector_size(bt);
2219   assert(lane_cnt <= max_vector_length, "unsupported input lane_cnt");
2220 
2221   // Set all elements to false if the input "lane_cnt" is zero.
2222   if (lane_cnt == 0) {
2223     sve_pfalse(dst);
2224     return;
2225   }
2226 
2227   SIMD_RegVariant size = elemType_to_regVariant(bt);
2228   assert(size != Q, "invalid size");
2229 
2230   // Set all true if "lane_cnt" equals to the max lane count.
2231   if (lane_cnt == max_vector_length) {
2232     sve_ptrue(dst, size, /* ALL */ 0b11111);
2233     return;
2234   }
2235 
2236   // Fixed numbers for "ptrue".
2237   switch(lane_cnt) {
2238   case 1: /* VL1 */
2239   case 2: /* VL2 */
2240   case 3: /* VL3 */
2241   case 4: /* VL4 */
2242   case 5: /* VL5 */
2243   case 6: /* VL6 */
2244   case 7: /* VL7 */
2245   case 8: /* VL8 */
2246     sve_ptrue(dst, size, lane_cnt);
2247     return;
2248   case 16:
2249     sve_ptrue(dst, size, /* VL16 */ 0b01001);
2250     return;
2251   case 32:
2252     sve_ptrue(dst, size, /* VL32 */ 0b01010);
2253     return;
2254   case 64:
2255     sve_ptrue(dst, size, /* VL64 */ 0b01011);
2256     return;
2257   case 128:
2258     sve_ptrue(dst, size, /* VL128 */ 0b01100);
2259     return;
2260   case 256:
2261     sve_ptrue(dst, size, /* VL256 */ 0b01101);
2262     return;
2263   default:
2264     break;
2265   }
2266 
2267   // Special patterns for "ptrue".
2268   if (lane_cnt == round_down_power_of_2(max_vector_length)) {
2269     sve_ptrue(dst, size, /* POW2 */ 0b00000);
2270   } else if (lane_cnt == max_vector_length - (max_vector_length % 4)) {
2271     sve_ptrue(dst, size, /* MUL4 */ 0b11101);
2272   } else if (lane_cnt == max_vector_length - (max_vector_length % 3)) {
2273     sve_ptrue(dst, size, /* MUL3 */ 0b11110);
2274   } else {
2275     // Encode to "whileltw" for the remaining cases.
2276     mov(rscratch1, lane_cnt);
2277     sve_whileltw(dst, size, zr, rscratch1);
2278   }
2279 }
2280 
2281 // Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst.
2282 // Any remaining elements of dst will be filled with zero.
2283 // Clobbers: rscratch1
2284 // Preserves: mask, vzr
2285 void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask,
2286                                            FloatRegister vzr, FloatRegister vtmp,
2287                                            PRegister pgtmp, unsigned vector_length_in_bytes) {
2288   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2289   // When called by sve_compress_byte, src and vtmp may be the same register.
2290   assert_different_registers(dst, src, vzr);
2291   assert_different_registers(dst, vtmp, vzr);
2292   assert_different_registers(mask, pgtmp);
2293   // high <-- low
2294   // Example input:   src   = hh gg ff ee dd cc bb aa, one character is 8 bits.
2295   //                  mask  = 01 00 00 01 01 00 01 01, one character is 1 bit.
2296   // Expected result: dst   = 00 00 00 hh ee dd bb aa
2297 
2298   // Extend lowest half to type INT.
2299   // dst   =  00dd  00cc  00bb  00aa
2300   sve_uunpklo(dst, S, src);
2301   // pgtmp =  0001  0000  0001  0001
2302   sve_punpklo(pgtmp, mask);
2303   // Pack the active elements in size of type INT to the right,
2304   // and fill the remainings with zero.
2305   // dst   =  0000  00dd  00bb  00aa
2306   sve_compact(dst, S, dst, pgtmp);
2307   // Narrow the result back to type SHORT.
2308   // dst   = 00 00 00 00 00 dd bb aa
2309   sve_uzp1(dst, H, dst, vzr);
2310 
2311   // Return if the vector length is no more than MaxVectorSize/2, since the
2312   // highest half is invalid.
2313   if (vector_length_in_bytes <= (MaxVectorSize >> 1)) {
2314     return;
2315   }
2316 
2317   // Count the active elements of lowest half.
2318   // rscratch1 = 3
2319   sve_cntp(rscratch1, S, ptrue, pgtmp);
2320 
2321   // Repeat to the highest half.
2322   // pgtmp =  0001  0000  0000  0001
2323   sve_punpkhi(pgtmp, mask);
2324   // vtmp  =  00hh  00gg  00ff  00ee
2325   sve_uunpkhi(vtmp, S, src);
2326   // vtmp  =  0000  0000  00hh  00ee
2327   sve_compact(vtmp, S, vtmp, pgtmp);
2328   // vtmp  = 00 00 00 00 00 00 hh ee
2329   sve_uzp1(vtmp, H, vtmp, vzr);
2330 
2331   // pgtmp = 00 00 00 00 00 01 01 01
2332   sve_whilelt(pgtmp, H, zr, rscratch1);
2333   // Compressed low:  dst  = 00 00 00 00 00 dd bb aa
2334   // Compressed high: vtmp = 00 00 00 00 00 00 hh ee
2335   // Combine the compressed low with the compressed high:
2336   //                  dst  = 00 00 00 hh ee dd bb aa
2337   sve_splice(dst, H, pgtmp, vtmp);
2338 }
2339 
2340 // Clobbers: rscratch1, rscratch2
2341 // Preserves: src, mask
2342 void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask,
2343                                           FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
2344                                           PRegister ptmp, PRegister pgtmp, unsigned vector_length_in_bytes) {
2345   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2346   assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3);
2347   assert_different_registers(mask, ptmp, pgtmp);
2348   // high <-- low
2349   // Example input:   src   = q p n m l k j i h g f e d c b a, one character is 8 bits.
2350   //                  mask  = 0 1 0 0 0 0 0 1 0 1 0 0 0 1 0 1, one character is 1 bit.
2351   // Expected result: dst   = 0 0 0 0 0 0 0 0 0 0 0 p i g c a
2352   FloatRegister vzr = vtmp3;
2353   sve_dup(vzr, B, 0);
2354 
2355   // Extend lowest half to type SHORT.
2356   // vtmp1 =  0h  0g  0f  0e  0d  0c  0b  0a
2357   sve_uunpklo(vtmp1, H, src);
2358   // ptmp  =  00  01  00  00  00  01  00  01
2359   sve_punpklo(ptmp, mask);
2360   // Pack the active elements in size of type SHORT to the right,
2361   // and fill the remainings with zero.
2362   // dst   =  00  00  00  00  00  0g  0c  0a
2363   unsigned extended_size = vector_length_in_bytes << 1;
2364   sve_compress_short(dst, vtmp1, ptmp, vzr, vtmp2, pgtmp, extended_size > MaxVectorSize ? MaxVectorSize : extended_size);
2365   // Narrow the result back to type BYTE.
2366   // dst   = 0 0 0 0 0 0 0 0 0 0 0 0 0 g c a
2367   sve_uzp1(dst, B, dst, vzr);
2368 
2369   // Return if the vector length is no more than MaxVectorSize/2, since the
2370   // highest half is invalid.
2371   if (vector_length_in_bytes <= (MaxVectorSize >> 1)) {
2372     return;
2373   }
2374   // Count the active elements of lowest half.
2375   // rscratch2 = 3
2376   sve_cntp(rscratch2, H, ptrue, ptmp);
2377 
2378   // Repeat to the highest half.
2379   // ptmp  =  00  01  00  00  00  00  00  01
2380   sve_punpkhi(ptmp, mask);
2381   // vtmp2 =  0q  0p  0n  0m  0l  0k  0j  0i
2382   sve_uunpkhi(vtmp2, H, src);
2383   // vtmp1 =  00  00  00  00  00  00  0p  0i
2384   sve_compress_short(vtmp1, vtmp2, ptmp, vzr, vtmp2, pgtmp, extended_size - MaxVectorSize);
2385   // vtmp1 = 0 0 0 0 0 0 0 0 0 0 0 0 0 0 p i
2386   sve_uzp1(vtmp1, B, vtmp1, vzr);
2387 
2388   // ptmp  = 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1
2389   sve_whilelt(ptmp, B, zr, rscratch2);
2390   // Compressed low:  dst   = 0 0 0 0 0 0 0 0 0 0 0 0 0 g c a
2391   // Compressed high: vtmp1 = 0 0 0 0 0 0 0 0 0 0 0 0 0 0 p i
2392   // Combine the compressed low with the compressed high:
2393   //                  dst   = 0 0 0 0 0 0 0 0 0 0 0 p i g c a
2394   sve_splice(dst, B, ptmp, vtmp1);
2395 }
2396 
2397 void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2398   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2399   SIMD_Arrangement size = isQ ? T16B : T8B;
2400   if (bt == T_BYTE) {
2401     rbit(dst, size, src);
2402   } else {
2403     neon_reverse_bytes(dst, src, bt, isQ);
2404     rbit(dst, size, dst);
2405   }
2406 }
2407 
2408 void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2409   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2410   SIMD_Arrangement size = isQ ? T16B : T8B;
2411   switch (bt) {
2412     case T_BYTE:
2413       if (dst != src) {
2414         orr(dst, size, src, src);
2415       }
2416       break;
2417     case T_SHORT:
2418       rev16(dst, size, src);
2419       break;
2420     case T_INT:
2421       rev32(dst, size, src);
2422       break;
2423     case T_LONG:
2424       rev64(dst, size, src);
2425       break;
2426     default:
2427       assert(false, "unsupported");
2428       ShouldNotReachHere();
2429   }
2430 }
2431 
2432 // VectorRearrange implementation for short/int/float/long/double types with NEON
2433 // instructions. For VectorRearrange short/int/float, we use NEON tbl instruction.
2434 // But since it supports bytes table only, we need to lookup 2/4 bytes as a group.
2435 // For VectorRearrange long/double, we compare the shuffle input with iota indices,
2436 // and use bsl to implement the operation.
2437 void C2_MacroAssembler::neon_rearrange_hsd(FloatRegister dst, FloatRegister src,
2438                                            FloatRegister shuffle, FloatRegister tmp,
2439                                            BasicType bt, bool isQ) {
2440   assert_different_registers(dst, src, shuffle, tmp);
2441   SIMD_Arrangement size1 = isQ ? T16B : T8B;
2442   SIMD_Arrangement size2 = esize2arrangement((uint)type2aelembytes(bt), isQ);
2443 
2444   // Here is an example that rearranges a NEON vector with 4 ints:
2445   // Rearrange V1 int[a0, a1, a2, a3] to V2 int[a2, a3, a0, a1]
2446   //   1. We assume the shuffle input is Vi int[2, 3, 0, 1].
2447   //   2. Multiply Vi int[2, 3, 0, 1] with constant int vector
2448   //      [0x04040404, 0x04040404, 0x04040404, 0x04040404], and get
2449   //      tbl base Vm int[0x08080808, 0x0c0c0c0c, 0x00000000, 0x04040404].
2450   //   3. Add Vm with constant int[0x03020100, 0x03020100, 0x03020100, 0x03020100],
2451   //      and get tbl index Vm int[0x0b0a0908, 0x0f0e0d0c, 0x03020100, 0x07060504]
2452   //   4. Use Vm as index register, and use V1 as table register.
2453   //      Then get V2 as the result by tbl NEON instructions.
2454   switch (bt) {
2455     case T_SHORT:
2456       mov(tmp, size1, 0x02);
2457       mulv(dst, size2, shuffle, tmp);
2458       mov(tmp, size2, 0x0100);
2459       addv(dst, size1, dst, tmp);
2460       tbl(dst, size1, src, 1, dst);
2461       break;
2462     case T_INT:
2463     case T_FLOAT:
2464       mov(tmp, size1, 0x04);
2465       mulv(dst, size2, shuffle, tmp);
2466       mov(tmp, size2, 0x03020100);
2467       addv(dst, size1, dst, tmp);
2468       tbl(dst, size1, src, 1, dst);
2469       break;
2470     case T_LONG:
2471     case T_DOUBLE:
2472       {
2473         int idx = vector_iota_entry_index(T_LONG);
2474         lea(rscratch1,
2475             ExternalAddress(StubRoutines::aarch64::vector_iota_indices(idx)));
2476         ldrq(tmp, rscratch1);
2477         // Check whether the input "shuffle" is the same with iota indices.
2478         // Return "src" if true, otherwise swap the two elements of "src".
2479         cm(EQ, dst, size2, shuffle, tmp);
2480         ext(tmp, size1, src, src, 8);
2481         bsl(dst, size1, src, tmp);
2482       }
2483       break;
2484     default:
2485       assert(false, "unsupported element type");
2486       ShouldNotReachHere();
2487   }
2488 }
2489 
2490 // Extract a scalar element from an sve vector at position 'idx'.
2491 // The input elements in src are expected to be of integral type.
2492 void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src,
2493                                              int idx, FloatRegister vtmp) {
2494   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2495   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2496   if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction
2497     if (bt == T_INT || bt == T_LONG) {
2498       umov(dst, src, size, idx);
2499     } else {
2500       smov(dst, src, size, idx);
2501     }
2502   } else {
2503     sve_orr(vtmp, src, src);
2504     sve_ext(vtmp, vtmp, idx << size);
2505     if (bt == T_INT || bt == T_LONG) {
2506       umov(dst, vtmp, size, 0);
2507     } else {
2508       smov(dst, vtmp, size, 0);
2509     }
2510   }
2511 }
2512 
2513 // java.lang.Math::round intrinsics
2514 
2515 // Clobbers: rscratch1, rflags
2516 void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2517                                           FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) {
2518   assert_different_registers(tmp1, tmp2, tmp3, src, dst);
2519   switch (T) {
2520     case T2S:
2521     case T4S:
2522       fmovs(tmp1, T, 0.5f);
2523       mov(rscratch1, jint_cast(0x1.0p23f));
2524       break;
2525     case T2D:
2526       fmovd(tmp1, T, 0.5);
2527       mov(rscratch1, julong_cast(0x1.0p52));
2528       break;
2529     default:
2530       assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2531   }
2532   fadd(tmp1, T, tmp1, src);
2533   fcvtms(tmp1, T, tmp1);
2534   // tmp1 = floor(src + 0.5, ties to even)
2535 
2536   fcvtas(dst, T, src);
2537   // dst = round(src), ties to away
2538 
2539   fneg(tmp3, T, src);
2540   dup(tmp2, T, rscratch1);
2541   cm(HS, tmp3, T, tmp3, tmp2);
2542   // tmp3 is now a set of flags
2543 
2544   bif(dst, T16B, tmp1, tmp3);
2545   // result in dst
2546 }
2547 
2548 // Clobbers: rscratch1, rflags
2549 void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2550                                          FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) {
2551   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2552   assert_different_registers(tmp1, tmp2, src, dst);
2553 
2554   switch (T) {
2555     case S:
2556       mov(rscratch1, jint_cast(0x1.0p23f));
2557       break;
2558     case D:
2559       mov(rscratch1, julong_cast(0x1.0p52));
2560       break;
2561     default:
2562       assert(T == S || T == D, "invalid register variant");
2563   }
2564 
2565   sve_frinta(dst, T, ptrue, src);
2566   // dst = round(src), ties to away
2567 
2568   Label none;
2569 
2570   sve_fneg(tmp1, T, ptrue, src);
2571   sve_dup(tmp2, T, rscratch1);
2572   sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1);
2573   br(EQ, none);
2574   {
2575     sve_cpy(tmp1, T, pgtmp, 0.5);
2576     sve_fadd(tmp1, T, pgtmp, src);
2577     sve_frintm(dst, T, pgtmp, tmp1);
2578     // dst = floor(src + 0.5, ties to even)
2579   }
2580   bind(none);
2581 
2582   sve_fcvtzs(dst, T, ptrue, dst, T);
2583   // result in dst
2584 }
2585 
2586 void C2_MacroAssembler::vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero,
2587                                            FloatRegister one, SIMD_Arrangement T) {
2588   assert_different_registers(dst, src, zero, one);
2589   assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2590 
2591   facgt(dst, T, src, zero);
2592   ushr(dst, T, dst, 1); // dst=0 for +-0.0 and NaN. 0x7FF..F otherwise
2593   bsl(dst, T == T2S ? T8B : T16B, one, src); // Result in dst
2594 }
2595 
2596 void C2_MacroAssembler::vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero,
2597                                           FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T) {
2598     assert_different_registers(dst, src, zero, one, vtmp);
2599     assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2600 
2601     sve_orr(vtmp, src, src);
2602     sve_fac(Assembler::GT, pgtmp, T, ptrue, src, zero); // pmtp=0 for +-0.0 and NaN. 0x1 otherwise
2603     switch (T) {
2604     case S:
2605       sve_and(vtmp, T, min_jint); // Extract the sign bit of float value in every lane of src
2606       sve_orr(vtmp, T, jint_cast(1.0)); // OR it with +1 to make the final result +1 or -1 depending
2607                                         // on the sign of the float value
2608       break;
2609     case D:
2610       sve_and(vtmp, T, min_jlong);
2611       sve_orr(vtmp, T, jlong_cast(1.0));
2612       break;
2613     default:
2614       assert(false, "unsupported");
2615       ShouldNotReachHere();
2616     }
2617     sve_sel(dst, T, pgtmp, vtmp, src); // Select either from src or vtmp based on the predicate register pgtmp
2618                                        // Result in dst
2619 }
2620 
2621 bool C2_MacroAssembler::in_scratch_emit_size() {
2622   if (ciEnv::current()->task() != nullptr) {
2623     PhaseOutput* phase_output = Compile::current()->output();
2624     if (phase_output != nullptr && phase_output->in_scratch_emit_size()) {
2625       return true;
2626     }
2627   }
2628   return MacroAssembler::in_scratch_emit_size();
2629 }
2630 
2631 static void abort_verify_int_in_range(uint idx, jint val, jint lo, jint hi) {
2632   fatal("Invalid CastII, idx: %u, val: %d, lo: %d, hi: %d", idx, val, lo, hi);
2633 }
2634 
2635 void C2_MacroAssembler::verify_int_in_range(uint idx, const TypeInt* t, Register rval, Register rtmp) {
2636   assert(!t->empty() && !t->singleton(), "%s", Type::str(t));
2637   if (t == TypeInt::INT) {
2638     return;
2639   }
2640 
2641   BLOCK_COMMENT("verify_int_in_range {");
2642   Label L_success, L_failure;
2643 
2644   jint lo = t->_lo;
2645   jint hi = t->_hi;
2646 
2647   if (lo != min_jint) {
2648     subsw(rtmp, rval, lo);
2649     br(Assembler::LT, L_failure);
2650   }
2651   if (hi != max_jint) {
2652     subsw(rtmp, rval, hi);
2653     br(Assembler::GT, L_failure);
2654   }
2655   b(L_success);
2656 
2657   bind(L_failure);
2658   movw(c_rarg0, idx);
2659   mov(c_rarg1, rval);
2660   movw(c_rarg2, lo);
2661   movw(c_rarg3, hi);
2662   reconstruct_frame_pointer(rtmp);
2663   rt_call(CAST_FROM_FN_PTR(address, abort_verify_int_in_range), rtmp);
2664   hlt(0);
2665 
2666   bind(L_success);
2667   BLOCK_COMMENT("} verify_int_in_range");
2668 }
2669 
2670 static void abort_verify_long_in_range(uint idx, jlong val, jlong lo, jlong hi) {
2671   fatal("Invalid CastLL, idx: %u, val: " JLONG_FORMAT ", lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, val, lo, hi);
2672 }
2673 
2674 void C2_MacroAssembler::verify_long_in_range(uint idx, const TypeLong* t, Register rval, Register rtmp) {
2675   assert(!t->empty() && !t->singleton(), "%s", Type::str(t));
2676   if (t == TypeLong::LONG) {
2677     return;
2678   }
2679 
2680   BLOCK_COMMENT("verify_long_in_range {");
2681   Label L_success, L_failure;
2682 
2683   jlong lo = t->_lo;
2684   jlong hi = t->_hi;
2685 
2686   if (lo != min_jlong) {
2687     subs(rtmp, rval, lo);
2688     br(Assembler::LT, L_failure);
2689   }
2690   if (hi != max_jlong) {
2691     subs(rtmp, rval, hi);
2692     br(Assembler::GT, L_failure);
2693   }
2694   b(L_success);
2695 
2696   bind(L_failure);
2697   movw(c_rarg0, idx);
2698   mov(c_rarg1, rval);
2699   mov(c_rarg2, lo);
2700   mov(c_rarg3, hi);
2701   reconstruct_frame_pointer(rtmp);
2702   rt_call(CAST_FROM_FN_PTR(address, abort_verify_long_in_range), rtmp);
2703   hlt(0);
2704 
2705   bind(L_success);
2706   BLOCK_COMMENT("} verify_long_in_range");
2707 }
2708 
2709 void C2_MacroAssembler::reconstruct_frame_pointer(Register rtmp) {
2710   const int framesize = Compile::current()->output()->frame_size_in_bytes();
2711   if (PreserveFramePointer) {
2712     // frame pointer is valid
2713 #ifdef ASSERT
2714     // Verify frame pointer value in rfp.
2715     add(rtmp, sp, framesize - 2 * wordSize);
2716     Label L_success;
2717     cmp(rfp, rtmp);
2718     br(Assembler::EQ, L_success);
2719     stop("frame pointer mismatch");
2720     bind(L_success);
2721 #endif // ASSERT
2722   } else {
2723     add(rfp, sp, framesize - 2 * wordSize);
2724   }
2725 }
2726 
2727 // Selects elements from two source vectors (src1, src2) based on index values in the index register
2728 // using Neon instructions and places it in the destination vector element corresponding to the
2729 // index vector element. Each index in the index register must be in the range - [0, 2 * NUM_ELEM),
2730 // where NUM_ELEM is the number of BasicType elements per vector.
2731 // If idx < NUM_ELEM --> selects src1[idx] (idx is an element of the index register)
2732 // Otherwise, selects src2[idx – NUM_ELEM]
2733 void C2_MacroAssembler::select_from_two_vectors_neon(FloatRegister dst, FloatRegister src1,
2734                                                      FloatRegister src2, FloatRegister index,
2735                                                      FloatRegister tmp, unsigned vector_length_in_bytes) {
2736   assert_different_registers(dst, src1, src2, tmp);
2737   SIMD_Arrangement size = vector_length_in_bytes == 16 ? T16B : T8B;
2738 
2739   if (vector_length_in_bytes == 16) {
2740     assert(UseSVE <= 1, "sve must be <= 1");
2741     assert(src1->successor() == src2, "Source registers must be ordered");
2742     // If the vector length is 16B, then use the Neon "tbl" instruction with two vector table
2743     tbl(dst, size, src1, 2, index);
2744   } else { // vector length == 8
2745     assert(UseSVE == 0, "must be Neon only");
2746     // We need to fit both the source vectors (src1, src2) in a 128-bit register because the
2747     // Neon "tbl" instruction supports only looking up 16B vectors. We then use the Neon "tbl"
2748     // instruction with one vector lookup
2749     ins(tmp, D, src1, 0, 0);
2750     ins(tmp, D, src2, 1, 0);
2751     tbl(dst, size, tmp, 1, index);
2752   }
2753 }
2754 
2755 // Selects elements from two source vectors (src1, src2) based on index values in the index register
2756 // using SVE/SVE2 instructions and places it in the destination vector element corresponding to the
2757 // index vector element. Each index in the index register must be in the range - [0, 2 * NUM_ELEM),
2758 // where NUM_ELEM is the number of BasicType elements per vector.
2759 // If idx < NUM_ELEM --> selects src1[idx] (idx is an element of the index register)
2760 // Otherwise, selects src2[idx – NUM_ELEM]
2761 void C2_MacroAssembler::select_from_two_vectors_sve(FloatRegister dst, FloatRegister src1,
2762                                                     FloatRegister src2, FloatRegister index,
2763                                                     FloatRegister tmp, SIMD_RegVariant T,
2764                                                     unsigned vector_length_in_bytes) {
2765   assert_different_registers(dst, src1, src2, index, tmp);
2766 
2767   if (vector_length_in_bytes == 8) {
2768     // We need to fit both the source vectors (src1, src2) in a single vector register because the
2769     // SVE "tbl" instruction is unpredicated and works on the entire vector which can lead to
2770     // incorrect results if each source vector is only partially filled. We then use the SVE "tbl"
2771     // instruction with one vector lookup
2772     assert(UseSVE >= 1, "sve must be >= 1");
2773     ins(tmp, D, src1, 0, 0);
2774     ins(tmp, D, src2, 1, 0);
2775     sve_tbl(dst, T, tmp, index);
2776   } else {  // UseSVE == 2 and vector_length_in_bytes > 8
2777     // If the vector length is > 8, then use the SVE2 "tbl" instruction with the two vector table.
2778     // The assertion - vector_length_in_bytes == MaxVectorSize ensures that this operation
2779     // is not executed on machines where vector_length_in_bytes < MaxVectorSize
2780     // with the only exception of 8B vector length.
2781     assert(UseSVE == 2 && vector_length_in_bytes == MaxVectorSize, "must be");
2782     assert(src1->successor() == src2, "Source registers must be ordered");
2783     sve_tbl(dst, T, src1, src2, index);
2784   }
2785 }
2786 
2787 void C2_MacroAssembler::select_from_two_vectors(FloatRegister dst, FloatRegister src1,
2788                                                 FloatRegister src2, FloatRegister index,
2789                                                 FloatRegister tmp, BasicType bt,
2790                                                 unsigned vector_length_in_bytes) {
2791 
2792   assert_different_registers(dst, src1, src2, index, tmp);
2793 
2794   // The cases that can reach this method are -
2795   // - UseSVE = 0/1, vector_length_in_bytes = 8 or 16, excluding double and long types
2796   // - UseSVE = 2, vector_length_in_bytes >= 8, for all types
2797   //
2798   // SVE/SVE2 tbl instructions are generated when UseSVE = 1 with vector_length_in_bytes = 8
2799   // and UseSVE = 2 with vector_length_in_bytes >= 8
2800   //
2801   // Neon instructions are generated when UseSVE = 0 with vector_length_in_bytes = 8 or 16 and
2802   // UseSVE = 1 with vector_length_in_bytes = 16
2803 
2804   if ((UseSVE == 1 && vector_length_in_bytes == 8) || UseSVE == 2) {
2805     SIMD_RegVariant T = elemType_to_regVariant(bt);
2806     select_from_two_vectors_sve(dst, src1, src2, index, tmp, T, vector_length_in_bytes);
2807     return;
2808   }
2809 
2810   // The only BasicTypes that can reach here are T_SHORT, T_BYTE, T_INT and T_FLOAT
2811   assert(bt != T_DOUBLE && bt != T_LONG, "unsupported basic type");
2812   assert(vector_length_in_bytes <= 16, "length_in_bytes must be <= 16");
2813 
2814   bool isQ = vector_length_in_bytes == 16;
2815 
2816   SIMD_Arrangement size1 = isQ ? T16B : T8B;
2817   SIMD_Arrangement size2 = esize2arrangement((uint)type2aelembytes(bt), isQ);
2818 
2819   // Neon "tbl" instruction only supports byte tables, so we need to look at chunks of
2820   // 2B for selecting shorts or chunks of 4B for selecting ints/floats from the table.
2821   // The index values in "index" register are in the range of [0, 2 * NUM_ELEM) where NUM_ELEM
2822   // is the number of elements that can fit in a vector. For ex. for T_SHORT with 64-bit vector length,
2823   // the indices can range from [0, 8).
2824   // As an example with 64-bit vector length and T_SHORT type - let index = [2, 5, 1, 0]
2825   // Move a constant 0x02 in every byte of tmp - tmp = [0x0202, 0x0202, 0x0202, 0x0202]
2826   // Multiply index vector with tmp to yield - dst = [0x0404, 0x0a0a, 0x0202, 0x0000]
2827   // Move a constant 0x0100 in every 2B of tmp - tmp = [0x0100, 0x0100, 0x0100, 0x0100]
2828   // Add the multiplied result to the vector in tmp to obtain the byte level
2829   // offsets - dst = [0x0504, 0x0b0a, 0x0302, 0x0100]
2830   // Use these offsets in the "tbl" instruction to select chunks of 2B.
2831 
2832   if (bt == T_BYTE) {
2833     select_from_two_vectors_neon(dst, src1, src2, index, tmp, vector_length_in_bytes);
2834   } else {
2835     int elem_size = (bt == T_SHORT) ? 2 : 4;
2836     uint64_t tbl_offset = (bt == T_SHORT) ? 0x0100u : 0x03020100u;
2837 
2838     mov(tmp, size1, elem_size);
2839     mulv(dst, size2, index, tmp);
2840     mov(tmp, size2, tbl_offset);
2841     addv(dst, size1, dst, tmp); // "dst" now contains the processed index elements
2842                                 // to select a set of 2B/4B
2843     select_from_two_vectors_neon(dst, src1, src2, dst, tmp, vector_length_in_bytes);
2844   }
2845 }
2846 
2847 // Vector expand implementation. Elements from the src vector are expanded into
2848 // the dst vector under the control of the vector mask.
2849 // Since there are no native instructions directly corresponding to expand before
2850 // SVE2p2, the following implementations mainly leverages the TBL instruction to
2851 // implement expand. To compute the index input for TBL, the prefix sum algorithm
2852 // (https://en.wikipedia.org/wiki/Prefix_sum) is used. The same algorithm is used
2853 // for NEON and SVE, but with different instructions where appropriate.
2854 
2855 // Vector expand implementation for NEON.
2856 //
2857 // An example of 128-bit Byte vector:
2858 //   Data direction: high <== low
2859 //   Input:
2860 //         src   = g  f  e  d  c  b  a  9  8  7  6  5  4  3  2  1
2861 //         mask  = 0  0 -1 -1  0  0 -1 -1  0  0 -1 -1  0  0 -1 -1
2862 //   Expected result:
2863 //         dst   = 0  0  8  7  0  0  6  5  0  0  4  3  0  0  2  1
2864 void C2_MacroAssembler::vector_expand_neon(FloatRegister dst, FloatRegister src, FloatRegister mask,
2865                                            FloatRegister tmp1, FloatRegister tmp2, BasicType bt,
2866                                            int vector_length_in_bytes) {
2867   assert(vector_length_in_bytes <= 16, "the vector length in bytes for NEON must be <= 16");
2868   assert_different_registers(dst, src, mask, tmp1, tmp2);
2869   // Since the TBL instruction only supports byte table, we need to
2870   // compute indices in byte type for all types.
2871   SIMD_Arrangement size = vector_length_in_bytes == 16 ? T16B : T8B;
2872   // tmp1 =  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
2873   dup(tmp1, size, zr);
2874   // dst  =  0  0  1  1  0  0  1  1  0  0  1  1  0  0  1  1
2875   negr(dst, size, mask);
2876   // Calculate vector index for TBL with prefix sum algorithm.
2877   // dst  =  8  8  8  7  6  6  6  5  4  4  4  3  2  2  2  1
2878   for (int i = 1; i < vector_length_in_bytes; i <<= 1) {
2879     ext(tmp2, size, tmp1, dst, vector_length_in_bytes - i);
2880     addv(dst, size, tmp2, dst);
2881   }
2882   // tmp2 =  0  0 -1 -1  0  0 -1 -1  0  0 -1 -1  0  0 -1 -1
2883   orr(tmp2, size, mask, mask);
2884   // tmp2 =  0  0  8  7  0  0  6  5  0  0  4  3  0  0  2  1
2885   bsl(tmp2, size, dst, tmp1);
2886   // tmp1 =  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
2887   movi(tmp1, size, 1);
2888   // dst  = -1 -1  7  6 -1 -1  5  4 -1 -1  3  2 -1 -1  1  0
2889   subv(dst, size, tmp2, tmp1);
2890   // dst  =  0  0  8  7  0  0  6  5  0  0  4  3  0  0  2  1
2891   tbl(dst, size, src, 1, dst);
2892 }
2893 
2894 // Vector expand implementation for SVE.
2895 //
2896 // An example of 128-bit Short vector:
2897 //   Data direction: high <== low
2898 //   Input:
2899 //         src   = gf ed cb a9 87 65 43 21
2900 //         pg    = 00 01 00 01 00 01 00 01
2901 //   Expected result:
2902 //         dst   = 00 87 00 65 00 43 00 21
2903 void C2_MacroAssembler::vector_expand_sve(FloatRegister dst, FloatRegister src, PRegister pg,
2904                                           FloatRegister tmp1, FloatRegister tmp2, BasicType bt,
2905                                           int vector_length_in_bytes) {
2906   assert(UseSVE > 0, "expand implementation only for SVE");
2907   assert_different_registers(dst, src, tmp1, tmp2);
2908   SIMD_RegVariant size = elemType_to_regVariant(bt);
2909 
2910   // tmp1 = 00 00 00 00 00 00 00 00
2911   sve_dup(tmp1, size, 0);
2912   sve_movprfx(tmp2, tmp1);
2913   // tmp2 = 00 01 00 01 00 01 00 01
2914   sve_cpy(tmp2, size, pg, 1, true);
2915   // Calculate vector index for TBL with prefix sum algorithm.
2916   // tmp2 = 04 04 03 03 02 02 01 01
2917   for (int i = type2aelembytes(bt); i < vector_length_in_bytes; i <<= 1) {
2918     sve_movprfx(dst, tmp1);
2919     // The EXT instruction operates on the full-width sve register. The correct
2920     // index calculation method is:
2921     // vector_length_in_bytes - i + MaxVectorSize - vector_length_in_bytes =>
2922     // MaxVectorSize - i.
2923     sve_ext(dst, tmp2, MaxVectorSize - i);
2924     sve_add(tmp2, size, dst, tmp2);
2925   }
2926   // dst  = 00 04 00 03 00 02 00 01
2927   sve_sel(dst, size, pg, tmp2, tmp1);
2928   // dst  = -1 03 -1 02 -1 01 -1 00
2929   sve_sub(dst, size, 1);
2930   // dst  = 00 87 00 65 00 43 00 21
2931   sve_tbl(dst, size, src, dst);
2932 }
2933 
2934 // Optimized SVE cpy (imm, zeroing) instruction.
2935 //
2936 // `movi; cpy(imm, merging)` and `cpy(imm, zeroing)` have the same
2937 // functionality, but test results show that `movi; cpy(imm, merging)` has
2938 // higher throughput on some microarchitectures. This would depend on
2939 // microarchitecture and so may vary between implementations.
2940 void C2_MacroAssembler::sve_cpy(FloatRegister dst, SIMD_RegVariant T,
2941                                 PRegister pg, int imm8, bool isMerge) {
2942   if (VM_Version::prefer_sve_merging_mode_cpy() && !isMerge) {
2943     // Generates a NEON instruction `movi V<dst>.2d, #0`.
2944     // On AArch64, Z and V registers alias in the low 128 bits, so V<dst> is
2945     // the low 128 bits of Z<dst>. A write to V<dst> also clears all bits of
2946     // Z<dst> above 128, so this `movi` instruction effectively zeroes the
2947     // entire Z<dst> register. According to the Arm Software Optimization
2948     // Guide, `movi` is zero latency.
2949     movi(dst, T2D, 0);
2950     isMerge = true;
2951   }
2952   Assembler::sve_cpy(dst, T, pg, imm8, isMerge);
2953 }
2954 
2955 int C2_MacroAssembler::vector_iota_entry_index(BasicType bt) {
2956   // The vector iota entries array is ordered by type B/S/I/L/F/D, and
2957   // the offset between two types is 16.
2958   switch(bt) {
2959   case T_BYTE:
2960     return 0;
2961   case T_SHORT:
2962     return 1;
2963   case T_INT:
2964     return 2;
2965   case T_LONG:
2966     return 3;
2967   case T_FLOAT:
2968     return 4;
2969   case T_DOUBLE:
2970     return 5;
2971   default:
2972     ShouldNotReachHere();
2973   }
2974 }