New src/hotspot/cpu/aarch64/c2_MacroAssembler

   1 /*
   2  * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "opto/c2_MacroAssembler.hpp"
  29 #include "opto/compile.hpp"
  30 #include "opto/intrinsicnode.hpp"
  31 #include "opto/matcher.hpp"
  32 #include "opto/output.hpp"
  33 #include "opto/subnode.hpp"
  34 #include "runtime/stubRoutines.hpp"
  35 #include "utilities/globalDefinitions.hpp"
  36 
  37 #ifdef PRODUCT
  38 #define BLOCK_COMMENT(str) /* nothing */
  39 #define STOP(error) stop(error)
  40 #else
  41 #define BLOCK_COMMENT(str) block_comment(str)
  42 #define STOP(error) block_comment(error); stop(error)
  43 #endif
  44 
  45 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  46 
  47 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
  48 
  49 void C2_MacroAssembler::fast_lock(Register objectReg, Register boxReg, Register tmpReg,
  50                                   Register tmp2Reg, Register tmp3Reg) {
  51   Register oop = objectReg;
  52   Register box = boxReg;
  53   Register disp_hdr = tmpReg;
  54   Register tmp = tmp2Reg;
  55   Label object_has_monitor;
  56   Label count, no_count;
  57 
  58   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight");
  59   assert_different_registers(oop, box, tmp, disp_hdr, rscratch1);
  60 
  61   // Load markWord from object into displaced_header.
  62   ldr(disp_hdr, Address(oop, oopDesc::mark_offset_in_bytes()));
  63 
  64   if (DiagnoseSyncOnValueBasedClasses != 0) {
  65     load_klass(tmp, oop);
  66     ldrw(tmp, Address(tmp, Klass::access_flags_offset()));
  67     tstw(tmp, JVM_ACC_IS_VALUE_BASED_CLASS);
  68     br(Assembler::NE, no_count);
  69   }
  70 
  71   // Check for existing monitor
  72   tbnz(disp_hdr, exact_log2(markWord::monitor_value), object_has_monitor);
  73 
  74   if (LockingMode == LM_MONITOR) {
  75     tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0.
  76     b(no_count);
  77   } else {
  78     assert(LockingMode == LM_LEGACY, "must be");
  79     // Set tmp to be (markWord of object | UNLOCK_VALUE).
  80     orr(tmp, disp_hdr, markWord::unlocked_value);
  81 
  82     // Initialize the box. (Must happen before we update the object mark!)
  83     str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes()));
  84 
  85     // Compare object markWord with an unlocked value (tmp) and if
  86     // equal exchange the stack address of our box with object markWord.
  87     // On failure disp_hdr contains the possibly locked markWord.
  88     cmpxchg(oop, tmp, box, Assembler::xword, /*acquire*/ true,
  89             /*release*/ true, /*weak*/ false, disp_hdr);
  90     br(Assembler::EQ, count);
  91 
  92     assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
  93 
  94     // If the compare-and-exchange succeeded, then we found an unlocked
  95     // object, will have now locked it will continue at label cont
  96 
  97     // Check if the owner is self by comparing the value in the
  98     // markWord of object (disp_hdr) with the stack pointer.
  99     mov(rscratch1, sp);
 100     sub(disp_hdr, disp_hdr, rscratch1);
 101     mov(tmp, (address) (~(os::vm_page_size()-1) | markWord::lock_mask_in_place));
 102     // If condition is true we are cont and hence we can store 0 as the
 103     // displaced header in the box, which indicates that it is a recursive lock.
 104     ands(tmp/*==0?*/, disp_hdr, tmp);   // Sets flags for result
 105     str(tmp/*==0, perhaps*/, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 106     b(no_count);
 107   }
 108 
 109   // Handle existing monitor.
 110   bind(object_has_monitor);
 111 
 112   // The object's monitor m is unlocked iff m->owner == nullptr,
 113   // otherwise m->owner may contain a thread id, a stack address for LM_LEGACY,
 114   // or the ANONYMOUS_OWNER constant for LM_LIGHTWEIGHT.
 115   //
 116   // Try to CAS m->owner from null to current thread.
 117   ldr(rscratch2, Address(rthread, JavaThread::lock_id_offset()));
 118   add(tmp, disp_hdr, (in_bytes(ObjectMonitor::owner_offset())-markWord::monitor_value));
 119   cmpxchg(tmp, zr, rscratch2, Assembler::xword, /*acquire*/ true,
 120           /*release*/ true, /*weak*/ false, tmp3Reg); // Sets flags for result
 121 
 122   // Store a non-null value into the box to avoid looking like a re-entrant
 123   // lock. The fast-path monitor unlock code checks for
 124   // markWord::monitor_value so use markWord::unused_mark which has the
 125   // relevant bit set, and also matches ObjectSynchronizer::enter.
 126   mov(tmp, (address)markWord::unused_mark().value());
 127   str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 128 
 129   br(Assembler::EQ, no_count); // CAS success means locking succeeded
 130 
 131   cmp(tmp3Reg, rscratch2);
 132   br(Assembler::NE, no_count); // Check for recursive locking
 133 
 134   // Recursive lock case
 135   increment(Address(disp_hdr, in_bytes(ObjectMonitor::recursions_offset()) - markWord::monitor_value), 1);
 136   // flag == EQ still from the cmp above, checking if this is a reentrant lock
 137   b(no_count);
 138 
 139   bind(count);
 140   inc_held_monitor_count();
 141 
 142   bind(no_count);
 143   // flag == EQ indicates success
 144   // flag == NE indicates failure
 145 }
 146 
 147 void C2_MacroAssembler::fast_unlock(Register objectReg, Register boxReg, Register tmpReg,
 148                                     Register tmp2Reg) {
 149   Register oop = objectReg;
 150   Register box = boxReg;
 151   Register disp_hdr = tmpReg;
 152   Register tmp = tmp2Reg;
 153   Label cont;
 154   Label object_has_monitor;
 155   Label no_count;
 156 
 157   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight");
 158   assert_different_registers(oop, box, tmp, disp_hdr);
 159 
 160   if (LockingMode == LM_LEGACY) {
 161     // Find the lock address and load the displaced header from the stack.
 162     ldr(disp_hdr, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 163 
 164     // If the displaced header is 0, we have a recursive unlock.
 165     cmp(disp_hdr, zr);
 166     br(Assembler::EQ, no_count);
 167   }
 168 
 169   // Handle existing monitor.
 170   ldr(tmp, Address(oop, oopDesc::mark_offset_in_bytes()));
 171   tbnz(tmp, exact_log2(markWord::monitor_value), object_has_monitor);
 172 
 173   if (LockingMode == LM_MONITOR) {
 174     tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0.
 175     b(no_count);
 176   } else {
 177     assert(LockingMode == LM_LEGACY, "must be");
 178     // Check if it is still a light weight lock, this is is true if we
 179     // see the stack address of the basicLock in the markWord of the
 180     // object.
 181 
 182     cmpxchg(oop, box, disp_hdr, Assembler::xword, /*acquire*/ false,
 183             /*release*/ true, /*weak*/ false, tmp);
 184     b(cont);
 185   }
 186 
 187   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
 188 
 189   // Handle existing monitor.
 190   bind(object_has_monitor);
 191   STATIC_ASSERT(markWord::monitor_value <= INT_MAX);
 192   add(tmp, tmp, -(int)markWord::monitor_value); // monitor
 193 
 194   // If the owner is anonymous, we need to fix it -- in an outline stub.
 195   Register tmp2 = disp_hdr;
 196   ldr(tmp2, Address(tmp, ObjectMonitor::owner_offset()));
 197   // We cannot use tbnz here, the target might be too far away and cannot
 198   // be encoded.
 199   mov(rscratch1, (uint64_t)ObjectMonitor::ANONYMOUS_OWNER);
 200   cmp(tmp2, rscratch1);
 201   C2HandleAnonOMOwnerStub* stub = new (Compile::current()->comp_arena()) C2HandleAnonOMOwnerStub(tmp, tmp2);
 202   Compile::current()->output()->add_stub(stub);
 203   br(Assembler::EQ, stub->entry());
 204   bind(stub->continuation());
 205 
 206   ldr(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset()));
 207 
 208   Label notRecursive;
 209   cbz(disp_hdr, notRecursive);
 210 
 211   // Recursive lock
 212   sub(disp_hdr, disp_hdr, 1u);
 213   str(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset()));
 214   cmp(disp_hdr, disp_hdr); // Sets flags for result
 215   b(no_count);
 216 
 217   bind(notRecursive);
 218   ldr(rscratch1, Address(tmp, ObjectMonitor::EntryList_offset()));
 219   ldr(disp_hdr, Address(tmp, ObjectMonitor::cxq_offset()));
 220   orr(rscratch1, rscratch1, disp_hdr); // Will be 0 if both are 0.
 221   cmp(rscratch1, zr); // Sets flags for result
 222   cbnz(rscratch1, no_count);
 223   // need a release store here
 224   lea(tmp, Address(tmp, ObjectMonitor::owner_offset()));
 225   stlr(zr, tmp); // set unowned
 226   b(no_count);
 227 
 228   bind(cont);
 229   // flag == EQ indicates success
 230   // flag == NE indicates failure
 231   br(Assembler::NE, no_count);
 232 
 233   dec_held_monitor_count();
 234 
 235   bind(no_count);
 236 }
 237 
 238 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register t1,
 239                                               Register t2, Register t3) {
 240   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 241   assert_different_registers(obj, t1, t2, t3);
 242 
 243   // Handle inflated monitor.
 244   Label inflated;
 245   // Finish fast lock successfully. MUST branch to with flag == EQ
 246   Label locked;
 247   // Finish fast lock unsuccessfully. MUST branch to with flag == NE
 248   Label slow_path;
 249 
 250   if (DiagnoseSyncOnValueBasedClasses != 0) {
 251     load_klass(t1, obj);
 252     ldrw(t1, Address(t1, Klass::access_flags_offset()));
 253     tstw(t1, JVM_ACC_IS_VALUE_BASED_CLASS);
 254     br(Assembler::NE, slow_path);
 255   }
 256 
 257   const Register t1_mark = t1;
 258 
 259   { // Lightweight locking
 260 
 261     // Push lock to the lock stack and finish successfully. MUST branch to with flag == EQ
 262     Label push;
 263 
 264     const Register t2_top = t2;
 265     const Register t3_t = t3;
 266 
 267     // Check if lock-stack is full.
 268     ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 269     cmpw(t2_top, (unsigned)LockStack::end_offset() - 1);
 270     br(Assembler::GT, slow_path);
 271 
 272     // Check if recursive.
 273     subw(t3_t, t2_top, oopSize);
 274     ldr(t3_t, Address(rthread, t3_t));
 275     cmp(obj, t3_t);
 276     br(Assembler::EQ, push);
 277 
 278     // Relaxed normal load to check for monitor. Optimization for monitor case.
 279     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 280     tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
 281 
 282     // Not inflated
 283     assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea");
 284 
 285     // Try to lock. Transition lock-bits 0b01 => 0b00
 286     orr(t1_mark, t1_mark, markWord::unlocked_value);
 287     eor(t3_t, t1_mark, markWord::unlocked_value);
 288     cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
 289             /*acquire*/ true, /*release*/ false, /*weak*/ false, noreg);
 290     br(Assembler::NE, slow_path);
 291 
 292     bind(push);
 293     // After successful lock, push object on lock-stack.
 294     str(obj, Address(rthread, t2_top));
 295     addw(t2_top, t2_top, oopSize);
 296     strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 297     b(locked);
 298   }
 299 
 300   { // Handle inflated monitor.
 301     bind(inflated);
 302 
 303     // mark contains the tagged ObjectMonitor*.
 304     const Register t1_tagged_monitor = t1_mark;
 305     const uintptr_t monitor_tag = markWord::monitor_value;
 306     const Register t2_owner_addr = t2;
 307     const Register t3_owner = t3;
 308 
 309     // Compute owner address.
 310     lea(t2_owner_addr, Address(t1_tagged_monitor, (in_bytes(ObjectMonitor::owner_offset()) - monitor_tag)));
 311 
 312     // CAS owner (null => current thread id).
 313     ldr(rscratch2, Address(rthread, JavaThread::lock_id_offset()));
 314     cmpxchg(t2_owner_addr, zr, rscratch2, Assembler::xword, /*acquire*/ true,
 315             /*release*/ false, /*weak*/ false, t3_owner);
 316     br(Assembler::EQ, locked);
 317 
 318     // Check if recursive.
 319     cmp(t3_owner, rscratch2);
 320     br(Assembler::NE, slow_path);
 321 
 322     // Recursive.
 323     increment(Address(t1_tagged_monitor, in_bytes(ObjectMonitor::recursions_offset()) - monitor_tag), 1);
 324   }
 325 
 326   bind(locked);
 327 
 328 #ifdef ASSERT
 329   // Check that locked label is reached with Flags == EQ.
 330   Label flag_correct;
 331   br(Assembler::EQ, flag_correct);
 332   stop("Fast Lock Flag != EQ");
 333 #endif
 334 
 335   bind(slow_path);
 336 #ifdef ASSERT
 337   // Check that slow_path label is reached with Flags == NE.
 338   br(Assembler::NE, flag_correct);
 339   stop("Fast Lock Flag != NE");
 340   bind(flag_correct);
 341 #endif
 342   // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
 343 }
 344 
 345 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register t1, Register t2,
 346                                                 Register t3) {
 347   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 348   assert_different_registers(obj, t1, t2, t3);
 349 
 350   // Handle inflated monitor.
 351   Label inflated, inflated_load_monitor;
 352   // Finish fast unlock successfully. MUST branch to with flag == EQ
 353   Label unlocked;
 354   // Finish fast unlock unsuccessfully. MUST branch to with flag == NE
 355   Label slow_path;
 356 
 357   const Register t1_mark = t1;
 358   const Register t2_top = t2;
 359   const Register t3_t = t3;
 360 
 361   { // Lightweight unlock
 362 
 363     // Check if obj is top of lock-stack.
 364     ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 365     subw(t2_top, t2_top, oopSize);
 366     ldr(t3_t, Address(rthread, t2_top));
 367     cmp(obj, t3_t);
 368     // Top of lock stack was not obj. Must be monitor.
 369     br(Assembler::NE, inflated_load_monitor);
 370 
 371     // Pop lock-stack.
 372     DEBUG_ONLY(str(zr, Address(rthread, t2_top));)
 373     strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 374 
 375     // Check if recursive.
 376     subw(t3_t, t2_top, oopSize);
 377     ldr(t3_t, Address(rthread, t3_t));
 378     cmp(obj, t3_t);
 379     br(Assembler::EQ, unlocked);
 380 
 381     // Not recursive.
 382     // Load Mark.
 383     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 384 
 385     // Check header for monitor (0b10).
 386     tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
 387 
 388     // Try to unlock. Transition lock bits 0b00 => 0b01
 389     assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea");
 390     orr(t3_t, t1_mark, markWord::unlocked_value);
 391     cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
 392             /*acquire*/ false, /*release*/ true, /*weak*/ false, noreg);
 393     br(Assembler::EQ, unlocked);
 394 
 395     // Compare and exchange failed.
 396     // Restore lock-stack and handle the unlock in runtime.
 397     DEBUG_ONLY(str(obj, Address(rthread, t2_top));)
 398     addw(t2_top, t2_top, oopSize);
 399     str(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 400     b(slow_path);
 401   }
 402 
 403 
 404   { // Handle inflated monitor.
 405     bind(inflated_load_monitor);
 406     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 407 #ifdef ASSERT
 408     tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
 409     stop("Fast Unlock not monitor");
 410 #endif
 411 
 412     bind(inflated);
 413 
 414 #ifdef ASSERT
 415     Label check_done;
 416     subw(t2_top, t2_top, oopSize);
 417     cmpw(t2_top, in_bytes(JavaThread::lock_stack_base_offset()));
 418     br(Assembler::LT, check_done);
 419     ldr(t3_t, Address(rthread, t2_top));
 420     cmp(obj, t3_t);
 421     br(Assembler::NE, inflated);
 422     stop("Fast Unlock lock on stack");
 423     bind(check_done);
 424 #endif
 425 
 426     // mark contains the tagged ObjectMonitor*.
 427     const Register t1_monitor = t1_mark;
 428     const uintptr_t monitor_tag = markWord::monitor_value;
 429 
 430     // Untag the monitor.
 431     sub(t1_monitor, t1_mark, monitor_tag);
 432 
 433     const Register t2_recursions = t2;
 434     Label not_recursive;
 435 
 436     // Check if recursive.
 437     ldr(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
 438     cbz(t2_recursions, not_recursive);
 439 
 440     // Recursive unlock.
 441     sub(t2_recursions, t2_recursions, 1u);
 442     str(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
 443     // Set flag == EQ
 444     cmp(t2_recursions, t2_recursions);
 445     b(unlocked);
 446 
 447     bind(not_recursive);
 448 
 449     Label release;
 450     const Register t2_owner_addr = t2;
 451 
 452     // Compute owner address.
 453     lea(t2_owner_addr, Address(t1_monitor, ObjectMonitor::owner_offset()));
 454 
 455     // Check if the entry lists are empty.
 456     ldr(rscratch1, Address(t1_monitor, ObjectMonitor::EntryList_offset()));
 457     ldr(t3_t, Address(t1_monitor, ObjectMonitor::cxq_offset()));
 458     orr(rscratch1, rscratch1, t3_t);
 459     cmp(rscratch1, zr);
 460     br(Assembler::EQ, release);
 461 
 462     // The owner may be anonymous and we removed the last obj entry in
 463     // the lock-stack. This loses the information about the owner.
 464     // Write the thread id to the owner field so the runtime knows the owner.
 465     ldr(t3_t, Address(rthread, JavaThread::lock_id_offset()));
 466     str(t3_t, Address(t2_owner_addr));
 467     b(slow_path);
 468 
 469     bind(release);
 470     // Set owner to null.
 471     // Release to satisfy the JMM
 472     stlr(zr, t2_owner_addr);
 473   }
 474 
 475   bind(unlocked);
 476 
 477 #ifdef ASSERT
 478   // Check that unlocked label is reached with Flags == EQ.
 479   Label flag_correct;
 480   br(Assembler::EQ, flag_correct);
 481   stop("Fast Unlock Flag != EQ");
 482 #endif
 483 
 484   bind(slow_path);
 485 #ifdef ASSERT
 486   // Check that slow_path label is reached with Flags == NE.
 487   br(Assembler::NE, flag_correct);
 488   stop("Fast Unlock Flag != NE");
 489   bind(flag_correct);
 490 #endif
 491   // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
 492 }
 493 
 494 // Search for str1 in str2 and return index or -1
 495 // Clobbers: rscratch1, rscratch2, rflags. May also clobber v0-v1, when icnt1==-1.
 496 void C2_MacroAssembler::string_indexof(Register str2, Register str1,
 497                                        Register cnt2, Register cnt1,
 498                                        Register tmp1, Register tmp2,
 499                                        Register tmp3, Register tmp4,
 500                                        Register tmp5, Register tmp6,
 501                                        int icnt1, Register result, int ae) {
 502   // NOTE: tmp5, tmp6 can be zr depending on specific method version
 503   Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
 504 
 505   Register ch1 = rscratch1;
 506   Register ch2 = rscratch2;
 507   Register cnt1tmp = tmp1;
 508   Register cnt2tmp = tmp2;
 509   Register cnt1_neg = cnt1;
 510   Register cnt2_neg = cnt2;
 511   Register result_tmp = tmp4;
 512 
 513   bool isL = ae == StrIntrinsicNode::LL;
 514 
 515   bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
 516   bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
 517   int str1_chr_shift = str1_isL ? 0:1;
 518   int str2_chr_shift = str2_isL ? 0:1;
 519   int str1_chr_size = str1_isL ? 1:2;
 520   int str2_chr_size = str2_isL ? 1:2;
 521   chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
 522                                       (chr_insn)&MacroAssembler::ldrh;
 523   chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
 524                                       (chr_insn)&MacroAssembler::ldrh;
 525   chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
 526   chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
 527 
 528   // Note, inline_string_indexOf() generates checks:
 529   // if (substr.count > string.count) return -1;
 530   // if (substr.count == 0) return 0;
 531 
 532   // We have two strings, a source string in str2, cnt2 and a pattern string
 533   // in str1, cnt1. Find the 1st occurrence of pattern in source or return -1.
 534 
 535   // For larger pattern and source we use a simplified Boyer Moore algorithm.
 536   // With a small pattern and source we use linear scan.
 537 
 538   if (icnt1 == -1) {
 539     sub(result_tmp, cnt2, cnt1);
 540     cmp(cnt1, (u1)8);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
 541     br(LT, LINEARSEARCH);
 542     dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
 543     subs(zr, cnt1, 256);
 544     lsr(tmp1, cnt2, 2);
 545     ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
 546     br(GE, LINEARSTUB);
 547   }
 548 
 549 // The Boyer Moore alogorithm is based on the description here:-
 550 //
 551 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
 552 //
 553 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
 554 // and the 'Good Suffix' rule.
 555 //
 556 // These rules are essentially heuristics for how far we can shift the
 557 // pattern along the search string.
 558 //
 559 // The implementation here uses the 'Bad Character' rule only because of the
 560 // complexity of initialisation for the 'Good Suffix' rule.
 561 //
 562 // This is also known as the Boyer-Moore-Horspool algorithm:-
 563 //
 564 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
 565 //
 566 // This particular implementation has few java-specific optimizations.
 567 //
 568 // #define ASIZE 256
 569 //
 570 //    int bm(unsigned char *x, int m, unsigned char *y, int n) {
 571 //       int i, j;
 572 //       unsigned c;
 573 //       unsigned char bc[ASIZE];
 574 //
 575 //       /* Preprocessing */
 576 //       for (i = 0; i < ASIZE; ++i)
 577 //          bc[i] = m;
 578 //       for (i = 0; i < m - 1; ) {
 579 //          c = x[i];
 580 //          ++i;
 581 //          // c < 256 for Latin1 string, so, no need for branch
 582 //          #ifdef PATTERN_STRING_IS_LATIN1
 583 //          bc[c] = m - i;
 584 //          #else
 585 //          if (c < ASIZE) bc[c] = m - i;
 586 //          #endif
 587 //       }
 588 //
 589 //       /* Searching */
 590 //       j = 0;
 591 //       while (j <= n - m) {
 592 //          c = y[i+j];
 593 //          if (x[m-1] == c)
 594 //            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
 595 //          if (i < 0) return j;
 596 //          // c < 256 for Latin1 string, so, no need for branch
 597 //          #ifdef SOURCE_STRING_IS_LATIN1
 598 //          // LL case: (c< 256) always true. Remove branch
 599 //          j += bc[y[j+m-1]];
 600 //          #endif
 601 //          #ifndef PATTERN_STRING_IS_UTF
 602 //          // UU case: need if (c<ASIZE) check. Skip 1 character if not.
 603 //          if (c < ASIZE)
 604 //            j += bc[y[j+m-1]];
 605 //          else
 606 //            j += 1
 607 //          #endif
 608 //          #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
 609 //          // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
 610 //          if (c < ASIZE)
 611 //            j += bc[y[j+m-1]];
 612 //          else
 613 //            j += m
 614 //          #endif
 615 //       }
 616 //    }
 617 
 618   if (icnt1 == -1) {
 619     Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
 620         BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
 621     Register cnt1end = tmp2;
 622     Register str2end = cnt2;
 623     Register skipch = tmp2;
 624 
 625     // str1 length is >=8, so, we can read at least 1 register for cases when
 626     // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
 627     // UL case. We'll re-read last character in inner pre-loop code to have
 628     // single outer pre-loop load
 629     const int firstStep = isL ? 7 : 3;
 630 
 631     const int ASIZE = 256;
 632     const int STORED_BYTES = 32; // amount of bytes stored per instruction
 633     sub(sp, sp, ASIZE);
 634     mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
 635     mov(ch1, sp);
 636     BIND(BM_INIT_LOOP);
 637       stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
 638       subs(tmp5, tmp5, 1);
 639       br(GT, BM_INIT_LOOP);
 640 
 641       sub(cnt1tmp, cnt1, 1);
 642       mov(tmp5, str2);
 643       add(str2end, str2, result_tmp, LSL, str2_chr_shift);
 644       sub(ch2, cnt1, 1);
 645       mov(tmp3, str1);
 646     BIND(BCLOOP);
 647       (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
 648       if (!str1_isL) {
 649         subs(zr, ch1, ASIZE);
 650         br(HS, BCSKIP);
 651       }
 652       strb(ch2, Address(sp, ch1));
 653     BIND(BCSKIP);
 654       subs(ch2, ch2, 1);
 655       br(GT, BCLOOP);
 656 
 657       add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
 658       if (str1_isL == str2_isL) {
 659         // load last 8 bytes (8LL/4UU symbols)
 660         ldr(tmp6, Address(tmp6, -wordSize));
 661       } else {
 662         ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
 663         // convert Latin1 to UTF. We'll have to wait until load completed, but
 664         // it's still faster than per-character loads+checks
 665         lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
 666         ubfx(ch1, tmp6, 8, 8); // str1[N-2]
 667         ubfx(ch2, tmp6, 16, 8); // str1[N-3]
 668         andr(tmp6, tmp6, 0xFF); // str1[N-4]
 669         orr(ch2, ch1, ch2, LSL, 16);
 670         orr(tmp6, tmp6, tmp3, LSL, 48);
 671         orr(tmp6, tmp6, ch2, LSL, 16);
 672       }
 673     BIND(BMLOOPSTR2);
 674       (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 675       sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
 676       if (str1_isL == str2_isL) {
 677         // re-init tmp3. It's for free because it's executed in parallel with
 678         // load above. Alternative is to initialize it before loop, but it'll
 679         // affect performance on in-order systems with 2 or more ld/st pipelines
 680         lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
 681       }
 682       if (!isL) { // UU/UL case
 683         lsl(ch2, cnt1tmp, 1); // offset in bytes
 684       }
 685       cmp(tmp3, skipch);
 686       br(NE, BMSKIP);
 687       ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
 688       mov(ch1, tmp6);
 689       if (isL) {
 690         b(BMLOOPSTR1_AFTER_LOAD);
 691       } else {
 692         sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
 693         b(BMLOOPSTR1_CMP);
 694       }
 695     BIND(BMLOOPSTR1);
 696       (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
 697       (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 698     BIND(BMLOOPSTR1_AFTER_LOAD);
 699       subs(cnt1tmp, cnt1tmp, 1);
 700       br(LT, BMLOOPSTR1_LASTCMP);
 701     BIND(BMLOOPSTR1_CMP);
 702       cmp(ch1, ch2);
 703       br(EQ, BMLOOPSTR1);
 704     BIND(BMSKIP);
 705       if (!isL) {
 706         // if we've met UTF symbol while searching Latin1 pattern, then we can
 707         // skip cnt1 symbols
 708         if (str1_isL != str2_isL) {
 709           mov(result_tmp, cnt1);
 710         } else {
 711           mov(result_tmp, 1);
 712         }
 713         subs(zr, skipch, ASIZE);
 714         br(HS, BMADV);
 715       }
 716       ldrb(result_tmp, Address(sp, skipch)); // load skip distance
 717     BIND(BMADV);
 718       sub(cnt1tmp, cnt1, 1);
 719       add(str2, str2, result_tmp, LSL, str2_chr_shift);
 720       cmp(str2, str2end);
 721       br(LE, BMLOOPSTR2);
 722       add(sp, sp, ASIZE);
 723       b(NOMATCH);
 724     BIND(BMLOOPSTR1_LASTCMP);
 725       cmp(ch1, ch2);
 726       br(NE, BMSKIP);
 727     BIND(BMMATCH);
 728       sub(result, str2, tmp5);
 729       if (!str2_isL) lsr(result, result, 1);
 730       add(sp, sp, ASIZE);
 731       b(DONE);
 732 
 733     BIND(LINEARSTUB);
 734     cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm
 735     br(LT, LINEAR_MEDIUM);
 736     mov(result, zr);
 737     RuntimeAddress stub = nullptr;
 738     if (isL) {
 739       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
 740       assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated");
 741     } else if (str1_isL) {
 742       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
 743        assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated");
 744     } else {
 745       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
 746       assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated");
 747     }
 748     address call = trampoline_call(stub);
 749     if (call == nullptr) {
 750       DEBUG_ONLY(reset_labels(LINEARSEARCH, LINEAR_MEDIUM, DONE, NOMATCH, MATCH));
 751       ciEnv::current()->record_failure("CodeCache is full");
 752       return;
 753     }
 754     b(DONE);
 755   }
 756 
 757   BIND(LINEARSEARCH);
 758   {
 759     Label DO1, DO2, DO3;
 760 
 761     Register str2tmp = tmp2;
 762     Register first = tmp3;
 763 
 764     if (icnt1 == -1)
 765     {
 766         Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
 767 
 768         cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2));
 769         br(LT, DOSHORT);
 770       BIND(LINEAR_MEDIUM);
 771         (this->*str1_load_1chr)(first, Address(str1));
 772         lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
 773         sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
 774         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 775         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 776 
 777       BIND(FIRST_LOOP);
 778         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
 779         cmp(first, ch2);
 780         br(EQ, STR1_LOOP);
 781       BIND(STR2_NEXT);
 782         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 783         br(LE, FIRST_LOOP);
 784         b(NOMATCH);
 785 
 786       BIND(STR1_LOOP);
 787         adds(cnt1tmp, cnt1_neg, str1_chr_size);
 788         add(cnt2tmp, cnt2_neg, str2_chr_size);
 789         br(GE, MATCH);
 790 
 791       BIND(STR1_NEXT);
 792         (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
 793         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
 794         cmp(ch1, ch2);
 795         br(NE, STR2_NEXT);
 796         adds(cnt1tmp, cnt1tmp, str1_chr_size);
 797         add(cnt2tmp, cnt2tmp, str2_chr_size);
 798         br(LT, STR1_NEXT);
 799         b(MATCH);
 800 
 801       BIND(DOSHORT);
 802       if (str1_isL == str2_isL) {
 803         cmp(cnt1, (u1)2);
 804         br(LT, DO1);
 805         br(GT, DO3);
 806       }
 807     }
 808 
 809     if (icnt1 == 4) {
 810       Label CH1_LOOP;
 811 
 812         (this->*load_4chr)(ch1, str1);
 813         sub(result_tmp, cnt2, 4);
 814         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 815         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 816 
 817       BIND(CH1_LOOP);
 818         (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
 819         cmp(ch1, ch2);
 820         br(EQ, MATCH);
 821         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 822         br(LE, CH1_LOOP);
 823         b(NOMATCH);
 824       }
 825 
 826     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
 827       Label CH1_LOOP;
 828 
 829       BIND(DO2);
 830         (this->*load_2chr)(ch1, str1);
 831         if (icnt1 == 2) {
 832           sub(result_tmp, cnt2, 2);
 833         }
 834         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 835         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 836       BIND(CH1_LOOP);
 837         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
 838         cmp(ch1, ch2);
 839         br(EQ, MATCH);
 840         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 841         br(LE, CH1_LOOP);
 842         b(NOMATCH);
 843     }
 844 
 845     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
 846       Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
 847 
 848       BIND(DO3);
 849         (this->*load_2chr)(first, str1);
 850         (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
 851         if (icnt1 == 3) {
 852           sub(result_tmp, cnt2, 3);
 853         }
 854         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 855         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 856       BIND(FIRST_LOOP);
 857         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
 858         cmpw(first, ch2);
 859         br(EQ, STR1_LOOP);
 860       BIND(STR2_NEXT);
 861         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 862         br(LE, FIRST_LOOP);
 863         b(NOMATCH);
 864 
 865       BIND(STR1_LOOP);
 866         add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
 867         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
 868         cmp(ch1, ch2);
 869         br(NE, STR2_NEXT);
 870         b(MATCH);
 871     }
 872 
 873     if (icnt1 == -1 || icnt1 == 1) {
 874       Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
 875 
 876       BIND(DO1);
 877         (this->*str1_load_1chr)(ch1, str1);
 878         cmp(cnt2, (u1)8);
 879         br(LT, DO1_SHORT);
 880 
 881         sub(result_tmp, cnt2, 8/str2_chr_size);
 882         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 883         mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
 884         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 885 
 886         if (str2_isL) {
 887           orr(ch1, ch1, ch1, LSL, 8);
 888         }
 889         orr(ch1, ch1, ch1, LSL, 16);
 890         orr(ch1, ch1, ch1, LSL, 32);
 891       BIND(CH1_LOOP);
 892         ldr(ch2, Address(str2, cnt2_neg));
 893         eor(ch2, ch1, ch2);
 894         sub(tmp1, ch2, tmp3);
 895         orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 896         bics(tmp1, tmp1, tmp2);
 897         br(NE, HAS_ZERO);
 898         adds(cnt2_neg, cnt2_neg, 8);
 899         br(LT, CH1_LOOP);
 900 
 901         cmp(cnt2_neg, (u1)8);
 902         mov(cnt2_neg, 0);
 903         br(LT, CH1_LOOP);
 904         b(NOMATCH);
 905 
 906       BIND(HAS_ZERO);
 907         rev(tmp1, tmp1);
 908         clz(tmp1, tmp1);
 909         add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
 910         b(MATCH);
 911 
 912       BIND(DO1_SHORT);
 913         mov(result_tmp, cnt2);
 914         lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
 915         sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
 916       BIND(DO1_LOOP);
 917         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
 918         cmpw(ch1, ch2);
 919         br(EQ, MATCH);
 920         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 921         br(LT, DO1_LOOP);
 922     }
 923   }
 924   BIND(NOMATCH);
 925     mov(result, -1);
 926     b(DONE);
 927   BIND(MATCH);
 928     add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
 929   BIND(DONE);
 930 }
 931 
 932 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
 933 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
 934 
 935 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1,
 936                                             Register ch, Register result,
 937                                             Register tmp1, Register tmp2, Register tmp3)
 938 {
 939   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
 940   Register cnt1_neg = cnt1;
 941   Register ch1 = rscratch1;
 942   Register result_tmp = rscratch2;
 943 
 944   cbz(cnt1, NOMATCH);
 945 
 946   cmp(cnt1, (u1)4);
 947   br(LT, DO1_SHORT);
 948 
 949   orr(ch, ch, ch, LSL, 16);
 950   orr(ch, ch, ch, LSL, 32);
 951 
 952   sub(cnt1, cnt1, 4);
 953   mov(result_tmp, cnt1);
 954   lea(str1, Address(str1, cnt1, Address::uxtw(1)));
 955   sub(cnt1_neg, zr, cnt1, LSL, 1);
 956 
 957   mov(tmp3, 0x0001000100010001);
 958 
 959   BIND(CH1_LOOP);
 960     ldr(ch1, Address(str1, cnt1_neg));
 961     eor(ch1, ch, ch1);
 962     sub(tmp1, ch1, tmp3);
 963     orr(tmp2, ch1, 0x7fff7fff7fff7fff);
 964     bics(tmp1, tmp1, tmp2);
 965     br(NE, HAS_ZERO);
 966     adds(cnt1_neg, cnt1_neg, 8);
 967     br(LT, CH1_LOOP);
 968 
 969     cmp(cnt1_neg, (u1)8);
 970     mov(cnt1_neg, 0);
 971     br(LT, CH1_LOOP);
 972     b(NOMATCH);
 973 
 974   BIND(HAS_ZERO);
 975     rev(tmp1, tmp1);
 976     clz(tmp1, tmp1);
 977     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
 978     b(MATCH);
 979 
 980   BIND(DO1_SHORT);
 981     mov(result_tmp, cnt1);
 982     lea(str1, Address(str1, cnt1, Address::uxtw(1)));
 983     sub(cnt1_neg, zr, cnt1, LSL, 1);
 984   BIND(DO1_LOOP);
 985     ldrh(ch1, Address(str1, cnt1_neg));
 986     cmpw(ch, ch1);
 987     br(EQ, MATCH);
 988     adds(cnt1_neg, cnt1_neg, 2);
 989     br(LT, DO1_LOOP);
 990   BIND(NOMATCH);
 991     mov(result, -1);
 992     b(DONE);
 993   BIND(MATCH);
 994     add(result, result_tmp, cnt1_neg, ASR, 1);
 995   BIND(DONE);
 996 }
 997 
 998 void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1,
 999                                                 Register ch, Register result,
1000                                                 FloatRegister ztmp1,
1001                                                 FloatRegister ztmp2,
1002                                                 PRegister tmp_pg,
1003                                                 PRegister tmp_pdn, bool isL)
1004 {
1005   // Note that `tmp_pdn` should *NOT* be used as governing predicate register.
1006   assert(tmp_pg->is_governing(),
1007          "this register has to be a governing predicate register");
1008 
1009   Label LOOP, MATCH, DONE, NOMATCH;
1010   Register vec_len = rscratch1;
1011   Register idx = rscratch2;
1012 
1013   SIMD_RegVariant T = (isL == true) ? B : H;
1014 
1015   cbz(cnt1, NOMATCH);
1016 
1017   // Assign the particular char throughout the vector.
1018   sve_dup(ztmp2, T, ch);
1019   if (isL) {
1020     sve_cntb(vec_len);
1021   } else {
1022     sve_cnth(vec_len);
1023   }
1024   mov(idx, 0);
1025 
1026   // Generate a predicate to control the reading of input string.
1027   sve_whilelt(tmp_pg, T, idx, cnt1);
1028 
1029   BIND(LOOP);
1030     // Read a vector of 8- or 16-bit data depending on the string type. Note
1031     // that inactive elements indicated by the predicate register won't cause
1032     // a data read from memory to the destination vector.
1033     if (isL) {
1034       sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx));
1035     } else {
1036       sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1)));
1037     }
1038     add(idx, idx, vec_len);
1039 
1040     // Perform the comparison. An element of the destination predicate is set
1041     // to active if the particular char is matched.
1042     sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2);
1043 
1044     // Branch if the particular char is found.
1045     br(NE, MATCH);
1046 
1047     sve_whilelt(tmp_pg, T, idx, cnt1);
1048 
1049     // Loop back if the particular char not found.
1050     br(MI, LOOP);
1051 
1052   BIND(NOMATCH);
1053     mov(result, -1);
1054     b(DONE);
1055 
1056   BIND(MATCH);
1057     // Undo the index increment.
1058     sub(idx, idx, vec_len);
1059 
1060     // Crop the vector to find its location.
1061     sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */);
1062     add(result, idx, -1);
1063     sve_incp(result, T, tmp_pdn);
1064   BIND(DONE);
1065 }
1066 
1067 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1,
1068                                             Register ch, Register result,
1069                                             Register tmp1, Register tmp2, Register tmp3)
1070 {
1071   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
1072   Register cnt1_neg = cnt1;
1073   Register ch1 = rscratch1;
1074   Register result_tmp = rscratch2;
1075 
1076   cbz(cnt1, NOMATCH);
1077 
1078   cmp(cnt1, (u1)8);
1079   br(LT, DO1_SHORT);
1080 
1081   orr(ch, ch, ch, LSL, 8);
1082   orr(ch, ch, ch, LSL, 16);
1083   orr(ch, ch, ch, LSL, 32);
1084 
1085   sub(cnt1, cnt1, 8);
1086   mov(result_tmp, cnt1);
1087   lea(str1, Address(str1, cnt1));
1088   sub(cnt1_neg, zr, cnt1);
1089 
1090   mov(tmp3, 0x0101010101010101);
1091 
1092   BIND(CH1_LOOP);
1093     ldr(ch1, Address(str1, cnt1_neg));
1094     eor(ch1, ch, ch1);
1095     sub(tmp1, ch1, tmp3);
1096     orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f);
1097     bics(tmp1, tmp1, tmp2);
1098     br(NE, HAS_ZERO);
1099     adds(cnt1_neg, cnt1_neg, 8);
1100     br(LT, CH1_LOOP);
1101 
1102     cmp(cnt1_neg, (u1)8);
1103     mov(cnt1_neg, 0);
1104     br(LT, CH1_LOOP);
1105     b(NOMATCH);
1106 
1107   BIND(HAS_ZERO);
1108     rev(tmp1, tmp1);
1109     clz(tmp1, tmp1);
1110     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
1111     b(MATCH);
1112 
1113   BIND(DO1_SHORT);
1114     mov(result_tmp, cnt1);
1115     lea(str1, Address(str1, cnt1));
1116     sub(cnt1_neg, zr, cnt1);
1117   BIND(DO1_LOOP);
1118     ldrb(ch1, Address(str1, cnt1_neg));
1119     cmp(ch, ch1);
1120     br(EQ, MATCH);
1121     adds(cnt1_neg, cnt1_neg, 1);
1122     br(LT, DO1_LOOP);
1123   BIND(NOMATCH);
1124     mov(result, -1);
1125     b(DONE);
1126   BIND(MATCH);
1127     add(result, result_tmp, cnt1_neg);
1128   BIND(DONE);
1129 }
1130 
1131 // Compare strings.
1132 void C2_MacroAssembler::string_compare(Register str1, Register str2,
1133     Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
1134     FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
1135     PRegister pgtmp1, PRegister pgtmp2, int ae) {
1136   Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
1137       DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
1138       SHORT_LOOP_START, TAIL_CHECK;
1139 
1140   bool isLL = ae == StrIntrinsicNode::LL;
1141   bool isLU = ae == StrIntrinsicNode::LU;
1142   bool isUL = ae == StrIntrinsicNode::UL;
1143 
1144   // The stub threshold for LL strings is: 72 (64 + 8) chars
1145   // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch)
1146   // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least)
1147   const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36);
1148 
1149   bool str1_isL = isLL || isLU;
1150   bool str2_isL = isLL || isUL;
1151 
1152   int str1_chr_shift = str1_isL ? 0 : 1;
1153   int str2_chr_shift = str2_isL ? 0 : 1;
1154   int str1_chr_size = str1_isL ? 1 : 2;
1155   int str2_chr_size = str2_isL ? 1 : 2;
1156   int minCharsInWord = isLL ? wordSize : wordSize/2;
1157 
1158   FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
1159   chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
1160                                       (chr_insn)&MacroAssembler::ldrh;
1161   chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
1162                                       (chr_insn)&MacroAssembler::ldrh;
1163   uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
1164                             (uxt_insn)&MacroAssembler::uxthw;
1165 
1166   BLOCK_COMMENT("string_compare {");
1167 
1168   // Bizarrely, the counts are passed in bytes, regardless of whether they
1169   // are L or U strings, however the result is always in characters.
1170   if (!str1_isL) asrw(cnt1, cnt1, 1);
1171   if (!str2_isL) asrw(cnt2, cnt2, 1);
1172 
1173   // Compute the minimum of the string lengths and save the difference.
1174   subsw(result, cnt1, cnt2);
1175   cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
1176 
1177   // A very short string
1178   cmpw(cnt2, minCharsInWord);
1179   br(Assembler::LE, SHORT_STRING);
1180 
1181   // Compare longwords
1182   // load first parts of strings and finish initialization while loading
1183   {
1184     if (str1_isL == str2_isL) { // LL or UU
1185       ldr(tmp1, Address(str1));
1186       cmp(str1, str2);
1187       br(Assembler::EQ, DONE);
1188       ldr(tmp2, Address(str2));
1189       cmp(cnt2, stub_threshold);
1190       br(GE, STUB);
1191       subsw(cnt2, cnt2, minCharsInWord);
1192       br(EQ, TAIL_CHECK);
1193       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1194       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1195       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1196     } else if (isLU) {
1197       ldrs(vtmp, Address(str1));
1198       ldr(tmp2, Address(str2));
1199       cmp(cnt2, stub_threshold);
1200       br(GE, STUB);
1201       subw(cnt2, cnt2, 4);
1202       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1203       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1204       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1205       zip1(vtmp, T8B, vtmp, vtmpZ);
1206       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1207       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1208       add(cnt1, cnt1, 4);
1209       fmovd(tmp1, vtmp);
1210     } else { // UL case
1211       ldr(tmp1, Address(str1));
1212       ldrs(vtmp, Address(str2));
1213       cmp(cnt2, stub_threshold);
1214       br(GE, STUB);
1215       subw(cnt2, cnt2, 4);
1216       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1217       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1218       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1219       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1220       zip1(vtmp, T8B, vtmp, vtmpZ);
1221       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1222       add(cnt1, cnt1, 8);
1223       fmovd(tmp2, vtmp);
1224     }
1225     adds(cnt2, cnt2, isUL ? 4 : 8);
1226     br(GE, TAIL);
1227     eor(rscratch2, tmp1, tmp2);
1228     cbnz(rscratch2, DIFF);
1229     // main loop
1230     bind(NEXT_WORD);
1231     if (str1_isL == str2_isL) {
1232       ldr(tmp1, Address(str1, cnt2));
1233       ldr(tmp2, Address(str2, cnt2));
1234       adds(cnt2, cnt2, 8);
1235     } else if (isLU) {
1236       ldrs(vtmp, Address(str1, cnt1));
1237       ldr(tmp2, Address(str2, cnt2));
1238       add(cnt1, cnt1, 4);
1239       zip1(vtmp, T8B, vtmp, vtmpZ);
1240       fmovd(tmp1, vtmp);
1241       adds(cnt2, cnt2, 8);
1242     } else { // UL
1243       ldrs(vtmp, Address(str2, cnt2));
1244       ldr(tmp1, Address(str1, cnt1));
1245       zip1(vtmp, T8B, vtmp, vtmpZ);
1246       add(cnt1, cnt1, 8);
1247       fmovd(tmp2, vtmp);
1248       adds(cnt2, cnt2, 4);
1249     }
1250     br(GE, TAIL);
1251 
1252     eor(rscratch2, tmp1, tmp2);
1253     cbz(rscratch2, NEXT_WORD);
1254     b(DIFF);
1255     bind(TAIL);
1256     eor(rscratch2, tmp1, tmp2);
1257     cbnz(rscratch2, DIFF);
1258     // Last longword.  In the case where length == 4 we compare the
1259     // same longword twice, but that's still faster than another
1260     // conditional branch.
1261     if (str1_isL == str2_isL) {
1262       ldr(tmp1, Address(str1));
1263       ldr(tmp2, Address(str2));
1264     } else if (isLU) {
1265       ldrs(vtmp, Address(str1));
1266       ldr(tmp2, Address(str2));
1267       zip1(vtmp, T8B, vtmp, vtmpZ);
1268       fmovd(tmp1, vtmp);
1269     } else { // UL
1270       ldrs(vtmp, Address(str2));
1271       ldr(tmp1, Address(str1));
1272       zip1(vtmp, T8B, vtmp, vtmpZ);
1273       fmovd(tmp2, vtmp);
1274     }
1275     bind(TAIL_CHECK);
1276     eor(rscratch2, tmp1, tmp2);
1277     cbz(rscratch2, DONE);
1278 
1279     // Find the first different characters in the longwords and
1280     // compute their difference.
1281     bind(DIFF);
1282     rev(rscratch2, rscratch2);
1283     clz(rscratch2, rscratch2);
1284     andr(rscratch2, rscratch2, isLL ? -8 : -16);
1285     lsrv(tmp1, tmp1, rscratch2);
1286     (this->*ext_chr)(tmp1, tmp1);
1287     lsrv(tmp2, tmp2, rscratch2);
1288     (this->*ext_chr)(tmp2, tmp2);
1289     subw(result, tmp1, tmp2);
1290     b(DONE);
1291   }
1292 
1293   bind(STUB);
1294     RuntimeAddress stub = nullptr;
1295     switch(ae) {
1296       case StrIntrinsicNode::LL:
1297         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
1298         break;
1299       case StrIntrinsicNode::UU:
1300         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
1301         break;
1302       case StrIntrinsicNode::LU:
1303         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
1304         break;
1305       case StrIntrinsicNode::UL:
1306         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
1307         break;
1308       default:
1309         ShouldNotReachHere();
1310      }
1311     assert(stub.target() != nullptr, "compare_long_string stub has not been generated");
1312     address call = trampoline_call(stub);
1313     if (call == nullptr) {
1314       DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START));
1315       ciEnv::current()->record_failure("CodeCache is full");
1316       return;
1317     }
1318     b(DONE);
1319 
1320   bind(SHORT_STRING);
1321   // Is the minimum length zero?
1322   cbz(cnt2, DONE);
1323   // arrange code to do most branches while loading and loading next characters
1324   // while comparing previous
1325   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1326   subs(cnt2, cnt2, 1);
1327   br(EQ, SHORT_LAST_INIT);
1328   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1329   b(SHORT_LOOP_START);
1330   bind(SHORT_LOOP);
1331   subs(cnt2, cnt2, 1);
1332   br(EQ, SHORT_LAST);
1333   bind(SHORT_LOOP_START);
1334   (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
1335   (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
1336   cmp(tmp1, cnt1);
1337   br(NE, SHORT_LOOP_TAIL);
1338   subs(cnt2, cnt2, 1);
1339   br(EQ, SHORT_LAST2);
1340   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1341   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1342   cmp(tmp2, rscratch1);
1343   br(EQ, SHORT_LOOP);
1344   sub(result, tmp2, rscratch1);
1345   b(DONE);
1346   bind(SHORT_LOOP_TAIL);
1347   sub(result, tmp1, cnt1);
1348   b(DONE);
1349   bind(SHORT_LAST2);
1350   cmp(tmp2, rscratch1);
1351   br(EQ, DONE);
1352   sub(result, tmp2, rscratch1);
1353 
1354   b(DONE);
1355   bind(SHORT_LAST_INIT);
1356   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1357   bind(SHORT_LAST);
1358   cmp(tmp1, cnt1);
1359   br(EQ, DONE);
1360   sub(result, tmp1, cnt1);
1361 
1362   bind(DONE);
1363 
1364   BLOCK_COMMENT("} string_compare");
1365 }
1366 
1367 void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1,
1368                                      FloatRegister src2, Condition cond, bool isQ) {
1369   SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1370   FloatRegister zn = src1, zm = src2;
1371   bool needs_negation = false;
1372   switch (cond) {
1373     case LT: cond = GT; zn = src2; zm = src1; break;
1374     case LE: cond = GE; zn = src2; zm = src1; break;
1375     case LO: cond = HI; zn = src2; zm = src1; break;
1376     case LS: cond = HS; zn = src2; zm = src1; break;
1377     case NE: cond = EQ; needs_negation = true; break;
1378     default:
1379       break;
1380   }
1381 
1382   if (is_floating_point_type(bt)) {
1383     fcm(cond, dst, size, zn, zm);
1384   } else {
1385     cm(cond, dst, size, zn, zm);
1386   }
1387 
1388   if (needs_negation) {
1389     notr(dst, isQ ? T16B : T8B, dst);
1390   }
1391 }
1392 
1393 void C2_MacroAssembler::neon_compare_zero(FloatRegister dst, BasicType bt, FloatRegister src,
1394                                           Condition cond, bool isQ) {
1395   SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1396   if (bt == T_FLOAT || bt == T_DOUBLE) {
1397     if (cond == Assembler::NE) {
1398       fcm(Assembler::EQ, dst, size, src);
1399       notr(dst, isQ ? T16B : T8B, dst);
1400     } else {
1401       fcm(cond, dst, size, src);
1402     }
1403   } else {
1404     if (cond == Assembler::NE) {
1405       cm(Assembler::EQ, dst, size, src);
1406       notr(dst, isQ ? T16B : T8B, dst);
1407     } else {
1408       cm(cond, dst, size, src);
1409     }
1410   }
1411 }
1412 
1413 // Compress the least significant bit of each byte to the rightmost and clear
1414 // the higher garbage bits.
1415 void C2_MacroAssembler::bytemask_compress(Register dst) {
1416   // Example input, dst = 0x01 00 00 00 01 01 00 01
1417   // The "??" bytes are garbage.
1418   orr(dst, dst, dst, Assembler::LSR, 7);  // dst = 0x?? 02 ?? 00 ?? 03 ?? 01
1419   orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D
1420   orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D
1421   andr(dst, dst, 0xff);                   // dst = 0x8D
1422 }
1423 
1424 // Pack the lowest-numbered bit of each mask element in src into a long value
1425 // in dst, at most the first 64 lane elements.
1426 // Clobbers: rscratch1, if UseSVE=1 or the hardware doesn't support FEAT_BITPERM.
1427 void C2_MacroAssembler::sve_vmask_tolong(Register dst, PRegister src, BasicType bt, int lane_cnt,
1428                                          FloatRegister vtmp1, FloatRegister vtmp2) {
1429   assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
1430   assert_different_registers(dst, rscratch1);
1431   assert_different_registers(vtmp1, vtmp2);
1432 
1433   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
1434   // Example:   src = 0b01100101 10001101, bt = T_BYTE, lane_cnt = 16
1435   // Expected:  dst = 0x658D
1436 
1437   // Convert the mask into vector with sequential bytes.
1438   // vtmp1 = 0x00010100 0x00010001 0x01000000 0x01010001
1439   sve_cpy(vtmp1, size, src, 1, false);
1440   if (bt != T_BYTE) {
1441     sve_vector_narrow(vtmp1, B, vtmp1, size, vtmp2);
1442   }
1443 
1444   if (UseSVE > 1 && VM_Version::supports_svebitperm()) {
1445     // Given a vector with the value 0x00 or 0x01 in each byte, the basic idea
1446     // is to compress each significant bit of the byte in a cross-lane way. Due
1447     // to the lack of a cross-lane bit-compress instruction, we use BEXT
1448     // (bit-compress in each lane) with the biggest lane size (T = D) then
1449     // concatenate the results.
1450 
1451     // The second source input of BEXT, initialized with 0x01 in each byte.
1452     // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1453     sve_dup(vtmp2, B, 1);
1454 
1455     // BEXT vtmp1.D, vtmp1.D, vtmp2.D
1456     // vtmp1 = 0x0001010000010001 | 0x0100000001010001
1457     // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1458     //         ---------------------------------------
1459     // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1460     sve_bext(vtmp1, D, vtmp1, vtmp2);
1461 
1462     // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the
1463     // result to dst.
1464     // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1465     // dst   = 0x658D
1466     if (lane_cnt <= 8) {
1467       // No need to concatenate.
1468       umov(dst, vtmp1, B, 0);
1469     } else if (lane_cnt <= 16) {
1470       ins(vtmp1, B, vtmp1, 1, 8);
1471       umov(dst, vtmp1, H, 0);
1472     } else {
1473       // As the lane count is 64 at most, the final expected value must be in
1474       // the lowest 64 bits after narrowing vtmp1 from D to B.
1475       sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2);
1476       umov(dst, vtmp1, D, 0);
1477     }
1478   } else if (UseSVE > 0) {
1479     // Compress the lowest 8 bytes.
1480     fmovd(dst, vtmp1);
1481     bytemask_compress(dst);
1482     if (lane_cnt <= 8) return;
1483 
1484     // Repeat on higher bytes and join the results.
1485     // Compress 8 bytes in each iteration.
1486     for (int idx = 1; idx < (lane_cnt / 8); idx++) {
1487       sve_extract_integral(rscratch1, T_LONG, vtmp1, idx, vtmp2);
1488       bytemask_compress(rscratch1);
1489       orr(dst, dst, rscratch1, Assembler::LSL, idx << 3);
1490     }
1491   } else {
1492     assert(false, "unsupported");
1493     ShouldNotReachHere();
1494   }
1495 }
1496 
1497 // Unpack the mask, a long value in src, into predicate register dst based on the
1498 // corresponding data type. Note that dst can support at most 64 lanes.
1499 // Below example gives the expected dst predicate register in different types, with
1500 // a valid src(0x658D) on a 1024-bit vector size machine.
1501 // BYTE:  dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 00 00 65 8D
1502 // SHORT: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 14 11 40 51
1503 // INT:   dst = 0x00 00 00 00 00 00 00 00 01 10 01 01 10 00 11 01
1504 // LONG:  dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01
1505 //
1506 // The number of significant bits of src must be equal to lane_cnt. E.g., 0xFF658D which
1507 // has 24 significant bits would be an invalid input if dst predicate register refers to
1508 // a LONG type 1024-bit vector, which has at most 16 lanes.
1509 void C2_MacroAssembler::sve_vmask_fromlong(PRegister dst, Register src, BasicType bt, int lane_cnt,
1510                                            FloatRegister vtmp1, FloatRegister vtmp2) {
1511   assert(UseSVE == 2 && VM_Version::supports_svebitperm() &&
1512          lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported");
1513   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
1514   // Example:   src = 0x658D, bt = T_BYTE, size = B, lane_cnt = 16
1515   // Expected:  dst = 0b01101001 10001101
1516 
1517   // Put long value from general purpose register into the first lane of vector.
1518   // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1519   sve_dup(vtmp1, B, 0);
1520   mov(vtmp1, D, 0, src);
1521 
1522   // As sve_cmp generates mask value with the minimum unit in byte, we should
1523   // transform the value in the first lane which is mask in bit now to the
1524   // mask in byte, which can be done by SVE2's BDEP instruction.
1525 
1526   // The first source input of BDEP instruction. Deposite each byte in every 8 bytes.
1527   // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1528   if (lane_cnt <= 8) {
1529     // Nothing. As only one byte exsits.
1530   } else if (lane_cnt <= 16) {
1531     ins(vtmp1, B, vtmp1, 8, 1);
1532     mov(vtmp1, B, 1, zr);
1533   } else {
1534     sve_vector_extend(vtmp1, D, vtmp1, B);
1535   }
1536 
1537   // The second source input of BDEP instruction, initialized with 0x01 for each byte.
1538   // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1539   sve_dup(vtmp2, B, 1);
1540 
1541   // BDEP vtmp1.D, vtmp1.D, vtmp2.D
1542   // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1543   // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1544   //         ---------------------------------------
1545   // vtmp1 = 0x0001010000010001 | 0x0100000001010001
1546   sve_bdep(vtmp1, D, vtmp1, vtmp2);
1547 
1548   if (bt != T_BYTE) {
1549     sve_vector_extend(vtmp1, size, vtmp1, B);
1550   }
1551   // Generate mask according to the given vector, in which the elements have been
1552   // extended to expected type.
1553   // dst = 0b01101001 10001101
1554   sve_cmp(Assembler::NE, dst, size, ptrue, vtmp1, 0);
1555 }
1556 
1557 // Clobbers: rflags
1558 void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg,
1559                                     FloatRegister zn, FloatRegister zm, Condition cond) {
1560   assert(pg->is_governing(), "This register has to be a governing predicate register");
1561   FloatRegister z1 = zn, z2 = zm;
1562   switch (cond) {
1563     case LE: z1 = zm; z2 = zn; cond = GE; break;
1564     case LT: z1 = zm; z2 = zn; cond = GT; break;
1565     case LO: z1 = zm; z2 = zn; cond = HI; break;
1566     case LS: z1 = zm; z2 = zn; cond = HS; break;
1567     default:
1568       break;
1569   }
1570 
1571   SIMD_RegVariant size = elemType_to_regVariant(bt);
1572   if (is_floating_point_type(bt)) {
1573     sve_fcm(cond, pd, size, pg, z1, z2);
1574   } else {
1575     assert(is_integral_type(bt), "unsupported element type");
1576     sve_cmp(cond, pd, size, pg, z1, z2);
1577   }
1578 }
1579 
1580 // Get index of the last mask lane that is set
1581 void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) {
1582   SIMD_RegVariant size = elemType_to_regVariant(bt);
1583   sve_rev(ptmp, size, src);
1584   sve_brkb(ptmp, ptrue, ptmp, false);
1585   sve_cntp(dst, size, ptrue, ptmp);
1586   movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1);
1587   subw(dst, rscratch1, dst);
1588 }
1589 
1590 // Extend integer vector src to dst with the same lane count
1591 // but larger element size, e.g. 4B -> 4I
1592 void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes,
1593                                            FloatRegister src, BasicType src_bt, bool is_unsigned) {
1594   if (src_bt == T_BYTE) {
1595     if (dst_bt == T_SHORT) {
1596       // 4B/8B to 4S/8S
1597       _xshll(is_unsigned, dst, T8H, src, T8B, 0);
1598     } else {
1599       // 4B to 4I
1600       assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported");
1601       _xshll(is_unsigned, dst, T8H, src, T8B, 0);
1602       _xshll(is_unsigned, dst, T4S, dst, T4H, 0);
1603     }
1604   } else if (src_bt == T_SHORT) {
1605     // 4S to 4I
1606     assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported");
1607     _xshll(is_unsigned, dst, T4S, src, T4H, 0);
1608   } else if (src_bt == T_INT) {
1609     // 2I to 2L
1610     assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported");
1611     _xshll(is_unsigned, dst, T2D, src, T2S, 0);
1612   } else {
1613     ShouldNotReachHere();
1614   }
1615 }
1616 
1617 // Narrow integer vector src down to dst with the same lane count
1618 // but smaller element size, e.g. 4I -> 4B
1619 void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt,
1620                                            FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) {
1621   if (src_bt == T_SHORT) {
1622     // 4S/8S to 4B/8B
1623     assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
1624     assert(dst_bt == T_BYTE, "unsupported");
1625     xtn(dst, T8B, src, T8H);
1626   } else if (src_bt == T_INT) {
1627     // 4I to 4B/4S
1628     assert(src_vlen_in_bytes == 16, "unsupported");
1629     assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported");
1630     xtn(dst, T4H, src, T4S);
1631     if (dst_bt == T_BYTE) {
1632       xtn(dst, T8B, dst, T8H);
1633     }
1634   } else if (src_bt == T_LONG) {
1635     // 2L to 2I
1636     assert(src_vlen_in_bytes == 16, "unsupported");
1637     assert(dst_bt == T_INT, "unsupported");
1638     xtn(dst, T2S, src, T2D);
1639   } else {
1640     ShouldNotReachHere();
1641   }
1642 }
1643 
1644 void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size,
1645                                           FloatRegister src, SIMD_RegVariant src_size,
1646                                           bool is_unsigned) {
1647   assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size");
1648 
1649   if (src_size == B) {
1650     switch (dst_size) {
1651     case H:
1652       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1653       break;
1654     case S:
1655       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1656       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1657       break;
1658     case D:
1659       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1660       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1661       _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1662       break;
1663     default:
1664       ShouldNotReachHere();
1665     }
1666   } else if (src_size == H) {
1667     if (dst_size == S) {
1668       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1669     } else { // D
1670       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1671       _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1672     }
1673   } else if (src_size == S) {
1674     _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, src);
1675   }
1676 }
1677 
1678 // Vector narrow from src to dst with specified element sizes.
1679 // High part of dst vector will be filled with zero.
1680 void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size,
1681                                           FloatRegister src, SIMD_RegVariant src_size,
1682                                           FloatRegister tmp) {
1683   assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size");
1684   assert_different_registers(src, tmp);
1685   sve_dup(tmp, src_size, 0);
1686   if (src_size == D) {
1687     switch (dst_size) {
1688     case S:
1689       sve_uzp1(dst, S, src, tmp);
1690       break;
1691     case H:
1692       assert_different_registers(dst, tmp);
1693       sve_uzp1(dst, S, src, tmp);
1694       sve_uzp1(dst, H, dst, tmp);
1695       break;
1696     case B:
1697       assert_different_registers(dst, tmp);
1698       sve_uzp1(dst, S, src, tmp);
1699       sve_uzp1(dst, H, dst, tmp);
1700       sve_uzp1(dst, B, dst, tmp);
1701       break;
1702     default:
1703       ShouldNotReachHere();
1704     }
1705   } else if (src_size == S) {
1706     if (dst_size == H) {
1707       sve_uzp1(dst, H, src, tmp);
1708     } else { // B
1709       assert_different_registers(dst, tmp);
1710       sve_uzp1(dst, H, src, tmp);
1711       sve_uzp1(dst, B, dst, tmp);
1712     }
1713   } else if (src_size == H) {
1714     sve_uzp1(dst, B, src, tmp);
1715   }
1716 }
1717 
1718 // Extend src predicate to dst predicate with the same lane count but larger
1719 // element size, e.g. 64Byte -> 512Long
1720 void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src,
1721                                              uint dst_element_length_in_bytes,
1722                                              uint src_element_length_in_bytes) {
1723   if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) {
1724     sve_punpklo(dst, src);
1725   } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) {
1726     sve_punpklo(dst, src);
1727     sve_punpklo(dst, dst);
1728   } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) {
1729     sve_punpklo(dst, src);
1730     sve_punpklo(dst, dst);
1731     sve_punpklo(dst, dst);
1732   } else {
1733     assert(false, "unsupported");
1734     ShouldNotReachHere();
1735   }
1736 }
1737 
1738 // Narrow src predicate to dst predicate with the same lane count but
1739 // smaller element size, e.g. 512Long -> 64Byte
1740 void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src, PRegister ptmp,
1741                                              uint dst_element_length_in_bytes, uint src_element_length_in_bytes) {
1742   // The insignificant bits in src predicate are expected to be zero.
1743   // To ensure the higher order bits of the resultant narrowed vector are 0, an all-zero predicate is
1744   // passed as the second argument. An example narrowing operation with a given mask would be -
1745   // 128Long -> 64Int on a 128-bit machine i.e 2L -> 2I
1746   // Mask (for 2 Longs) : TF
1747   // Predicate register for the above mask (16 bits) : 00000001 00000000
1748   // After narrowing (uzp1 dst.b, src.b, ptmp.b) : 0000 0000 0001 0000
1749   // Which translates to mask for 2 integers as : TF (lower half is considered while upper half is 0)
1750   assert_different_registers(src, ptmp);
1751   assert_different_registers(dst, ptmp);
1752   sve_pfalse(ptmp);
1753   if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) {
1754     sve_uzp1(dst, B, src, ptmp);
1755   } else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) {
1756     sve_uzp1(dst, H, src, ptmp);
1757     sve_uzp1(dst, B, dst, ptmp);
1758   } else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) {
1759     sve_uzp1(dst, S, src, ptmp);
1760     sve_uzp1(dst, H, dst, ptmp);
1761     sve_uzp1(dst, B, dst, ptmp);
1762   } else {
1763     assert(false, "unsupported");
1764     ShouldNotReachHere();
1765   }
1766 }
1767 
1768 // Vector reduction add for integral type with ASIMD instructions.
1769 void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt,
1770                                                  Register isrc, FloatRegister vsrc,
1771                                                  unsigned vector_length_in_bytes,
1772                                                  FloatRegister vtmp) {
1773   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1774   assert_different_registers(dst, isrc);
1775   bool isQ = vector_length_in_bytes == 16;
1776 
1777   BLOCK_COMMENT("neon_reduce_add_integral {");
1778     switch(bt) {
1779       case T_BYTE:
1780         addv(vtmp, isQ ? T16B : T8B, vsrc);
1781         smov(dst, vtmp, B, 0);
1782         addw(dst, dst, isrc, ext::sxtb);
1783         break;
1784       case T_SHORT:
1785         addv(vtmp, isQ ? T8H : T4H, vsrc);
1786         smov(dst, vtmp, H, 0);
1787         addw(dst, dst, isrc, ext::sxth);
1788         break;
1789       case T_INT:
1790         isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc);
1791         umov(dst, vtmp, S, 0);
1792         addw(dst, dst, isrc);
1793         break;
1794       case T_LONG:
1795         assert(isQ, "unsupported");
1796         addpd(vtmp, vsrc);
1797         umov(dst, vtmp, D, 0);
1798         add(dst, dst, isrc);
1799         break;
1800       default:
1801         assert(false, "unsupported");
1802         ShouldNotReachHere();
1803     }
1804   BLOCK_COMMENT("} neon_reduce_add_integral");
1805 }
1806 
1807 // Vector reduction multiply for integral type with ASIMD instructions.
1808 // Note: temporary registers vtmp1 and vtmp2 are not used in some cases.
1809 // Clobbers: rscratch1
1810 void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt,
1811                                                  Register isrc, FloatRegister vsrc,
1812                                                  unsigned vector_length_in_bytes,
1813                                                  FloatRegister vtmp1, FloatRegister vtmp2) {
1814   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1815   bool isQ = vector_length_in_bytes == 16;
1816 
1817   BLOCK_COMMENT("neon_reduce_mul_integral {");
1818     switch(bt) {
1819       case T_BYTE:
1820         if (isQ) {
1821           // Multiply the lower half and higher half of vector iteratively.
1822           // vtmp1 = vsrc[8:15]
1823           ins(vtmp1, D, vsrc, 0, 1);
1824           // vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7]
1825           mulv(vtmp1, T8B, vtmp1, vsrc);
1826           // vtmp2 = vtmp1[4:7]
1827           ins(vtmp2, S, vtmp1, 0, 1);
1828           // vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3]
1829           mulv(vtmp1, T8B, vtmp2, vtmp1);
1830         } else {
1831           ins(vtmp1, S, vsrc, 0, 1);
1832           mulv(vtmp1, T8B, vtmp1, vsrc);
1833         }
1834         // vtmp2 = vtmp1[2:3]
1835         ins(vtmp2, H, vtmp1, 0, 1);
1836         // vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1]
1837         mulv(vtmp2, T8B, vtmp2, vtmp1);
1838         // dst = vtmp2[0] * isrc * vtmp2[1]
1839         umov(rscratch1, vtmp2, B, 0);
1840         mulw(dst, rscratch1, isrc);
1841         sxtb(dst, dst);
1842         umov(rscratch1, vtmp2, B, 1);
1843         mulw(dst, rscratch1, dst);
1844         sxtb(dst, dst);
1845         break;
1846       case T_SHORT:
1847         if (isQ) {
1848           ins(vtmp2, D, vsrc, 0, 1);
1849           mulv(vtmp2, T4H, vtmp2, vsrc);
1850           ins(vtmp1, S, vtmp2, 0, 1);
1851           mulv(vtmp1, T4H, vtmp1, vtmp2);
1852         } else {
1853           ins(vtmp1, S, vsrc, 0, 1);
1854           mulv(vtmp1, T4H, vtmp1, vsrc);
1855         }
1856         umov(rscratch1, vtmp1, H, 0);
1857         mulw(dst, rscratch1, isrc);
1858         sxth(dst, dst);
1859         umov(rscratch1, vtmp1, H, 1);
1860         mulw(dst, rscratch1, dst);
1861         sxth(dst, dst);
1862         break;
1863       case T_INT:
1864         if (isQ) {
1865           ins(vtmp1, D, vsrc, 0, 1);
1866           mulv(vtmp1, T2S, vtmp1, vsrc);
1867         } else {
1868           vtmp1 = vsrc;
1869         }
1870         umov(rscratch1, vtmp1, S, 0);
1871         mul(dst, rscratch1, isrc);
1872         umov(rscratch1, vtmp1, S, 1);
1873         mul(dst, rscratch1, dst);
1874         break;
1875       case T_LONG:
1876         umov(rscratch1, vsrc, D, 0);
1877         mul(dst, isrc, rscratch1);
1878         umov(rscratch1, vsrc, D, 1);
1879         mul(dst, dst, rscratch1);
1880         break;
1881       default:
1882         assert(false, "unsupported");
1883         ShouldNotReachHere();
1884     }
1885   BLOCK_COMMENT("} neon_reduce_mul_integral");
1886 }
1887 
1888 // Vector reduction multiply for floating-point type with ASIMD instructions.
1889 void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt,
1890                                            FloatRegister fsrc, FloatRegister vsrc,
1891                                            unsigned vector_length_in_bytes,
1892                                            FloatRegister vtmp) {
1893   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1894   bool isQ = vector_length_in_bytes == 16;
1895 
1896   BLOCK_COMMENT("neon_reduce_mul_fp {");
1897     switch(bt) {
1898       case T_FLOAT:
1899         fmuls(dst, fsrc, vsrc);
1900         ins(vtmp, S, vsrc, 0, 1);
1901         fmuls(dst, dst, vtmp);
1902         if (isQ) {
1903           ins(vtmp, S, vsrc, 0, 2);
1904           fmuls(dst, dst, vtmp);
1905           ins(vtmp, S, vsrc, 0, 3);
1906           fmuls(dst, dst, vtmp);
1907          }
1908         break;
1909       case T_DOUBLE:
1910         assert(isQ, "unsupported");
1911         fmuld(dst, fsrc, vsrc);
1912         ins(vtmp, D, vsrc, 0, 1);
1913         fmuld(dst, dst, vtmp);
1914         break;
1915       default:
1916         assert(false, "unsupported");
1917         ShouldNotReachHere();
1918     }
1919   BLOCK_COMMENT("} neon_reduce_mul_fp");
1920 }
1921 
1922 // Helper to select logical instruction
1923 void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd,
1924                                                    Register Rn, Register Rm,
1925                                                    enum shift_kind kind, unsigned shift) {
1926   switch(opc) {
1927     case Op_AndReductionV:
1928       is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift);
1929       break;
1930     case Op_OrReductionV:
1931       is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift);
1932       break;
1933     case Op_XorReductionV:
1934       is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift);
1935       break;
1936     default:
1937       assert(false, "unsupported");
1938       ShouldNotReachHere();
1939   }
1940 }
1941 
1942 // Vector reduction logical operations And, Or, Xor
1943 // Clobbers: rscratch1
1944 void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt,
1945                                             Register isrc, FloatRegister vsrc,
1946                                             unsigned vector_length_in_bytes) {
1947   assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV,
1948          "unsupported");
1949   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1950   assert_different_registers(dst, isrc);
1951   bool isQ = vector_length_in_bytes == 16;
1952 
1953   BLOCK_COMMENT("neon_reduce_logical {");
1954     umov(rscratch1, vsrc, isQ ? D : S, 0);
1955     umov(dst, vsrc, isQ ? D : S, 1);
1956     neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1);
1957     switch(bt) {
1958       case T_BYTE:
1959         if (isQ) {
1960           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1961         }
1962         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
1963         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8);
1964         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1965         sxtb(dst, dst);
1966         break;
1967       case T_SHORT:
1968         if (isQ) {
1969           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1970         }
1971         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
1972         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1973         sxth(dst, dst);
1974         break;
1975       case T_INT:
1976         if (isQ) {
1977           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1978         }
1979         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1980         break;
1981       case T_LONG:
1982         assert(isQ, "unsupported");
1983         neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst);
1984         break;
1985       default:
1986         assert(false, "unsupported");
1987         ShouldNotReachHere();
1988     }
1989   BLOCK_COMMENT("} neon_reduce_logical");
1990 }
1991 
1992 // Vector reduction min/max for integral type with ASIMD instructions.
1993 // Note: vtmp is not used and expected to be fnoreg for T_LONG case.
1994 // Clobbers: rscratch1, rflags
1995 void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt,
1996                                                     Register isrc, FloatRegister vsrc,
1997                                                     unsigned vector_length_in_bytes,
1998                                                     FloatRegister vtmp) {
1999   assert(opc == Op_MinReductionV || opc == Op_MaxReductionV, "unsupported");
2000   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
2001   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported");
2002   assert_different_registers(dst, isrc);
2003   bool isQ = vector_length_in_bytes == 16;
2004   bool is_min = opc == Op_MinReductionV;
2005 
2006   BLOCK_COMMENT("neon_reduce_minmax_integral {");
2007     if (bt == T_LONG) {
2008       assert(vtmp == fnoreg, "should be");
2009       assert(isQ, "should be");
2010       umov(rscratch1, vsrc, D, 0);
2011       cmp(isrc, rscratch1);
2012       csel(dst, isrc, rscratch1, is_min ? LT : GT);
2013       umov(rscratch1, vsrc, D, 1);
2014       cmp(dst, rscratch1);
2015       csel(dst, dst, rscratch1, is_min ? LT : GT);
2016     } else {
2017       SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
2018       if (size == T2S) {
2019         is_min ? sminp(vtmp, size, vsrc, vsrc) : smaxp(vtmp, size, vsrc, vsrc);
2020       } else {
2021         is_min ? sminv(vtmp, size, vsrc) : smaxv(vtmp, size, vsrc);
2022       }
2023       if (bt == T_INT) {
2024         umov(dst, vtmp, S, 0);
2025       } else {
2026         smov(dst, vtmp, elemType_to_regVariant(bt), 0);
2027       }
2028       cmpw(dst, isrc);
2029       cselw(dst, dst, isrc, is_min ? LT : GT);
2030     }
2031   BLOCK_COMMENT("} neon_reduce_minmax_integral");
2032 }
2033 
2034 // Vector reduction for integral type with SVE instruction.
2035 // Supported operations are Add, And, Or, Xor, Max, Min.
2036 // rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV.
2037 void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1,
2038                                             FloatRegister src2, PRegister pg, FloatRegister tmp) {
2039   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2040   assert(pg->is_governing(), "This register has to be a governing predicate register");
2041   assert_different_registers(src1, dst);
2042   // Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved.
2043   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2044   switch (opc) {
2045     case Op_AddReductionVI: {
2046       sve_uaddv(tmp, size, pg, src2);
2047       if (bt == T_BYTE) {
2048         smov(dst, tmp, size, 0);
2049         addw(dst, src1, dst, ext::sxtb);
2050       } else if (bt == T_SHORT) {
2051         smov(dst, tmp, size, 0);
2052         addw(dst, src1, dst, ext::sxth);
2053       } else {
2054         umov(dst, tmp, size, 0);
2055         addw(dst, dst, src1);
2056       }
2057       break;
2058     }
2059     case Op_AddReductionVL: {
2060       sve_uaddv(tmp, size, pg, src2);
2061       umov(dst, tmp, size, 0);
2062       add(dst, dst, src1);
2063       break;
2064     }
2065     case Op_AndReductionV: {
2066       sve_andv(tmp, size, pg, src2);
2067       if (bt == T_INT || bt == T_LONG) {
2068         umov(dst, tmp, size, 0);
2069       } else {
2070         smov(dst, tmp, size, 0);
2071       }
2072       if (bt == T_LONG) {
2073         andr(dst, dst, src1);
2074       } else {
2075         andw(dst, dst, src1);
2076       }
2077       break;
2078     }
2079     case Op_OrReductionV: {
2080       sve_orv(tmp, size, pg, src2);
2081       if (bt == T_INT || bt == T_LONG) {
2082         umov(dst, tmp, size, 0);
2083       } else {
2084         smov(dst, tmp, size, 0);
2085       }
2086       if (bt == T_LONG) {
2087         orr(dst, dst, src1);
2088       } else {
2089         orrw(dst, dst, src1);
2090       }
2091       break;
2092     }
2093     case Op_XorReductionV: {
2094       sve_eorv(tmp, size, pg, src2);
2095       if (bt == T_INT || bt == T_LONG) {
2096         umov(dst, tmp, size, 0);
2097       } else {
2098         smov(dst, tmp, size, 0);
2099       }
2100       if (bt == T_LONG) {
2101         eor(dst, dst, src1);
2102       } else {
2103         eorw(dst, dst, src1);
2104       }
2105       break;
2106     }
2107     case Op_MaxReductionV: {
2108       sve_smaxv(tmp, size, pg, src2);
2109       if (bt == T_INT || bt == T_LONG) {
2110         umov(dst, tmp, size, 0);
2111       } else {
2112         smov(dst, tmp, size, 0);
2113       }
2114       if (bt == T_LONG) {
2115         cmp(dst, src1);
2116         csel(dst, dst, src1, Assembler::GT);
2117       } else {
2118         cmpw(dst, src1);
2119         cselw(dst, dst, src1, Assembler::GT);
2120       }
2121       break;
2122     }
2123     case Op_MinReductionV: {
2124       sve_sminv(tmp, size, pg, src2);
2125       if (bt == T_INT || bt == T_LONG) {
2126         umov(dst, tmp, size, 0);
2127       } else {
2128         smov(dst, tmp, size, 0);
2129       }
2130       if (bt == T_LONG) {
2131         cmp(dst, src1);
2132         csel(dst, dst, src1, Assembler::LT);
2133       } else {
2134         cmpw(dst, src1);
2135         cselw(dst, dst, src1, Assembler::LT);
2136       }
2137       break;
2138     }
2139     default:
2140       assert(false, "unsupported");
2141       ShouldNotReachHere();
2142   }
2143 
2144   if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) {
2145     if (bt == T_BYTE) {
2146       sxtb(dst, dst);
2147     } else if (bt == T_SHORT) {
2148       sxth(dst, dst);
2149     }
2150   }
2151 }
2152 
2153 // Set elements of the dst predicate to true for lanes in the range of [0, lane_cnt), or
2154 // to false otherwise. The input "lane_cnt" should be smaller than or equal to the supported
2155 // max vector length of the basic type. Clobbers: rscratch1 and the rFlagsReg.
2156 void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt) {
2157   uint32_t max_vector_length = Matcher::max_vector_size(bt);
2158   assert(lane_cnt <= max_vector_length, "unsupported input lane_cnt");
2159 
2160   // Set all elements to false if the input "lane_cnt" is zero.
2161   if (lane_cnt == 0) {
2162     sve_pfalse(dst);
2163     return;
2164   }
2165 
2166   SIMD_RegVariant size = elemType_to_regVariant(bt);
2167   assert(size != Q, "invalid size");
2168 
2169   // Set all true if "lane_cnt" equals to the max lane count.
2170   if (lane_cnt == max_vector_length) {
2171     sve_ptrue(dst, size, /* ALL */ 0b11111);
2172     return;
2173   }
2174 
2175   // Fixed numbers for "ptrue".
2176   switch(lane_cnt) {
2177   case 1: /* VL1 */
2178   case 2: /* VL2 */
2179   case 3: /* VL3 */
2180   case 4: /* VL4 */
2181   case 5: /* VL5 */
2182   case 6: /* VL6 */
2183   case 7: /* VL7 */
2184   case 8: /* VL8 */
2185     sve_ptrue(dst, size, lane_cnt);
2186     return;
2187   case 16:
2188     sve_ptrue(dst, size, /* VL16 */ 0b01001);
2189     return;
2190   case 32:
2191     sve_ptrue(dst, size, /* VL32 */ 0b01010);
2192     return;
2193   case 64:
2194     sve_ptrue(dst, size, /* VL64 */ 0b01011);
2195     return;
2196   case 128:
2197     sve_ptrue(dst, size, /* VL128 */ 0b01100);
2198     return;
2199   case 256:
2200     sve_ptrue(dst, size, /* VL256 */ 0b01101);
2201     return;
2202   default:
2203     break;
2204   }
2205 
2206   // Special patterns for "ptrue".
2207   if (lane_cnt == round_down_power_of_2(max_vector_length)) {
2208     sve_ptrue(dst, size, /* POW2 */ 0b00000);
2209   } else if (lane_cnt == max_vector_length - (max_vector_length % 4)) {
2210     sve_ptrue(dst, size, /* MUL4 */ 0b11101);
2211   } else if (lane_cnt == max_vector_length - (max_vector_length % 3)) {
2212     sve_ptrue(dst, size, /* MUL3 */ 0b11110);
2213   } else {
2214     // Encode to "whileltw" for the remaining cases.
2215     mov(rscratch1, lane_cnt);
2216     sve_whileltw(dst, size, zr, rscratch1);
2217   }
2218 }
2219 
2220 // Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst.
2221 // Any remaining elements of dst will be filled with zero.
2222 // Clobbers: rscratch1
2223 // Preserves: src, mask
2224 void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask,
2225                                            FloatRegister vtmp1, FloatRegister vtmp2,
2226                                            PRegister pgtmp) {
2227   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2228   assert_different_registers(dst, src, vtmp1, vtmp2);
2229   assert_different_registers(mask, pgtmp);
2230 
2231   // Example input:   src   = 8888 7777 6666 5555 4444 3333 2222 1111
2232   //                  mask  = 0001 0000 0000 0001 0001 0000 0001 0001
2233   // Expected result: dst   = 0000 0000 0000 8888 5555 4444 2222 1111
2234   sve_dup(vtmp2, H, 0);
2235 
2236   // Extend lowest half to type INT.
2237   // dst = 00004444 00003333 00002222 00001111
2238   sve_uunpklo(dst, S, src);
2239   // pgtmp = 00000001 00000000 00000001 00000001
2240   sve_punpklo(pgtmp, mask);
2241   // Pack the active elements in size of type INT to the right,
2242   // and fill the remainings with zero.
2243   // dst = 00000000 00004444 00002222 00001111
2244   sve_compact(dst, S, dst, pgtmp);
2245   // Narrow the result back to type SHORT.
2246   // dst = 0000 0000 0000 0000 0000 4444 2222 1111
2247   sve_uzp1(dst, H, dst, vtmp2);
2248   // Count the active elements of lowest half.
2249   // rscratch1 = 3
2250   sve_cntp(rscratch1, S, ptrue, pgtmp);
2251 
2252   // Repeat to the highest half.
2253   // pgtmp = 00000001 00000000 00000000 00000001
2254   sve_punpkhi(pgtmp, mask);
2255   // vtmp1 = 00008888 00007777 00006666 00005555
2256   sve_uunpkhi(vtmp1, S, src);
2257   // vtmp1 = 00000000 00000000 00008888 00005555
2258   sve_compact(vtmp1, S, vtmp1, pgtmp);
2259   // vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555
2260   sve_uzp1(vtmp1, H, vtmp1, vtmp2);
2261 
2262   // Compressed low:   dst   = 0000 0000 0000 0000 0000 4444 2222 1111
2263   // Compressed high:  vtmp1 = 0000 0000 0000 0000 0000 0000 8888  5555
2264   // Left shift(cross lane) compressed high with TRUE_CNT lanes,
2265   // TRUE_CNT is the number of active elements in the compressed low.
2266   neg(rscratch1, rscratch1);
2267   // vtmp2 = {4 3 2 1 0 -1 -2 -3}
2268   sve_index(vtmp2, H, rscratch1, 1);
2269   // vtmp1 = 0000 0000 0000 8888 5555 0000 0000 0000
2270   sve_tbl(vtmp1, H, vtmp1, vtmp2);
2271 
2272   // Combine the compressed high(after shifted) with the compressed low.
2273   // dst = 0000 0000 0000 8888 5555 4444 2222 1111
2274   sve_orr(dst, dst, vtmp1);
2275 }
2276 
2277 // Clobbers: rscratch1, rscratch2
2278 // Preserves: src, mask
2279 void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask,
2280                                           FloatRegister vtmp1, FloatRegister vtmp2,
2281                                           FloatRegister vtmp3, FloatRegister vtmp4,
2282                                           PRegister ptmp, PRegister pgtmp) {
2283   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2284   assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3, vtmp4);
2285   assert_different_registers(mask, ptmp, pgtmp);
2286   // Example input:   src   = 88 77 66 55 44 33 22 11
2287   //                  mask  = 01 00 00 01 01 00 01 01
2288   // Expected result: dst   = 00 00 00 88 55 44 22 11
2289 
2290   sve_dup(vtmp4, B, 0);
2291   // Extend lowest half to type SHORT.
2292   // vtmp1 = 0044 0033 0022 0011
2293   sve_uunpklo(vtmp1, H, src);
2294   // ptmp = 0001 0000 0001 0001
2295   sve_punpklo(ptmp, mask);
2296   // Count the active elements of lowest half.
2297   // rscratch2 = 3
2298   sve_cntp(rscratch2, H, ptrue, ptmp);
2299   // Pack the active elements in size of type SHORT to the right,
2300   // and fill the remainings with zero.
2301   // dst = 0000 0044 0022 0011
2302   sve_compress_short(dst, vtmp1, ptmp, vtmp2, vtmp3, pgtmp);
2303   // Narrow the result back to type BYTE.
2304   // dst = 00 00 00 00 00 44 22 11
2305   sve_uzp1(dst, B, dst, vtmp4);
2306 
2307   // Repeat to the highest half.
2308   // ptmp = 0001 0000 0000 0001
2309   sve_punpkhi(ptmp, mask);
2310   // vtmp1 = 0088 0077 0066 0055
2311   sve_uunpkhi(vtmp2, H, src);
2312   // vtmp1 = 0000 0000 0088 0055
2313   sve_compress_short(vtmp1, vtmp2, ptmp, vtmp3, vtmp4, pgtmp);
2314 
2315   sve_dup(vtmp4, B, 0);
2316   // vtmp1 = 00 00 00 00 00 00 88 55
2317   sve_uzp1(vtmp1, B, vtmp1, vtmp4);
2318 
2319   // Compressed low:   dst   = 00 00 00 00 00 44 22 11
2320   // Compressed high:  vtmp1 = 00 00 00 00 00 00 88 55
2321   // Left shift(cross lane) compressed high with TRUE_CNT lanes,
2322   // TRUE_CNT is the number of active elements in the compressed low.
2323   neg(rscratch2, rscratch2);
2324   // vtmp2 = {4 3 2 1 0 -1 -2 -3}
2325   sve_index(vtmp2, B, rscratch2, 1);
2326   // vtmp1 = 00 00 00 88 55 00 00 00
2327   sve_tbl(vtmp1, B, vtmp1, vtmp2);
2328   // Combine the compressed high(after shifted) with the compressed low.
2329   // dst = 00 00 00 88 55 44 22 11
2330   sve_orr(dst, dst, vtmp1);
2331 }
2332 
2333 void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2334   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2335   SIMD_Arrangement size = isQ ? T16B : T8B;
2336   if (bt == T_BYTE) {
2337     rbit(dst, size, src);
2338   } else {
2339     neon_reverse_bytes(dst, src, bt, isQ);
2340     rbit(dst, size, dst);
2341   }
2342 }
2343 
2344 void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2345   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2346   SIMD_Arrangement size = isQ ? T16B : T8B;
2347   switch (bt) {
2348     case T_BYTE:
2349       if (dst != src) {
2350         orr(dst, size, src, src);
2351       }
2352       break;
2353     case T_SHORT:
2354       rev16(dst, size, src);
2355       break;
2356     case T_INT:
2357       rev32(dst, size, src);
2358       break;
2359     case T_LONG:
2360       rev64(dst, size, src);
2361       break;
2362     default:
2363       assert(false, "unsupported");
2364       ShouldNotReachHere();
2365   }
2366 }
2367 
2368 // Extract a scalar element from an sve vector at position 'idx'.
2369 // The input elements in src are expected to be of integral type.
2370 void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src,
2371                                              int idx, FloatRegister vtmp) {
2372   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2373   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2374   if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction
2375     if (bt == T_INT || bt == T_LONG) {
2376       umov(dst, src, size, idx);
2377     } else {
2378       smov(dst, src, size, idx);
2379     }
2380   } else {
2381     sve_orr(vtmp, src, src);
2382     sve_ext(vtmp, vtmp, idx << size);
2383     if (bt == T_INT || bt == T_LONG) {
2384       umov(dst, vtmp, size, 0);
2385     } else {
2386       smov(dst, vtmp, size, 0);
2387     }
2388   }
2389 }
2390 
2391 // java.lang.Math::round intrinsics
2392 
2393 // Clobbers: rscratch1, rflags
2394 void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2395                                           FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) {
2396   assert_different_registers(tmp1, tmp2, tmp3, src, dst);
2397   switch (T) {
2398     case T2S:
2399     case T4S:
2400       fmovs(tmp1, T, 0.5f);
2401       mov(rscratch1, jint_cast(0x1.0p23f));
2402       break;
2403     case T2D:
2404       fmovd(tmp1, T, 0.5);
2405       mov(rscratch1, julong_cast(0x1.0p52));
2406       break;
2407     default:
2408       assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2409   }
2410   fadd(tmp1, T, tmp1, src);
2411   fcvtms(tmp1, T, tmp1);
2412   // tmp1 = floor(src + 0.5, ties to even)
2413 
2414   fcvtas(dst, T, src);
2415   // dst = round(src), ties to away
2416 
2417   fneg(tmp3, T, src);
2418   dup(tmp2, T, rscratch1);
2419   cm(HS, tmp3, T, tmp3, tmp2);
2420   // tmp3 is now a set of flags
2421 
2422   bif(dst, T16B, tmp1, tmp3);
2423   // result in dst
2424 }
2425 
2426 // Clobbers: rscratch1, rflags
2427 void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2428                                          FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) {
2429   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2430   assert_different_registers(tmp1, tmp2, src, dst);
2431 
2432   switch (T) {
2433     case S:
2434       mov(rscratch1, jint_cast(0x1.0p23f));
2435       break;
2436     case D:
2437       mov(rscratch1, julong_cast(0x1.0p52));
2438       break;
2439     default:
2440       assert(T == S || T == D, "invalid register variant");
2441   }
2442 
2443   sve_frinta(dst, T, ptrue, src);
2444   // dst = round(src), ties to away
2445 
2446   Label none;
2447 
2448   sve_fneg(tmp1, T, ptrue, src);
2449   sve_dup(tmp2, T, rscratch1);
2450   sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1);
2451   br(EQ, none);
2452   {
2453     sve_cpy(tmp1, T, pgtmp, 0.5);
2454     sve_fadd(tmp1, T, pgtmp, src);
2455     sve_frintm(dst, T, pgtmp, tmp1);
2456     // dst = floor(src + 0.5, ties to even)
2457   }
2458   bind(none);
2459 
2460   sve_fcvtzs(dst, T, ptrue, dst, T);
2461   // result in dst
2462 }
2463 
2464 void C2_MacroAssembler::vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero,
2465                                            FloatRegister one, SIMD_Arrangement T) {
2466   assert_different_registers(dst, src, zero, one);
2467   assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2468 
2469   facgt(dst, T, src, zero);
2470   ushr(dst, T, dst, 1); // dst=0 for +-0.0 and NaN. 0x7FF..F otherwise
2471   bsl(dst, T == T2S ? T8B : T16B, one, src); // Result in dst
2472 }
2473 
2474 void C2_MacroAssembler::vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero,
2475                                           FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T) {
2476     assert_different_registers(dst, src, zero, one, vtmp);
2477     assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2478 
2479     sve_orr(vtmp, src, src);
2480     sve_fac(Assembler::GT, pgtmp, T, ptrue, src, zero); // pmtp=0 for +-0.0 and NaN. 0x1 otherwise
2481     switch (T) {
2482     case S:
2483       sve_and(vtmp, T, min_jint); // Extract the sign bit of float value in every lane of src
2484       sve_orr(vtmp, T, jint_cast(1.0)); // OR it with +1 to make the final result +1 or -1 depending
2485                                         // on the sign of the float value
2486       break;
2487     case D:
2488       sve_and(vtmp, T, min_jlong);
2489       sve_orr(vtmp, T, jlong_cast(1.0));
2490       break;
2491     default:
2492       assert(false, "unsupported");
2493       ShouldNotReachHere();
2494     }
2495     sve_sel(dst, T, pgtmp, vtmp, src); // Select either from src or vtmp based on the predicate register pgtmp
2496                                        // Result in dst
2497 }
2498 
2499 bool C2_MacroAssembler::in_scratch_emit_size() {
2500   if (ciEnv::current()->task() != nullptr) {
2501     PhaseOutput* phase_output = Compile::current()->output();
2502     if (phase_output != nullptr && phase_output->in_scratch_emit_size()) {
2503       return true;
2504     }
2505   }
2506   return MacroAssembler::in_scratch_emit_size();
2507 }