1 /*
   2  * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "opto/c2_MacroAssembler.hpp"
  29 #include "opto/compile.hpp"
  30 #include "opto/intrinsicnode.hpp"
  31 #include "opto/matcher.hpp"
  32 #include "opto/output.hpp"
  33 #include "opto/subnode.hpp"
  34 #include "runtime/stubRoutines.hpp"
  35 #include "utilities/globalDefinitions.hpp"
  36 
  37 #ifdef PRODUCT
  38 #define BLOCK_COMMENT(str) /* nothing */
  39 #define STOP(error) stop(error)
  40 #else
  41 #define BLOCK_COMMENT(str) block_comment(str)
  42 #define STOP(error) block_comment(error); stop(error)
  43 #endif
  44 
  45 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  46 
  47 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
  48 
  49 void C2_MacroAssembler::entry_barrier() {
  50   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
  51   if (BarrierSet::barrier_set()->barrier_set_nmethod() != nullptr) {
  52     // Dummy labels for just measuring the code size
  53     Label dummy_slow_path;
  54     Label dummy_continuation;
  55     Label dummy_guard;
  56     Label* slow_path = &dummy_slow_path;
  57     Label* continuation = &dummy_continuation;
  58     Label* guard = &dummy_guard;
  59     if (!Compile::current()->output()->in_scratch_emit_size()) {
  60       // Use real labels from actual stub when not emitting code for the purpose of measuring its size
  61       C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
  62       Compile::current()->output()->add_stub(stub);
  63       slow_path = &stub->entry();
  64       continuation = &stub->continuation();
  65       guard = &stub->guard();
  66     }
  67     // In the C2 code, we move the non-hot part of nmethod entry barriers out-of-line to a stub.
  68     bs->nmethod_entry_barrier(this, slow_path, continuation, guard);
  69   }
  70 }
  71 
  72 void C2_MacroAssembler::fast_lock(Register objectReg, Register boxReg, Register tmpReg,
  73                                   Register tmp2Reg, Register tmp3Reg) {
  74   Register oop = objectReg;
  75   Register box = boxReg;
  76   Register disp_hdr = tmpReg;
  77   Register tmp = tmp2Reg;
  78   Label cont;
  79   Label object_has_monitor;
  80   Label count, no_count;
  81 
  82   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight");
  83   assert_different_registers(oop, box, tmp, disp_hdr);
  84 
  85   // Load markWord from object into displaced_header.
  86   ldr(disp_hdr, Address(oop, oopDesc::mark_offset_in_bytes()));
  87 
  88   if (DiagnoseSyncOnValueBasedClasses != 0) {
  89     load_klass(tmp, oop);
  90     ldrw(tmp, Address(tmp, Klass::access_flags_offset()));
  91     tstw(tmp, JVM_ACC_IS_VALUE_BASED_CLASS);
  92     br(Assembler::NE, cont);
  93   }
  94 
  95   // Check for existing monitor
  96   tbnz(disp_hdr, exact_log2(markWord::monitor_value), object_has_monitor);
  97 
  98   if (LockingMode == LM_MONITOR) {
  99     tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0.
 100     b(cont);
 101   } else {
 102     assert(LockingMode == LM_LEGACY, "must be");
 103     // Set tmp to be (markWord of object | UNLOCK_VALUE).
 104     orr(tmp, disp_hdr, markWord::unlocked_value);
 105 
 106     if (EnableValhalla) {
 107       // Mask inline_type bit such that we go to the slow path if object is an inline type
 108       andr(tmp, tmp, ~((int) markWord::inline_type_bit_in_place));
 109     }
 110 
 111     // Initialize the box. (Must happen before we update the object mark!)
 112     str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 113 
 114     // Compare object markWord with an unlocked value (tmp) and if
 115     // equal exchange the stack address of our box with object markWord.
 116     // On failure disp_hdr contains the possibly locked markWord.
 117     cmpxchg(oop, tmp, box, Assembler::xword, /*acquire*/ true,
 118             /*release*/ true, /*weak*/ false, disp_hdr);
 119     br(Assembler::EQ, cont);
 120 
 121     assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
 122 
 123     // If the compare-and-exchange succeeded, then we found an unlocked
 124     // object, will have now locked it will continue at label cont
 125 
 126     // Check if the owner is self by comparing the value in the
 127     // markWord of object (disp_hdr) with the stack pointer.
 128     mov(rscratch1, sp);
 129     sub(disp_hdr, disp_hdr, rscratch1);
 130     mov(tmp, (address) (~(os::vm_page_size()-1) | markWord::lock_mask_in_place));
 131     // If condition is true we are cont and hence we can store 0 as the
 132     // displaced header in the box, which indicates that it is a recursive lock.
 133     ands(tmp/*==0?*/, disp_hdr, tmp);   // Sets flags for result
 134     str(tmp/*==0, perhaps*/, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 135     b(cont);
 136   }
 137 
 138   // Handle existing monitor.
 139   bind(object_has_monitor);
 140 
 141   // The object's monitor m is unlocked iff m->owner == nullptr,
 142   // otherwise m->owner may contain a thread or a stack address.
 143   //
 144   // Try to CAS m->owner from null to current thread.
 145   add(tmp, disp_hdr, (in_bytes(ObjectMonitor::owner_offset())-markWord::monitor_value));
 146   cmpxchg(tmp, zr, rthread, Assembler::xword, /*acquire*/ true,
 147           /*release*/ true, /*weak*/ false, tmp3Reg); // Sets flags for result
 148 
 149   // Store a non-null value into the box to avoid looking like a re-entrant
 150   // lock. The fast-path monitor unlock code checks for
 151   // markWord::monitor_value so use markWord::unused_mark which has the
 152   // relevant bit set, and also matches ObjectSynchronizer::enter.
 153   mov(tmp, (address)markWord::unused_mark().value());
 154   str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 155 
 156   br(Assembler::EQ, cont); // CAS success means locking succeeded
 157 
 158   cmp(tmp3Reg, rthread);
 159   br(Assembler::NE, cont); // Check for recursive locking
 160 
 161   // Recursive lock case
 162   increment(Address(disp_hdr, in_bytes(ObjectMonitor::recursions_offset()) - markWord::monitor_value), 1);
 163   // flag == EQ still from the cmp above, checking if this is a reentrant lock
 164 
 165   bind(cont);
 166   // flag == EQ indicates success
 167   // flag == NE indicates failure
 168   br(Assembler::NE, no_count);
 169 
 170   bind(count);
 171   increment(Address(rthread, JavaThread::held_monitor_count_offset()));
 172 
 173   bind(no_count);
 174 }
 175 
 176 void C2_MacroAssembler::fast_unlock(Register objectReg, Register boxReg, Register tmpReg,
 177                                     Register tmp2Reg) {
 178   Register oop = objectReg;
 179   Register box = boxReg;
 180   Register disp_hdr = tmpReg;
 181   Register tmp = tmp2Reg;
 182   Label cont;
 183   Label object_has_monitor;
 184   Label count, no_count;
 185 
 186   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight");
 187   assert_different_registers(oop, box, tmp, disp_hdr);
 188 
 189   if (LockingMode == LM_LEGACY) {
 190     // Find the lock address and load the displaced header from the stack.
 191     ldr(disp_hdr, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 192 
 193     // If the displaced header is 0, we have a recursive unlock.
 194     cmp(disp_hdr, zr);
 195     br(Assembler::EQ, cont);
 196   }
 197 
 198   // Handle existing monitor.
 199   ldr(tmp, Address(oop, oopDesc::mark_offset_in_bytes()));
 200   tbnz(tmp, exact_log2(markWord::monitor_value), object_has_monitor);
 201 
 202   if (LockingMode == LM_MONITOR) {
 203     tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0.
 204     b(cont);
 205   } else {
 206     assert(LockingMode == LM_LEGACY, "must be");
 207     // Check if it is still a light weight lock, this is is true if we
 208     // see the stack address of the basicLock in the markWord of the
 209     // object.
 210 
 211     cmpxchg(oop, box, disp_hdr, Assembler::xword, /*acquire*/ false,
 212             /*release*/ true, /*weak*/ false, tmp);
 213     b(cont);
 214   }
 215 
 216   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
 217 
 218   // Handle existing monitor.
 219   bind(object_has_monitor);
 220   STATIC_ASSERT(markWord::monitor_value <= INT_MAX);
 221   add(tmp, tmp, -(int)markWord::monitor_value); // monitor
 222 
 223   ldr(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset()));
 224 
 225   Label notRecursive;
 226   cbz(disp_hdr, notRecursive);
 227 
 228   // Recursive lock
 229   sub(disp_hdr, disp_hdr, 1u);
 230   str(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset()));
 231   cmp(disp_hdr, disp_hdr); // Sets flags for result
 232   b(cont);
 233 
 234   bind(notRecursive);
 235   ldr(rscratch1, Address(tmp, ObjectMonitor::EntryList_offset()));
 236   ldr(disp_hdr, Address(tmp, ObjectMonitor::cxq_offset()));
 237   orr(rscratch1, rscratch1, disp_hdr); // Will be 0 if both are 0.
 238   cmp(rscratch1, zr); // Sets flags for result
 239   cbnz(rscratch1, cont);
 240   // need a release store here
 241   lea(tmp, Address(tmp, ObjectMonitor::owner_offset()));
 242   stlr(zr, tmp); // set unowned
 243 
 244   bind(cont);
 245   // flag == EQ indicates success
 246   // flag == NE indicates failure
 247   br(Assembler::NE, no_count);
 248 
 249   bind(count);
 250   decrement(Address(rthread, JavaThread::held_monitor_count_offset()));
 251 
 252   bind(no_count);
 253 }
 254 
 255 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register t1,
 256                                               Register t2, Register t3) {
 257   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 258   assert_different_registers(obj, t1, t2, t3);
 259 
 260   // Handle inflated monitor.
 261   Label inflated;
 262   // Finish fast lock successfully. MUST branch to with flag == EQ
 263   Label locked;
 264   // Finish fast lock unsuccessfully. MUST branch to with flag == NE
 265   Label slow_path;
 266 
 267   if (DiagnoseSyncOnValueBasedClasses != 0) {
 268     load_klass(t1, obj);
 269     ldrw(t1, Address(t1, Klass::access_flags_offset()));
 270     tstw(t1, JVM_ACC_IS_VALUE_BASED_CLASS);
 271     br(Assembler::NE, slow_path);
 272   }
 273 
 274   const Register t1_mark = t1;
 275 
 276   { // Lightweight locking
 277 
 278     // Push lock to the lock stack and finish successfully. MUST branch to with flag == EQ
 279     Label push;
 280 
 281     const Register t2_top = t2;
 282     const Register t3_t = t3;
 283 
 284     // Check if lock-stack is full.
 285     ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 286     cmpw(t2_top, (unsigned)LockStack::end_offset() - 1);
 287     br(Assembler::GT, slow_path);
 288 
 289     // Check if recursive.
 290     subw(t3_t, t2_top, oopSize);
 291     ldr(t3_t, Address(rthread, t3_t));
 292     cmp(obj, t3_t);
 293     br(Assembler::EQ, push);
 294 
 295     // Relaxed normal load to check for monitor. Optimization for monitor case.
 296     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 297     tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
 298 
 299     // Not inflated
 300     assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea");
 301 
 302     // Try to lock. Transition lock-bits 0b01 => 0b00
 303     orr(t1_mark, t1_mark, markWord::unlocked_value);
 304     eor(t3_t, t1_mark, markWord::unlocked_value);
 305     cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
 306             /*acquire*/ true, /*release*/ false, /*weak*/ false, noreg);
 307     br(Assembler::NE, slow_path);
 308 
 309     bind(push);
 310     // After successful lock, push object on lock-stack.
 311     str(obj, Address(rthread, t2_top));
 312     addw(t2_top, t2_top, oopSize);
 313     strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 314     b(locked);
 315   }
 316 
 317   { // Handle inflated monitor.
 318     bind(inflated);
 319 
 320     // mark contains the tagged ObjectMonitor*.
 321     const Register t1_tagged_monitor = t1_mark;
 322     const uintptr_t monitor_tag = markWord::monitor_value;
 323     const Register t2_owner_addr = t2;
 324     const Register t3_owner = t3;
 325 
 326     // Compute owner address.
 327     lea(t2_owner_addr, Address(t1_tagged_monitor, (in_bytes(ObjectMonitor::owner_offset()) - monitor_tag)));
 328 
 329     // CAS owner (null => current thread).
 330     cmpxchg(t2_owner_addr, zr, rthread, Assembler::xword, /*acquire*/ true,
 331             /*release*/ false, /*weak*/ false, t3_owner);
 332     br(Assembler::EQ, locked);
 333 
 334     // Check if recursive.
 335     cmp(t3_owner, rthread);
 336     br(Assembler::NE, slow_path);
 337 
 338     // Recursive.
 339     increment(Address(t1_tagged_monitor, in_bytes(ObjectMonitor::recursions_offset()) - monitor_tag), 1);
 340   }
 341 
 342   bind(locked);
 343   increment(Address(rthread, JavaThread::held_monitor_count_offset()));
 344 
 345 #ifdef ASSERT
 346   // Check that locked label is reached with Flags == EQ.
 347   Label flag_correct;
 348   br(Assembler::EQ, flag_correct);
 349   stop("Fast Lock Flag != EQ");
 350 #endif
 351 
 352   bind(slow_path);
 353 #ifdef ASSERT
 354   // Check that slow_path label is reached with Flags == NE.
 355   br(Assembler::NE, flag_correct);
 356   stop("Fast Lock Flag != NE");
 357   bind(flag_correct);
 358 #endif
 359   // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
 360 }
 361 
 362 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register t1, Register t2,
 363                                                 Register t3) {
 364   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 365   assert_different_registers(obj, t1, t2, t3);
 366 
 367   // Handle inflated monitor.
 368   Label inflated, inflated_load_monitor;
 369   // Finish fast unlock successfully. MUST branch to with flag == EQ
 370   Label unlocked;
 371   // Finish fast unlock unsuccessfully. MUST branch to with flag == NE
 372   Label slow_path;
 373 
 374   const Register t1_mark = t1;
 375   const Register t2_top = t2;
 376   const Register t3_t = t3;
 377 
 378   { // Lightweight unlock
 379 
 380     // Check if obj is top of lock-stack.
 381     ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 382     subw(t2_top, t2_top, oopSize);
 383     ldr(t3_t, Address(rthread, t2_top));
 384     cmp(obj, t3_t);
 385     // Top of lock stack was not obj. Must be monitor.
 386     br(Assembler::NE, inflated_load_monitor);
 387 
 388     // Pop lock-stack.
 389     DEBUG_ONLY(str(zr, Address(rthread, t2_top));)
 390     strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 391 
 392     // Check if recursive.
 393     subw(t3_t, t2_top, oopSize);
 394     ldr(t3_t, Address(rthread, t3_t));
 395     cmp(obj, t3_t);
 396     br(Assembler::EQ, unlocked);
 397 
 398     // Not recursive.
 399     // Load Mark.
 400     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 401 
 402     // Check header for monitor (0b10).
 403     tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
 404 
 405     // Try to unlock. Transition lock bits 0b00 => 0b01
 406     assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea");
 407     orr(t3_t, t1_mark, markWord::unlocked_value);
 408     cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
 409             /*acquire*/ false, /*release*/ true, /*weak*/ false, noreg);
 410     br(Assembler::EQ, unlocked);
 411 
 412     // Compare and exchange failed.
 413     // Restore lock-stack and handle the unlock in runtime.
 414     DEBUG_ONLY(str(obj, Address(rthread, t2_top));)
 415     addw(t2_top, t2_top, oopSize);
 416     str(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 417     b(slow_path);
 418   }
 419 
 420 
 421   { // Handle inflated monitor.
 422     bind(inflated_load_monitor);
 423     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 424 #ifdef ASSERT
 425     tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
 426     stop("Fast Unlock not monitor");
 427 #endif
 428 
 429     bind(inflated);
 430 
 431 #ifdef ASSERT
 432     Label check_done;
 433     subw(t2_top, t2_top, oopSize);
 434     cmpw(t2_top, in_bytes(JavaThread::lock_stack_base_offset()));
 435     br(Assembler::LT, check_done);
 436     ldr(t3_t, Address(rthread, t2_top));
 437     cmp(obj, t3_t);
 438     br(Assembler::NE, inflated);
 439     stop("Fast Unlock lock on stack");
 440     bind(check_done);
 441 #endif
 442 
 443     // mark contains the tagged ObjectMonitor*.
 444     const Register t1_monitor = t1_mark;
 445     const uintptr_t monitor_tag = markWord::monitor_value;
 446 
 447     // Untag the monitor.
 448     sub(t1_monitor, t1_mark, monitor_tag);
 449 
 450     const Register t2_recursions = t2;
 451     Label not_recursive;
 452 
 453     // Check if recursive.
 454     ldr(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
 455     cbz(t2_recursions, not_recursive);
 456 
 457     // Recursive unlock.
 458     sub(t2_recursions, t2_recursions, 1u);
 459     str(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
 460     // Set flag == EQ
 461     cmp(t2_recursions, t2_recursions);
 462     b(unlocked);
 463 
 464     bind(not_recursive);
 465 
 466     Label release;
 467     const Register t2_owner_addr = t2;
 468 
 469     // Compute owner address.
 470     lea(t2_owner_addr, Address(t1_monitor, ObjectMonitor::owner_offset()));
 471 
 472     // Check if the entry lists are empty.
 473     ldr(rscratch1, Address(t1_monitor, ObjectMonitor::EntryList_offset()));
 474     ldr(t3_t, Address(t1_monitor, ObjectMonitor::cxq_offset()));
 475     orr(rscratch1, rscratch1, t3_t);
 476     cmp(rscratch1, zr);
 477     br(Assembler::EQ, release);
 478 
 479     // The owner may be anonymous and we removed the last obj entry in
 480     // the lock-stack. This loses the information about the owner.
 481     // Write the thread to the owner field so the runtime knows the owner.
 482     str(rthread, Address(t2_owner_addr));
 483     b(slow_path);
 484 
 485     bind(release);
 486     // Set owner to null.
 487     // Release to satisfy the JMM
 488     stlr(zr, t2_owner_addr);
 489   }
 490 
 491   bind(unlocked);
 492   decrement(Address(rthread, JavaThread::held_monitor_count_offset()));
 493 
 494 #ifdef ASSERT
 495   // Check that unlocked label is reached with Flags == EQ.
 496   Label flag_correct;
 497   br(Assembler::EQ, flag_correct);
 498   stop("Fast Unlock Flag != EQ");
 499 #endif
 500 
 501   bind(slow_path);
 502 #ifdef ASSERT
 503   // Check that slow_path label is reached with Flags == NE.
 504   br(Assembler::NE, flag_correct);
 505   stop("Fast Unlock Flag != NE");
 506   bind(flag_correct);
 507 #endif
 508   // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
 509 }
 510 
 511 // Search for str1 in str2 and return index or -1
 512 // Clobbers: rscratch1, rscratch2, rflags. May also clobber v0-v1, when icnt1==-1.
 513 void C2_MacroAssembler::string_indexof(Register str2, Register str1,
 514                                        Register cnt2, Register cnt1,
 515                                        Register tmp1, Register tmp2,
 516                                        Register tmp3, Register tmp4,
 517                                        Register tmp5, Register tmp6,
 518                                        int icnt1, Register result, int ae) {
 519   // NOTE: tmp5, tmp6 can be zr depending on specific method version
 520   Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
 521 
 522   Register ch1 = rscratch1;
 523   Register ch2 = rscratch2;
 524   Register cnt1tmp = tmp1;
 525   Register cnt2tmp = tmp2;
 526   Register cnt1_neg = cnt1;
 527   Register cnt2_neg = cnt2;
 528   Register result_tmp = tmp4;
 529 
 530   bool isL = ae == StrIntrinsicNode::LL;
 531 
 532   bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
 533   bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
 534   int str1_chr_shift = str1_isL ? 0:1;
 535   int str2_chr_shift = str2_isL ? 0:1;
 536   int str1_chr_size = str1_isL ? 1:2;
 537   int str2_chr_size = str2_isL ? 1:2;
 538   chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
 539                                       (chr_insn)&MacroAssembler::ldrh;
 540   chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
 541                                       (chr_insn)&MacroAssembler::ldrh;
 542   chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
 543   chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
 544 
 545   // Note, inline_string_indexOf() generates checks:
 546   // if (substr.count > string.count) return -1;
 547   // if (substr.count == 0) return 0;
 548 
 549   // We have two strings, a source string in str2, cnt2 and a pattern string
 550   // in str1, cnt1. Find the 1st occurrence of pattern in source or return -1.
 551 
 552   // For larger pattern and source we use a simplified Boyer Moore algorithm.
 553   // With a small pattern and source we use linear scan.
 554 
 555   if (icnt1 == -1) {
 556     sub(result_tmp, cnt2, cnt1);
 557     cmp(cnt1, (u1)8);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
 558     br(LT, LINEARSEARCH);
 559     dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
 560     subs(zr, cnt1, 256);
 561     lsr(tmp1, cnt2, 2);
 562     ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
 563     br(GE, LINEARSTUB);
 564   }
 565 
 566 // The Boyer Moore alogorithm is based on the description here:-
 567 //
 568 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
 569 //
 570 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
 571 // and the 'Good Suffix' rule.
 572 //
 573 // These rules are essentially heuristics for how far we can shift the
 574 // pattern along the search string.
 575 //
 576 // The implementation here uses the 'Bad Character' rule only because of the
 577 // complexity of initialisation for the 'Good Suffix' rule.
 578 //
 579 // This is also known as the Boyer-Moore-Horspool algorithm:-
 580 //
 581 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
 582 //
 583 // This particular implementation has few java-specific optimizations.
 584 //
 585 // #define ASIZE 256
 586 //
 587 //    int bm(unsigned char *x, int m, unsigned char *y, int n) {
 588 //       int i, j;
 589 //       unsigned c;
 590 //       unsigned char bc[ASIZE];
 591 //
 592 //       /* Preprocessing */
 593 //       for (i = 0; i < ASIZE; ++i)
 594 //          bc[i] = m;
 595 //       for (i = 0; i < m - 1; ) {
 596 //          c = x[i];
 597 //          ++i;
 598 //          // c < 256 for Latin1 string, so, no need for branch
 599 //          #ifdef PATTERN_STRING_IS_LATIN1
 600 //          bc[c] = m - i;
 601 //          #else
 602 //          if (c < ASIZE) bc[c] = m - i;
 603 //          #endif
 604 //       }
 605 //
 606 //       /* Searching */
 607 //       j = 0;
 608 //       while (j <= n - m) {
 609 //          c = y[i+j];
 610 //          if (x[m-1] == c)
 611 //            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
 612 //          if (i < 0) return j;
 613 //          // c < 256 for Latin1 string, so, no need for branch
 614 //          #ifdef SOURCE_STRING_IS_LATIN1
 615 //          // LL case: (c< 256) always true. Remove branch
 616 //          j += bc[y[j+m-1]];
 617 //          #endif
 618 //          #ifndef PATTERN_STRING_IS_UTF
 619 //          // UU case: need if (c<ASIZE) check. Skip 1 character if not.
 620 //          if (c < ASIZE)
 621 //            j += bc[y[j+m-1]];
 622 //          else
 623 //            j += 1
 624 //          #endif
 625 //          #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
 626 //          // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
 627 //          if (c < ASIZE)
 628 //            j += bc[y[j+m-1]];
 629 //          else
 630 //            j += m
 631 //          #endif
 632 //       }
 633 //    }
 634 
 635   if (icnt1 == -1) {
 636     Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
 637         BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
 638     Register cnt1end = tmp2;
 639     Register str2end = cnt2;
 640     Register skipch = tmp2;
 641 
 642     // str1 length is >=8, so, we can read at least 1 register for cases when
 643     // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
 644     // UL case. We'll re-read last character in inner pre-loop code to have
 645     // single outer pre-loop load
 646     const int firstStep = isL ? 7 : 3;
 647 
 648     const int ASIZE = 256;
 649     const int STORED_BYTES = 32; // amount of bytes stored per instruction
 650     sub(sp, sp, ASIZE);
 651     mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
 652     mov(ch1, sp);
 653     BIND(BM_INIT_LOOP);
 654       stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
 655       subs(tmp5, tmp5, 1);
 656       br(GT, BM_INIT_LOOP);
 657 
 658       sub(cnt1tmp, cnt1, 1);
 659       mov(tmp5, str2);
 660       add(str2end, str2, result_tmp, LSL, str2_chr_shift);
 661       sub(ch2, cnt1, 1);
 662       mov(tmp3, str1);
 663     BIND(BCLOOP);
 664       (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
 665       if (!str1_isL) {
 666         subs(zr, ch1, ASIZE);
 667         br(HS, BCSKIP);
 668       }
 669       strb(ch2, Address(sp, ch1));
 670     BIND(BCSKIP);
 671       subs(ch2, ch2, 1);
 672       br(GT, BCLOOP);
 673 
 674       add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
 675       if (str1_isL == str2_isL) {
 676         // load last 8 bytes (8LL/4UU symbols)
 677         ldr(tmp6, Address(tmp6, -wordSize));
 678       } else {
 679         ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
 680         // convert Latin1 to UTF. We'll have to wait until load completed, but
 681         // it's still faster than per-character loads+checks
 682         lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
 683         ubfx(ch1, tmp6, 8, 8); // str1[N-2]
 684         ubfx(ch2, tmp6, 16, 8); // str1[N-3]
 685         andr(tmp6, tmp6, 0xFF); // str1[N-4]
 686         orr(ch2, ch1, ch2, LSL, 16);
 687         orr(tmp6, tmp6, tmp3, LSL, 48);
 688         orr(tmp6, tmp6, ch2, LSL, 16);
 689       }
 690     BIND(BMLOOPSTR2);
 691       (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 692       sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
 693       if (str1_isL == str2_isL) {
 694         // re-init tmp3. It's for free because it's executed in parallel with
 695         // load above. Alternative is to initialize it before loop, but it'll
 696         // affect performance on in-order systems with 2 or more ld/st pipelines
 697         lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
 698       }
 699       if (!isL) { // UU/UL case
 700         lsl(ch2, cnt1tmp, 1); // offset in bytes
 701       }
 702       cmp(tmp3, skipch);
 703       br(NE, BMSKIP);
 704       ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
 705       mov(ch1, tmp6);
 706       if (isL) {
 707         b(BMLOOPSTR1_AFTER_LOAD);
 708       } else {
 709         sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
 710         b(BMLOOPSTR1_CMP);
 711       }
 712     BIND(BMLOOPSTR1);
 713       (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
 714       (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 715     BIND(BMLOOPSTR1_AFTER_LOAD);
 716       subs(cnt1tmp, cnt1tmp, 1);
 717       br(LT, BMLOOPSTR1_LASTCMP);
 718     BIND(BMLOOPSTR1_CMP);
 719       cmp(ch1, ch2);
 720       br(EQ, BMLOOPSTR1);
 721     BIND(BMSKIP);
 722       if (!isL) {
 723         // if we've met UTF symbol while searching Latin1 pattern, then we can
 724         // skip cnt1 symbols
 725         if (str1_isL != str2_isL) {
 726           mov(result_tmp, cnt1);
 727         } else {
 728           mov(result_tmp, 1);
 729         }
 730         subs(zr, skipch, ASIZE);
 731         br(HS, BMADV);
 732       }
 733       ldrb(result_tmp, Address(sp, skipch)); // load skip distance
 734     BIND(BMADV);
 735       sub(cnt1tmp, cnt1, 1);
 736       add(str2, str2, result_tmp, LSL, str2_chr_shift);
 737       cmp(str2, str2end);
 738       br(LE, BMLOOPSTR2);
 739       add(sp, sp, ASIZE);
 740       b(NOMATCH);
 741     BIND(BMLOOPSTR1_LASTCMP);
 742       cmp(ch1, ch2);
 743       br(NE, BMSKIP);
 744     BIND(BMMATCH);
 745       sub(result, str2, tmp5);
 746       if (!str2_isL) lsr(result, result, 1);
 747       add(sp, sp, ASIZE);
 748       b(DONE);
 749 
 750     BIND(LINEARSTUB);
 751     cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm
 752     br(LT, LINEAR_MEDIUM);
 753     mov(result, zr);
 754     RuntimeAddress stub = nullptr;
 755     if (isL) {
 756       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
 757       assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated");
 758     } else if (str1_isL) {
 759       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
 760        assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated");
 761     } else {
 762       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
 763       assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated");
 764     }
 765     address call = trampoline_call(stub);
 766     if (call == nullptr) {
 767       DEBUG_ONLY(reset_labels(LINEARSEARCH, LINEAR_MEDIUM, DONE, NOMATCH, MATCH));
 768       ciEnv::current()->record_failure("CodeCache is full");
 769       return;
 770     }
 771     b(DONE);
 772   }
 773 
 774   BIND(LINEARSEARCH);
 775   {
 776     Label DO1, DO2, DO3;
 777 
 778     Register str2tmp = tmp2;
 779     Register first = tmp3;
 780 
 781     if (icnt1 == -1)
 782     {
 783         Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
 784 
 785         cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2));
 786         br(LT, DOSHORT);
 787       BIND(LINEAR_MEDIUM);
 788         (this->*str1_load_1chr)(first, Address(str1));
 789         lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
 790         sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
 791         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 792         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 793 
 794       BIND(FIRST_LOOP);
 795         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
 796         cmp(first, ch2);
 797         br(EQ, STR1_LOOP);
 798       BIND(STR2_NEXT);
 799         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 800         br(LE, FIRST_LOOP);
 801         b(NOMATCH);
 802 
 803       BIND(STR1_LOOP);
 804         adds(cnt1tmp, cnt1_neg, str1_chr_size);
 805         add(cnt2tmp, cnt2_neg, str2_chr_size);
 806         br(GE, MATCH);
 807 
 808       BIND(STR1_NEXT);
 809         (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
 810         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
 811         cmp(ch1, ch2);
 812         br(NE, STR2_NEXT);
 813         adds(cnt1tmp, cnt1tmp, str1_chr_size);
 814         add(cnt2tmp, cnt2tmp, str2_chr_size);
 815         br(LT, STR1_NEXT);
 816         b(MATCH);
 817 
 818       BIND(DOSHORT);
 819       if (str1_isL == str2_isL) {
 820         cmp(cnt1, (u1)2);
 821         br(LT, DO1);
 822         br(GT, DO3);
 823       }
 824     }
 825 
 826     if (icnt1 == 4) {
 827       Label CH1_LOOP;
 828 
 829         (this->*load_4chr)(ch1, str1);
 830         sub(result_tmp, cnt2, 4);
 831         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 832         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 833 
 834       BIND(CH1_LOOP);
 835         (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
 836         cmp(ch1, ch2);
 837         br(EQ, MATCH);
 838         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 839         br(LE, CH1_LOOP);
 840         b(NOMATCH);
 841       }
 842 
 843     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
 844       Label CH1_LOOP;
 845 
 846       BIND(DO2);
 847         (this->*load_2chr)(ch1, str1);
 848         if (icnt1 == 2) {
 849           sub(result_tmp, cnt2, 2);
 850         }
 851         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 852         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 853       BIND(CH1_LOOP);
 854         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
 855         cmp(ch1, ch2);
 856         br(EQ, MATCH);
 857         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 858         br(LE, CH1_LOOP);
 859         b(NOMATCH);
 860     }
 861 
 862     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
 863       Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
 864 
 865       BIND(DO3);
 866         (this->*load_2chr)(first, str1);
 867         (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
 868         if (icnt1 == 3) {
 869           sub(result_tmp, cnt2, 3);
 870         }
 871         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 872         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 873       BIND(FIRST_LOOP);
 874         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
 875         cmpw(first, ch2);
 876         br(EQ, STR1_LOOP);
 877       BIND(STR2_NEXT);
 878         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 879         br(LE, FIRST_LOOP);
 880         b(NOMATCH);
 881 
 882       BIND(STR1_LOOP);
 883         add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
 884         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
 885         cmp(ch1, ch2);
 886         br(NE, STR2_NEXT);
 887         b(MATCH);
 888     }
 889 
 890     if (icnt1 == -1 || icnt1 == 1) {
 891       Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
 892 
 893       BIND(DO1);
 894         (this->*str1_load_1chr)(ch1, str1);
 895         cmp(cnt2, (u1)8);
 896         br(LT, DO1_SHORT);
 897 
 898         sub(result_tmp, cnt2, 8/str2_chr_size);
 899         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 900         mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
 901         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 902 
 903         if (str2_isL) {
 904           orr(ch1, ch1, ch1, LSL, 8);
 905         }
 906         orr(ch1, ch1, ch1, LSL, 16);
 907         orr(ch1, ch1, ch1, LSL, 32);
 908       BIND(CH1_LOOP);
 909         ldr(ch2, Address(str2, cnt2_neg));
 910         eor(ch2, ch1, ch2);
 911         sub(tmp1, ch2, tmp3);
 912         orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 913         bics(tmp1, tmp1, tmp2);
 914         br(NE, HAS_ZERO);
 915         adds(cnt2_neg, cnt2_neg, 8);
 916         br(LT, CH1_LOOP);
 917 
 918         cmp(cnt2_neg, (u1)8);
 919         mov(cnt2_neg, 0);
 920         br(LT, CH1_LOOP);
 921         b(NOMATCH);
 922 
 923       BIND(HAS_ZERO);
 924         rev(tmp1, tmp1);
 925         clz(tmp1, tmp1);
 926         add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
 927         b(MATCH);
 928 
 929       BIND(DO1_SHORT);
 930         mov(result_tmp, cnt2);
 931         lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
 932         sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
 933       BIND(DO1_LOOP);
 934         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
 935         cmpw(ch1, ch2);
 936         br(EQ, MATCH);
 937         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 938         br(LT, DO1_LOOP);
 939     }
 940   }
 941   BIND(NOMATCH);
 942     mov(result, -1);
 943     b(DONE);
 944   BIND(MATCH);
 945     add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
 946   BIND(DONE);
 947 }
 948 
 949 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
 950 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
 951 
 952 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1,
 953                                             Register ch, Register result,
 954                                             Register tmp1, Register tmp2, Register tmp3)
 955 {
 956   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
 957   Register cnt1_neg = cnt1;
 958   Register ch1 = rscratch1;
 959   Register result_tmp = rscratch2;
 960 
 961   cbz(cnt1, NOMATCH);
 962 
 963   cmp(cnt1, (u1)4);
 964   br(LT, DO1_SHORT);
 965 
 966   orr(ch, ch, ch, LSL, 16);
 967   orr(ch, ch, ch, LSL, 32);
 968 
 969   sub(cnt1, cnt1, 4);
 970   mov(result_tmp, cnt1);
 971   lea(str1, Address(str1, cnt1, Address::uxtw(1)));
 972   sub(cnt1_neg, zr, cnt1, LSL, 1);
 973 
 974   mov(tmp3, 0x0001000100010001);
 975 
 976   BIND(CH1_LOOP);
 977     ldr(ch1, Address(str1, cnt1_neg));
 978     eor(ch1, ch, ch1);
 979     sub(tmp1, ch1, tmp3);
 980     orr(tmp2, ch1, 0x7fff7fff7fff7fff);
 981     bics(tmp1, tmp1, tmp2);
 982     br(NE, HAS_ZERO);
 983     adds(cnt1_neg, cnt1_neg, 8);
 984     br(LT, CH1_LOOP);
 985 
 986     cmp(cnt1_neg, (u1)8);
 987     mov(cnt1_neg, 0);
 988     br(LT, CH1_LOOP);
 989     b(NOMATCH);
 990 
 991   BIND(HAS_ZERO);
 992     rev(tmp1, tmp1);
 993     clz(tmp1, tmp1);
 994     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
 995     b(MATCH);
 996 
 997   BIND(DO1_SHORT);
 998     mov(result_tmp, cnt1);
 999     lea(str1, Address(str1, cnt1, Address::uxtw(1)));
1000     sub(cnt1_neg, zr, cnt1, LSL, 1);
1001   BIND(DO1_LOOP);
1002     ldrh(ch1, Address(str1, cnt1_neg));
1003     cmpw(ch, ch1);
1004     br(EQ, MATCH);
1005     adds(cnt1_neg, cnt1_neg, 2);
1006     br(LT, DO1_LOOP);
1007   BIND(NOMATCH);
1008     mov(result, -1);
1009     b(DONE);
1010   BIND(MATCH);
1011     add(result, result_tmp, cnt1_neg, ASR, 1);
1012   BIND(DONE);
1013 }
1014 
1015 void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1,
1016                                                 Register ch, Register result,
1017                                                 FloatRegister ztmp1,
1018                                                 FloatRegister ztmp2,
1019                                                 PRegister tmp_pg,
1020                                                 PRegister tmp_pdn, bool isL)
1021 {
1022   // Note that `tmp_pdn` should *NOT* be used as governing predicate register.
1023   assert(tmp_pg->is_governing(),
1024          "this register has to be a governing predicate register");
1025 
1026   Label LOOP, MATCH, DONE, NOMATCH;
1027   Register vec_len = rscratch1;
1028   Register idx = rscratch2;
1029 
1030   SIMD_RegVariant T = (isL == true) ? B : H;
1031 
1032   cbz(cnt1, NOMATCH);
1033 
1034   // Assign the particular char throughout the vector.
1035   sve_dup(ztmp2, T, ch);
1036   if (isL) {
1037     sve_cntb(vec_len);
1038   } else {
1039     sve_cnth(vec_len);
1040   }
1041   mov(idx, 0);
1042 
1043   // Generate a predicate to control the reading of input string.
1044   sve_whilelt(tmp_pg, T, idx, cnt1);
1045 
1046   BIND(LOOP);
1047     // Read a vector of 8- or 16-bit data depending on the string type. Note
1048     // that inactive elements indicated by the predicate register won't cause
1049     // a data read from memory to the destination vector.
1050     if (isL) {
1051       sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx));
1052     } else {
1053       sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1)));
1054     }
1055     add(idx, idx, vec_len);
1056 
1057     // Perform the comparison. An element of the destination predicate is set
1058     // to active if the particular char is matched.
1059     sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2);
1060 
1061     // Branch if the particular char is found.
1062     br(NE, MATCH);
1063 
1064     sve_whilelt(tmp_pg, T, idx, cnt1);
1065 
1066     // Loop back if the particular char not found.
1067     br(MI, LOOP);
1068 
1069   BIND(NOMATCH);
1070     mov(result, -1);
1071     b(DONE);
1072 
1073   BIND(MATCH);
1074     // Undo the index increment.
1075     sub(idx, idx, vec_len);
1076 
1077     // Crop the vector to find its location.
1078     sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */);
1079     add(result, idx, -1);
1080     sve_incp(result, T, tmp_pdn);
1081   BIND(DONE);
1082 }
1083 
1084 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1,
1085                                             Register ch, Register result,
1086                                             Register tmp1, Register tmp2, Register tmp3)
1087 {
1088   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
1089   Register cnt1_neg = cnt1;
1090   Register ch1 = rscratch1;
1091   Register result_tmp = rscratch2;
1092 
1093   cbz(cnt1, NOMATCH);
1094 
1095   cmp(cnt1, (u1)8);
1096   br(LT, DO1_SHORT);
1097 
1098   orr(ch, ch, ch, LSL, 8);
1099   orr(ch, ch, ch, LSL, 16);
1100   orr(ch, ch, ch, LSL, 32);
1101 
1102   sub(cnt1, cnt1, 8);
1103   mov(result_tmp, cnt1);
1104   lea(str1, Address(str1, cnt1));
1105   sub(cnt1_neg, zr, cnt1);
1106 
1107   mov(tmp3, 0x0101010101010101);
1108 
1109   BIND(CH1_LOOP);
1110     ldr(ch1, Address(str1, cnt1_neg));
1111     eor(ch1, ch, ch1);
1112     sub(tmp1, ch1, tmp3);
1113     orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f);
1114     bics(tmp1, tmp1, tmp2);
1115     br(NE, HAS_ZERO);
1116     adds(cnt1_neg, cnt1_neg, 8);
1117     br(LT, CH1_LOOP);
1118 
1119     cmp(cnt1_neg, (u1)8);
1120     mov(cnt1_neg, 0);
1121     br(LT, CH1_LOOP);
1122     b(NOMATCH);
1123 
1124   BIND(HAS_ZERO);
1125     rev(tmp1, tmp1);
1126     clz(tmp1, tmp1);
1127     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
1128     b(MATCH);
1129 
1130   BIND(DO1_SHORT);
1131     mov(result_tmp, cnt1);
1132     lea(str1, Address(str1, cnt1));
1133     sub(cnt1_neg, zr, cnt1);
1134   BIND(DO1_LOOP);
1135     ldrb(ch1, Address(str1, cnt1_neg));
1136     cmp(ch, ch1);
1137     br(EQ, MATCH);
1138     adds(cnt1_neg, cnt1_neg, 1);
1139     br(LT, DO1_LOOP);
1140   BIND(NOMATCH);
1141     mov(result, -1);
1142     b(DONE);
1143   BIND(MATCH);
1144     add(result, result_tmp, cnt1_neg);
1145   BIND(DONE);
1146 }
1147 
1148 // Compare strings.
1149 void C2_MacroAssembler::string_compare(Register str1, Register str2,
1150     Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
1151     FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
1152     PRegister pgtmp1, PRegister pgtmp2, int ae) {
1153   Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
1154       DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
1155       SHORT_LOOP_START, TAIL_CHECK;
1156 
1157   bool isLL = ae == StrIntrinsicNode::LL;
1158   bool isLU = ae == StrIntrinsicNode::LU;
1159   bool isUL = ae == StrIntrinsicNode::UL;
1160 
1161   // The stub threshold for LL strings is: 72 (64 + 8) chars
1162   // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch)
1163   // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least)
1164   const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36);
1165 
1166   bool str1_isL = isLL || isLU;
1167   bool str2_isL = isLL || isUL;
1168 
1169   int str1_chr_shift = str1_isL ? 0 : 1;
1170   int str2_chr_shift = str2_isL ? 0 : 1;
1171   int str1_chr_size = str1_isL ? 1 : 2;
1172   int str2_chr_size = str2_isL ? 1 : 2;
1173   int minCharsInWord = isLL ? wordSize : wordSize/2;
1174 
1175   FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
1176   chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
1177                                       (chr_insn)&MacroAssembler::ldrh;
1178   chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
1179                                       (chr_insn)&MacroAssembler::ldrh;
1180   uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
1181                             (uxt_insn)&MacroAssembler::uxthw;
1182 
1183   BLOCK_COMMENT("string_compare {");
1184 
1185   // Bizzarely, the counts are passed in bytes, regardless of whether they
1186   // are L or U strings, however the result is always in characters.
1187   if (!str1_isL) asrw(cnt1, cnt1, 1);
1188   if (!str2_isL) asrw(cnt2, cnt2, 1);
1189 
1190   // Compute the minimum of the string lengths and save the difference.
1191   subsw(result, cnt1, cnt2);
1192   cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
1193 
1194   // A very short string
1195   cmpw(cnt2, minCharsInWord);
1196   br(Assembler::LE, SHORT_STRING);
1197 
1198   // Compare longwords
1199   // load first parts of strings and finish initialization while loading
1200   {
1201     if (str1_isL == str2_isL) { // LL or UU
1202       ldr(tmp1, Address(str1));
1203       cmp(str1, str2);
1204       br(Assembler::EQ, DONE);
1205       ldr(tmp2, Address(str2));
1206       cmp(cnt2, stub_threshold);
1207       br(GE, STUB);
1208       subsw(cnt2, cnt2, minCharsInWord);
1209       br(EQ, TAIL_CHECK);
1210       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1211       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1212       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1213     } else if (isLU) {
1214       ldrs(vtmp, Address(str1));
1215       ldr(tmp2, Address(str2));
1216       cmp(cnt2, stub_threshold);
1217       br(GE, STUB);
1218       subw(cnt2, cnt2, 4);
1219       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1220       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1221       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1222       zip1(vtmp, T8B, vtmp, vtmpZ);
1223       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1224       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1225       add(cnt1, cnt1, 4);
1226       fmovd(tmp1, vtmp);
1227     } else { // UL case
1228       ldr(tmp1, Address(str1));
1229       ldrs(vtmp, Address(str2));
1230       cmp(cnt2, stub_threshold);
1231       br(GE, STUB);
1232       subw(cnt2, cnt2, 4);
1233       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1234       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1235       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1236       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1237       zip1(vtmp, T8B, vtmp, vtmpZ);
1238       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1239       add(cnt1, cnt1, 8);
1240       fmovd(tmp2, vtmp);
1241     }
1242     adds(cnt2, cnt2, isUL ? 4 : 8);
1243     br(GE, TAIL);
1244     eor(rscratch2, tmp1, tmp2);
1245     cbnz(rscratch2, DIFF);
1246     // main loop
1247     bind(NEXT_WORD);
1248     if (str1_isL == str2_isL) {
1249       ldr(tmp1, Address(str1, cnt2));
1250       ldr(tmp2, Address(str2, cnt2));
1251       adds(cnt2, cnt2, 8);
1252     } else if (isLU) {
1253       ldrs(vtmp, Address(str1, cnt1));
1254       ldr(tmp2, Address(str2, cnt2));
1255       add(cnt1, cnt1, 4);
1256       zip1(vtmp, T8B, vtmp, vtmpZ);
1257       fmovd(tmp1, vtmp);
1258       adds(cnt2, cnt2, 8);
1259     } else { // UL
1260       ldrs(vtmp, Address(str2, cnt2));
1261       ldr(tmp1, Address(str1, cnt1));
1262       zip1(vtmp, T8B, vtmp, vtmpZ);
1263       add(cnt1, cnt1, 8);
1264       fmovd(tmp2, vtmp);
1265       adds(cnt2, cnt2, 4);
1266     }
1267     br(GE, TAIL);
1268 
1269     eor(rscratch2, tmp1, tmp2);
1270     cbz(rscratch2, NEXT_WORD);
1271     b(DIFF);
1272     bind(TAIL);
1273     eor(rscratch2, tmp1, tmp2);
1274     cbnz(rscratch2, DIFF);
1275     // Last longword.  In the case where length == 4 we compare the
1276     // same longword twice, but that's still faster than another
1277     // conditional branch.
1278     if (str1_isL == str2_isL) {
1279       ldr(tmp1, Address(str1));
1280       ldr(tmp2, Address(str2));
1281     } else if (isLU) {
1282       ldrs(vtmp, Address(str1));
1283       ldr(tmp2, Address(str2));
1284       zip1(vtmp, T8B, vtmp, vtmpZ);
1285       fmovd(tmp1, vtmp);
1286     } else { // UL
1287       ldrs(vtmp, Address(str2));
1288       ldr(tmp1, Address(str1));
1289       zip1(vtmp, T8B, vtmp, vtmpZ);
1290       fmovd(tmp2, vtmp);
1291     }
1292     bind(TAIL_CHECK);
1293     eor(rscratch2, tmp1, tmp2);
1294     cbz(rscratch2, DONE);
1295 
1296     // Find the first different characters in the longwords and
1297     // compute their difference.
1298     bind(DIFF);
1299     rev(rscratch2, rscratch2);
1300     clz(rscratch2, rscratch2);
1301     andr(rscratch2, rscratch2, isLL ? -8 : -16);
1302     lsrv(tmp1, tmp1, rscratch2);
1303     (this->*ext_chr)(tmp1, tmp1);
1304     lsrv(tmp2, tmp2, rscratch2);
1305     (this->*ext_chr)(tmp2, tmp2);
1306     subw(result, tmp1, tmp2);
1307     b(DONE);
1308   }
1309 
1310   bind(STUB);
1311     RuntimeAddress stub = nullptr;
1312     switch(ae) {
1313       case StrIntrinsicNode::LL:
1314         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
1315         break;
1316       case StrIntrinsicNode::UU:
1317         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
1318         break;
1319       case StrIntrinsicNode::LU:
1320         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
1321         break;
1322       case StrIntrinsicNode::UL:
1323         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
1324         break;
1325       default:
1326         ShouldNotReachHere();
1327      }
1328     assert(stub.target() != nullptr, "compare_long_string stub has not been generated");
1329     address call = trampoline_call(stub);
1330     if (call == nullptr) {
1331       DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START));
1332       ciEnv::current()->record_failure("CodeCache is full");
1333       return;
1334     }
1335     b(DONE);
1336 
1337   bind(SHORT_STRING);
1338   // Is the minimum length zero?
1339   cbz(cnt2, DONE);
1340   // arrange code to do most branches while loading and loading next characters
1341   // while comparing previous
1342   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1343   subs(cnt2, cnt2, 1);
1344   br(EQ, SHORT_LAST_INIT);
1345   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1346   b(SHORT_LOOP_START);
1347   bind(SHORT_LOOP);
1348   subs(cnt2, cnt2, 1);
1349   br(EQ, SHORT_LAST);
1350   bind(SHORT_LOOP_START);
1351   (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
1352   (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
1353   cmp(tmp1, cnt1);
1354   br(NE, SHORT_LOOP_TAIL);
1355   subs(cnt2, cnt2, 1);
1356   br(EQ, SHORT_LAST2);
1357   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1358   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1359   cmp(tmp2, rscratch1);
1360   br(EQ, SHORT_LOOP);
1361   sub(result, tmp2, rscratch1);
1362   b(DONE);
1363   bind(SHORT_LOOP_TAIL);
1364   sub(result, tmp1, cnt1);
1365   b(DONE);
1366   bind(SHORT_LAST2);
1367   cmp(tmp2, rscratch1);
1368   br(EQ, DONE);
1369   sub(result, tmp2, rscratch1);
1370 
1371   b(DONE);
1372   bind(SHORT_LAST_INIT);
1373   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1374   bind(SHORT_LAST);
1375   cmp(tmp1, cnt1);
1376   br(EQ, DONE);
1377   sub(result, tmp1, cnt1);
1378 
1379   bind(DONE);
1380 
1381   BLOCK_COMMENT("} string_compare");
1382 }
1383 
1384 void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1,
1385                                      FloatRegister src2, Condition cond, bool isQ) {
1386   SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1387   FloatRegister zn = src1, zm = src2;
1388   bool needs_negation = false;
1389   switch (cond) {
1390     case LT: cond = GT; zn = src2; zm = src1; break;
1391     case LE: cond = GE; zn = src2; zm = src1; break;
1392     case LO: cond = HI; zn = src2; zm = src1; break;
1393     case LS: cond = HS; zn = src2; zm = src1; break;
1394     case NE: cond = EQ; needs_negation = true; break;
1395     default:
1396       break;
1397   }
1398 
1399   if (is_floating_point_type(bt)) {
1400     fcm(cond, dst, size, zn, zm);
1401   } else {
1402     cm(cond, dst, size, zn, zm);
1403   }
1404 
1405   if (needs_negation) {
1406     notr(dst, isQ ? T16B : T8B, dst);
1407   }
1408 }
1409 
1410 void C2_MacroAssembler::neon_compare_zero(FloatRegister dst, BasicType bt, FloatRegister src,
1411                                           Condition cond, bool isQ) {
1412   SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1413   if (bt == T_FLOAT || bt == T_DOUBLE) {
1414     if (cond == Assembler::NE) {
1415       fcm(Assembler::EQ, dst, size, src);
1416       notr(dst, isQ ? T16B : T8B, dst);
1417     } else {
1418       fcm(cond, dst, size, src);
1419     }
1420   } else {
1421     if (cond == Assembler::NE) {
1422       cm(Assembler::EQ, dst, size, src);
1423       notr(dst, isQ ? T16B : T8B, dst);
1424     } else {
1425       cm(cond, dst, size, src);
1426     }
1427   }
1428 }
1429 
1430 // Compress the least significant bit of each byte to the rightmost and clear
1431 // the higher garbage bits.
1432 void C2_MacroAssembler::bytemask_compress(Register dst) {
1433   // Example input, dst = 0x01 00 00 00 01 01 00 01
1434   // The "??" bytes are garbage.
1435   orr(dst, dst, dst, Assembler::LSR, 7);  // dst = 0x?? 02 ?? 00 ?? 03 ?? 01
1436   orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D
1437   orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D
1438   andr(dst, dst, 0xff);                   // dst = 0x8D
1439 }
1440 
1441 // Pack the lowest-numbered bit of each mask element in src into a long value
1442 // in dst, at most the first 64 lane elements.
1443 // Clobbers: rscratch1, if UseSVE=1 or the hardware doesn't support FEAT_BITPERM.
1444 void C2_MacroAssembler::sve_vmask_tolong(Register dst, PRegister src, BasicType bt, int lane_cnt,
1445                                          FloatRegister vtmp1, FloatRegister vtmp2) {
1446   assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
1447   assert_different_registers(dst, rscratch1);
1448   assert_different_registers(vtmp1, vtmp2);
1449 
1450   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
1451   // Example:   src = 0b01100101 10001101, bt = T_BYTE, lane_cnt = 16
1452   // Expected:  dst = 0x658D
1453 
1454   // Convert the mask into vector with sequential bytes.
1455   // vtmp1 = 0x00010100 0x00010001 0x01000000 0x01010001
1456   sve_cpy(vtmp1, size, src, 1, false);
1457   if (bt != T_BYTE) {
1458     sve_vector_narrow(vtmp1, B, vtmp1, size, vtmp2);
1459   }
1460 
1461   if (UseSVE > 1 && VM_Version::supports_svebitperm()) {
1462     // Given a vector with the value 0x00 or 0x01 in each byte, the basic idea
1463     // is to compress each significant bit of the byte in a cross-lane way. Due
1464     // to the lack of a cross-lane bit-compress instruction, we use BEXT
1465     // (bit-compress in each lane) with the biggest lane size (T = D) then
1466     // concatenate the results.
1467 
1468     // The second source input of BEXT, initialized with 0x01 in each byte.
1469     // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1470     sve_dup(vtmp2, B, 1);
1471 
1472     // BEXT vtmp1.D, vtmp1.D, vtmp2.D
1473     // vtmp1 = 0x0001010000010001 | 0x0100000001010001
1474     // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1475     //         ---------------------------------------
1476     // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1477     sve_bext(vtmp1, D, vtmp1, vtmp2);
1478 
1479     // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the
1480     // result to dst.
1481     // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1482     // dst   = 0x658D
1483     if (lane_cnt <= 8) {
1484       // No need to concatenate.
1485       umov(dst, vtmp1, B, 0);
1486     } else if (lane_cnt <= 16) {
1487       ins(vtmp1, B, vtmp1, 1, 8);
1488       umov(dst, vtmp1, H, 0);
1489     } else {
1490       // As the lane count is 64 at most, the final expected value must be in
1491       // the lowest 64 bits after narrowing vtmp1 from D to B.
1492       sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2);
1493       umov(dst, vtmp1, D, 0);
1494     }
1495   } else if (UseSVE > 0) {
1496     // Compress the lowest 8 bytes.
1497     fmovd(dst, vtmp1);
1498     bytemask_compress(dst);
1499     if (lane_cnt <= 8) return;
1500 
1501     // Repeat on higher bytes and join the results.
1502     // Compress 8 bytes in each iteration.
1503     for (int idx = 1; idx < (lane_cnt / 8); idx++) {
1504       sve_extract_integral(rscratch1, T_LONG, vtmp1, idx, vtmp2);
1505       bytemask_compress(rscratch1);
1506       orr(dst, dst, rscratch1, Assembler::LSL, idx << 3);
1507     }
1508   } else {
1509     assert(false, "unsupported");
1510     ShouldNotReachHere();
1511   }
1512 }
1513 
1514 // Unpack the mask, a long value in src, into predicate register dst based on the
1515 // corresponding data type. Note that dst can support at most 64 lanes.
1516 // Below example gives the expected dst predicate register in different types, with
1517 // a valid src(0x658D) on a 1024-bit vector size machine.
1518 // BYTE:  dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 00 00 65 8D
1519 // SHORT: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 14 11 40 51
1520 // INT:   dst = 0x00 00 00 00 00 00 00 00 01 10 01 01 10 00 11 01
1521 // LONG:  dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01
1522 //
1523 // The number of significant bits of src must be equal to lane_cnt. E.g., 0xFF658D which
1524 // has 24 significant bits would be an invalid input if dst predicate register refers to
1525 // a LONG type 1024-bit vector, which has at most 16 lanes.
1526 void C2_MacroAssembler::sve_vmask_fromlong(PRegister dst, Register src, BasicType bt, int lane_cnt,
1527                                            FloatRegister vtmp1, FloatRegister vtmp2) {
1528   assert(UseSVE == 2 && VM_Version::supports_svebitperm() &&
1529          lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported");
1530   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
1531   // Example:   src = 0x658D, bt = T_BYTE, size = B, lane_cnt = 16
1532   // Expected:  dst = 0b01101001 10001101
1533 
1534   // Put long value from general purpose register into the first lane of vector.
1535   // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1536   sve_dup(vtmp1, B, 0);
1537   mov(vtmp1, D, 0, src);
1538 
1539   // As sve_cmp generates mask value with the minimum unit in byte, we should
1540   // transform the value in the first lane which is mask in bit now to the
1541   // mask in byte, which can be done by SVE2's BDEP instruction.
1542 
1543   // The first source input of BDEP instruction. Deposite each byte in every 8 bytes.
1544   // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1545   if (lane_cnt <= 8) {
1546     // Nothing. As only one byte exsits.
1547   } else if (lane_cnt <= 16) {
1548     ins(vtmp1, B, vtmp1, 8, 1);
1549     mov(vtmp1, B, 1, zr);
1550   } else {
1551     sve_vector_extend(vtmp1, D, vtmp1, B);
1552   }
1553 
1554   // The second source input of BDEP instruction, initialized with 0x01 for each byte.
1555   // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1556   sve_dup(vtmp2, B, 1);
1557 
1558   // BDEP vtmp1.D, vtmp1.D, vtmp2.D
1559   // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1560   // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1561   //         ---------------------------------------
1562   // vtmp1 = 0x0001010000010001 | 0x0100000001010001
1563   sve_bdep(vtmp1, D, vtmp1, vtmp2);
1564 
1565   if (bt != T_BYTE) {
1566     sve_vector_extend(vtmp1, size, vtmp1, B);
1567   }
1568   // Generate mask according to the given vector, in which the elements have been
1569   // extended to expected type.
1570   // dst = 0b01101001 10001101
1571   sve_cmp(Assembler::NE, dst, size, ptrue, vtmp1, 0);
1572 }
1573 
1574 // Clobbers: rflags
1575 void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg,
1576                                     FloatRegister zn, FloatRegister zm, Condition cond) {
1577   assert(pg->is_governing(), "This register has to be a governing predicate register");
1578   FloatRegister z1 = zn, z2 = zm;
1579   switch (cond) {
1580     case LE: z1 = zm; z2 = zn; cond = GE; break;
1581     case LT: z1 = zm; z2 = zn; cond = GT; break;
1582     case LO: z1 = zm; z2 = zn; cond = HI; break;
1583     case LS: z1 = zm; z2 = zn; cond = HS; break;
1584     default:
1585       break;
1586   }
1587 
1588   SIMD_RegVariant size = elemType_to_regVariant(bt);
1589   if (is_floating_point_type(bt)) {
1590     sve_fcm(cond, pd, size, pg, z1, z2);
1591   } else {
1592     assert(is_integral_type(bt), "unsupported element type");
1593     sve_cmp(cond, pd, size, pg, z1, z2);
1594   }
1595 }
1596 
1597 // Get index of the last mask lane that is set
1598 void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) {
1599   SIMD_RegVariant size = elemType_to_regVariant(bt);
1600   sve_rev(ptmp, size, src);
1601   sve_brkb(ptmp, ptrue, ptmp, false);
1602   sve_cntp(dst, size, ptrue, ptmp);
1603   movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1);
1604   subw(dst, rscratch1, dst);
1605 }
1606 
1607 // Extend integer vector src to dst with the same lane count
1608 // but larger element size, e.g. 4B -> 4I
1609 void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes,
1610                                            FloatRegister src, BasicType src_bt, bool is_unsigned) {
1611   if (src_bt == T_BYTE) {
1612     if (dst_bt == T_SHORT) {
1613       // 4B/8B to 4S/8S
1614       _xshll(is_unsigned, dst, T8H, src, T8B, 0);
1615     } else {
1616       // 4B to 4I
1617       assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported");
1618       _xshll(is_unsigned, dst, T8H, src, T8B, 0);
1619       _xshll(is_unsigned, dst, T4S, dst, T4H, 0);
1620     }
1621   } else if (src_bt == T_SHORT) {
1622     // 4S to 4I
1623     assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported");
1624     _xshll(is_unsigned, dst, T4S, src, T4H, 0);
1625   } else if (src_bt == T_INT) {
1626     // 2I to 2L
1627     assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported");
1628     _xshll(is_unsigned, dst, T2D, src, T2S, 0);
1629   } else {
1630     ShouldNotReachHere();
1631   }
1632 }
1633 
1634 // Narrow integer vector src down to dst with the same lane count
1635 // but smaller element size, e.g. 4I -> 4B
1636 void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt,
1637                                            FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) {
1638   if (src_bt == T_SHORT) {
1639     // 4S/8S to 4B/8B
1640     assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
1641     assert(dst_bt == T_BYTE, "unsupported");
1642     xtn(dst, T8B, src, T8H);
1643   } else if (src_bt == T_INT) {
1644     // 4I to 4B/4S
1645     assert(src_vlen_in_bytes == 16, "unsupported");
1646     assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported");
1647     xtn(dst, T4H, src, T4S);
1648     if (dst_bt == T_BYTE) {
1649       xtn(dst, T8B, dst, T8H);
1650     }
1651   } else if (src_bt == T_LONG) {
1652     // 2L to 2I
1653     assert(src_vlen_in_bytes == 16, "unsupported");
1654     assert(dst_bt == T_INT, "unsupported");
1655     xtn(dst, T2S, src, T2D);
1656   } else {
1657     ShouldNotReachHere();
1658   }
1659 }
1660 
1661 void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size,
1662                                           FloatRegister src, SIMD_RegVariant src_size,
1663                                           bool is_unsigned) {
1664   assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size");
1665 
1666   if (src_size == B) {
1667     switch (dst_size) {
1668     case H:
1669       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1670       break;
1671     case S:
1672       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1673       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1674       break;
1675     case D:
1676       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1677       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1678       _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1679       break;
1680     default:
1681       ShouldNotReachHere();
1682     }
1683   } else if (src_size == H) {
1684     if (dst_size == S) {
1685       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1686     } else { // D
1687       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1688       _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1689     }
1690   } else if (src_size == S) {
1691     _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, src);
1692   }
1693 }
1694 
1695 // Vector narrow from src to dst with specified element sizes.
1696 // High part of dst vector will be filled with zero.
1697 void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size,
1698                                           FloatRegister src, SIMD_RegVariant src_size,
1699                                           FloatRegister tmp) {
1700   assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size");
1701   assert_different_registers(src, tmp);
1702   sve_dup(tmp, src_size, 0);
1703   if (src_size == D) {
1704     switch (dst_size) {
1705     case S:
1706       sve_uzp1(dst, S, src, tmp);
1707       break;
1708     case H:
1709       assert_different_registers(dst, tmp);
1710       sve_uzp1(dst, S, src, tmp);
1711       sve_uzp1(dst, H, dst, tmp);
1712       break;
1713     case B:
1714       assert_different_registers(dst, tmp);
1715       sve_uzp1(dst, S, src, tmp);
1716       sve_uzp1(dst, H, dst, tmp);
1717       sve_uzp1(dst, B, dst, tmp);
1718       break;
1719     default:
1720       ShouldNotReachHere();
1721     }
1722   } else if (src_size == S) {
1723     if (dst_size == H) {
1724       sve_uzp1(dst, H, src, tmp);
1725     } else { // B
1726       assert_different_registers(dst, tmp);
1727       sve_uzp1(dst, H, src, tmp);
1728       sve_uzp1(dst, B, dst, tmp);
1729     }
1730   } else if (src_size == H) {
1731     sve_uzp1(dst, B, src, tmp);
1732   }
1733 }
1734 
1735 // Extend src predicate to dst predicate with the same lane count but larger
1736 // element size, e.g. 64Byte -> 512Long
1737 void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src,
1738                                              uint dst_element_length_in_bytes,
1739                                              uint src_element_length_in_bytes) {
1740   if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) {
1741     sve_punpklo(dst, src);
1742   } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) {
1743     sve_punpklo(dst, src);
1744     sve_punpklo(dst, dst);
1745   } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) {
1746     sve_punpklo(dst, src);
1747     sve_punpklo(dst, dst);
1748     sve_punpklo(dst, dst);
1749   } else {
1750     assert(false, "unsupported");
1751     ShouldNotReachHere();
1752   }
1753 }
1754 
1755 // Narrow src predicate to dst predicate with the same lane count but
1756 // smaller element size, e.g. 512Long -> 64Byte
1757 void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src, PRegister ptmp,
1758                                              uint dst_element_length_in_bytes, uint src_element_length_in_bytes) {
1759   // The insignificant bits in src predicate are expected to be zero.
1760   // To ensure the higher order bits of the resultant narrowed vector are 0, an all-zero predicate is
1761   // passed as the second argument. An example narrowing operation with a given mask would be -
1762   // 128Long -> 64Int on a 128-bit machine i.e 2L -> 2I
1763   // Mask (for 2 Longs) : TF
1764   // Predicate register for the above mask (16 bits) : 00000001 00000000
1765   // After narrowing (uzp1 dst.b, src.b, ptmp.b) : 0000 0000 0001 0000
1766   // Which translates to mask for 2 integers as : TF (lower half is considered while upper half is 0)
1767   assert_different_registers(src, ptmp);
1768   assert_different_registers(dst, ptmp);
1769   sve_pfalse(ptmp);
1770   if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) {
1771     sve_uzp1(dst, B, src, ptmp);
1772   } else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) {
1773     sve_uzp1(dst, H, src, ptmp);
1774     sve_uzp1(dst, B, dst, ptmp);
1775   } else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) {
1776     sve_uzp1(dst, S, src, ptmp);
1777     sve_uzp1(dst, H, dst, ptmp);
1778     sve_uzp1(dst, B, dst, ptmp);
1779   } else {
1780     assert(false, "unsupported");
1781     ShouldNotReachHere();
1782   }
1783 }
1784 
1785 // Vector reduction add for integral type with ASIMD instructions.
1786 void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt,
1787                                                  Register isrc, FloatRegister vsrc,
1788                                                  unsigned vector_length_in_bytes,
1789                                                  FloatRegister vtmp) {
1790   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1791   assert_different_registers(dst, isrc);
1792   bool isQ = vector_length_in_bytes == 16;
1793 
1794   BLOCK_COMMENT("neon_reduce_add_integral {");
1795     switch(bt) {
1796       case T_BYTE:
1797         addv(vtmp, isQ ? T16B : T8B, vsrc);
1798         smov(dst, vtmp, B, 0);
1799         addw(dst, dst, isrc, ext::sxtb);
1800         break;
1801       case T_SHORT:
1802         addv(vtmp, isQ ? T8H : T4H, vsrc);
1803         smov(dst, vtmp, H, 0);
1804         addw(dst, dst, isrc, ext::sxth);
1805         break;
1806       case T_INT:
1807         isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc);
1808         umov(dst, vtmp, S, 0);
1809         addw(dst, dst, isrc);
1810         break;
1811       case T_LONG:
1812         assert(isQ, "unsupported");
1813         addpd(vtmp, vsrc);
1814         umov(dst, vtmp, D, 0);
1815         add(dst, dst, isrc);
1816         break;
1817       default:
1818         assert(false, "unsupported");
1819         ShouldNotReachHere();
1820     }
1821   BLOCK_COMMENT("} neon_reduce_add_integral");
1822 }
1823 
1824 // Vector reduction multiply for integral type with ASIMD instructions.
1825 // Note: temporary registers vtmp1 and vtmp2 are not used in some cases.
1826 // Clobbers: rscratch1
1827 void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt,
1828                                                  Register isrc, FloatRegister vsrc,
1829                                                  unsigned vector_length_in_bytes,
1830                                                  FloatRegister vtmp1, FloatRegister vtmp2) {
1831   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1832   bool isQ = vector_length_in_bytes == 16;
1833 
1834   BLOCK_COMMENT("neon_reduce_mul_integral {");
1835     switch(bt) {
1836       case T_BYTE:
1837         if (isQ) {
1838           // Multiply the lower half and higher half of vector iteratively.
1839           // vtmp1 = vsrc[8:15]
1840           ins(vtmp1, D, vsrc, 0, 1);
1841           // vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7]
1842           mulv(vtmp1, T8B, vtmp1, vsrc);
1843           // vtmp2 = vtmp1[4:7]
1844           ins(vtmp2, S, vtmp1, 0, 1);
1845           // vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3]
1846           mulv(vtmp1, T8B, vtmp2, vtmp1);
1847         } else {
1848           ins(vtmp1, S, vsrc, 0, 1);
1849           mulv(vtmp1, T8B, vtmp1, vsrc);
1850         }
1851         // vtmp2 = vtmp1[2:3]
1852         ins(vtmp2, H, vtmp1, 0, 1);
1853         // vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1]
1854         mulv(vtmp2, T8B, vtmp2, vtmp1);
1855         // dst = vtmp2[0] * isrc * vtmp2[1]
1856         umov(rscratch1, vtmp2, B, 0);
1857         mulw(dst, rscratch1, isrc);
1858         sxtb(dst, dst);
1859         umov(rscratch1, vtmp2, B, 1);
1860         mulw(dst, rscratch1, dst);
1861         sxtb(dst, dst);
1862         break;
1863       case T_SHORT:
1864         if (isQ) {
1865           ins(vtmp2, D, vsrc, 0, 1);
1866           mulv(vtmp2, T4H, vtmp2, vsrc);
1867           ins(vtmp1, S, vtmp2, 0, 1);
1868           mulv(vtmp1, T4H, vtmp1, vtmp2);
1869         } else {
1870           ins(vtmp1, S, vsrc, 0, 1);
1871           mulv(vtmp1, T4H, vtmp1, vsrc);
1872         }
1873         umov(rscratch1, vtmp1, H, 0);
1874         mulw(dst, rscratch1, isrc);
1875         sxth(dst, dst);
1876         umov(rscratch1, vtmp1, H, 1);
1877         mulw(dst, rscratch1, dst);
1878         sxth(dst, dst);
1879         break;
1880       case T_INT:
1881         if (isQ) {
1882           ins(vtmp1, D, vsrc, 0, 1);
1883           mulv(vtmp1, T2S, vtmp1, vsrc);
1884         } else {
1885           vtmp1 = vsrc;
1886         }
1887         umov(rscratch1, vtmp1, S, 0);
1888         mul(dst, rscratch1, isrc);
1889         umov(rscratch1, vtmp1, S, 1);
1890         mul(dst, rscratch1, dst);
1891         break;
1892       case T_LONG:
1893         umov(rscratch1, vsrc, D, 0);
1894         mul(dst, isrc, rscratch1);
1895         umov(rscratch1, vsrc, D, 1);
1896         mul(dst, dst, rscratch1);
1897         break;
1898       default:
1899         assert(false, "unsupported");
1900         ShouldNotReachHere();
1901     }
1902   BLOCK_COMMENT("} neon_reduce_mul_integral");
1903 }
1904 
1905 // Vector reduction multiply for floating-point type with ASIMD instructions.
1906 void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt,
1907                                            FloatRegister fsrc, FloatRegister vsrc,
1908                                            unsigned vector_length_in_bytes,
1909                                            FloatRegister vtmp) {
1910   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1911   bool isQ = vector_length_in_bytes == 16;
1912 
1913   BLOCK_COMMENT("neon_reduce_mul_fp {");
1914     switch(bt) {
1915       case T_FLOAT:
1916         fmuls(dst, fsrc, vsrc);
1917         ins(vtmp, S, vsrc, 0, 1);
1918         fmuls(dst, dst, vtmp);
1919         if (isQ) {
1920           ins(vtmp, S, vsrc, 0, 2);
1921           fmuls(dst, dst, vtmp);
1922           ins(vtmp, S, vsrc, 0, 3);
1923           fmuls(dst, dst, vtmp);
1924          }
1925         break;
1926       case T_DOUBLE:
1927         assert(isQ, "unsupported");
1928         fmuld(dst, fsrc, vsrc);
1929         ins(vtmp, D, vsrc, 0, 1);
1930         fmuld(dst, dst, vtmp);
1931         break;
1932       default:
1933         assert(false, "unsupported");
1934         ShouldNotReachHere();
1935     }
1936   BLOCK_COMMENT("} neon_reduce_mul_fp");
1937 }
1938 
1939 // Helper to select logical instruction
1940 void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd,
1941                                                    Register Rn, Register Rm,
1942                                                    enum shift_kind kind, unsigned shift) {
1943   switch(opc) {
1944     case Op_AndReductionV:
1945       is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift);
1946       break;
1947     case Op_OrReductionV:
1948       is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift);
1949       break;
1950     case Op_XorReductionV:
1951       is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift);
1952       break;
1953     default:
1954       assert(false, "unsupported");
1955       ShouldNotReachHere();
1956   }
1957 }
1958 
1959 // Vector reduction logical operations And, Or, Xor
1960 // Clobbers: rscratch1
1961 void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt,
1962                                             Register isrc, FloatRegister vsrc,
1963                                             unsigned vector_length_in_bytes) {
1964   assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV,
1965          "unsupported");
1966   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1967   assert_different_registers(dst, isrc);
1968   bool isQ = vector_length_in_bytes == 16;
1969 
1970   BLOCK_COMMENT("neon_reduce_logical {");
1971     umov(rscratch1, vsrc, isQ ? D : S, 0);
1972     umov(dst, vsrc, isQ ? D : S, 1);
1973     neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1);
1974     switch(bt) {
1975       case T_BYTE:
1976         if (isQ) {
1977           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1978         }
1979         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
1980         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8);
1981         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1982         sxtb(dst, dst);
1983         break;
1984       case T_SHORT:
1985         if (isQ) {
1986           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1987         }
1988         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
1989         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1990         sxth(dst, dst);
1991         break;
1992       case T_INT:
1993         if (isQ) {
1994           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1995         }
1996         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1997         break;
1998       case T_LONG:
1999         assert(isQ, "unsupported");
2000         neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst);
2001         break;
2002       default:
2003         assert(false, "unsupported");
2004         ShouldNotReachHere();
2005     }
2006   BLOCK_COMMENT("} neon_reduce_logical");
2007 }
2008 
2009 // Vector reduction min/max for integral type with ASIMD instructions.
2010 // Note: vtmp is not used and expected to be fnoreg for T_LONG case.
2011 // Clobbers: rscratch1, rflags
2012 void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt,
2013                                                     Register isrc, FloatRegister vsrc,
2014                                                     unsigned vector_length_in_bytes,
2015                                                     FloatRegister vtmp) {
2016   assert(opc == Op_MinReductionV || opc == Op_MaxReductionV, "unsupported");
2017   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
2018   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported");
2019   assert_different_registers(dst, isrc);
2020   bool isQ = vector_length_in_bytes == 16;
2021   bool is_min = opc == Op_MinReductionV;
2022 
2023   BLOCK_COMMENT("neon_reduce_minmax_integral {");
2024     if (bt == T_LONG) {
2025       assert(vtmp == fnoreg, "should be");
2026       assert(isQ, "should be");
2027       umov(rscratch1, vsrc, D, 0);
2028       cmp(isrc, rscratch1);
2029       csel(dst, isrc, rscratch1, is_min ? LT : GT);
2030       umov(rscratch1, vsrc, D, 1);
2031       cmp(dst, rscratch1);
2032       csel(dst, dst, rscratch1, is_min ? LT : GT);
2033     } else {
2034       SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
2035       if (size == T2S) {
2036         is_min ? sminp(vtmp, size, vsrc, vsrc) : smaxp(vtmp, size, vsrc, vsrc);
2037       } else {
2038         is_min ? sminv(vtmp, size, vsrc) : smaxv(vtmp, size, vsrc);
2039       }
2040       if (bt == T_INT) {
2041         umov(dst, vtmp, S, 0);
2042       } else {
2043         smov(dst, vtmp, elemType_to_regVariant(bt), 0);
2044       }
2045       cmpw(dst, isrc);
2046       cselw(dst, dst, isrc, is_min ? LT : GT);
2047     }
2048   BLOCK_COMMENT("} neon_reduce_minmax_integral");
2049 }
2050 
2051 // Vector reduction for integral type with SVE instruction.
2052 // Supported operations are Add, And, Or, Xor, Max, Min.
2053 // rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV.
2054 void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1,
2055                                             FloatRegister src2, PRegister pg, FloatRegister tmp) {
2056   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2057   assert(pg->is_governing(), "This register has to be a governing predicate register");
2058   assert_different_registers(src1, dst);
2059   // Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved.
2060   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2061   switch (opc) {
2062     case Op_AddReductionVI: {
2063       sve_uaddv(tmp, size, pg, src2);
2064       if (bt == T_BYTE) {
2065         smov(dst, tmp, size, 0);
2066         addw(dst, src1, dst, ext::sxtb);
2067       } else if (bt == T_SHORT) {
2068         smov(dst, tmp, size, 0);
2069         addw(dst, src1, dst, ext::sxth);
2070       } else {
2071         umov(dst, tmp, size, 0);
2072         addw(dst, dst, src1);
2073       }
2074       break;
2075     }
2076     case Op_AddReductionVL: {
2077       sve_uaddv(tmp, size, pg, src2);
2078       umov(dst, tmp, size, 0);
2079       add(dst, dst, src1);
2080       break;
2081     }
2082     case Op_AndReductionV: {
2083       sve_andv(tmp, size, pg, src2);
2084       if (bt == T_INT || bt == T_LONG) {
2085         umov(dst, tmp, size, 0);
2086       } else {
2087         smov(dst, tmp, size, 0);
2088       }
2089       if (bt == T_LONG) {
2090         andr(dst, dst, src1);
2091       } else {
2092         andw(dst, dst, src1);
2093       }
2094       break;
2095     }
2096     case Op_OrReductionV: {
2097       sve_orv(tmp, size, pg, src2);
2098       if (bt == T_INT || bt == T_LONG) {
2099         umov(dst, tmp, size, 0);
2100       } else {
2101         smov(dst, tmp, size, 0);
2102       }
2103       if (bt == T_LONG) {
2104         orr(dst, dst, src1);
2105       } else {
2106         orrw(dst, dst, src1);
2107       }
2108       break;
2109     }
2110     case Op_XorReductionV: {
2111       sve_eorv(tmp, size, pg, src2);
2112       if (bt == T_INT || bt == T_LONG) {
2113         umov(dst, tmp, size, 0);
2114       } else {
2115         smov(dst, tmp, size, 0);
2116       }
2117       if (bt == T_LONG) {
2118         eor(dst, dst, src1);
2119       } else {
2120         eorw(dst, dst, src1);
2121       }
2122       break;
2123     }
2124     case Op_MaxReductionV: {
2125       sve_smaxv(tmp, size, pg, src2);
2126       if (bt == T_INT || bt == T_LONG) {
2127         umov(dst, tmp, size, 0);
2128       } else {
2129         smov(dst, tmp, size, 0);
2130       }
2131       if (bt == T_LONG) {
2132         cmp(dst, src1);
2133         csel(dst, dst, src1, Assembler::GT);
2134       } else {
2135         cmpw(dst, src1);
2136         cselw(dst, dst, src1, Assembler::GT);
2137       }
2138       break;
2139     }
2140     case Op_MinReductionV: {
2141       sve_sminv(tmp, size, pg, src2);
2142       if (bt == T_INT || bt == T_LONG) {
2143         umov(dst, tmp, size, 0);
2144       } else {
2145         smov(dst, tmp, size, 0);
2146       }
2147       if (bt == T_LONG) {
2148         cmp(dst, src1);
2149         csel(dst, dst, src1, Assembler::LT);
2150       } else {
2151         cmpw(dst, src1);
2152         cselw(dst, dst, src1, Assembler::LT);
2153       }
2154       break;
2155     }
2156     default:
2157       assert(false, "unsupported");
2158       ShouldNotReachHere();
2159   }
2160 
2161   if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) {
2162     if (bt == T_BYTE) {
2163       sxtb(dst, dst);
2164     } else if (bt == T_SHORT) {
2165       sxth(dst, dst);
2166     }
2167   }
2168 }
2169 
2170 // Set elements of the dst predicate to true for lanes in the range of [0, lane_cnt), or
2171 // to false otherwise. The input "lane_cnt" should be smaller than or equal to the supported
2172 // max vector length of the basic type. Clobbers: rscratch1 and the rFlagsReg.
2173 void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt) {
2174   uint32_t max_vector_length = Matcher::max_vector_size(bt);
2175   assert(lane_cnt <= max_vector_length, "unsupported input lane_cnt");
2176 
2177   // Set all elements to false if the input "lane_cnt" is zero.
2178   if (lane_cnt == 0) {
2179     sve_pfalse(dst);
2180     return;
2181   }
2182 
2183   SIMD_RegVariant size = elemType_to_regVariant(bt);
2184   assert(size != Q, "invalid size");
2185 
2186   // Set all true if "lane_cnt" equals to the max lane count.
2187   if (lane_cnt == max_vector_length) {
2188     sve_ptrue(dst, size, /* ALL */ 0b11111);
2189     return;
2190   }
2191 
2192   // Fixed numbers for "ptrue".
2193   switch(lane_cnt) {
2194   case 1: /* VL1 */
2195   case 2: /* VL2 */
2196   case 3: /* VL3 */
2197   case 4: /* VL4 */
2198   case 5: /* VL5 */
2199   case 6: /* VL6 */
2200   case 7: /* VL7 */
2201   case 8: /* VL8 */
2202     sve_ptrue(dst, size, lane_cnt);
2203     return;
2204   case 16:
2205     sve_ptrue(dst, size, /* VL16 */ 0b01001);
2206     return;
2207   case 32:
2208     sve_ptrue(dst, size, /* VL32 */ 0b01010);
2209     return;
2210   case 64:
2211     sve_ptrue(dst, size, /* VL64 */ 0b01011);
2212     return;
2213   case 128:
2214     sve_ptrue(dst, size, /* VL128 */ 0b01100);
2215     return;
2216   case 256:
2217     sve_ptrue(dst, size, /* VL256 */ 0b01101);
2218     return;
2219   default:
2220     break;
2221   }
2222 
2223   // Special patterns for "ptrue".
2224   if (lane_cnt == round_down_power_of_2(max_vector_length)) {
2225     sve_ptrue(dst, size, /* POW2 */ 0b00000);
2226   } else if (lane_cnt == max_vector_length - (max_vector_length % 4)) {
2227     sve_ptrue(dst, size, /* MUL4 */ 0b11101);
2228   } else if (lane_cnt == max_vector_length - (max_vector_length % 3)) {
2229     sve_ptrue(dst, size, /* MUL3 */ 0b11110);
2230   } else {
2231     // Encode to "whileltw" for the remaining cases.
2232     mov(rscratch1, lane_cnt);
2233     sve_whileltw(dst, size, zr, rscratch1);
2234   }
2235 }
2236 
2237 // Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst.
2238 // Any remaining elements of dst will be filled with zero.
2239 // Clobbers: rscratch1
2240 // Preserves: src, mask
2241 void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask,
2242                                            FloatRegister vtmp1, FloatRegister vtmp2,
2243                                            PRegister pgtmp) {
2244   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2245   assert_different_registers(dst, src, vtmp1, vtmp2);
2246   assert_different_registers(mask, pgtmp);
2247 
2248   // Example input:   src   = 8888 7777 6666 5555 4444 3333 2222 1111
2249   //                  mask  = 0001 0000 0000 0001 0001 0000 0001 0001
2250   // Expected result: dst   = 0000 0000 0000 8888 5555 4444 2222 1111
2251   sve_dup(vtmp2, H, 0);
2252 
2253   // Extend lowest half to type INT.
2254   // dst = 00004444 00003333 00002222 00001111
2255   sve_uunpklo(dst, S, src);
2256   // pgtmp = 00000001 00000000 00000001 00000001
2257   sve_punpklo(pgtmp, mask);
2258   // Pack the active elements in size of type INT to the right,
2259   // and fill the remainings with zero.
2260   // dst = 00000000 00004444 00002222 00001111
2261   sve_compact(dst, S, dst, pgtmp);
2262   // Narrow the result back to type SHORT.
2263   // dst = 0000 0000 0000 0000 0000 4444 2222 1111
2264   sve_uzp1(dst, H, dst, vtmp2);
2265   // Count the active elements of lowest half.
2266   // rscratch1 = 3
2267   sve_cntp(rscratch1, S, ptrue, pgtmp);
2268 
2269   // Repeat to the highest half.
2270   // pgtmp = 00000001 00000000 00000000 00000001
2271   sve_punpkhi(pgtmp, mask);
2272   // vtmp1 = 00008888 00007777 00006666 00005555
2273   sve_uunpkhi(vtmp1, S, src);
2274   // vtmp1 = 00000000 00000000 00008888 00005555
2275   sve_compact(vtmp1, S, vtmp1, pgtmp);
2276   // vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555
2277   sve_uzp1(vtmp1, H, vtmp1, vtmp2);
2278 
2279   // Compressed low:   dst   = 0000 0000 0000 0000 0000 4444 2222 1111
2280   // Compressed high:  vtmp1 = 0000 0000 0000 0000 0000 0000 8888  5555
2281   // Left shift(cross lane) compressed high with TRUE_CNT lanes,
2282   // TRUE_CNT is the number of active elements in the compressed low.
2283   neg(rscratch1, rscratch1);
2284   // vtmp2 = {4 3 2 1 0 -1 -2 -3}
2285   sve_index(vtmp2, H, rscratch1, 1);
2286   // vtmp1 = 0000 0000 0000 8888 5555 0000 0000 0000
2287   sve_tbl(vtmp1, H, vtmp1, vtmp2);
2288 
2289   // Combine the compressed high(after shifted) with the compressed low.
2290   // dst = 0000 0000 0000 8888 5555 4444 2222 1111
2291   sve_orr(dst, dst, vtmp1);
2292 }
2293 
2294 // Clobbers: rscratch1, rscratch2
2295 // Preserves: src, mask
2296 void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask,
2297                                           FloatRegister vtmp1, FloatRegister vtmp2,
2298                                           FloatRegister vtmp3, FloatRegister vtmp4,
2299                                           PRegister ptmp, PRegister pgtmp) {
2300   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2301   assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3, vtmp4);
2302   assert_different_registers(mask, ptmp, pgtmp);
2303   // Example input:   src   = 88 77 66 55 44 33 22 11
2304   //                  mask  = 01 00 00 01 01 00 01 01
2305   // Expected result: dst   = 00 00 00 88 55 44 22 11
2306 
2307   sve_dup(vtmp4, B, 0);
2308   // Extend lowest half to type SHORT.
2309   // vtmp1 = 0044 0033 0022 0011
2310   sve_uunpklo(vtmp1, H, src);
2311   // ptmp = 0001 0000 0001 0001
2312   sve_punpklo(ptmp, mask);
2313   // Count the active elements of lowest half.
2314   // rscratch2 = 3
2315   sve_cntp(rscratch2, H, ptrue, ptmp);
2316   // Pack the active elements in size of type SHORT to the right,
2317   // and fill the remainings with zero.
2318   // dst = 0000 0044 0022 0011
2319   sve_compress_short(dst, vtmp1, ptmp, vtmp2, vtmp3, pgtmp);
2320   // Narrow the result back to type BYTE.
2321   // dst = 00 00 00 00 00 44 22 11
2322   sve_uzp1(dst, B, dst, vtmp4);
2323 
2324   // Repeat to the highest half.
2325   // ptmp = 0001 0000 0000 0001
2326   sve_punpkhi(ptmp, mask);
2327   // vtmp1 = 0088 0077 0066 0055
2328   sve_uunpkhi(vtmp2, H, src);
2329   // vtmp1 = 0000 0000 0088 0055
2330   sve_compress_short(vtmp1, vtmp2, ptmp, vtmp3, vtmp4, pgtmp);
2331 
2332   sve_dup(vtmp4, B, 0);
2333   // vtmp1 = 00 00 00 00 00 00 88 55
2334   sve_uzp1(vtmp1, B, vtmp1, vtmp4);
2335 
2336   // Compressed low:   dst   = 00 00 00 00 00 44 22 11
2337   // Compressed high:  vtmp1 = 00 00 00 00 00 00 88 55
2338   // Left shift(cross lane) compressed high with TRUE_CNT lanes,
2339   // TRUE_CNT is the number of active elements in the compressed low.
2340   neg(rscratch2, rscratch2);
2341   // vtmp2 = {4 3 2 1 0 -1 -2 -3}
2342   sve_index(vtmp2, B, rscratch2, 1);
2343   // vtmp1 = 00 00 00 88 55 00 00 00
2344   sve_tbl(vtmp1, B, vtmp1, vtmp2);
2345   // Combine the compressed high(after shifted) with the compressed low.
2346   // dst = 00 00 00 88 55 44 22 11
2347   sve_orr(dst, dst, vtmp1);
2348 }
2349 
2350 void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2351   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2352   SIMD_Arrangement size = isQ ? T16B : T8B;
2353   if (bt == T_BYTE) {
2354     rbit(dst, size, src);
2355   } else {
2356     neon_reverse_bytes(dst, src, bt, isQ);
2357     rbit(dst, size, dst);
2358   }
2359 }
2360 
2361 void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2362   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2363   SIMD_Arrangement size = isQ ? T16B : T8B;
2364   switch (bt) {
2365     case T_BYTE:
2366       if (dst != src) {
2367         orr(dst, size, src, src);
2368       }
2369       break;
2370     case T_SHORT:
2371       rev16(dst, size, src);
2372       break;
2373     case T_INT:
2374       rev32(dst, size, src);
2375       break;
2376     case T_LONG:
2377       rev64(dst, size, src);
2378       break;
2379     default:
2380       assert(false, "unsupported");
2381       ShouldNotReachHere();
2382   }
2383 }
2384 
2385 // Extract a scalar element from an sve vector at position 'idx'.
2386 // The input elements in src are expected to be of integral type.
2387 void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src,
2388                                              int idx, FloatRegister vtmp) {
2389   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2390   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2391   if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction
2392     if (bt == T_INT || bt == T_LONG) {
2393       umov(dst, src, size, idx);
2394     } else {
2395       smov(dst, src, size, idx);
2396     }
2397   } else {
2398     sve_orr(vtmp, src, src);
2399     sve_ext(vtmp, vtmp, idx << size);
2400     if (bt == T_INT || bt == T_LONG) {
2401       umov(dst, vtmp, size, 0);
2402     } else {
2403       smov(dst, vtmp, size, 0);
2404     }
2405   }
2406 }
2407 
2408 // java.lang.Math::round intrinsics
2409 
2410 // Clobbers: rscratch1, rflags
2411 void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2412                                           FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) {
2413   assert_different_registers(tmp1, tmp2, tmp3, src, dst);
2414   switch (T) {
2415     case T2S:
2416     case T4S:
2417       fmovs(tmp1, T, 0.5f);
2418       mov(rscratch1, jint_cast(0x1.0p23f));
2419       break;
2420     case T2D:
2421       fmovd(tmp1, T, 0.5);
2422       mov(rscratch1, julong_cast(0x1.0p52));
2423       break;
2424     default:
2425       assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2426   }
2427   fadd(tmp1, T, tmp1, src);
2428   fcvtms(tmp1, T, tmp1);
2429   // tmp1 = floor(src + 0.5, ties to even)
2430 
2431   fcvtas(dst, T, src);
2432   // dst = round(src), ties to away
2433 
2434   fneg(tmp3, T, src);
2435   dup(tmp2, T, rscratch1);
2436   cm(HS, tmp3, T, tmp3, tmp2);
2437   // tmp3 is now a set of flags
2438 
2439   bif(dst, T16B, tmp1, tmp3);
2440   // result in dst
2441 }
2442 
2443 // Clobbers: rscratch1, rflags
2444 void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2445                                          FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) {
2446   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2447   assert_different_registers(tmp1, tmp2, src, dst);
2448 
2449   switch (T) {
2450     case S:
2451       mov(rscratch1, jint_cast(0x1.0p23f));
2452       break;
2453     case D:
2454       mov(rscratch1, julong_cast(0x1.0p52));
2455       break;
2456     default:
2457       assert(T == S || T == D, "invalid register variant");
2458   }
2459 
2460   sve_frinta(dst, T, ptrue, src);
2461   // dst = round(src), ties to away
2462 
2463   Label none;
2464 
2465   sve_fneg(tmp1, T, ptrue, src);
2466   sve_dup(tmp2, T, rscratch1);
2467   sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1);
2468   br(EQ, none);
2469   {
2470     sve_cpy(tmp1, T, pgtmp, 0.5);
2471     sve_fadd(tmp1, T, pgtmp, src);
2472     sve_frintm(dst, T, pgtmp, tmp1);
2473     // dst = floor(src + 0.5, ties to even)
2474   }
2475   bind(none);
2476 
2477   sve_fcvtzs(dst, T, ptrue, dst, T);
2478   // result in dst
2479 }
2480 
2481 void C2_MacroAssembler::vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero,
2482                                            FloatRegister one, SIMD_Arrangement T) {
2483   assert_different_registers(dst, src, zero, one);
2484   assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2485 
2486   facgt(dst, T, src, zero);
2487   ushr(dst, T, dst, 1); // dst=0 for +-0.0 and NaN. 0x7FF..F otherwise
2488   bsl(dst, T == T2S ? T8B : T16B, one, src); // Result in dst
2489 }
2490 
2491 void C2_MacroAssembler::vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero,
2492                                           FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T) {
2493     assert_different_registers(dst, src, zero, one, vtmp);
2494     assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2495 
2496     sve_orr(vtmp, src, src);
2497     sve_fac(Assembler::GT, pgtmp, T, ptrue, src, zero); // pmtp=0 for +-0.0 and NaN. 0x1 otherwise
2498     switch (T) {
2499     case S:
2500       sve_and(vtmp, T, min_jint); // Extract the sign bit of float value in every lane of src
2501       sve_orr(vtmp, T, jint_cast(1.0)); // OR it with +1 to make the final result +1 or -1 depending
2502                                         // on the sign of the float value
2503       break;
2504     case D:
2505       sve_and(vtmp, T, min_jlong);
2506       sve_orr(vtmp, T, jlong_cast(1.0));
2507       break;
2508     default:
2509       assert(false, "unsupported");
2510       ShouldNotReachHere();
2511     }
2512     sve_sel(dst, T, pgtmp, vtmp, src); // Select either from src or vtmp based on the predicate register pgtmp
2513                                        // Result in dst
2514 }
2515 
2516 bool C2_MacroAssembler::in_scratch_emit_size() {
2517   if (ciEnv::current()->task() != nullptr) {
2518     PhaseOutput* phase_output = Compile::current()->output();
2519     if (phase_output != nullptr && phase_output->in_scratch_emit_size()) {
2520       return true;
2521     }
2522   }
2523   return MacroAssembler::in_scratch_emit_size();
2524 }