1 /*
   2  * Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "asm/assembler.hpp"
  26 #include "asm/assembler.inline.hpp"
  27 #include "opto/c2_MacroAssembler.hpp"
  28 #include "opto/compile.hpp"
  29 #include "opto/intrinsicnode.hpp"
  30 #include "opto/matcher.hpp"
  31 #include "opto/output.hpp"
  32 #include "opto/subnode.hpp"
  33 #include "runtime/stubRoutines.hpp"
  34 #include "utilities/globalDefinitions.hpp"
  35 #include "utilities/powerOfTwo.hpp"
  36 
  37 #ifdef PRODUCT
  38 #define BLOCK_COMMENT(str) /* nothing */
  39 #define STOP(error) stop(error)
  40 #else
  41 #define BLOCK_COMMENT(str) block_comment(str)
  42 #define STOP(error) block_comment(error); stop(error)
  43 #endif
  44 
  45 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  46 
  47 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
  48 
  49 void C2_MacroAssembler::entry_barrier() {
  50   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
  51   // Dummy labels for just measuring the code size
  52   Label dummy_slow_path;
  53   Label dummy_continuation;
  54   Label dummy_guard;
  55   Label* slow_path = &dummy_slow_path;
  56   Label* continuation = &dummy_continuation;
  57   Label* guard = &dummy_guard;
  58   if (!Compile::current()->output()->in_scratch_emit_size()) {
  59     // Use real labels from actual stub when not emitting code for the purpose of measuring its size
  60     C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
  61     Compile::current()->output()->add_stub(stub);
  62     slow_path = &stub->entry();
  63     continuation = &stub->continuation();
  64     guard = &stub->guard();
  65   }
  66   // In the C2 code, we move the non-hot part of nmethod entry barriers out-of-line to a stub.
  67   bs->nmethod_entry_barrier(this, slow_path, continuation, guard);
  68 }
  69 
  70 // jdk.internal.util.ArraysSupport.vectorizedHashCode
  71 address C2_MacroAssembler::arrays_hashcode(Register ary, Register cnt, Register result,
  72                                            FloatRegister vdata0, FloatRegister vdata1,
  73                                            FloatRegister vdata2, FloatRegister vdata3,
  74                                            FloatRegister vmul0, FloatRegister vmul1,
  75                                            FloatRegister vmul2, FloatRegister vmul3,
  76                                            FloatRegister vpow, FloatRegister vpowm,
  77                                            BasicType eltype) {
  78   ARRAYS_HASHCODE_REGISTERS;
  79 
  80   Register tmp1 = rscratch1, tmp2 = rscratch2;
  81 
  82   Label TAIL, STUB_SWITCH, STUB_SWITCH_OUT, LOOP, BR_BASE, LARGE, DONE;
  83 
  84   // Vectorization factor. Number of array elements loaded to one SIMD&FP registers by the stubs. We
  85   // use 8H load arrangements for chars and shorts and 8B for booleans and bytes. It's possible to
  86   // use 4H for chars and shorts instead, but using 8H gives better performance.
  87   const size_t vf = eltype == T_BOOLEAN || eltype == T_BYTE ? 8
  88                     : eltype == T_CHAR || eltype == T_SHORT ? 8
  89                     : eltype == T_INT                       ? 4
  90                                                             : 0;
  91   guarantee(vf, "unsupported eltype");
  92 
  93   // Unroll factor for the scalar loop below. The value is chosen based on performance analysis.
  94   const size_t unroll_factor = 4;
  95 
  96   switch (eltype) {
  97   case T_BOOLEAN:
  98     BLOCK_COMMENT("arrays_hashcode(unsigned byte) {");
  99     break;
 100   case T_CHAR:
 101     BLOCK_COMMENT("arrays_hashcode(char) {");
 102     break;
 103   case T_BYTE:
 104     BLOCK_COMMENT("arrays_hashcode(byte) {");
 105     break;
 106   case T_SHORT:
 107     BLOCK_COMMENT("arrays_hashcode(short) {");
 108     break;
 109   case T_INT:
 110     BLOCK_COMMENT("arrays_hashcode(int) {");
 111     break;
 112   default:
 113     ShouldNotReachHere();
 114   }
 115 
 116   // large_arrays_hashcode(T_INT) performs worse than the scalar loop below when the Neon loop
 117   // implemented by the stub executes just once. Call the stub only if at least two iterations will
 118   // be executed.
 119   const size_t large_threshold = eltype == T_INT ? vf * 2 : vf;
 120   cmpw(cnt, large_threshold);
 121   br(Assembler::HS, LARGE);
 122 
 123   bind(TAIL);
 124 
 125   // The andr performs cnt % uf where uf = unroll_factor. The subtract shifted by 3 offsets past
 126   // uf - (cnt % uf) pairs of load + madd insns i.e. it only executes cnt % uf load + madd pairs.
 127   // Iteration eats up the remainder, uf elements at a time.
 128   assert(is_power_of_2(unroll_factor), "can't use this value to calculate the jump target PC");
 129   andr(tmp2, cnt, unroll_factor - 1);
 130   adr(tmp1, BR_BASE);
 131   // For Cortex-A53 offset is 4 because 2 nops are generated.
 132   sub(tmp1, tmp1, tmp2, ext::sxtw, VM_Version::supports_a53mac() ? 4 : 3);
 133   movw(tmp2, 0x1f);
 134   br(tmp1);
 135 
 136   bind(LOOP);
 137   for (size_t i = 0; i < unroll_factor; ++i) {
 138     load(tmp1, Address(post(ary, type2aelembytes(eltype))), eltype);
 139     maddw(result, result, tmp2, tmp1);
 140     // maddw generates an extra nop for Cortex-A53 (see maddw definition in macroAssembler).
 141     // Generate 2nd nop to have 4 instructions per iteration.
 142     if (VM_Version::supports_a53mac()) {
 143       nop();
 144     }
 145   }
 146   bind(BR_BASE);
 147   subsw(cnt, cnt, unroll_factor);
 148   br(Assembler::HS, LOOP);
 149 
 150   b(DONE);
 151 
 152   bind(LARGE);
 153 
 154   RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_arrays_hashcode(eltype));
 155   assert(stub.target() != nullptr, "array_hashcode stub has not been generated");
 156   address tpc = trampoline_call(stub);
 157   if (tpc == nullptr) {
 158     DEBUG_ONLY(reset_labels(TAIL, BR_BASE));
 159     postcond(pc() == badAddress);
 160     return nullptr;
 161   }
 162 
 163   bind(DONE);
 164 
 165   BLOCK_COMMENT("} // arrays_hashcode");
 166 
 167   postcond(pc() != badAddress);
 168   return pc();
 169 }
 170 
 171 void C2_MacroAssembler::fast_lock(Register objectReg, Register boxReg, Register tmpReg,
 172                                   Register tmp2Reg, Register tmp3Reg) {
 173   Register oop = objectReg;
 174   Register box = boxReg;
 175   Register disp_hdr = tmpReg;
 176   Register tmp = tmp2Reg;
 177   Label cont;
 178   Label object_has_monitor;
 179   Label count, no_count;
 180 
 181   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight");
 182   assert_different_registers(oop, box, tmp, disp_hdr, rscratch2);
 183 
 184   // Load markWord from object into displaced_header.
 185   ldr(disp_hdr, Address(oop, oopDesc::mark_offset_in_bytes()));
 186 
 187   if (DiagnoseSyncOnValueBasedClasses != 0) {
 188     load_klass(tmp, oop);
 189     ldrb(tmp, Address(tmp, Klass::misc_flags_offset()));
 190     tst(tmp, KlassFlags::_misc_is_value_based_class);
 191     br(Assembler::NE, cont);
 192   }
 193 
 194   // Check for existing monitor
 195   tbnz(disp_hdr, exact_log2(markWord::monitor_value), object_has_monitor);
 196 
 197   if (LockingMode == LM_MONITOR) {
 198     tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0.
 199     b(cont);
 200   } else {
 201     assert(LockingMode == LM_LEGACY, "must be");
 202     // Set tmp to be (markWord of object | UNLOCK_VALUE).
 203     orr(tmp, disp_hdr, markWord::unlocked_value);
 204 
 205     if (EnableValhalla) {
 206       // Mask inline_type bit such that we go to the slow path if object is an inline type
 207       andr(tmp, tmp, ~((int) markWord::inline_type_bit_in_place));
 208     }
 209 
 210     // Initialize the box. (Must happen before we update the object mark!)
 211     str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 212 
 213     // Compare object markWord with an unlocked value (tmp) and if
 214     // equal exchange the stack address of our box with object markWord.
 215     // On failure disp_hdr contains the possibly locked markWord.
 216     cmpxchg(oop, tmp, box, Assembler::xword, /*acquire*/ true,
 217             /*release*/ true, /*weak*/ false, disp_hdr);
 218     br(Assembler::EQ, cont);
 219 
 220     assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
 221 
 222     // If the compare-and-exchange succeeded, then we found an unlocked
 223     // object, will have now locked it will continue at label cont
 224 
 225     // Check if the owner is self by comparing the value in the
 226     // markWord of object (disp_hdr) with the stack pointer.
 227     mov(rscratch1, sp);
 228     sub(disp_hdr, disp_hdr, rscratch1);
 229     mov(tmp, (address) (~(os::vm_page_size()-1) | markWord::lock_mask_in_place));
 230     // If condition is true we are cont and hence we can store 0 as the
 231     // displaced header in the box, which indicates that it is a recursive lock.
 232     ands(tmp/*==0?*/, disp_hdr, tmp);   // Sets flags for result
 233     str(tmp/*==0, perhaps*/, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 234     b(cont);
 235   }
 236 
 237   // Handle existing monitor.
 238   bind(object_has_monitor);
 239 
 240   // Try to CAS owner (no owner => current thread's _monitor_owner_id).
 241   ldr(rscratch2, Address(rthread, JavaThread::monitor_owner_id_offset()));
 242   add(tmp, disp_hdr, (in_bytes(ObjectMonitor::owner_offset())-markWord::monitor_value));
 243   cmpxchg(tmp, zr, rscratch2, Assembler::xword, /*acquire*/ true,
 244           /*release*/ true, /*weak*/ false, tmp3Reg); // Sets flags for result
 245 
 246   // Store a non-null value into the box to avoid looking like a re-entrant
 247   // lock. The fast-path monitor unlock code checks for
 248   // markWord::monitor_value so use markWord::unused_mark which has the
 249   // relevant bit set, and also matches ObjectSynchronizer::enter.
 250   mov(tmp, (address)markWord::unused_mark().value());
 251   str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 252 
 253   br(Assembler::EQ, cont); // CAS success means locking succeeded
 254 
 255   cmp(tmp3Reg, rscratch2);
 256   br(Assembler::NE, cont); // Check for recursive locking
 257 
 258   // Recursive lock case
 259   increment(Address(disp_hdr, in_bytes(ObjectMonitor::recursions_offset()) - markWord::monitor_value), 1);
 260   // flag == EQ still from the cmp above, checking if this is a reentrant lock
 261 
 262   bind(cont);
 263   // flag == EQ indicates success
 264   // flag == NE indicates failure
 265   br(Assembler::NE, no_count);
 266 
 267   bind(count);
 268   if (LockingMode == LM_LEGACY) {
 269     inc_held_monitor_count(rscratch1);
 270   }
 271 
 272   bind(no_count);
 273 }
 274 
 275 void C2_MacroAssembler::fast_unlock(Register objectReg, Register boxReg, Register tmpReg,
 276                                     Register tmp2Reg) {
 277   Register oop = objectReg;
 278   Register box = boxReg;
 279   Register disp_hdr = tmpReg;
 280   Register owner_addr = tmpReg;
 281   Register tmp = tmp2Reg;
 282   Label cont;
 283   Label object_has_monitor;
 284   Label count, no_count;
 285   Label unlocked;
 286 
 287   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight");
 288   assert_different_registers(oop, box, tmp, disp_hdr);
 289 
 290   if (LockingMode == LM_LEGACY) {
 291     // Find the lock address and load the displaced header from the stack.
 292     ldr(disp_hdr, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 293 
 294     // If the displaced header is 0, we have a recursive unlock.
 295     cmp(disp_hdr, zr);
 296     br(Assembler::EQ, cont);
 297   }
 298 
 299   // Handle existing monitor.
 300   ldr(tmp, Address(oop, oopDesc::mark_offset_in_bytes()));
 301   tbnz(tmp, exact_log2(markWord::monitor_value), object_has_monitor);
 302 
 303   if (LockingMode == LM_MONITOR) {
 304     tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0.
 305     b(cont);
 306   } else {
 307     assert(LockingMode == LM_LEGACY, "must be");
 308     // Check if it is still a light weight lock, this is is true if we
 309     // see the stack address of the basicLock in the markWord of the
 310     // object.
 311 
 312     cmpxchg(oop, box, disp_hdr, Assembler::xword, /*acquire*/ false,
 313             /*release*/ true, /*weak*/ false, tmp);
 314     b(cont);
 315   }
 316 
 317   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
 318 
 319   // Handle existing monitor.
 320   bind(object_has_monitor);
 321   STATIC_ASSERT(markWord::monitor_value <= INT_MAX);
 322   add(tmp, tmp, -(int)markWord::monitor_value); // monitor
 323 
 324   ldr(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset()));
 325 
 326   Label notRecursive;
 327   cbz(disp_hdr, notRecursive);
 328 
 329   // Recursive lock
 330   sub(disp_hdr, disp_hdr, 1u);
 331   str(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset()));
 332   cmp(disp_hdr, disp_hdr); // Sets flags for result
 333   b(cont);
 334 
 335   bind(notRecursive);
 336 
 337   // Compute owner address.
 338   lea(owner_addr, Address(tmp, ObjectMonitor::owner_offset()));
 339 
 340   // Set owner to null.
 341   // Release to satisfy the JMM
 342   stlr(zr, owner_addr);
 343   // We need a full fence after clearing owner to avoid stranding.
 344   // StoreLoad achieves this.
 345   membar(StoreLoad);
 346 
 347   // Check if the entry_list is empty.
 348   ldr(rscratch1, Address(tmp, ObjectMonitor::entry_list_offset()));
 349   cmp(rscratch1, zr);
 350   br(Assembler::EQ, cont);     // If so we are done.
 351 
 352   // Check if there is a successor.
 353   ldr(rscratch1, Address(tmp, ObjectMonitor::succ_offset()));
 354   cmp(rscratch1, zr);
 355   br(Assembler::NE, unlocked); // If so we are done.
 356 
 357   // Save the monitor pointer in the current thread, so we can try to
 358   // reacquire the lock in SharedRuntime::monitor_exit_helper().
 359   str(tmp, Address(rthread, JavaThread::unlocked_inflated_monitor_offset()));
 360 
 361   cmp(zr, rthread); // Set Flag to NE => slow path
 362   b(cont);
 363 
 364   bind(unlocked);
 365   cmp(zr, zr); // Set Flag to EQ => fast path
 366 
 367   // Intentional fall-through
 368 
 369   bind(cont);
 370   // flag == EQ indicates success
 371   // flag == NE indicates failure
 372   br(Assembler::NE, no_count);
 373 
 374   bind(count);
 375   if (LockingMode == LM_LEGACY) {
 376     dec_held_monitor_count(rscratch1);
 377   }
 378 
 379   bind(no_count);
 380 }
 381 
 382 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register t1,
 383                                               Register t2, Register t3) {
 384   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 385   assert_different_registers(obj, box, t1, t2, t3, rscratch2);
 386 
 387   // Handle inflated monitor.
 388   Label inflated;
 389   // Finish fast lock successfully. MUST branch to with flag == EQ
 390   Label locked;
 391   // Finish fast lock unsuccessfully. MUST branch to with flag == NE
 392   Label slow_path;
 393 
 394   if (UseObjectMonitorTable) {
 395     // Clear cache in case fast locking succeeds or we need to take the slow-path.
 396     str(zr, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 397   }
 398 
 399   if (DiagnoseSyncOnValueBasedClasses != 0) {
 400     load_klass(t1, obj);
 401     ldrb(t1, Address(t1, Klass::misc_flags_offset()));
 402     tst(t1, KlassFlags::_misc_is_value_based_class);
 403     br(Assembler::NE, slow_path);
 404   }
 405 
 406   const Register t1_mark = t1;
 407   const Register t3_t = t3;
 408 
 409   { // Lightweight locking
 410 
 411     // Push lock to the lock stack and finish successfully. MUST branch to with flag == EQ
 412     Label push;
 413 
 414     const Register t2_top = t2;
 415 
 416     // Check if lock-stack is full.
 417     ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 418     cmpw(t2_top, (unsigned)LockStack::end_offset() - 1);
 419     br(Assembler::GT, slow_path);
 420 
 421     // Check if recursive.
 422     subw(t3_t, t2_top, oopSize);
 423     ldr(t3_t, Address(rthread, t3_t));
 424     cmp(obj, t3_t);
 425     br(Assembler::EQ, push);
 426 
 427     // Relaxed normal load to check for monitor. Optimization for monitor case.
 428     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 429     tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
 430 
 431     // Not inflated
 432     assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea");
 433 
 434     // Try to lock. Transition lock-bits 0b01 => 0b00
 435     orr(t1_mark, t1_mark, markWord::unlocked_value);
 436     eor(t3_t, t1_mark, markWord::unlocked_value);
 437     cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
 438             /*acquire*/ true, /*release*/ false, /*weak*/ false, noreg);
 439     br(Assembler::NE, slow_path);
 440 
 441     bind(push);
 442     // After successful lock, push object on lock-stack.
 443     str(obj, Address(rthread, t2_top));
 444     addw(t2_top, t2_top, oopSize);
 445     strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 446     b(locked);
 447   }
 448 
 449   { // Handle inflated monitor.
 450     bind(inflated);
 451 
 452     const Register t1_monitor = t1;
 453 
 454     if (!UseObjectMonitorTable) {
 455       assert(t1_monitor == t1_mark, "should be the same here");
 456     } else {
 457       Label monitor_found;
 458 
 459       // Load cache address
 460       lea(t3_t, Address(rthread, JavaThread::om_cache_oops_offset()));
 461 
 462       const int num_unrolled = 2;
 463       for (int i = 0; i < num_unrolled; i++) {
 464         ldr(t1, Address(t3_t));
 465         cmp(obj, t1);
 466         br(Assembler::EQ, monitor_found);
 467         increment(t3_t, in_bytes(OMCache::oop_to_oop_difference()));
 468       }
 469 
 470       Label loop;
 471 
 472       // Search for obj in cache.
 473       bind(loop);
 474 
 475       // Check for match.
 476       ldr(t1, Address(t3_t));
 477       cmp(obj, t1);
 478       br(Assembler::EQ, monitor_found);
 479 
 480       // Search until null encountered, guaranteed _null_sentinel at end.
 481       increment(t3_t, in_bytes(OMCache::oop_to_oop_difference()));
 482       cbnz(t1, loop);
 483       // Cache Miss, NE set from cmp above, cbnz does not set flags
 484       b(slow_path);
 485 
 486       bind(monitor_found);
 487       ldr(t1_monitor, Address(t3_t, OMCache::oop_to_monitor_difference()));
 488     }
 489 
 490     const Register t2_owner_addr = t2;
 491     const Register t3_owner = t3;
 492     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 493     const Address owner_address(t1_monitor, ObjectMonitor::owner_offset() - monitor_tag);
 494     const Address recursions_address(t1_monitor, ObjectMonitor::recursions_offset() - monitor_tag);
 495 
 496     Label monitor_locked;
 497 
 498     // Compute owner address.
 499     lea(t2_owner_addr, owner_address);
 500 
 501     // Try to CAS owner (no owner => current thread's _monitor_owner_id).
 502     ldr(rscratch2, Address(rthread, JavaThread::monitor_owner_id_offset()));
 503     cmpxchg(t2_owner_addr, zr, rscratch2, Assembler::xword, /*acquire*/ true,
 504             /*release*/ false, /*weak*/ false, t3_owner);
 505     br(Assembler::EQ, monitor_locked);
 506 
 507     // Check if recursive.
 508     cmp(t3_owner, rscratch2);
 509     br(Assembler::NE, slow_path);
 510 
 511     // Recursive.
 512     increment(recursions_address, 1);
 513 
 514     bind(monitor_locked);
 515     if (UseObjectMonitorTable) {
 516       str(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 517     }
 518   }
 519 
 520   bind(locked);
 521 
 522 #ifdef ASSERT
 523   // Check that locked label is reached with Flags == EQ.
 524   Label flag_correct;
 525   br(Assembler::EQ, flag_correct);
 526   stop("Fast Lock Flag != EQ");
 527 #endif
 528 
 529   bind(slow_path);
 530 #ifdef ASSERT
 531   // Check that slow_path label is reached with Flags == NE.
 532   br(Assembler::NE, flag_correct);
 533   stop("Fast Lock Flag != NE");
 534   bind(flag_correct);
 535 #endif
 536   // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
 537 }
 538 
 539 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register box, Register t1,
 540                                                 Register t2, Register t3) {
 541   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 542   assert_different_registers(obj, box, t1, t2, t3);
 543 
 544   // Handle inflated monitor.
 545   Label inflated, inflated_load_mark;
 546   // Finish fast unlock successfully. MUST branch to with flag == EQ
 547   Label unlocked;
 548   // Finish fast unlock unsuccessfully. MUST branch to with flag == NE
 549   Label slow_path;
 550 
 551   const Register t1_mark = t1;
 552   const Register t2_top = t2;
 553   const Register t3_t = t3;
 554 
 555   { // Lightweight unlock
 556 
 557     Label push_and_slow_path;
 558 
 559     // Check if obj is top of lock-stack.
 560     ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 561     subw(t2_top, t2_top, oopSize);
 562     ldr(t3_t, Address(rthread, t2_top));
 563     cmp(obj, t3_t);
 564     // Top of lock stack was not obj. Must be monitor.
 565     br(Assembler::NE, inflated_load_mark);
 566 
 567     // Pop lock-stack.
 568     DEBUG_ONLY(str(zr, Address(rthread, t2_top));)
 569     strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 570 
 571     // Check if recursive.
 572     subw(t3_t, t2_top, oopSize);
 573     ldr(t3_t, Address(rthread, t3_t));
 574     cmp(obj, t3_t);
 575     br(Assembler::EQ, unlocked);
 576 
 577     // Not recursive.
 578     // Load Mark.
 579     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 580 
 581     // Check header for monitor (0b10).
 582     // Because we got here by popping (meaning we pushed in locked)
 583     // there will be no monitor in the box. So we need to push back the obj
 584     // so that the runtime can fix any potential anonymous owner.
 585     tbnz(t1_mark, exact_log2(markWord::monitor_value), UseObjectMonitorTable ? push_and_slow_path : inflated);
 586 
 587     // Try to unlock. Transition lock bits 0b00 => 0b01
 588     assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea");
 589     orr(t3_t, t1_mark, markWord::unlocked_value);
 590     cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
 591             /*acquire*/ false, /*release*/ true, /*weak*/ false, noreg);
 592     br(Assembler::EQ, unlocked);
 593 
 594     bind(push_and_slow_path);
 595     // Compare and exchange failed.
 596     // Restore lock-stack and handle the unlock in runtime.
 597     DEBUG_ONLY(str(obj, Address(rthread, t2_top));)
 598     addw(t2_top, t2_top, oopSize);
 599     str(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 600     b(slow_path);
 601   }
 602 
 603 
 604   { // Handle inflated monitor.
 605     bind(inflated_load_mark);
 606     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 607 #ifdef ASSERT
 608     tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
 609     stop("Fast Unlock not monitor");
 610 #endif
 611 
 612     bind(inflated);
 613 
 614 #ifdef ASSERT
 615     Label check_done;
 616     subw(t2_top, t2_top, oopSize);
 617     cmpw(t2_top, in_bytes(JavaThread::lock_stack_base_offset()));
 618     br(Assembler::LT, check_done);
 619     ldr(t3_t, Address(rthread, t2_top));
 620     cmp(obj, t3_t);
 621     br(Assembler::NE, inflated);
 622     stop("Fast Unlock lock on stack");
 623     bind(check_done);
 624 #endif
 625 
 626     const Register t1_monitor = t1;
 627 
 628     if (!UseObjectMonitorTable) {
 629       assert(t1_monitor == t1_mark, "should be the same here");
 630 
 631       // Untag the monitor.
 632       add(t1_monitor, t1_mark, -(int)markWord::monitor_value);
 633     } else {
 634       ldr(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 635       // null check with Flags == NE, no valid pointer below alignof(ObjectMonitor*)
 636       cmp(t1_monitor, checked_cast<uint8_t>(alignof(ObjectMonitor*)));
 637       br(Assembler::LO, slow_path);
 638     }
 639 
 640     const Register t2_recursions = t2;
 641     Label not_recursive;
 642 
 643     // Check if recursive.
 644     ldr(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
 645     cbz(t2_recursions, not_recursive);
 646 
 647     // Recursive unlock.
 648     sub(t2_recursions, t2_recursions, 1u);
 649     str(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
 650     // Set flag == EQ
 651     cmp(t2_recursions, t2_recursions);
 652     b(unlocked);
 653 
 654     bind(not_recursive);
 655 
 656     const Register t2_owner_addr = t2;
 657 
 658     // Compute owner address.
 659     lea(t2_owner_addr, Address(t1_monitor, ObjectMonitor::owner_offset()));
 660 
 661     // Set owner to null.
 662     // Release to satisfy the JMM
 663     stlr(zr, t2_owner_addr);
 664     // We need a full fence after clearing owner to avoid stranding.
 665     // StoreLoad achieves this.
 666     membar(StoreLoad);
 667 
 668     // Check if the entry_list is empty.
 669     ldr(rscratch1, Address(t1_monitor, ObjectMonitor::entry_list_offset()));
 670     cmp(rscratch1, zr);
 671     br(Assembler::EQ, unlocked);  // If so we are done.
 672 
 673     // Check if there is a successor.
 674     ldr(rscratch1, Address(t1_monitor, ObjectMonitor::succ_offset()));
 675     cmp(rscratch1, zr);
 676     br(Assembler::NE, unlocked);  // If so we are done.
 677 
 678     // Save the monitor pointer in the current thread, so we can try to
 679     // reacquire the lock in SharedRuntime::monitor_exit_helper().
 680     str(t1_monitor, Address(rthread, JavaThread::unlocked_inflated_monitor_offset()));
 681 
 682     cmp(zr, rthread); // Set Flag to NE => slow path
 683     b(slow_path);
 684   }
 685 
 686   bind(unlocked);
 687   cmp(zr, zr); // Set Flags to EQ => fast path
 688 
 689 #ifdef ASSERT
 690   // Check that unlocked label is reached with Flags == EQ.
 691   Label flag_correct;
 692   br(Assembler::EQ, flag_correct);
 693   stop("Fast Unlock Flag != EQ");
 694 #endif
 695 
 696   bind(slow_path);
 697 #ifdef ASSERT
 698   // Check that slow_path label is reached with Flags == NE.
 699   br(Assembler::NE, flag_correct);
 700   stop("Fast Unlock Flag != NE");
 701   bind(flag_correct);
 702 #endif
 703   // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
 704 }
 705 
 706 // Search for str1 in str2 and return index or -1
 707 // Clobbers: rscratch1, rscratch2, rflags. May also clobber v0-v1, when icnt1==-1.
 708 void C2_MacroAssembler::string_indexof(Register str2, Register str1,
 709                                        Register cnt2, Register cnt1,
 710                                        Register tmp1, Register tmp2,
 711                                        Register tmp3, Register tmp4,
 712                                        Register tmp5, Register tmp6,
 713                                        int icnt1, Register result, int ae) {
 714   // NOTE: tmp5, tmp6 can be zr depending on specific method version
 715   Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
 716 
 717   Register ch1 = rscratch1;
 718   Register ch2 = rscratch2;
 719   Register cnt1tmp = tmp1;
 720   Register cnt2tmp = tmp2;
 721   Register cnt1_neg = cnt1;
 722   Register cnt2_neg = cnt2;
 723   Register result_tmp = tmp4;
 724 
 725   bool isL = ae == StrIntrinsicNode::LL;
 726 
 727   bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
 728   bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
 729   int str1_chr_shift = str1_isL ? 0:1;
 730   int str2_chr_shift = str2_isL ? 0:1;
 731   int str1_chr_size = str1_isL ? 1:2;
 732   int str2_chr_size = str2_isL ? 1:2;
 733   chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
 734                                       (chr_insn)&MacroAssembler::ldrh;
 735   chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
 736                                       (chr_insn)&MacroAssembler::ldrh;
 737   chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
 738   chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
 739 
 740   // Note, inline_string_indexOf() generates checks:
 741   // if (substr.count > string.count) return -1;
 742   // if (substr.count == 0) return 0;
 743 
 744   // We have two strings, a source string in str2, cnt2 and a pattern string
 745   // in str1, cnt1. Find the 1st occurrence of pattern in source or return -1.
 746 
 747   // For larger pattern and source we use a simplified Boyer Moore algorithm.
 748   // With a small pattern and source we use linear scan.
 749 
 750   if (icnt1 == -1) {
 751     sub(result_tmp, cnt2, cnt1);
 752     cmp(cnt1, (u1)8);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
 753     br(LT, LINEARSEARCH);
 754     dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
 755     subs(zr, cnt1, 256);
 756     lsr(tmp1, cnt2, 2);
 757     ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
 758     br(GE, LINEARSTUB);
 759   }
 760 
 761 // The Boyer Moore alogorithm is based on the description here:-
 762 //
 763 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
 764 //
 765 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
 766 // and the 'Good Suffix' rule.
 767 //
 768 // These rules are essentially heuristics for how far we can shift the
 769 // pattern along the search string.
 770 //
 771 // The implementation here uses the 'Bad Character' rule only because of the
 772 // complexity of initialisation for the 'Good Suffix' rule.
 773 //
 774 // This is also known as the Boyer-Moore-Horspool algorithm:-
 775 //
 776 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
 777 //
 778 // This particular implementation has few java-specific optimizations.
 779 //
 780 // #define ASIZE 256
 781 //
 782 //    int bm(unsigned char *x, int m, unsigned char *y, int n) {
 783 //       int i, j;
 784 //       unsigned c;
 785 //       unsigned char bc[ASIZE];
 786 //
 787 //       /* Preprocessing */
 788 //       for (i = 0; i < ASIZE; ++i)
 789 //          bc[i] = m;
 790 //       for (i = 0; i < m - 1; ) {
 791 //          c = x[i];
 792 //          ++i;
 793 //          // c < 256 for Latin1 string, so, no need for branch
 794 //          #ifdef PATTERN_STRING_IS_LATIN1
 795 //          bc[c] = m - i;
 796 //          #else
 797 //          if (c < ASIZE) bc[c] = m - i;
 798 //          #endif
 799 //       }
 800 //
 801 //       /* Searching */
 802 //       j = 0;
 803 //       while (j <= n - m) {
 804 //          c = y[i+j];
 805 //          if (x[m-1] == c)
 806 //            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
 807 //          if (i < 0) return j;
 808 //          // c < 256 for Latin1 string, so, no need for branch
 809 //          #ifdef SOURCE_STRING_IS_LATIN1
 810 //          // LL case: (c< 256) always true. Remove branch
 811 //          j += bc[y[j+m-1]];
 812 //          #endif
 813 //          #ifndef PATTERN_STRING_IS_UTF
 814 //          // UU case: need if (c<ASIZE) check. Skip 1 character if not.
 815 //          if (c < ASIZE)
 816 //            j += bc[y[j+m-1]];
 817 //          else
 818 //            j += 1
 819 //          #endif
 820 //          #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
 821 //          // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
 822 //          if (c < ASIZE)
 823 //            j += bc[y[j+m-1]];
 824 //          else
 825 //            j += m
 826 //          #endif
 827 //       }
 828 //    }
 829 
 830   if (icnt1 == -1) {
 831     Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
 832         BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
 833     Register cnt1end = tmp2;
 834     Register str2end = cnt2;
 835     Register skipch = tmp2;
 836 
 837     // str1 length is >=8, so, we can read at least 1 register for cases when
 838     // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
 839     // UL case. We'll re-read last character in inner pre-loop code to have
 840     // single outer pre-loop load
 841     const int firstStep = isL ? 7 : 3;
 842 
 843     const int ASIZE = 256;
 844     const int STORED_BYTES = 32; // amount of bytes stored per instruction
 845     sub(sp, sp, ASIZE);
 846     mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
 847     mov(ch1, sp);
 848     BIND(BM_INIT_LOOP);
 849       stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
 850       subs(tmp5, tmp5, 1);
 851       br(GT, BM_INIT_LOOP);
 852 
 853       sub(cnt1tmp, cnt1, 1);
 854       mov(tmp5, str2);
 855       add(str2end, str2, result_tmp, LSL, str2_chr_shift);
 856       sub(ch2, cnt1, 1);
 857       mov(tmp3, str1);
 858     BIND(BCLOOP);
 859       (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
 860       if (!str1_isL) {
 861         subs(zr, ch1, ASIZE);
 862         br(HS, BCSKIP);
 863       }
 864       strb(ch2, Address(sp, ch1));
 865     BIND(BCSKIP);
 866       subs(ch2, ch2, 1);
 867       br(GT, BCLOOP);
 868 
 869       add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
 870       if (str1_isL == str2_isL) {
 871         // load last 8 bytes (8LL/4UU symbols)
 872         ldr(tmp6, Address(tmp6, -wordSize));
 873       } else {
 874         ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
 875         // convert Latin1 to UTF. We'll have to wait until load completed, but
 876         // it's still faster than per-character loads+checks
 877         lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
 878         ubfx(ch1, tmp6, 8, 8); // str1[N-2]
 879         ubfx(ch2, tmp6, 16, 8); // str1[N-3]
 880         andr(tmp6, tmp6, 0xFF); // str1[N-4]
 881         orr(ch2, ch1, ch2, LSL, 16);
 882         orr(tmp6, tmp6, tmp3, LSL, 48);
 883         orr(tmp6, tmp6, ch2, LSL, 16);
 884       }
 885     BIND(BMLOOPSTR2);
 886       (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 887       sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
 888       if (str1_isL == str2_isL) {
 889         // re-init tmp3. It's for free because it's executed in parallel with
 890         // load above. Alternative is to initialize it before loop, but it'll
 891         // affect performance on in-order systems with 2 or more ld/st pipelines
 892         lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
 893       }
 894       if (!isL) { // UU/UL case
 895         lsl(ch2, cnt1tmp, 1); // offset in bytes
 896       }
 897       cmp(tmp3, skipch);
 898       br(NE, BMSKIP);
 899       ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
 900       mov(ch1, tmp6);
 901       if (isL) {
 902         b(BMLOOPSTR1_AFTER_LOAD);
 903       } else {
 904         sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
 905         b(BMLOOPSTR1_CMP);
 906       }
 907     BIND(BMLOOPSTR1);
 908       (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
 909       (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 910     BIND(BMLOOPSTR1_AFTER_LOAD);
 911       subs(cnt1tmp, cnt1tmp, 1);
 912       br(LT, BMLOOPSTR1_LASTCMP);
 913     BIND(BMLOOPSTR1_CMP);
 914       cmp(ch1, ch2);
 915       br(EQ, BMLOOPSTR1);
 916     BIND(BMSKIP);
 917       if (!isL) {
 918         // if we've met UTF symbol while searching Latin1 pattern, then we can
 919         // skip cnt1 symbols
 920         if (str1_isL != str2_isL) {
 921           mov(result_tmp, cnt1);
 922         } else {
 923           mov(result_tmp, 1);
 924         }
 925         subs(zr, skipch, ASIZE);
 926         br(HS, BMADV);
 927       }
 928       ldrb(result_tmp, Address(sp, skipch)); // load skip distance
 929     BIND(BMADV);
 930       sub(cnt1tmp, cnt1, 1);
 931       add(str2, str2, result_tmp, LSL, str2_chr_shift);
 932       cmp(str2, str2end);
 933       br(LE, BMLOOPSTR2);
 934       add(sp, sp, ASIZE);
 935       b(NOMATCH);
 936     BIND(BMLOOPSTR1_LASTCMP);
 937       cmp(ch1, ch2);
 938       br(NE, BMSKIP);
 939     BIND(BMMATCH);
 940       sub(result, str2, tmp5);
 941       if (!str2_isL) lsr(result, result, 1);
 942       add(sp, sp, ASIZE);
 943       b(DONE);
 944 
 945     BIND(LINEARSTUB);
 946     cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm
 947     br(LT, LINEAR_MEDIUM);
 948     mov(result, zr);
 949     RuntimeAddress stub = nullptr;
 950     if (isL) {
 951       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
 952       assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated");
 953     } else if (str1_isL) {
 954       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
 955        assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated");
 956     } else {
 957       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
 958       assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated");
 959     }
 960     address call = trampoline_call(stub);
 961     if (call == nullptr) {
 962       DEBUG_ONLY(reset_labels(LINEARSEARCH, LINEAR_MEDIUM, DONE, NOMATCH, MATCH));
 963       ciEnv::current()->record_failure("CodeCache is full");
 964       return;
 965     }
 966     b(DONE);
 967   }
 968 
 969   BIND(LINEARSEARCH);
 970   {
 971     Label DO1, DO2, DO3;
 972 
 973     Register str2tmp = tmp2;
 974     Register first = tmp3;
 975 
 976     if (icnt1 == -1)
 977     {
 978         Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
 979 
 980         cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2));
 981         br(LT, DOSHORT);
 982       BIND(LINEAR_MEDIUM);
 983         (this->*str1_load_1chr)(first, Address(str1));
 984         lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
 985         sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
 986         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 987         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 988 
 989       BIND(FIRST_LOOP);
 990         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
 991         cmp(first, ch2);
 992         br(EQ, STR1_LOOP);
 993       BIND(STR2_NEXT);
 994         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 995         br(LE, FIRST_LOOP);
 996         b(NOMATCH);
 997 
 998       BIND(STR1_LOOP);
 999         adds(cnt1tmp, cnt1_neg, str1_chr_size);
1000         add(cnt2tmp, cnt2_neg, str2_chr_size);
1001         br(GE, MATCH);
1002 
1003       BIND(STR1_NEXT);
1004         (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
1005         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
1006         cmp(ch1, ch2);
1007         br(NE, STR2_NEXT);
1008         adds(cnt1tmp, cnt1tmp, str1_chr_size);
1009         add(cnt2tmp, cnt2tmp, str2_chr_size);
1010         br(LT, STR1_NEXT);
1011         b(MATCH);
1012 
1013       BIND(DOSHORT);
1014       if (str1_isL == str2_isL) {
1015         cmp(cnt1, (u1)2);
1016         br(LT, DO1);
1017         br(GT, DO3);
1018       }
1019     }
1020 
1021     if (icnt1 == 4) {
1022       Label CH1_LOOP;
1023 
1024         (this->*load_4chr)(ch1, str1);
1025         sub(result_tmp, cnt2, 4);
1026         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
1027         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
1028 
1029       BIND(CH1_LOOP);
1030         (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
1031         cmp(ch1, ch2);
1032         br(EQ, MATCH);
1033         adds(cnt2_neg, cnt2_neg, str2_chr_size);
1034         br(LE, CH1_LOOP);
1035         b(NOMATCH);
1036       }
1037 
1038     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
1039       Label CH1_LOOP;
1040 
1041       BIND(DO2);
1042         (this->*load_2chr)(ch1, str1);
1043         if (icnt1 == 2) {
1044           sub(result_tmp, cnt2, 2);
1045         }
1046         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
1047         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
1048       BIND(CH1_LOOP);
1049         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
1050         cmp(ch1, ch2);
1051         br(EQ, MATCH);
1052         adds(cnt2_neg, cnt2_neg, str2_chr_size);
1053         br(LE, CH1_LOOP);
1054         b(NOMATCH);
1055     }
1056 
1057     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
1058       Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
1059 
1060       BIND(DO3);
1061         (this->*load_2chr)(first, str1);
1062         (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
1063         if (icnt1 == 3) {
1064           sub(result_tmp, cnt2, 3);
1065         }
1066         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
1067         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
1068       BIND(FIRST_LOOP);
1069         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
1070         cmpw(first, ch2);
1071         br(EQ, STR1_LOOP);
1072       BIND(STR2_NEXT);
1073         adds(cnt2_neg, cnt2_neg, str2_chr_size);
1074         br(LE, FIRST_LOOP);
1075         b(NOMATCH);
1076 
1077       BIND(STR1_LOOP);
1078         add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
1079         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
1080         cmp(ch1, ch2);
1081         br(NE, STR2_NEXT);
1082         b(MATCH);
1083     }
1084 
1085     if (icnt1 == -1 || icnt1 == 1) {
1086       Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
1087 
1088       BIND(DO1);
1089         (this->*str1_load_1chr)(ch1, str1);
1090         cmp(cnt2, (u1)8);
1091         br(LT, DO1_SHORT);
1092 
1093         sub(result_tmp, cnt2, 8/str2_chr_size);
1094         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
1095         mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
1096         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
1097 
1098         if (str2_isL) {
1099           orr(ch1, ch1, ch1, LSL, 8);
1100         }
1101         orr(ch1, ch1, ch1, LSL, 16);
1102         orr(ch1, ch1, ch1, LSL, 32);
1103       BIND(CH1_LOOP);
1104         ldr(ch2, Address(str2, cnt2_neg));
1105         eor(ch2, ch1, ch2);
1106         sub(tmp1, ch2, tmp3);
1107         orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
1108         bics(tmp1, tmp1, tmp2);
1109         br(NE, HAS_ZERO);
1110         adds(cnt2_neg, cnt2_neg, 8);
1111         br(LT, CH1_LOOP);
1112 
1113         cmp(cnt2_neg, (u1)8);
1114         mov(cnt2_neg, 0);
1115         br(LT, CH1_LOOP);
1116         b(NOMATCH);
1117 
1118       BIND(HAS_ZERO);
1119         rev(tmp1, tmp1);
1120         clz(tmp1, tmp1);
1121         add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
1122         b(MATCH);
1123 
1124       BIND(DO1_SHORT);
1125         mov(result_tmp, cnt2);
1126         lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
1127         sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
1128       BIND(DO1_LOOP);
1129         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
1130         cmpw(ch1, ch2);
1131         br(EQ, MATCH);
1132         adds(cnt2_neg, cnt2_neg, str2_chr_size);
1133         br(LT, DO1_LOOP);
1134     }
1135   }
1136   BIND(NOMATCH);
1137     mov(result, -1);
1138     b(DONE);
1139   BIND(MATCH);
1140     add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
1141   BIND(DONE);
1142 }
1143 
1144 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
1145 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
1146 
1147 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1,
1148                                             Register ch, Register result,
1149                                             Register tmp1, Register tmp2, Register tmp3)
1150 {
1151   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
1152   Register cnt1_neg = cnt1;
1153   Register ch1 = rscratch1;
1154   Register result_tmp = rscratch2;
1155 
1156   cbz(cnt1, NOMATCH);
1157 
1158   cmp(cnt1, (u1)4);
1159   br(LT, DO1_SHORT);
1160 
1161   orr(ch, ch, ch, LSL, 16);
1162   orr(ch, ch, ch, LSL, 32);
1163 
1164   sub(cnt1, cnt1, 4);
1165   mov(result_tmp, cnt1);
1166   lea(str1, Address(str1, cnt1, Address::uxtw(1)));
1167   sub(cnt1_neg, zr, cnt1, LSL, 1);
1168 
1169   mov(tmp3, 0x0001000100010001);
1170 
1171   BIND(CH1_LOOP);
1172     ldr(ch1, Address(str1, cnt1_neg));
1173     eor(ch1, ch, ch1);
1174     sub(tmp1, ch1, tmp3);
1175     orr(tmp2, ch1, 0x7fff7fff7fff7fff);
1176     bics(tmp1, tmp1, tmp2);
1177     br(NE, HAS_ZERO);
1178     adds(cnt1_neg, cnt1_neg, 8);
1179     br(LT, CH1_LOOP);
1180 
1181     cmp(cnt1_neg, (u1)8);
1182     mov(cnt1_neg, 0);
1183     br(LT, CH1_LOOP);
1184     b(NOMATCH);
1185 
1186   BIND(HAS_ZERO);
1187     rev(tmp1, tmp1);
1188     clz(tmp1, tmp1);
1189     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
1190     b(MATCH);
1191 
1192   BIND(DO1_SHORT);
1193     mov(result_tmp, cnt1);
1194     lea(str1, Address(str1, cnt1, Address::uxtw(1)));
1195     sub(cnt1_neg, zr, cnt1, LSL, 1);
1196   BIND(DO1_LOOP);
1197     ldrh(ch1, Address(str1, cnt1_neg));
1198     cmpw(ch, ch1);
1199     br(EQ, MATCH);
1200     adds(cnt1_neg, cnt1_neg, 2);
1201     br(LT, DO1_LOOP);
1202   BIND(NOMATCH);
1203     mov(result, -1);
1204     b(DONE);
1205   BIND(MATCH);
1206     add(result, result_tmp, cnt1_neg, ASR, 1);
1207   BIND(DONE);
1208 }
1209 
1210 void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1,
1211                                                 Register ch, Register result,
1212                                                 FloatRegister ztmp1,
1213                                                 FloatRegister ztmp2,
1214                                                 PRegister tmp_pg,
1215                                                 PRegister tmp_pdn, bool isL)
1216 {
1217   // Note that `tmp_pdn` should *NOT* be used as governing predicate register.
1218   assert(tmp_pg->is_governing(),
1219          "this register has to be a governing predicate register");
1220 
1221   Label LOOP, MATCH, DONE, NOMATCH;
1222   Register vec_len = rscratch1;
1223   Register idx = rscratch2;
1224 
1225   SIMD_RegVariant T = (isL == true) ? B : H;
1226 
1227   cbz(cnt1, NOMATCH);
1228 
1229   // Assign the particular char throughout the vector.
1230   sve_dup(ztmp2, T, ch);
1231   if (isL) {
1232     sve_cntb(vec_len);
1233   } else {
1234     sve_cnth(vec_len);
1235   }
1236   mov(idx, 0);
1237 
1238   // Generate a predicate to control the reading of input string.
1239   sve_whilelt(tmp_pg, T, idx, cnt1);
1240 
1241   BIND(LOOP);
1242     // Read a vector of 8- or 16-bit data depending on the string type. Note
1243     // that inactive elements indicated by the predicate register won't cause
1244     // a data read from memory to the destination vector.
1245     if (isL) {
1246       sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx));
1247     } else {
1248       sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1)));
1249     }
1250     add(idx, idx, vec_len);
1251 
1252     // Perform the comparison. An element of the destination predicate is set
1253     // to active if the particular char is matched.
1254     sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2);
1255 
1256     // Branch if the particular char is found.
1257     br(NE, MATCH);
1258 
1259     sve_whilelt(tmp_pg, T, idx, cnt1);
1260 
1261     // Loop back if the particular char not found.
1262     br(MI, LOOP);
1263 
1264   BIND(NOMATCH);
1265     mov(result, -1);
1266     b(DONE);
1267 
1268   BIND(MATCH);
1269     // Undo the index increment.
1270     sub(idx, idx, vec_len);
1271 
1272     // Crop the vector to find its location.
1273     sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */);
1274     add(result, idx, -1);
1275     sve_incp(result, T, tmp_pdn);
1276   BIND(DONE);
1277 }
1278 
1279 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1,
1280                                             Register ch, Register result,
1281                                             Register tmp1, Register tmp2, Register tmp3)
1282 {
1283   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
1284   Register cnt1_neg = cnt1;
1285   Register ch1 = rscratch1;
1286   Register result_tmp = rscratch2;
1287 
1288   cbz(cnt1, NOMATCH);
1289 
1290   cmp(cnt1, (u1)8);
1291   br(LT, DO1_SHORT);
1292 
1293   orr(ch, ch, ch, LSL, 8);
1294   orr(ch, ch, ch, LSL, 16);
1295   orr(ch, ch, ch, LSL, 32);
1296 
1297   sub(cnt1, cnt1, 8);
1298   mov(result_tmp, cnt1);
1299   lea(str1, Address(str1, cnt1));
1300   sub(cnt1_neg, zr, cnt1);
1301 
1302   mov(tmp3, 0x0101010101010101);
1303 
1304   BIND(CH1_LOOP);
1305     ldr(ch1, Address(str1, cnt1_neg));
1306     eor(ch1, ch, ch1);
1307     sub(tmp1, ch1, tmp3);
1308     orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f);
1309     bics(tmp1, tmp1, tmp2);
1310     br(NE, HAS_ZERO);
1311     adds(cnt1_neg, cnt1_neg, 8);
1312     br(LT, CH1_LOOP);
1313 
1314     cmp(cnt1_neg, (u1)8);
1315     mov(cnt1_neg, 0);
1316     br(LT, CH1_LOOP);
1317     b(NOMATCH);
1318 
1319   BIND(HAS_ZERO);
1320     rev(tmp1, tmp1);
1321     clz(tmp1, tmp1);
1322     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
1323     b(MATCH);
1324 
1325   BIND(DO1_SHORT);
1326     mov(result_tmp, cnt1);
1327     lea(str1, Address(str1, cnt1));
1328     sub(cnt1_neg, zr, cnt1);
1329   BIND(DO1_LOOP);
1330     ldrb(ch1, Address(str1, cnt1_neg));
1331     cmp(ch, ch1);
1332     br(EQ, MATCH);
1333     adds(cnt1_neg, cnt1_neg, 1);
1334     br(LT, DO1_LOOP);
1335   BIND(NOMATCH);
1336     mov(result, -1);
1337     b(DONE);
1338   BIND(MATCH);
1339     add(result, result_tmp, cnt1_neg);
1340   BIND(DONE);
1341 }
1342 
1343 // Compare strings.
1344 void C2_MacroAssembler::string_compare(Register str1, Register str2,
1345     Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
1346     FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
1347     PRegister pgtmp1, PRegister pgtmp2, int ae) {
1348   Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
1349       DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
1350       SHORT_LOOP_START, TAIL_CHECK;
1351 
1352   bool isLL = ae == StrIntrinsicNode::LL;
1353   bool isLU = ae == StrIntrinsicNode::LU;
1354   bool isUL = ae == StrIntrinsicNode::UL;
1355 
1356   // The stub threshold for LL strings is: 72 (64 + 8) chars
1357   // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch)
1358   // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least)
1359   const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36);
1360 
1361   bool str1_isL = isLL || isLU;
1362   bool str2_isL = isLL || isUL;
1363 
1364   int str1_chr_shift = str1_isL ? 0 : 1;
1365   int str2_chr_shift = str2_isL ? 0 : 1;
1366   int str1_chr_size = str1_isL ? 1 : 2;
1367   int str2_chr_size = str2_isL ? 1 : 2;
1368   int minCharsInWord = isLL ? wordSize : wordSize/2;
1369 
1370   FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
1371   chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
1372                                       (chr_insn)&MacroAssembler::ldrh;
1373   chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
1374                                       (chr_insn)&MacroAssembler::ldrh;
1375   uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
1376                             (uxt_insn)&MacroAssembler::uxthw;
1377 
1378   BLOCK_COMMENT("string_compare {");
1379 
1380   // Bizarrely, the counts are passed in bytes, regardless of whether they
1381   // are L or U strings, however the result is always in characters.
1382   if (!str1_isL) asrw(cnt1, cnt1, 1);
1383   if (!str2_isL) asrw(cnt2, cnt2, 1);
1384 
1385   // Compute the minimum of the string lengths and save the difference.
1386   subsw(result, cnt1, cnt2);
1387   cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
1388 
1389   // A very short string
1390   cmpw(cnt2, minCharsInWord);
1391   br(Assembler::LE, SHORT_STRING);
1392 
1393   // Compare longwords
1394   // load first parts of strings and finish initialization while loading
1395   {
1396     if (str1_isL == str2_isL) { // LL or UU
1397       ldr(tmp1, Address(str1));
1398       cmp(str1, str2);
1399       br(Assembler::EQ, DONE);
1400       ldr(tmp2, Address(str2));
1401       cmp(cnt2, stub_threshold);
1402       br(GE, STUB);
1403       subsw(cnt2, cnt2, minCharsInWord);
1404       br(EQ, TAIL_CHECK);
1405       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1406       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1407       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1408     } else if (isLU) {
1409       ldrs(vtmp, Address(str1));
1410       ldr(tmp2, Address(str2));
1411       cmp(cnt2, stub_threshold);
1412       br(GE, STUB);
1413       subw(cnt2, cnt2, 4);
1414       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1415       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1416       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1417       zip1(vtmp, T8B, vtmp, vtmpZ);
1418       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1419       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1420       add(cnt1, cnt1, 4);
1421       fmovd(tmp1, vtmp);
1422     } else { // UL case
1423       ldr(tmp1, Address(str1));
1424       ldrs(vtmp, Address(str2));
1425       cmp(cnt2, stub_threshold);
1426       br(GE, STUB);
1427       subw(cnt2, cnt2, 4);
1428       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1429       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1430       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1431       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1432       zip1(vtmp, T8B, vtmp, vtmpZ);
1433       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1434       add(cnt1, cnt1, 8);
1435       fmovd(tmp2, vtmp);
1436     }
1437     adds(cnt2, cnt2, isUL ? 4 : 8);
1438     br(GE, TAIL);
1439     eor(rscratch2, tmp1, tmp2);
1440     cbnz(rscratch2, DIFF);
1441     // main loop
1442     bind(NEXT_WORD);
1443     if (str1_isL == str2_isL) {
1444       ldr(tmp1, Address(str1, cnt2));
1445       ldr(tmp2, Address(str2, cnt2));
1446       adds(cnt2, cnt2, 8);
1447     } else if (isLU) {
1448       ldrs(vtmp, Address(str1, cnt1));
1449       ldr(tmp2, Address(str2, cnt2));
1450       add(cnt1, cnt1, 4);
1451       zip1(vtmp, T8B, vtmp, vtmpZ);
1452       fmovd(tmp1, vtmp);
1453       adds(cnt2, cnt2, 8);
1454     } else { // UL
1455       ldrs(vtmp, Address(str2, cnt2));
1456       ldr(tmp1, Address(str1, cnt1));
1457       zip1(vtmp, T8B, vtmp, vtmpZ);
1458       add(cnt1, cnt1, 8);
1459       fmovd(tmp2, vtmp);
1460       adds(cnt2, cnt2, 4);
1461     }
1462     br(GE, TAIL);
1463 
1464     eor(rscratch2, tmp1, tmp2);
1465     cbz(rscratch2, NEXT_WORD);
1466     b(DIFF);
1467     bind(TAIL);
1468     eor(rscratch2, tmp1, tmp2);
1469     cbnz(rscratch2, DIFF);
1470     // Last longword.  In the case where length == 4 we compare the
1471     // same longword twice, but that's still faster than another
1472     // conditional branch.
1473     if (str1_isL == str2_isL) {
1474       ldr(tmp1, Address(str1));
1475       ldr(tmp2, Address(str2));
1476     } else if (isLU) {
1477       ldrs(vtmp, Address(str1));
1478       ldr(tmp2, Address(str2));
1479       zip1(vtmp, T8B, vtmp, vtmpZ);
1480       fmovd(tmp1, vtmp);
1481     } else { // UL
1482       ldrs(vtmp, Address(str2));
1483       ldr(tmp1, Address(str1));
1484       zip1(vtmp, T8B, vtmp, vtmpZ);
1485       fmovd(tmp2, vtmp);
1486     }
1487     bind(TAIL_CHECK);
1488     eor(rscratch2, tmp1, tmp2);
1489     cbz(rscratch2, DONE);
1490 
1491     // Find the first different characters in the longwords and
1492     // compute their difference.
1493     bind(DIFF);
1494     rev(rscratch2, rscratch2);
1495     clz(rscratch2, rscratch2);
1496     andr(rscratch2, rscratch2, isLL ? -8 : -16);
1497     lsrv(tmp1, tmp1, rscratch2);
1498     (this->*ext_chr)(tmp1, tmp1);
1499     lsrv(tmp2, tmp2, rscratch2);
1500     (this->*ext_chr)(tmp2, tmp2);
1501     subw(result, tmp1, tmp2);
1502     b(DONE);
1503   }
1504 
1505   bind(STUB);
1506     RuntimeAddress stub = nullptr;
1507     switch(ae) {
1508       case StrIntrinsicNode::LL:
1509         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
1510         break;
1511       case StrIntrinsicNode::UU:
1512         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
1513         break;
1514       case StrIntrinsicNode::LU:
1515         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
1516         break;
1517       case StrIntrinsicNode::UL:
1518         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
1519         break;
1520       default:
1521         ShouldNotReachHere();
1522      }
1523     assert(stub.target() != nullptr, "compare_long_string stub has not been generated");
1524     address call = trampoline_call(stub);
1525     if (call == nullptr) {
1526       DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START));
1527       ciEnv::current()->record_failure("CodeCache is full");
1528       return;
1529     }
1530     b(DONE);
1531 
1532   bind(SHORT_STRING);
1533   // Is the minimum length zero?
1534   cbz(cnt2, DONE);
1535   // arrange code to do most branches while loading and loading next characters
1536   // while comparing previous
1537   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1538   subs(cnt2, cnt2, 1);
1539   br(EQ, SHORT_LAST_INIT);
1540   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1541   b(SHORT_LOOP_START);
1542   bind(SHORT_LOOP);
1543   subs(cnt2, cnt2, 1);
1544   br(EQ, SHORT_LAST);
1545   bind(SHORT_LOOP_START);
1546   (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
1547   (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
1548   cmp(tmp1, cnt1);
1549   br(NE, SHORT_LOOP_TAIL);
1550   subs(cnt2, cnt2, 1);
1551   br(EQ, SHORT_LAST2);
1552   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1553   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1554   cmp(tmp2, rscratch1);
1555   br(EQ, SHORT_LOOP);
1556   sub(result, tmp2, rscratch1);
1557   b(DONE);
1558   bind(SHORT_LOOP_TAIL);
1559   sub(result, tmp1, cnt1);
1560   b(DONE);
1561   bind(SHORT_LAST2);
1562   cmp(tmp2, rscratch1);
1563   br(EQ, DONE);
1564   sub(result, tmp2, rscratch1);
1565 
1566   b(DONE);
1567   bind(SHORT_LAST_INIT);
1568   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1569   bind(SHORT_LAST);
1570   cmp(tmp1, cnt1);
1571   br(EQ, DONE);
1572   sub(result, tmp1, cnt1);
1573 
1574   bind(DONE);
1575 
1576   BLOCK_COMMENT("} string_compare");
1577 }
1578 
1579 void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1,
1580                                      FloatRegister src2, Condition cond, bool isQ) {
1581   SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1582   FloatRegister zn = src1, zm = src2;
1583   bool needs_negation = false;
1584   switch (cond) {
1585     case LT: cond = GT; zn = src2; zm = src1; break;
1586     case LE: cond = GE; zn = src2; zm = src1; break;
1587     case LO: cond = HI; zn = src2; zm = src1; break;
1588     case LS: cond = HS; zn = src2; zm = src1; break;
1589     case NE: cond = EQ; needs_negation = true; break;
1590     default:
1591       break;
1592   }
1593 
1594   if (is_floating_point_type(bt)) {
1595     fcm(cond, dst, size, zn, zm);
1596   } else {
1597     cm(cond, dst, size, zn, zm);
1598   }
1599 
1600   if (needs_negation) {
1601     notr(dst, isQ ? T16B : T8B, dst);
1602   }
1603 }
1604 
1605 void C2_MacroAssembler::neon_compare_zero(FloatRegister dst, BasicType bt, FloatRegister src,
1606                                           Condition cond, bool isQ) {
1607   SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1608   if (bt == T_FLOAT || bt == T_DOUBLE) {
1609     if (cond == Assembler::NE) {
1610       fcm(Assembler::EQ, dst, size, src);
1611       notr(dst, isQ ? T16B : T8B, dst);
1612     } else {
1613       fcm(cond, dst, size, src);
1614     }
1615   } else {
1616     if (cond == Assembler::NE) {
1617       cm(Assembler::EQ, dst, size, src);
1618       notr(dst, isQ ? T16B : T8B, dst);
1619     } else {
1620       cm(cond, dst, size, src);
1621     }
1622   }
1623 }
1624 
1625 // Compress the least significant bit of each byte to the rightmost and clear
1626 // the higher garbage bits.
1627 void C2_MacroAssembler::bytemask_compress(Register dst) {
1628   // Example input, dst = 0x01 00 00 00 01 01 00 01
1629   // The "??" bytes are garbage.
1630   orr(dst, dst, dst, Assembler::LSR, 7);  // dst = 0x?? 02 ?? 00 ?? 03 ?? 01
1631   orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D
1632   orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D
1633   andr(dst, dst, 0xff);                   // dst = 0x8D
1634 }
1635 
1636 // Pack the lowest-numbered bit of each mask element in src into a long value
1637 // in dst, at most the first 64 lane elements.
1638 // Clobbers: rscratch1, if UseSVE=1 or the hardware doesn't support FEAT_BITPERM.
1639 void C2_MacroAssembler::sve_vmask_tolong(Register dst, PRegister src, BasicType bt, int lane_cnt,
1640                                          FloatRegister vtmp1, FloatRegister vtmp2) {
1641   assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
1642   assert_different_registers(dst, rscratch1);
1643   assert_different_registers(vtmp1, vtmp2);
1644 
1645   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
1646   // Example:   src = 0b01100101 10001101, bt = T_BYTE, lane_cnt = 16
1647   // Expected:  dst = 0x658D
1648 
1649   // Convert the mask into vector with sequential bytes.
1650   // vtmp1 = 0x00010100 0x00010001 0x01000000 0x01010001
1651   sve_cpy(vtmp1, size, src, 1, false);
1652   if (bt != T_BYTE) {
1653     sve_vector_narrow(vtmp1, B, vtmp1, size, vtmp2);
1654   }
1655 
1656   if (UseSVE > 1 && VM_Version::supports_svebitperm()) {
1657     // Given a vector with the value 0x00 or 0x01 in each byte, the basic idea
1658     // is to compress each significant bit of the byte in a cross-lane way. Due
1659     // to the lack of a cross-lane bit-compress instruction, we use BEXT
1660     // (bit-compress in each lane) with the biggest lane size (T = D) then
1661     // concatenate the results.
1662 
1663     // The second source input of BEXT, initialized with 0x01 in each byte.
1664     // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1665     sve_dup(vtmp2, B, 1);
1666 
1667     // BEXT vtmp1.D, vtmp1.D, vtmp2.D
1668     // vtmp1 = 0x0001010000010001 | 0x0100000001010001
1669     // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1670     //         ---------------------------------------
1671     // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1672     sve_bext(vtmp1, D, vtmp1, vtmp2);
1673 
1674     // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the
1675     // result to dst.
1676     // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1677     // dst   = 0x658D
1678     if (lane_cnt <= 8) {
1679       // No need to concatenate.
1680       umov(dst, vtmp1, B, 0);
1681     } else if (lane_cnt <= 16) {
1682       ins(vtmp1, B, vtmp1, 1, 8);
1683       umov(dst, vtmp1, H, 0);
1684     } else {
1685       // As the lane count is 64 at most, the final expected value must be in
1686       // the lowest 64 bits after narrowing vtmp1 from D to B.
1687       sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2);
1688       umov(dst, vtmp1, D, 0);
1689     }
1690   } else if (UseSVE > 0) {
1691     // Compress the lowest 8 bytes.
1692     fmovd(dst, vtmp1);
1693     bytemask_compress(dst);
1694     if (lane_cnt <= 8) return;
1695 
1696     // Repeat on higher bytes and join the results.
1697     // Compress 8 bytes in each iteration.
1698     for (int idx = 1; idx < (lane_cnt / 8); idx++) {
1699       sve_extract_integral(rscratch1, T_LONG, vtmp1, idx, vtmp2);
1700       bytemask_compress(rscratch1);
1701       orr(dst, dst, rscratch1, Assembler::LSL, idx << 3);
1702     }
1703   } else {
1704     assert(false, "unsupported");
1705     ShouldNotReachHere();
1706   }
1707 }
1708 
1709 // Unpack the mask, a long value in src, into predicate register dst based on the
1710 // corresponding data type. Note that dst can support at most 64 lanes.
1711 // Below example gives the expected dst predicate register in different types, with
1712 // a valid src(0x658D) on a 1024-bit vector size machine.
1713 // BYTE:  dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 00 00 65 8D
1714 // SHORT: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 14 11 40 51
1715 // INT:   dst = 0x00 00 00 00 00 00 00 00 01 10 01 01 10 00 11 01
1716 // LONG:  dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01
1717 //
1718 // The number of significant bits of src must be equal to lane_cnt. E.g., 0xFF658D which
1719 // has 24 significant bits would be an invalid input if dst predicate register refers to
1720 // a LONG type 1024-bit vector, which has at most 16 lanes.
1721 void C2_MacroAssembler::sve_vmask_fromlong(PRegister dst, Register src, BasicType bt, int lane_cnt,
1722                                            FloatRegister vtmp1, FloatRegister vtmp2) {
1723   assert(UseSVE == 2 && VM_Version::supports_svebitperm() &&
1724          lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported");
1725   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
1726   // Example:   src = 0x658D, bt = T_BYTE, size = B, lane_cnt = 16
1727   // Expected:  dst = 0b01101001 10001101
1728 
1729   // Put long value from general purpose register into the first lane of vector.
1730   // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1731   sve_dup(vtmp1, B, 0);
1732   mov(vtmp1, D, 0, src);
1733 
1734   // As sve_cmp generates mask value with the minimum unit in byte, we should
1735   // transform the value in the first lane which is mask in bit now to the
1736   // mask in byte, which can be done by SVE2's BDEP instruction.
1737 
1738   // The first source input of BDEP instruction. Deposite each byte in every 8 bytes.
1739   // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1740   if (lane_cnt <= 8) {
1741     // Nothing. As only one byte exsits.
1742   } else if (lane_cnt <= 16) {
1743     ins(vtmp1, B, vtmp1, 8, 1);
1744     mov(vtmp1, B, 1, zr);
1745   } else {
1746     sve_vector_extend(vtmp1, D, vtmp1, B);
1747   }
1748 
1749   // The second source input of BDEP instruction, initialized with 0x01 for each byte.
1750   // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1751   sve_dup(vtmp2, B, 1);
1752 
1753   // BDEP vtmp1.D, vtmp1.D, vtmp2.D
1754   // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1755   // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1756   //         ---------------------------------------
1757   // vtmp1 = 0x0001010000010001 | 0x0100000001010001
1758   sve_bdep(vtmp1, D, vtmp1, vtmp2);
1759 
1760   if (bt != T_BYTE) {
1761     sve_vector_extend(vtmp1, size, vtmp1, B);
1762   }
1763   // Generate mask according to the given vector, in which the elements have been
1764   // extended to expected type.
1765   // dst = 0b01101001 10001101
1766   sve_cmp(Assembler::NE, dst, size, ptrue, vtmp1, 0);
1767 }
1768 
1769 // Clobbers: rflags
1770 void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg,
1771                                     FloatRegister zn, FloatRegister zm, Condition cond) {
1772   assert(pg->is_governing(), "This register has to be a governing predicate register");
1773   FloatRegister z1 = zn, z2 = zm;
1774   switch (cond) {
1775     case LE: z1 = zm; z2 = zn; cond = GE; break;
1776     case LT: z1 = zm; z2 = zn; cond = GT; break;
1777     case LO: z1 = zm; z2 = zn; cond = HI; break;
1778     case LS: z1 = zm; z2 = zn; cond = HS; break;
1779     default:
1780       break;
1781   }
1782 
1783   SIMD_RegVariant size = elemType_to_regVariant(bt);
1784   if (is_floating_point_type(bt)) {
1785     sve_fcm(cond, pd, size, pg, z1, z2);
1786   } else {
1787     assert(is_integral_type(bt), "unsupported element type");
1788     sve_cmp(cond, pd, size, pg, z1, z2);
1789   }
1790 }
1791 
1792 // Get index of the last mask lane that is set
1793 void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) {
1794   SIMD_RegVariant size = elemType_to_regVariant(bt);
1795   sve_rev(ptmp, size, src);
1796   sve_brkb(ptmp, ptrue, ptmp, false);
1797   sve_cntp(dst, size, ptrue, ptmp);
1798   movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1);
1799   subw(dst, rscratch1, dst);
1800 }
1801 
1802 // Extend integer vector src to dst with the same lane count
1803 // but larger element size, e.g. 4B -> 4I
1804 void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes,
1805                                            FloatRegister src, BasicType src_bt, bool is_unsigned) {
1806   if (src_bt == T_BYTE) {
1807     if (dst_bt == T_SHORT) {
1808       // 4B/8B to 4S/8S
1809       _xshll(is_unsigned, dst, T8H, src, T8B, 0);
1810     } else {
1811       // 4B to 4I
1812       assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported");
1813       _xshll(is_unsigned, dst, T8H, src, T8B, 0);
1814       _xshll(is_unsigned, dst, T4S, dst, T4H, 0);
1815     }
1816   } else if (src_bt == T_SHORT) {
1817     // 4S to 4I
1818     assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported");
1819     _xshll(is_unsigned, dst, T4S, src, T4H, 0);
1820   } else if (src_bt == T_INT) {
1821     // 2I to 2L
1822     assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported");
1823     _xshll(is_unsigned, dst, T2D, src, T2S, 0);
1824   } else {
1825     ShouldNotReachHere();
1826   }
1827 }
1828 
1829 // Narrow integer vector src down to dst with the same lane count
1830 // but smaller element size, e.g. 4I -> 4B
1831 void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt,
1832                                            FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) {
1833   if (src_bt == T_SHORT) {
1834     // 4S/8S to 4B/8B
1835     assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
1836     assert(dst_bt == T_BYTE, "unsupported");
1837     xtn(dst, T8B, src, T8H);
1838   } else if (src_bt == T_INT) {
1839     // 4I to 4B/4S
1840     assert(src_vlen_in_bytes == 16, "unsupported");
1841     assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported");
1842     xtn(dst, T4H, src, T4S);
1843     if (dst_bt == T_BYTE) {
1844       xtn(dst, T8B, dst, T8H);
1845     }
1846   } else if (src_bt == T_LONG) {
1847     // 2L to 2I
1848     assert(src_vlen_in_bytes == 16, "unsupported");
1849     assert(dst_bt == T_INT, "unsupported");
1850     xtn(dst, T2S, src, T2D);
1851   } else {
1852     ShouldNotReachHere();
1853   }
1854 }
1855 
1856 void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size,
1857                                           FloatRegister src, SIMD_RegVariant src_size,
1858                                           bool is_unsigned) {
1859   assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size");
1860 
1861   if (src_size == B) {
1862     switch (dst_size) {
1863     case H:
1864       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1865       break;
1866     case S:
1867       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1868       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1869       break;
1870     case D:
1871       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1872       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1873       _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1874       break;
1875     default:
1876       ShouldNotReachHere();
1877     }
1878   } else if (src_size == H) {
1879     if (dst_size == S) {
1880       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1881     } else { // D
1882       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1883       _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1884     }
1885   } else if (src_size == S) {
1886     _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, src);
1887   }
1888 }
1889 
1890 // Vector narrow from src to dst with specified element sizes.
1891 // High part of dst vector will be filled with zero.
1892 void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size,
1893                                           FloatRegister src, SIMD_RegVariant src_size,
1894                                           FloatRegister tmp) {
1895   assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size");
1896   assert_different_registers(src, tmp);
1897   sve_dup(tmp, src_size, 0);
1898   if (src_size == D) {
1899     switch (dst_size) {
1900     case S:
1901       sve_uzp1(dst, S, src, tmp);
1902       break;
1903     case H:
1904       assert_different_registers(dst, tmp);
1905       sve_uzp1(dst, S, src, tmp);
1906       sve_uzp1(dst, H, dst, tmp);
1907       break;
1908     case B:
1909       assert_different_registers(dst, tmp);
1910       sve_uzp1(dst, S, src, tmp);
1911       sve_uzp1(dst, H, dst, tmp);
1912       sve_uzp1(dst, B, dst, tmp);
1913       break;
1914     default:
1915       ShouldNotReachHere();
1916     }
1917   } else if (src_size == S) {
1918     if (dst_size == H) {
1919       sve_uzp1(dst, H, src, tmp);
1920     } else { // B
1921       assert_different_registers(dst, tmp);
1922       sve_uzp1(dst, H, src, tmp);
1923       sve_uzp1(dst, B, dst, tmp);
1924     }
1925   } else if (src_size == H) {
1926     sve_uzp1(dst, B, src, tmp);
1927   }
1928 }
1929 
1930 // Extend src predicate to dst predicate with the same lane count but larger
1931 // element size, e.g. 64Byte -> 512Long
1932 void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src,
1933                                              uint dst_element_length_in_bytes,
1934                                              uint src_element_length_in_bytes) {
1935   if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) {
1936     sve_punpklo(dst, src);
1937   } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) {
1938     sve_punpklo(dst, src);
1939     sve_punpklo(dst, dst);
1940   } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) {
1941     sve_punpklo(dst, src);
1942     sve_punpklo(dst, dst);
1943     sve_punpklo(dst, dst);
1944   } else {
1945     assert(false, "unsupported");
1946     ShouldNotReachHere();
1947   }
1948 }
1949 
1950 // Narrow src predicate to dst predicate with the same lane count but
1951 // smaller element size, e.g. 512Long -> 64Byte
1952 void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src, PRegister ptmp,
1953                                              uint dst_element_length_in_bytes, uint src_element_length_in_bytes) {
1954   // The insignificant bits in src predicate are expected to be zero.
1955   // To ensure the higher order bits of the resultant narrowed vector are 0, an all-zero predicate is
1956   // passed as the second argument. An example narrowing operation with a given mask would be -
1957   // 128Long -> 64Int on a 128-bit machine i.e 2L -> 2I
1958   // Mask (for 2 Longs) : TF
1959   // Predicate register for the above mask (16 bits) : 00000001 00000000
1960   // After narrowing (uzp1 dst.b, src.b, ptmp.b) : 0000 0000 0001 0000
1961   // Which translates to mask for 2 integers as : TF (lower half is considered while upper half is 0)
1962   assert_different_registers(src, ptmp);
1963   assert_different_registers(dst, ptmp);
1964   sve_pfalse(ptmp);
1965   if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) {
1966     sve_uzp1(dst, B, src, ptmp);
1967   } else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) {
1968     sve_uzp1(dst, H, src, ptmp);
1969     sve_uzp1(dst, B, dst, ptmp);
1970   } else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) {
1971     sve_uzp1(dst, S, src, ptmp);
1972     sve_uzp1(dst, H, dst, ptmp);
1973     sve_uzp1(dst, B, dst, ptmp);
1974   } else {
1975     assert(false, "unsupported");
1976     ShouldNotReachHere();
1977   }
1978 }
1979 
1980 // Vector reduction add for integral type with ASIMD instructions.
1981 void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt,
1982                                                  Register isrc, FloatRegister vsrc,
1983                                                  unsigned vector_length_in_bytes,
1984                                                  FloatRegister vtmp) {
1985   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1986   assert_different_registers(dst, isrc);
1987   bool isQ = vector_length_in_bytes == 16;
1988 
1989   BLOCK_COMMENT("neon_reduce_add_integral {");
1990     switch(bt) {
1991       case T_BYTE:
1992         addv(vtmp, isQ ? T16B : T8B, vsrc);
1993         smov(dst, vtmp, B, 0);
1994         addw(dst, dst, isrc, ext::sxtb);
1995         break;
1996       case T_SHORT:
1997         addv(vtmp, isQ ? T8H : T4H, vsrc);
1998         smov(dst, vtmp, H, 0);
1999         addw(dst, dst, isrc, ext::sxth);
2000         break;
2001       case T_INT:
2002         isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc);
2003         umov(dst, vtmp, S, 0);
2004         addw(dst, dst, isrc);
2005         break;
2006       case T_LONG:
2007         assert(isQ, "unsupported");
2008         addpd(vtmp, vsrc);
2009         umov(dst, vtmp, D, 0);
2010         add(dst, dst, isrc);
2011         break;
2012       default:
2013         assert(false, "unsupported");
2014         ShouldNotReachHere();
2015     }
2016   BLOCK_COMMENT("} neon_reduce_add_integral");
2017 }
2018 
2019 // Vector reduction multiply for integral type with ASIMD instructions.
2020 // Note: temporary registers vtmp1 and vtmp2 are not used in some cases.
2021 // Clobbers: rscratch1
2022 void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt,
2023                                                  Register isrc, FloatRegister vsrc,
2024                                                  unsigned vector_length_in_bytes,
2025                                                  FloatRegister vtmp1, FloatRegister vtmp2) {
2026   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
2027   bool isQ = vector_length_in_bytes == 16;
2028 
2029   BLOCK_COMMENT("neon_reduce_mul_integral {");
2030     switch(bt) {
2031       case T_BYTE:
2032         if (isQ) {
2033           // Multiply the lower half and higher half of vector iteratively.
2034           // vtmp1 = vsrc[8:15]
2035           ins(vtmp1, D, vsrc, 0, 1);
2036           // vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7]
2037           mulv(vtmp1, T8B, vtmp1, vsrc);
2038           // vtmp2 = vtmp1[4:7]
2039           ins(vtmp2, S, vtmp1, 0, 1);
2040           // vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3]
2041           mulv(vtmp1, T8B, vtmp2, vtmp1);
2042         } else {
2043           ins(vtmp1, S, vsrc, 0, 1);
2044           mulv(vtmp1, T8B, vtmp1, vsrc);
2045         }
2046         // vtmp2 = vtmp1[2:3]
2047         ins(vtmp2, H, vtmp1, 0, 1);
2048         // vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1]
2049         mulv(vtmp2, T8B, vtmp2, vtmp1);
2050         // dst = vtmp2[0] * isrc * vtmp2[1]
2051         umov(rscratch1, vtmp2, B, 0);
2052         mulw(dst, rscratch1, isrc);
2053         sxtb(dst, dst);
2054         umov(rscratch1, vtmp2, B, 1);
2055         mulw(dst, rscratch1, dst);
2056         sxtb(dst, dst);
2057         break;
2058       case T_SHORT:
2059         if (isQ) {
2060           ins(vtmp2, D, vsrc, 0, 1);
2061           mulv(vtmp2, T4H, vtmp2, vsrc);
2062           ins(vtmp1, S, vtmp2, 0, 1);
2063           mulv(vtmp1, T4H, vtmp1, vtmp2);
2064         } else {
2065           ins(vtmp1, S, vsrc, 0, 1);
2066           mulv(vtmp1, T4H, vtmp1, vsrc);
2067         }
2068         umov(rscratch1, vtmp1, H, 0);
2069         mulw(dst, rscratch1, isrc);
2070         sxth(dst, dst);
2071         umov(rscratch1, vtmp1, H, 1);
2072         mulw(dst, rscratch1, dst);
2073         sxth(dst, dst);
2074         break;
2075       case T_INT:
2076         if (isQ) {
2077           ins(vtmp1, D, vsrc, 0, 1);
2078           mulv(vtmp1, T2S, vtmp1, vsrc);
2079         } else {
2080           vtmp1 = vsrc;
2081         }
2082         umov(rscratch1, vtmp1, S, 0);
2083         mul(dst, rscratch1, isrc);
2084         umov(rscratch1, vtmp1, S, 1);
2085         mul(dst, rscratch1, dst);
2086         break;
2087       case T_LONG:
2088         umov(rscratch1, vsrc, D, 0);
2089         mul(dst, isrc, rscratch1);
2090         umov(rscratch1, vsrc, D, 1);
2091         mul(dst, dst, rscratch1);
2092         break;
2093       default:
2094         assert(false, "unsupported");
2095         ShouldNotReachHere();
2096     }
2097   BLOCK_COMMENT("} neon_reduce_mul_integral");
2098 }
2099 
2100 // Vector reduction multiply for floating-point type with ASIMD instructions.
2101 void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt,
2102                                            FloatRegister fsrc, FloatRegister vsrc,
2103                                            unsigned vector_length_in_bytes,
2104                                            FloatRegister vtmp) {
2105   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
2106   bool isQ = vector_length_in_bytes == 16;
2107 
2108   BLOCK_COMMENT("neon_reduce_mul_fp {");
2109     switch(bt) {
2110       case T_FLOAT:
2111         fmuls(dst, fsrc, vsrc);
2112         ins(vtmp, S, vsrc, 0, 1);
2113         fmuls(dst, dst, vtmp);
2114         if (isQ) {
2115           ins(vtmp, S, vsrc, 0, 2);
2116           fmuls(dst, dst, vtmp);
2117           ins(vtmp, S, vsrc, 0, 3);
2118           fmuls(dst, dst, vtmp);
2119          }
2120         break;
2121       case T_DOUBLE:
2122         assert(isQ, "unsupported");
2123         fmuld(dst, fsrc, vsrc);
2124         ins(vtmp, D, vsrc, 0, 1);
2125         fmuld(dst, dst, vtmp);
2126         break;
2127       default:
2128         assert(false, "unsupported");
2129         ShouldNotReachHere();
2130     }
2131   BLOCK_COMMENT("} neon_reduce_mul_fp");
2132 }
2133 
2134 // Helper to select logical instruction
2135 void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd,
2136                                                    Register Rn, Register Rm,
2137                                                    enum shift_kind kind, unsigned shift) {
2138   switch(opc) {
2139     case Op_AndReductionV:
2140       is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift);
2141       break;
2142     case Op_OrReductionV:
2143       is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift);
2144       break;
2145     case Op_XorReductionV:
2146       is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift);
2147       break;
2148     default:
2149       assert(false, "unsupported");
2150       ShouldNotReachHere();
2151   }
2152 }
2153 
2154 // Vector reduction logical operations And, Or, Xor
2155 // Clobbers: rscratch1
2156 void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt,
2157                                             Register isrc, FloatRegister vsrc,
2158                                             unsigned vector_length_in_bytes) {
2159   assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV,
2160          "unsupported");
2161   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
2162   assert_different_registers(dst, isrc);
2163   bool isQ = vector_length_in_bytes == 16;
2164 
2165   BLOCK_COMMENT("neon_reduce_logical {");
2166     umov(rscratch1, vsrc, isQ ? D : S, 0);
2167     umov(dst, vsrc, isQ ? D : S, 1);
2168     neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1);
2169     switch(bt) {
2170       case T_BYTE:
2171         if (isQ) {
2172           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
2173         }
2174         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
2175         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8);
2176         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
2177         sxtb(dst, dst);
2178         break;
2179       case T_SHORT:
2180         if (isQ) {
2181           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
2182         }
2183         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
2184         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
2185         sxth(dst, dst);
2186         break;
2187       case T_INT:
2188         if (isQ) {
2189           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
2190         }
2191         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
2192         break;
2193       case T_LONG:
2194         assert(isQ, "unsupported");
2195         neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst);
2196         break;
2197       default:
2198         assert(false, "unsupported");
2199         ShouldNotReachHere();
2200     }
2201   BLOCK_COMMENT("} neon_reduce_logical");
2202 }
2203 
2204 // Vector reduction min/max for integral type with ASIMD instructions.
2205 // Note: vtmp is not used and expected to be fnoreg for T_LONG case.
2206 // Clobbers: rscratch1, rflags
2207 void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt,
2208                                                     Register isrc, FloatRegister vsrc,
2209                                                     unsigned vector_length_in_bytes,
2210                                                     FloatRegister vtmp) {
2211   assert(opc == Op_MinReductionV || opc == Op_MaxReductionV, "unsupported");
2212   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
2213   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported");
2214   assert_different_registers(dst, isrc);
2215   bool isQ = vector_length_in_bytes == 16;
2216   bool is_min = opc == Op_MinReductionV;
2217 
2218   BLOCK_COMMENT("neon_reduce_minmax_integral {");
2219     if (bt == T_LONG) {
2220       assert(vtmp == fnoreg, "should be");
2221       assert(isQ, "should be");
2222       umov(rscratch1, vsrc, D, 0);
2223       cmp(isrc, rscratch1);
2224       csel(dst, isrc, rscratch1, is_min ? LT : GT);
2225       umov(rscratch1, vsrc, D, 1);
2226       cmp(dst, rscratch1);
2227       csel(dst, dst, rscratch1, is_min ? LT : GT);
2228     } else {
2229       SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
2230       if (size == T2S) {
2231         is_min ? sminp(vtmp, size, vsrc, vsrc) : smaxp(vtmp, size, vsrc, vsrc);
2232       } else {
2233         is_min ? sminv(vtmp, size, vsrc) : smaxv(vtmp, size, vsrc);
2234       }
2235       if (bt == T_INT) {
2236         umov(dst, vtmp, S, 0);
2237       } else {
2238         smov(dst, vtmp, elemType_to_regVariant(bt), 0);
2239       }
2240       cmpw(dst, isrc);
2241       cselw(dst, dst, isrc, is_min ? LT : GT);
2242     }
2243   BLOCK_COMMENT("} neon_reduce_minmax_integral");
2244 }
2245 
2246 // Vector reduction for integral type with SVE instruction.
2247 // Supported operations are Add, And, Or, Xor, Max, Min.
2248 // rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV.
2249 void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1,
2250                                             FloatRegister src2, PRegister pg, FloatRegister tmp) {
2251   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2252   assert(pg->is_governing(), "This register has to be a governing predicate register");
2253   assert_different_registers(src1, dst);
2254   // Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved.
2255   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2256   switch (opc) {
2257     case Op_AddReductionVI: {
2258       sve_uaddv(tmp, size, pg, src2);
2259       if (bt == T_BYTE) {
2260         smov(dst, tmp, size, 0);
2261         addw(dst, src1, dst, ext::sxtb);
2262       } else if (bt == T_SHORT) {
2263         smov(dst, tmp, size, 0);
2264         addw(dst, src1, dst, ext::sxth);
2265       } else {
2266         umov(dst, tmp, size, 0);
2267         addw(dst, dst, src1);
2268       }
2269       break;
2270     }
2271     case Op_AddReductionVL: {
2272       sve_uaddv(tmp, size, pg, src2);
2273       umov(dst, tmp, size, 0);
2274       add(dst, dst, src1);
2275       break;
2276     }
2277     case Op_AndReductionV: {
2278       sve_andv(tmp, size, pg, src2);
2279       if (bt == T_INT || bt == T_LONG) {
2280         umov(dst, tmp, size, 0);
2281       } else {
2282         smov(dst, tmp, size, 0);
2283       }
2284       if (bt == T_LONG) {
2285         andr(dst, dst, src1);
2286       } else {
2287         andw(dst, dst, src1);
2288       }
2289       break;
2290     }
2291     case Op_OrReductionV: {
2292       sve_orv(tmp, size, pg, src2);
2293       if (bt == T_INT || bt == T_LONG) {
2294         umov(dst, tmp, size, 0);
2295       } else {
2296         smov(dst, tmp, size, 0);
2297       }
2298       if (bt == T_LONG) {
2299         orr(dst, dst, src1);
2300       } else {
2301         orrw(dst, dst, src1);
2302       }
2303       break;
2304     }
2305     case Op_XorReductionV: {
2306       sve_eorv(tmp, size, pg, src2);
2307       if (bt == T_INT || bt == T_LONG) {
2308         umov(dst, tmp, size, 0);
2309       } else {
2310         smov(dst, tmp, size, 0);
2311       }
2312       if (bt == T_LONG) {
2313         eor(dst, dst, src1);
2314       } else {
2315         eorw(dst, dst, src1);
2316       }
2317       break;
2318     }
2319     case Op_MaxReductionV: {
2320       sve_smaxv(tmp, size, pg, src2);
2321       if (bt == T_INT || bt == T_LONG) {
2322         umov(dst, tmp, size, 0);
2323       } else {
2324         smov(dst, tmp, size, 0);
2325       }
2326       if (bt == T_LONG) {
2327         cmp(dst, src1);
2328         csel(dst, dst, src1, Assembler::GT);
2329       } else {
2330         cmpw(dst, src1);
2331         cselw(dst, dst, src1, Assembler::GT);
2332       }
2333       break;
2334     }
2335     case Op_MinReductionV: {
2336       sve_sminv(tmp, size, pg, src2);
2337       if (bt == T_INT || bt == T_LONG) {
2338         umov(dst, tmp, size, 0);
2339       } else {
2340         smov(dst, tmp, size, 0);
2341       }
2342       if (bt == T_LONG) {
2343         cmp(dst, src1);
2344         csel(dst, dst, src1, Assembler::LT);
2345       } else {
2346         cmpw(dst, src1);
2347         cselw(dst, dst, src1, Assembler::LT);
2348       }
2349       break;
2350     }
2351     default:
2352       assert(false, "unsupported");
2353       ShouldNotReachHere();
2354   }
2355 
2356   if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) {
2357     if (bt == T_BYTE) {
2358       sxtb(dst, dst);
2359     } else if (bt == T_SHORT) {
2360       sxth(dst, dst);
2361     }
2362   }
2363 }
2364 
2365 // Set elements of the dst predicate to true for lanes in the range of [0, lane_cnt), or
2366 // to false otherwise. The input "lane_cnt" should be smaller than or equal to the supported
2367 // max vector length of the basic type. Clobbers: rscratch1 and the rFlagsReg.
2368 void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt) {
2369   uint32_t max_vector_length = Matcher::max_vector_size(bt);
2370   assert(lane_cnt <= max_vector_length, "unsupported input lane_cnt");
2371 
2372   // Set all elements to false if the input "lane_cnt" is zero.
2373   if (lane_cnt == 0) {
2374     sve_pfalse(dst);
2375     return;
2376   }
2377 
2378   SIMD_RegVariant size = elemType_to_regVariant(bt);
2379   assert(size != Q, "invalid size");
2380 
2381   // Set all true if "lane_cnt" equals to the max lane count.
2382   if (lane_cnt == max_vector_length) {
2383     sve_ptrue(dst, size, /* ALL */ 0b11111);
2384     return;
2385   }
2386 
2387   // Fixed numbers for "ptrue".
2388   switch(lane_cnt) {
2389   case 1: /* VL1 */
2390   case 2: /* VL2 */
2391   case 3: /* VL3 */
2392   case 4: /* VL4 */
2393   case 5: /* VL5 */
2394   case 6: /* VL6 */
2395   case 7: /* VL7 */
2396   case 8: /* VL8 */
2397     sve_ptrue(dst, size, lane_cnt);
2398     return;
2399   case 16:
2400     sve_ptrue(dst, size, /* VL16 */ 0b01001);
2401     return;
2402   case 32:
2403     sve_ptrue(dst, size, /* VL32 */ 0b01010);
2404     return;
2405   case 64:
2406     sve_ptrue(dst, size, /* VL64 */ 0b01011);
2407     return;
2408   case 128:
2409     sve_ptrue(dst, size, /* VL128 */ 0b01100);
2410     return;
2411   case 256:
2412     sve_ptrue(dst, size, /* VL256 */ 0b01101);
2413     return;
2414   default:
2415     break;
2416   }
2417 
2418   // Special patterns for "ptrue".
2419   if (lane_cnt == round_down_power_of_2(max_vector_length)) {
2420     sve_ptrue(dst, size, /* POW2 */ 0b00000);
2421   } else if (lane_cnt == max_vector_length - (max_vector_length % 4)) {
2422     sve_ptrue(dst, size, /* MUL4 */ 0b11101);
2423   } else if (lane_cnt == max_vector_length - (max_vector_length % 3)) {
2424     sve_ptrue(dst, size, /* MUL3 */ 0b11110);
2425   } else {
2426     // Encode to "whileltw" for the remaining cases.
2427     mov(rscratch1, lane_cnt);
2428     sve_whileltw(dst, size, zr, rscratch1);
2429   }
2430 }
2431 
2432 // Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst.
2433 // Any remaining elements of dst will be filled with zero.
2434 // Clobbers: rscratch1
2435 // Preserves: src, mask
2436 void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask,
2437                                            FloatRegister vtmp1, FloatRegister vtmp2,
2438                                            PRegister pgtmp) {
2439   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2440   assert_different_registers(dst, src, vtmp1, vtmp2);
2441   assert_different_registers(mask, pgtmp);
2442 
2443   // Example input:   src   = 8888 7777 6666 5555 4444 3333 2222 1111
2444   //                  mask  = 0001 0000 0000 0001 0001 0000 0001 0001
2445   // Expected result: dst   = 0000 0000 0000 8888 5555 4444 2222 1111
2446   sve_dup(vtmp2, H, 0);
2447 
2448   // Extend lowest half to type INT.
2449   // dst = 00004444 00003333 00002222 00001111
2450   sve_uunpklo(dst, S, src);
2451   // pgtmp = 00000001 00000000 00000001 00000001
2452   sve_punpklo(pgtmp, mask);
2453   // Pack the active elements in size of type INT to the right,
2454   // and fill the remainings with zero.
2455   // dst = 00000000 00004444 00002222 00001111
2456   sve_compact(dst, S, dst, pgtmp);
2457   // Narrow the result back to type SHORT.
2458   // dst = 0000 0000 0000 0000 0000 4444 2222 1111
2459   sve_uzp1(dst, H, dst, vtmp2);
2460   // Count the active elements of lowest half.
2461   // rscratch1 = 3
2462   sve_cntp(rscratch1, S, ptrue, pgtmp);
2463 
2464   // Repeat to the highest half.
2465   // pgtmp = 00000001 00000000 00000000 00000001
2466   sve_punpkhi(pgtmp, mask);
2467   // vtmp1 = 00008888 00007777 00006666 00005555
2468   sve_uunpkhi(vtmp1, S, src);
2469   // vtmp1 = 00000000 00000000 00008888 00005555
2470   sve_compact(vtmp1, S, vtmp1, pgtmp);
2471   // vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555
2472   sve_uzp1(vtmp1, H, vtmp1, vtmp2);
2473 
2474   // Compressed low:   dst   = 0000 0000 0000 0000 0000 4444 2222 1111
2475   // Compressed high:  vtmp1 = 0000 0000 0000 0000 0000 0000 8888  5555
2476   // Left shift(cross lane) compressed high with TRUE_CNT lanes,
2477   // TRUE_CNT is the number of active elements in the compressed low.
2478   neg(rscratch1, rscratch1);
2479   // vtmp2 = {4 3 2 1 0 -1 -2 -3}
2480   sve_index(vtmp2, H, rscratch1, 1);
2481   // vtmp1 = 0000 0000 0000 8888 5555 0000 0000 0000
2482   sve_tbl(vtmp1, H, vtmp1, vtmp2);
2483 
2484   // Combine the compressed high(after shifted) with the compressed low.
2485   // dst = 0000 0000 0000 8888 5555 4444 2222 1111
2486   sve_orr(dst, dst, vtmp1);
2487 }
2488 
2489 // Clobbers: rscratch1, rscratch2
2490 // Preserves: src, mask
2491 void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask,
2492                                           FloatRegister vtmp1, FloatRegister vtmp2,
2493                                           FloatRegister vtmp3, FloatRegister vtmp4,
2494                                           PRegister ptmp, PRegister pgtmp) {
2495   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2496   assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3, vtmp4);
2497   assert_different_registers(mask, ptmp, pgtmp);
2498   // Example input:   src   = 88 77 66 55 44 33 22 11
2499   //                  mask  = 01 00 00 01 01 00 01 01
2500   // Expected result: dst   = 00 00 00 88 55 44 22 11
2501 
2502   sve_dup(vtmp4, B, 0);
2503   // Extend lowest half to type SHORT.
2504   // vtmp1 = 0044 0033 0022 0011
2505   sve_uunpklo(vtmp1, H, src);
2506   // ptmp = 0001 0000 0001 0001
2507   sve_punpklo(ptmp, mask);
2508   // Count the active elements of lowest half.
2509   // rscratch2 = 3
2510   sve_cntp(rscratch2, H, ptrue, ptmp);
2511   // Pack the active elements in size of type SHORT to the right,
2512   // and fill the remainings with zero.
2513   // dst = 0000 0044 0022 0011
2514   sve_compress_short(dst, vtmp1, ptmp, vtmp2, vtmp3, pgtmp);
2515   // Narrow the result back to type BYTE.
2516   // dst = 00 00 00 00 00 44 22 11
2517   sve_uzp1(dst, B, dst, vtmp4);
2518 
2519   // Repeat to the highest half.
2520   // ptmp = 0001 0000 0000 0001
2521   sve_punpkhi(ptmp, mask);
2522   // vtmp1 = 0088 0077 0066 0055
2523   sve_uunpkhi(vtmp2, H, src);
2524   // vtmp1 = 0000 0000 0088 0055
2525   sve_compress_short(vtmp1, vtmp2, ptmp, vtmp3, vtmp4, pgtmp);
2526 
2527   sve_dup(vtmp4, B, 0);
2528   // vtmp1 = 00 00 00 00 00 00 88 55
2529   sve_uzp1(vtmp1, B, vtmp1, vtmp4);
2530 
2531   // Compressed low:   dst   = 00 00 00 00 00 44 22 11
2532   // Compressed high:  vtmp1 = 00 00 00 00 00 00 88 55
2533   // Left shift(cross lane) compressed high with TRUE_CNT lanes,
2534   // TRUE_CNT is the number of active elements in the compressed low.
2535   neg(rscratch2, rscratch2);
2536   // vtmp2 = {4 3 2 1 0 -1 -2 -3}
2537   sve_index(vtmp2, B, rscratch2, 1);
2538   // vtmp1 = 00 00 00 88 55 00 00 00
2539   sve_tbl(vtmp1, B, vtmp1, vtmp2);
2540   // Combine the compressed high(after shifted) with the compressed low.
2541   // dst = 00 00 00 88 55 44 22 11
2542   sve_orr(dst, dst, vtmp1);
2543 }
2544 
2545 void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2546   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2547   SIMD_Arrangement size = isQ ? T16B : T8B;
2548   if (bt == T_BYTE) {
2549     rbit(dst, size, src);
2550   } else {
2551     neon_reverse_bytes(dst, src, bt, isQ);
2552     rbit(dst, size, dst);
2553   }
2554 }
2555 
2556 void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2557   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2558   SIMD_Arrangement size = isQ ? T16B : T8B;
2559   switch (bt) {
2560     case T_BYTE:
2561       if (dst != src) {
2562         orr(dst, size, src, src);
2563       }
2564       break;
2565     case T_SHORT:
2566       rev16(dst, size, src);
2567       break;
2568     case T_INT:
2569       rev32(dst, size, src);
2570       break;
2571     case T_LONG:
2572       rev64(dst, size, src);
2573       break;
2574     default:
2575       assert(false, "unsupported");
2576       ShouldNotReachHere();
2577   }
2578 }
2579 
2580 // VectorRearrange implementation for short/int/float/long/double types with NEON
2581 // instructions. For VectorRearrange short/int/float, we use NEON tbl instruction.
2582 // But since it supports bytes table only, we need to lookup 2/4 bytes as a group.
2583 // For VectorRearrange long/double, we compare the shuffle input with iota indices,
2584 // and use bsl to implement the operation.
2585 void C2_MacroAssembler::neon_rearrange_hsd(FloatRegister dst, FloatRegister src,
2586                                            FloatRegister shuffle, FloatRegister tmp,
2587                                            BasicType bt, bool isQ) {
2588   assert_different_registers(dst, src, shuffle, tmp);
2589   SIMD_Arrangement size1 = isQ ? T16B : T8B;
2590   SIMD_Arrangement size2 = esize2arrangement((uint)type2aelembytes(bt), isQ);
2591 
2592   // Here is an example that rearranges a NEON vector with 4 ints:
2593   // Rearrange V1 int[a0, a1, a2, a3] to V2 int[a2, a3, a0, a1]
2594   //   1. We assume the shuffle input is Vi int[2, 3, 0, 1].
2595   //   2. Multiply Vi int[2, 3, 0, 1] with constant int vector
2596   //      [0x04040404, 0x04040404, 0x04040404, 0x04040404], and get
2597   //      tbl base Vm int[0x08080808, 0x0c0c0c0c, 0x00000000, 0x04040404].
2598   //   3. Add Vm with constant int[0x03020100, 0x03020100, 0x03020100, 0x03020100],
2599   //      and get tbl index Vm int[0x0b0a0908, 0x0f0e0d0c, 0x03020100, 0x07060504]
2600   //   4. Use Vm as index register, and use V1 as table register.
2601   //      Then get V2 as the result by tbl NEON instructions.
2602   switch (bt) {
2603     case T_SHORT:
2604       mov(tmp, size1, 0x02);
2605       mulv(dst, size2, shuffle, tmp);
2606       mov(tmp, size2, 0x0100);
2607       addv(dst, size1, dst, tmp);
2608       tbl(dst, size1, src, 1, dst);
2609       break;
2610     case T_INT:
2611     case T_FLOAT:
2612       mov(tmp, size1, 0x04);
2613       mulv(dst, size2, shuffle, tmp);
2614       mov(tmp, size2, 0x03020100);
2615       addv(dst, size1, dst, tmp);
2616       tbl(dst, size1, src, 1, dst);
2617       break;
2618     case T_LONG:
2619     case T_DOUBLE:
2620       // Load the iota indices for Long type. The indices are ordered by
2621       // type B/S/I/L/F/D, and the offset between two types is 16; Hence
2622       // the offset for L is 48.
2623       lea(rscratch1,
2624           ExternalAddress(StubRoutines::aarch64::vector_iota_indices() + 48));
2625       ldrq(tmp, rscratch1);
2626       // Check whether the input "shuffle" is the same with iota indices.
2627       // Return "src" if true, otherwise swap the two elements of "src".
2628       cm(EQ, dst, size2, shuffle, tmp);
2629       ext(tmp, size1, src, src, 8);
2630       bsl(dst, size1, src, tmp);
2631       break;
2632     default:
2633       assert(false, "unsupported element type");
2634       ShouldNotReachHere();
2635   }
2636 }
2637 
2638 // Extract a scalar element from an sve vector at position 'idx'.
2639 // The input elements in src are expected to be of integral type.
2640 void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src,
2641                                              int idx, FloatRegister vtmp) {
2642   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2643   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2644   if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction
2645     if (bt == T_INT || bt == T_LONG) {
2646       umov(dst, src, size, idx);
2647     } else {
2648       smov(dst, src, size, idx);
2649     }
2650   } else {
2651     sve_orr(vtmp, src, src);
2652     sve_ext(vtmp, vtmp, idx << size);
2653     if (bt == T_INT || bt == T_LONG) {
2654       umov(dst, vtmp, size, 0);
2655     } else {
2656       smov(dst, vtmp, size, 0);
2657     }
2658   }
2659 }
2660 
2661 // java.lang.Math::round intrinsics
2662 
2663 // Clobbers: rscratch1, rflags
2664 void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2665                                           FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) {
2666   assert_different_registers(tmp1, tmp2, tmp3, src, dst);
2667   switch (T) {
2668     case T2S:
2669     case T4S:
2670       fmovs(tmp1, T, 0.5f);
2671       mov(rscratch1, jint_cast(0x1.0p23f));
2672       break;
2673     case T2D:
2674       fmovd(tmp1, T, 0.5);
2675       mov(rscratch1, julong_cast(0x1.0p52));
2676       break;
2677     default:
2678       assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2679   }
2680   fadd(tmp1, T, tmp1, src);
2681   fcvtms(tmp1, T, tmp1);
2682   // tmp1 = floor(src + 0.5, ties to even)
2683 
2684   fcvtas(dst, T, src);
2685   // dst = round(src), ties to away
2686 
2687   fneg(tmp3, T, src);
2688   dup(tmp2, T, rscratch1);
2689   cm(HS, tmp3, T, tmp3, tmp2);
2690   // tmp3 is now a set of flags
2691 
2692   bif(dst, T16B, tmp1, tmp3);
2693   // result in dst
2694 }
2695 
2696 // Clobbers: rscratch1, rflags
2697 void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2698                                          FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) {
2699   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2700   assert_different_registers(tmp1, tmp2, src, dst);
2701 
2702   switch (T) {
2703     case S:
2704       mov(rscratch1, jint_cast(0x1.0p23f));
2705       break;
2706     case D:
2707       mov(rscratch1, julong_cast(0x1.0p52));
2708       break;
2709     default:
2710       assert(T == S || T == D, "invalid register variant");
2711   }
2712 
2713   sve_frinta(dst, T, ptrue, src);
2714   // dst = round(src), ties to away
2715 
2716   Label none;
2717 
2718   sve_fneg(tmp1, T, ptrue, src);
2719   sve_dup(tmp2, T, rscratch1);
2720   sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1);
2721   br(EQ, none);
2722   {
2723     sve_cpy(tmp1, T, pgtmp, 0.5);
2724     sve_fadd(tmp1, T, pgtmp, src);
2725     sve_frintm(dst, T, pgtmp, tmp1);
2726     // dst = floor(src + 0.5, ties to even)
2727   }
2728   bind(none);
2729 
2730   sve_fcvtzs(dst, T, ptrue, dst, T);
2731   // result in dst
2732 }
2733 
2734 void C2_MacroAssembler::vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero,
2735                                            FloatRegister one, SIMD_Arrangement T) {
2736   assert_different_registers(dst, src, zero, one);
2737   assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2738 
2739   facgt(dst, T, src, zero);
2740   ushr(dst, T, dst, 1); // dst=0 for +-0.0 and NaN. 0x7FF..F otherwise
2741   bsl(dst, T == T2S ? T8B : T16B, one, src); // Result in dst
2742 }
2743 
2744 void C2_MacroAssembler::vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero,
2745                                           FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T) {
2746     assert_different_registers(dst, src, zero, one, vtmp);
2747     assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2748 
2749     sve_orr(vtmp, src, src);
2750     sve_fac(Assembler::GT, pgtmp, T, ptrue, src, zero); // pmtp=0 for +-0.0 and NaN. 0x1 otherwise
2751     switch (T) {
2752     case S:
2753       sve_and(vtmp, T, min_jint); // Extract the sign bit of float value in every lane of src
2754       sve_orr(vtmp, T, jint_cast(1.0)); // OR it with +1 to make the final result +1 or -1 depending
2755                                         // on the sign of the float value
2756       break;
2757     case D:
2758       sve_and(vtmp, T, min_jlong);
2759       sve_orr(vtmp, T, jlong_cast(1.0));
2760       break;
2761     default:
2762       assert(false, "unsupported");
2763       ShouldNotReachHere();
2764     }
2765     sve_sel(dst, T, pgtmp, vtmp, src); // Select either from src or vtmp based on the predicate register pgtmp
2766                                        // Result in dst
2767 }
2768 
2769 bool C2_MacroAssembler::in_scratch_emit_size() {
2770   if (ciEnv::current()->task() != nullptr) {
2771     PhaseOutput* phase_output = Compile::current()->output();
2772     if (phase_output != nullptr && phase_output->in_scratch_emit_size()) {
2773       return true;
2774     }
2775   }
2776   return MacroAssembler::in_scratch_emit_size();
2777 }
2778 
2779 static void abort_verify_int_in_range(uint idx, jint val, jint lo, jint hi) {
2780   fatal("Invalid CastII, idx: %u, val: %d, lo: %d, hi: %d", idx, val, lo, hi);
2781 }
2782 
2783 void C2_MacroAssembler::verify_int_in_range(uint idx, const TypeInt* t, Register rval, Register rtmp) {
2784   assert(!t->empty() && !t->singleton(), "%s", Type::str(t));
2785   if (t == TypeInt::INT) {
2786     return;
2787   }
2788   BLOCK_COMMENT("verify_int_in_range {");
2789   Label L_success, L_failure;
2790 
2791   jint lo = t->_lo;
2792   jint hi = t->_hi;
2793 
2794   if (lo != min_jint && hi != max_jint) {
2795     subsw(rtmp, rval, lo);
2796     br(Assembler::LT, L_failure);
2797     subsw(rtmp, rval, hi);
2798     br(Assembler::LE, L_success);
2799   } else if (lo != min_jint) {
2800     subsw(rtmp, rval, lo);
2801     br(Assembler::GE, L_success);
2802   } else if (hi != max_jint) {
2803     subsw(rtmp, rval, hi);
2804     br(Assembler::LE, L_success);
2805   } else {
2806     ShouldNotReachHere();
2807   }
2808 
2809   bind(L_failure);
2810   movw(c_rarg0, idx);
2811   mov(c_rarg1, rval);
2812   movw(c_rarg2, lo);
2813   movw(c_rarg3, hi);
2814   reconstruct_frame_pointer(rtmp);
2815   rt_call(CAST_FROM_FN_PTR(address, abort_verify_int_in_range), rtmp);
2816   hlt(0);
2817 
2818   bind(L_success);
2819   BLOCK_COMMENT("} verify_int_in_range");
2820 }
2821 
2822 static void abort_verify_long_in_range(uint idx, jlong val, jlong lo, jlong hi) {
2823   fatal("Invalid CastLL, idx: %u, val: " JLONG_FORMAT ", lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, val, lo, hi);
2824 }
2825 
2826 void C2_MacroAssembler::verify_long_in_range(uint idx, const TypeLong* t, Register rval, Register rtmp) {
2827   assert(!t->empty() && !t->singleton(), "%s", Type::str(t));
2828   if (t == TypeLong::LONG) {
2829     return;
2830   }
2831   BLOCK_COMMENT("verify_long_in_range {");
2832   Label L_success, L_failure;
2833 
2834   jlong lo = t->_lo;
2835   jlong hi = t->_hi;
2836 
2837   if (lo != min_jlong && hi != max_jlong) {
2838     subs(rtmp, rval, lo);
2839     br(Assembler::LT, L_failure);
2840     subs(rtmp, rval, hi);
2841     br(Assembler::LE, L_success);
2842   } else if (lo != min_jlong) {
2843     subs(rtmp, rval, lo);
2844     br(Assembler::GE, L_success);
2845   } else if (hi != max_jlong) {
2846     subs(rtmp, rval, hi);
2847     br(Assembler::LE, L_success);
2848   } else {
2849     ShouldNotReachHere();
2850   }
2851 
2852   bind(L_failure);
2853   movw(c_rarg0, idx);
2854   mov(c_rarg1, rval);
2855   mov(c_rarg2, lo);
2856   mov(c_rarg3, hi);
2857   reconstruct_frame_pointer(rtmp);
2858   rt_call(CAST_FROM_FN_PTR(address, abort_verify_long_in_range), rtmp);
2859   hlt(0);
2860 
2861   bind(L_success);
2862   BLOCK_COMMENT("} verify_long_in_range");
2863 }
2864 
2865 void C2_MacroAssembler::reconstruct_frame_pointer(Register rtmp) {
2866   const int framesize = Compile::current()->output()->frame_size_in_bytes();
2867   if (PreserveFramePointer) {
2868     // frame pointer is valid
2869 #ifdef ASSERT
2870     // Verify frame pointer value in rfp.
2871     add(rtmp, sp, framesize - 2 * wordSize);
2872     Label L_success;
2873     cmp(rfp, rtmp);
2874     br(Assembler::EQ, L_success);
2875     stop("frame pointer mismatch");
2876     bind(L_success);
2877 #endif // ASSERT
2878   } else {
2879     add(rfp, sp, framesize - 2 * wordSize);
2880   }
2881 }