1 /*
   2  * Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "asm/assembler.hpp"
  26 #include "asm/assembler.inline.hpp"
  27 #include "opto/c2_MacroAssembler.hpp"
  28 #include "opto/compile.hpp"
  29 #include "opto/intrinsicnode.hpp"
  30 #include "opto/matcher.hpp"
  31 #include "opto/output.hpp"
  32 #include "opto/subnode.hpp"
  33 #include "runtime/stubRoutines.hpp"
  34 #include "utilities/globalDefinitions.hpp"
  35 #include "utilities/powerOfTwo.hpp"
  36 
  37 #ifdef PRODUCT
  38 #define BLOCK_COMMENT(str) /* nothing */
  39 #define STOP(error) stop(error)
  40 #else
  41 #define BLOCK_COMMENT(str) block_comment(str)
  42 #define STOP(error) block_comment(error); stop(error)
  43 #endif
  44 
  45 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  46 
  47 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
  48 
  49 void C2_MacroAssembler::entry_barrier() {
  50   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
  51   // Dummy labels for just measuring the code size
  52   Label dummy_slow_path;
  53   Label dummy_continuation;
  54   Label dummy_guard;
  55   Label* slow_path = &dummy_slow_path;
  56   Label* continuation = &dummy_continuation;
  57   Label* guard = &dummy_guard;
  58   if (!Compile::current()->output()->in_scratch_emit_size()) {
  59     // Use real labels from actual stub when not emitting code for the purpose of measuring its size
  60     C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
  61     Compile::current()->output()->add_stub(stub);
  62     slow_path = &stub->entry();
  63     continuation = &stub->continuation();
  64     guard = &stub->guard();
  65   }
  66   // In the C2 code, we move the non-hot part of nmethod entry barriers out-of-line to a stub.
  67   bs->nmethod_entry_barrier(this, slow_path, continuation, guard);
  68 }
  69 
  70 // jdk.internal.util.ArraysSupport.vectorizedHashCode
  71 address C2_MacroAssembler::arrays_hashcode(Register ary, Register cnt, Register result,
  72                                            FloatRegister vdata0, FloatRegister vdata1,
  73                                            FloatRegister vdata2, FloatRegister vdata3,
  74                                            FloatRegister vmul0, FloatRegister vmul1,
  75                                            FloatRegister vmul2, FloatRegister vmul3,
  76                                            FloatRegister vpow, FloatRegister vpowm,
  77                                            BasicType eltype) {
  78   ARRAYS_HASHCODE_REGISTERS;
  79 
  80   Register tmp1 = rscratch1, tmp2 = rscratch2;
  81 
  82   Label TAIL, STUB_SWITCH, STUB_SWITCH_OUT, LOOP, BR_BASE, LARGE, DONE;
  83 
  84   // Vectorization factor. Number of array elements loaded to one SIMD&FP registers by the stubs. We
  85   // use 8H load arrangements for chars and shorts and 8B for booleans and bytes. It's possible to
  86   // use 4H for chars and shorts instead, but using 8H gives better performance.
  87   const size_t vf = eltype == T_BOOLEAN || eltype == T_BYTE ? 8
  88                     : eltype == T_CHAR || eltype == T_SHORT ? 8
  89                     : eltype == T_INT                       ? 4
  90                                                             : 0;
  91   guarantee(vf, "unsupported eltype");
  92 
  93   // Unroll factor for the scalar loop below. The value is chosen based on performance analysis.
  94   const size_t unroll_factor = 4;
  95 
  96   switch (eltype) {
  97   case T_BOOLEAN:
  98     BLOCK_COMMENT("arrays_hashcode(unsigned byte) {");
  99     break;
 100   case T_CHAR:
 101     BLOCK_COMMENT("arrays_hashcode(char) {");
 102     break;
 103   case T_BYTE:
 104     BLOCK_COMMENT("arrays_hashcode(byte) {");
 105     break;
 106   case T_SHORT:
 107     BLOCK_COMMENT("arrays_hashcode(short) {");
 108     break;
 109   case T_INT:
 110     BLOCK_COMMENT("arrays_hashcode(int) {");
 111     break;
 112   default:
 113     ShouldNotReachHere();
 114   }
 115 
 116   // large_arrays_hashcode(T_INT) performs worse than the scalar loop below when the Neon loop
 117   // implemented by the stub executes just once. Call the stub only if at least two iterations will
 118   // be executed.
 119   const size_t large_threshold = eltype == T_INT ? vf * 2 : vf;
 120   cmpw(cnt, large_threshold);
 121   br(Assembler::HS, LARGE);
 122 
 123   bind(TAIL);
 124 
 125   // The andr performs cnt % uf where uf = unroll_factor. The subtract shifted by 3 offsets past
 126   // uf - (cnt % uf) pairs of load + madd insns i.e. it only executes cnt % uf load + madd pairs.
 127   // Iteration eats up the remainder, uf elements at a time.
 128   assert(is_power_of_2(unroll_factor), "can't use this value to calculate the jump target PC");
 129   andr(tmp2, cnt, unroll_factor - 1);
 130   adr(tmp1, BR_BASE);
 131   sub(tmp1, tmp1, tmp2, ext::sxtw, 3);
 132   movw(tmp2, 0x1f);
 133   br(tmp1);
 134 
 135   bind(LOOP);
 136   for (size_t i = 0; i < unroll_factor; ++i) {
 137     load(tmp1, Address(post(ary, type2aelembytes(eltype))), eltype);
 138     maddw(result, result, tmp2, tmp1);
 139   }
 140   bind(BR_BASE);
 141   subsw(cnt, cnt, unroll_factor);
 142   br(Assembler::HS, LOOP);
 143 
 144   b(DONE);
 145 
 146   bind(LARGE);
 147 
 148   RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_arrays_hashcode(eltype));
 149   assert(stub.target() != nullptr, "array_hashcode stub has not been generated");
 150   address tpc = trampoline_call(stub);
 151   if (tpc == nullptr) {
 152     DEBUG_ONLY(reset_labels(TAIL, BR_BASE));
 153     postcond(pc() == badAddress);
 154     return nullptr;
 155   }
 156 
 157   bind(DONE);
 158 
 159   BLOCK_COMMENT("} // arrays_hashcode");
 160 
 161   postcond(pc() != badAddress);
 162   return pc();
 163 }
 164 
 165 void C2_MacroAssembler::fast_lock(Register objectReg, Register boxReg, Register tmpReg,
 166                                   Register tmp2Reg, Register tmp3Reg) {
 167   Register oop = objectReg;
 168   Register box = boxReg;
 169   Register disp_hdr = tmpReg;
 170   Register tmp = tmp2Reg;
 171   Label cont;
 172   Label object_has_monitor;
 173   Label count, no_count;
 174 
 175   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight");
 176   assert_different_registers(oop, box, tmp, disp_hdr, rscratch2);
 177 
 178   // Load markWord from object into displaced_header.
 179   ldr(disp_hdr, Address(oop, oopDesc::mark_offset_in_bytes()));
 180 
 181   if (DiagnoseSyncOnValueBasedClasses != 0) {
 182     load_klass(tmp, oop);
 183     ldrb(tmp, Address(tmp, Klass::misc_flags_offset()));
 184     tst(tmp, KlassFlags::_misc_is_value_based_class);
 185     br(Assembler::NE, cont);
 186   }
 187 
 188   // Check for existing monitor
 189   tbnz(disp_hdr, exact_log2(markWord::monitor_value), object_has_monitor);
 190 
 191   if (LockingMode == LM_MONITOR) {
 192     tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0.
 193     b(cont);
 194   } else {
 195     assert(LockingMode == LM_LEGACY, "must be");
 196     // Set tmp to be (markWord of object | UNLOCK_VALUE).
 197     orr(tmp, disp_hdr, markWord::unlocked_value);
 198 
 199     if (EnableValhalla) {
 200       // Mask inline_type bit such that we go to the slow path if object is an inline type
 201       andr(tmp, tmp, ~((int) markWord::inline_type_bit_in_place));
 202     }
 203 
 204     // Initialize the box. (Must happen before we update the object mark!)
 205     str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 206 
 207     // Compare object markWord with an unlocked value (tmp) and if
 208     // equal exchange the stack address of our box with object markWord.
 209     // On failure disp_hdr contains the possibly locked markWord.
 210     cmpxchg(oop, tmp, box, Assembler::xword, /*acquire*/ true,
 211             /*release*/ true, /*weak*/ false, disp_hdr);
 212     br(Assembler::EQ, cont);
 213 
 214     assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
 215 
 216     // If the compare-and-exchange succeeded, then we found an unlocked
 217     // object, will have now locked it will continue at label cont
 218 
 219     // Check if the owner is self by comparing the value in the
 220     // markWord of object (disp_hdr) with the stack pointer.
 221     mov(rscratch1, sp);
 222     sub(disp_hdr, disp_hdr, rscratch1);
 223     mov(tmp, (address) (~(os::vm_page_size()-1) | markWord::lock_mask_in_place));
 224     // If condition is true we are cont and hence we can store 0 as the
 225     // displaced header in the box, which indicates that it is a recursive lock.
 226     ands(tmp/*==0?*/, disp_hdr, tmp);   // Sets flags for result
 227     str(tmp/*==0, perhaps*/, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 228     b(cont);
 229   }
 230 
 231   // Handle existing monitor.
 232   bind(object_has_monitor);
 233 
 234   // Try to CAS owner (no owner => current thread's _monitor_owner_id).
 235   ldr(rscratch2, Address(rthread, JavaThread::monitor_owner_id_offset()));
 236   add(tmp, disp_hdr, (in_bytes(ObjectMonitor::owner_offset())-markWord::monitor_value));
 237   cmpxchg(tmp, zr, rscratch2, Assembler::xword, /*acquire*/ true,
 238           /*release*/ true, /*weak*/ false, tmp3Reg); // Sets flags for result
 239 
 240   // Store a non-null value into the box to avoid looking like a re-entrant
 241   // lock. The fast-path monitor unlock code checks for
 242   // markWord::monitor_value so use markWord::unused_mark which has the
 243   // relevant bit set, and also matches ObjectSynchronizer::enter.
 244   mov(tmp, (address)markWord::unused_mark().value());
 245   str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 246 
 247   br(Assembler::EQ, cont); // CAS success means locking succeeded
 248 
 249   cmp(tmp3Reg, rscratch2);
 250   br(Assembler::NE, cont); // Check for recursive locking
 251 
 252   // Recursive lock case
 253   increment(Address(disp_hdr, in_bytes(ObjectMonitor::recursions_offset()) - markWord::monitor_value), 1);
 254   // flag == EQ still from the cmp above, checking if this is a reentrant lock
 255 
 256   bind(cont);
 257   // flag == EQ indicates success
 258   // flag == NE indicates failure
 259   br(Assembler::NE, no_count);
 260 
 261   bind(count);
 262   if (LockingMode == LM_LEGACY) {
 263     inc_held_monitor_count(rscratch1);
 264   }
 265 
 266   bind(no_count);
 267 }
 268 
 269 void C2_MacroAssembler::fast_unlock(Register objectReg, Register boxReg, Register tmpReg,
 270                                     Register tmp2Reg) {
 271   Register oop = objectReg;
 272   Register box = boxReg;
 273   Register disp_hdr = tmpReg;
 274   Register owner_addr = tmpReg;
 275   Register tmp = tmp2Reg;
 276   Label cont;
 277   Label object_has_monitor;
 278   Label count, no_count;
 279   Label unlocked;
 280 
 281   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight");
 282   assert_different_registers(oop, box, tmp, disp_hdr);
 283 
 284   if (LockingMode == LM_LEGACY) {
 285     // Find the lock address and load the displaced header from the stack.
 286     ldr(disp_hdr, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 287 
 288     // If the displaced header is 0, we have a recursive unlock.
 289     cmp(disp_hdr, zr);
 290     br(Assembler::EQ, cont);
 291   }
 292 
 293   // Handle existing monitor.
 294   ldr(tmp, Address(oop, oopDesc::mark_offset_in_bytes()));
 295   tbnz(tmp, exact_log2(markWord::monitor_value), object_has_monitor);
 296 
 297   if (LockingMode == LM_MONITOR) {
 298     tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0.
 299     b(cont);
 300   } else {
 301     assert(LockingMode == LM_LEGACY, "must be");
 302     // Check if it is still a light weight lock, this is is true if we
 303     // see the stack address of the basicLock in the markWord of the
 304     // object.
 305 
 306     cmpxchg(oop, box, disp_hdr, Assembler::xword, /*acquire*/ false,
 307             /*release*/ true, /*weak*/ false, tmp);
 308     b(cont);
 309   }
 310 
 311   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
 312 
 313   // Handle existing monitor.
 314   bind(object_has_monitor);
 315   STATIC_ASSERT(markWord::monitor_value <= INT_MAX);
 316   add(tmp, tmp, -(int)markWord::monitor_value); // monitor
 317 
 318   ldr(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset()));
 319 
 320   Label notRecursive;
 321   cbz(disp_hdr, notRecursive);
 322 
 323   // Recursive lock
 324   sub(disp_hdr, disp_hdr, 1u);
 325   str(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset()));
 326   cmp(disp_hdr, disp_hdr); // Sets flags for result
 327   b(cont);
 328 
 329   bind(notRecursive);
 330 
 331   // Compute owner address.
 332   lea(owner_addr, Address(tmp, ObjectMonitor::owner_offset()));
 333 
 334   // Set owner to null.
 335   // Release to satisfy the JMM
 336   stlr(zr, owner_addr);
 337   // We need a full fence after clearing owner to avoid stranding.
 338   // StoreLoad achieves this.
 339   membar(StoreLoad);
 340 
 341   // Check if the entry_list is empty.
 342   ldr(rscratch1, Address(tmp, ObjectMonitor::entry_list_offset()));
 343   cmp(rscratch1, zr);
 344   br(Assembler::EQ, cont);     // If so we are done.
 345 
 346   // Check if there is a successor.
 347   ldr(rscratch1, Address(tmp, ObjectMonitor::succ_offset()));
 348   cmp(rscratch1, zr);
 349   br(Assembler::NE, unlocked); // If so we are done.
 350 
 351   // Save the monitor pointer in the current thread, so we can try to
 352   // reacquire the lock in SharedRuntime::monitor_exit_helper().
 353   str(tmp, Address(rthread, JavaThread::unlocked_inflated_monitor_offset()));
 354 
 355   cmp(zr, rthread); // Set Flag to NE => slow path
 356   b(cont);
 357 
 358   bind(unlocked);
 359   cmp(zr, zr); // Set Flag to EQ => fast path
 360 
 361   // Intentional fall-through
 362 
 363   bind(cont);
 364   // flag == EQ indicates success
 365   // flag == NE indicates failure
 366   br(Assembler::NE, no_count);
 367 
 368   bind(count);
 369   if (LockingMode == LM_LEGACY) {
 370     dec_held_monitor_count(rscratch1);
 371   }
 372 
 373   bind(no_count);
 374 }
 375 
 376 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register t1,
 377                                               Register t2, Register t3) {
 378   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 379   assert_different_registers(obj, box, t1, t2, t3, rscratch2);
 380 
 381   // Handle inflated monitor.
 382   Label inflated;
 383   // Finish fast lock successfully. MUST branch to with flag == EQ
 384   Label locked;
 385   // Finish fast lock unsuccessfully. MUST branch to with flag == NE
 386   Label slow_path;
 387 
 388   if (UseObjectMonitorTable) {
 389     // Clear cache in case fast locking succeeds.
 390     str(zr, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 391   }
 392 
 393   if (DiagnoseSyncOnValueBasedClasses != 0) {
 394     load_klass(t1, obj);
 395     ldrb(t1, Address(t1, Klass::misc_flags_offset()));
 396     tst(t1, KlassFlags::_misc_is_value_based_class);
 397     br(Assembler::NE, slow_path);
 398   }
 399 
 400   const Register t1_mark = t1;
 401   const Register t3_t = t3;
 402 
 403   { // Lightweight locking
 404 
 405     // Push lock to the lock stack and finish successfully. MUST branch to with flag == EQ
 406     Label push;
 407 
 408     const Register t2_top = t2;
 409 
 410     // Check if lock-stack is full.
 411     ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 412     cmpw(t2_top, (unsigned)LockStack::end_offset() - 1);
 413     br(Assembler::GT, slow_path);
 414 
 415     // Check if recursive.
 416     subw(t3_t, t2_top, oopSize);
 417     ldr(t3_t, Address(rthread, t3_t));
 418     cmp(obj, t3_t);
 419     br(Assembler::EQ, push);
 420 
 421     // Relaxed normal load to check for monitor. Optimization for monitor case.
 422     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 423     tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
 424 
 425     // Not inflated
 426     assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea");
 427 
 428     // Try to lock. Transition lock-bits 0b01 => 0b00
 429     orr(t1_mark, t1_mark, markWord::unlocked_value);
 430     eor(t3_t, t1_mark, markWord::unlocked_value);
 431     cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
 432             /*acquire*/ true, /*release*/ false, /*weak*/ false, noreg);
 433     br(Assembler::NE, slow_path);
 434 
 435     bind(push);
 436     // After successful lock, push object on lock-stack.
 437     str(obj, Address(rthread, t2_top));
 438     addw(t2_top, t2_top, oopSize);
 439     strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 440     b(locked);
 441   }
 442 
 443   { // Handle inflated monitor.
 444     bind(inflated);
 445 
 446     const Register t1_monitor = t1;
 447 
 448     if (!UseObjectMonitorTable) {
 449       assert(t1_monitor == t1_mark, "should be the same here");
 450     } else {
 451       Label monitor_found;
 452 
 453       // Load cache address
 454       lea(t3_t, Address(rthread, JavaThread::om_cache_oops_offset()));
 455 
 456       const int num_unrolled = 2;
 457       for (int i = 0; i < num_unrolled; i++) {
 458         ldr(t1, Address(t3_t));
 459         cmp(obj, t1);
 460         br(Assembler::EQ, monitor_found);
 461         increment(t3_t, in_bytes(OMCache::oop_to_oop_difference()));
 462       }
 463 
 464       Label loop;
 465 
 466       // Search for obj in cache.
 467       bind(loop);
 468 
 469       // Check for match.
 470       ldr(t1, Address(t3_t));
 471       cmp(obj, t1);
 472       br(Assembler::EQ, monitor_found);
 473 
 474       // Search until null encountered, guaranteed _null_sentinel at end.
 475       increment(t3_t, in_bytes(OMCache::oop_to_oop_difference()));
 476       cbnz(t1, loop);
 477       // Cache Miss, NE set from cmp above, cbnz does not set flags
 478       b(slow_path);
 479 
 480       bind(monitor_found);
 481       ldr(t1_monitor, Address(t3_t, OMCache::oop_to_monitor_difference()));
 482     }
 483 
 484     const Register t2_owner_addr = t2;
 485     const Register t3_owner = t3;
 486     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 487     const Address owner_address(t1_monitor, ObjectMonitor::owner_offset() - monitor_tag);
 488     const Address recursions_address(t1_monitor, ObjectMonitor::recursions_offset() - monitor_tag);
 489 
 490     Label monitor_locked;
 491 
 492     // Compute owner address.
 493     lea(t2_owner_addr, owner_address);
 494 
 495     // Try to CAS owner (no owner => current thread's _monitor_owner_id).
 496     ldr(rscratch2, Address(rthread, JavaThread::monitor_owner_id_offset()));
 497     cmpxchg(t2_owner_addr, zr, rscratch2, Assembler::xword, /*acquire*/ true,
 498             /*release*/ false, /*weak*/ false, t3_owner);
 499     br(Assembler::EQ, monitor_locked);
 500 
 501     // Check if recursive.
 502     cmp(t3_owner, rscratch2);
 503     br(Assembler::NE, slow_path);
 504 
 505     // Recursive.
 506     increment(recursions_address, 1);
 507 
 508     bind(monitor_locked);
 509     if (UseObjectMonitorTable) {
 510       str(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 511     }
 512   }
 513 
 514   bind(locked);
 515 
 516 #ifdef ASSERT
 517   // Check that locked label is reached with Flags == EQ.
 518   Label flag_correct;
 519   br(Assembler::EQ, flag_correct);
 520   stop("Fast Lock Flag != EQ");
 521 #endif
 522 
 523   bind(slow_path);
 524 #ifdef ASSERT
 525   // Check that slow_path label is reached with Flags == NE.
 526   br(Assembler::NE, flag_correct);
 527   stop("Fast Lock Flag != NE");
 528   bind(flag_correct);
 529 #endif
 530   // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
 531 }
 532 
 533 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register box, Register t1,
 534                                                 Register t2, Register t3) {
 535   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 536   assert_different_registers(obj, box, t1, t2, t3);
 537 
 538   // Handle inflated monitor.
 539   Label inflated, inflated_load_mark;
 540   // Finish fast unlock successfully. MUST branch to with flag == EQ
 541   Label unlocked;
 542   // Finish fast unlock unsuccessfully. MUST branch to with flag == NE
 543   Label slow_path;
 544 
 545   const Register t1_mark = t1;
 546   const Register t2_top = t2;
 547   const Register t3_t = t3;
 548 
 549   { // Lightweight unlock
 550 
 551     Label push_and_slow_path;
 552 
 553     // Check if obj is top of lock-stack.
 554     ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 555     subw(t2_top, t2_top, oopSize);
 556     ldr(t3_t, Address(rthread, t2_top));
 557     cmp(obj, t3_t);
 558     // Top of lock stack was not obj. Must be monitor.
 559     br(Assembler::NE, inflated_load_mark);
 560 
 561     // Pop lock-stack.
 562     DEBUG_ONLY(str(zr, Address(rthread, t2_top));)
 563     strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 564 
 565     // Check if recursive.
 566     subw(t3_t, t2_top, oopSize);
 567     ldr(t3_t, Address(rthread, t3_t));
 568     cmp(obj, t3_t);
 569     br(Assembler::EQ, unlocked);
 570 
 571     // Not recursive.
 572     // Load Mark.
 573     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 574 
 575     // Check header for monitor (0b10).
 576     // Because we got here by popping (meaning we pushed in locked)
 577     // there will be no monitor in the box. So we need to push back the obj
 578     // so that the runtime can fix any potential anonymous owner.
 579     tbnz(t1_mark, exact_log2(markWord::monitor_value), UseObjectMonitorTable ? push_and_slow_path : inflated);
 580 
 581     // Try to unlock. Transition lock bits 0b00 => 0b01
 582     assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea");
 583     orr(t3_t, t1_mark, markWord::unlocked_value);
 584     cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
 585             /*acquire*/ false, /*release*/ true, /*weak*/ false, noreg);
 586     br(Assembler::EQ, unlocked);
 587 
 588     bind(push_and_slow_path);
 589     // Compare and exchange failed.
 590     // Restore lock-stack and handle the unlock in runtime.
 591     DEBUG_ONLY(str(obj, Address(rthread, t2_top));)
 592     addw(t2_top, t2_top, oopSize);
 593     str(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 594     b(slow_path);
 595   }
 596 
 597 
 598   { // Handle inflated monitor.
 599     bind(inflated_load_mark);
 600     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 601 #ifdef ASSERT
 602     tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
 603     stop("Fast Unlock not monitor");
 604 #endif
 605 
 606     bind(inflated);
 607 
 608 #ifdef ASSERT
 609     Label check_done;
 610     subw(t2_top, t2_top, oopSize);
 611     cmpw(t2_top, in_bytes(JavaThread::lock_stack_base_offset()));
 612     br(Assembler::LT, check_done);
 613     ldr(t3_t, Address(rthread, t2_top));
 614     cmp(obj, t3_t);
 615     br(Assembler::NE, inflated);
 616     stop("Fast Unlock lock on stack");
 617     bind(check_done);
 618 #endif
 619 
 620     const Register t1_monitor = t1;
 621 
 622     if (!UseObjectMonitorTable) {
 623       assert(t1_monitor == t1_mark, "should be the same here");
 624 
 625       // Untag the monitor.
 626       add(t1_monitor, t1_mark, -(int)markWord::monitor_value);
 627     } else {
 628       ldr(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 629       // null check with Flags == NE, no valid pointer below alignof(ObjectMonitor*)
 630       cmp(t1_monitor, checked_cast<uint8_t>(alignof(ObjectMonitor*)));
 631       br(Assembler::LO, slow_path);
 632     }
 633 
 634     const Register t2_recursions = t2;
 635     Label not_recursive;
 636 
 637     // Check if recursive.
 638     ldr(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
 639     cbz(t2_recursions, not_recursive);
 640 
 641     // Recursive unlock.
 642     sub(t2_recursions, t2_recursions, 1u);
 643     str(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
 644     // Set flag == EQ
 645     cmp(t2_recursions, t2_recursions);
 646     b(unlocked);
 647 
 648     bind(not_recursive);
 649 
 650     const Register t2_owner_addr = t2;
 651 
 652     // Compute owner address.
 653     lea(t2_owner_addr, Address(t1_monitor, ObjectMonitor::owner_offset()));
 654 
 655     // Set owner to null.
 656     // Release to satisfy the JMM
 657     stlr(zr, t2_owner_addr);
 658     // We need a full fence after clearing owner to avoid stranding.
 659     // StoreLoad achieves this.
 660     membar(StoreLoad);
 661 
 662     // Check if the entry_list is empty.
 663     ldr(rscratch1, Address(t1_monitor, ObjectMonitor::entry_list_offset()));
 664     cmp(rscratch1, zr);
 665     br(Assembler::EQ, unlocked);  // If so we are done.
 666 
 667     // Check if there is a successor.
 668     ldr(rscratch1, Address(t1_monitor, ObjectMonitor::succ_offset()));
 669     cmp(rscratch1, zr);
 670     br(Assembler::NE, unlocked);  // If so we are done.
 671 
 672     // Save the monitor pointer in the current thread, so we can try to
 673     // reacquire the lock in SharedRuntime::monitor_exit_helper().
 674     str(t1_monitor, Address(rthread, JavaThread::unlocked_inflated_monitor_offset()));
 675 
 676     cmp(zr, rthread); // Set Flag to NE => slow path
 677     b(slow_path);
 678   }
 679 
 680   bind(unlocked);
 681   cmp(zr, zr); // Set Flags to EQ => fast path
 682 
 683 #ifdef ASSERT
 684   // Check that unlocked label is reached with Flags == EQ.
 685   Label flag_correct;
 686   br(Assembler::EQ, flag_correct);
 687   stop("Fast Unlock Flag != EQ");
 688 #endif
 689 
 690   bind(slow_path);
 691 #ifdef ASSERT
 692   // Check that slow_path label is reached with Flags == NE.
 693   br(Assembler::NE, flag_correct);
 694   stop("Fast Unlock Flag != NE");
 695   bind(flag_correct);
 696 #endif
 697   // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
 698 }
 699 
 700 // Search for str1 in str2 and return index or -1
 701 // Clobbers: rscratch1, rscratch2, rflags. May also clobber v0-v1, when icnt1==-1.
 702 void C2_MacroAssembler::string_indexof(Register str2, Register str1,
 703                                        Register cnt2, Register cnt1,
 704                                        Register tmp1, Register tmp2,
 705                                        Register tmp3, Register tmp4,
 706                                        Register tmp5, Register tmp6,
 707                                        int icnt1, Register result, int ae) {
 708   // NOTE: tmp5, tmp6 can be zr depending on specific method version
 709   Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
 710 
 711   Register ch1 = rscratch1;
 712   Register ch2 = rscratch2;
 713   Register cnt1tmp = tmp1;
 714   Register cnt2tmp = tmp2;
 715   Register cnt1_neg = cnt1;
 716   Register cnt2_neg = cnt2;
 717   Register result_tmp = tmp4;
 718 
 719   bool isL = ae == StrIntrinsicNode::LL;
 720 
 721   bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
 722   bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
 723   int str1_chr_shift = str1_isL ? 0:1;
 724   int str2_chr_shift = str2_isL ? 0:1;
 725   int str1_chr_size = str1_isL ? 1:2;
 726   int str2_chr_size = str2_isL ? 1:2;
 727   chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
 728                                       (chr_insn)&MacroAssembler::ldrh;
 729   chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
 730                                       (chr_insn)&MacroAssembler::ldrh;
 731   chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
 732   chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
 733 
 734   // Note, inline_string_indexOf() generates checks:
 735   // if (substr.count > string.count) return -1;
 736   // if (substr.count == 0) return 0;
 737 
 738   // We have two strings, a source string in str2, cnt2 and a pattern string
 739   // in str1, cnt1. Find the 1st occurrence of pattern in source or return -1.
 740 
 741   // For larger pattern and source we use a simplified Boyer Moore algorithm.
 742   // With a small pattern and source we use linear scan.
 743 
 744   if (icnt1 == -1) {
 745     sub(result_tmp, cnt2, cnt1);
 746     cmp(cnt1, (u1)8);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
 747     br(LT, LINEARSEARCH);
 748     dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
 749     subs(zr, cnt1, 256);
 750     lsr(tmp1, cnt2, 2);
 751     ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
 752     br(GE, LINEARSTUB);
 753   }
 754 
 755 // The Boyer Moore alogorithm is based on the description here:-
 756 //
 757 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
 758 //
 759 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
 760 // and the 'Good Suffix' rule.
 761 //
 762 // These rules are essentially heuristics for how far we can shift the
 763 // pattern along the search string.
 764 //
 765 // The implementation here uses the 'Bad Character' rule only because of the
 766 // complexity of initialisation for the 'Good Suffix' rule.
 767 //
 768 // This is also known as the Boyer-Moore-Horspool algorithm:-
 769 //
 770 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
 771 //
 772 // This particular implementation has few java-specific optimizations.
 773 //
 774 // #define ASIZE 256
 775 //
 776 //    int bm(unsigned char *x, int m, unsigned char *y, int n) {
 777 //       int i, j;
 778 //       unsigned c;
 779 //       unsigned char bc[ASIZE];
 780 //
 781 //       /* Preprocessing */
 782 //       for (i = 0; i < ASIZE; ++i)
 783 //          bc[i] = m;
 784 //       for (i = 0; i < m - 1; ) {
 785 //          c = x[i];
 786 //          ++i;
 787 //          // c < 256 for Latin1 string, so, no need for branch
 788 //          #ifdef PATTERN_STRING_IS_LATIN1
 789 //          bc[c] = m - i;
 790 //          #else
 791 //          if (c < ASIZE) bc[c] = m - i;
 792 //          #endif
 793 //       }
 794 //
 795 //       /* Searching */
 796 //       j = 0;
 797 //       while (j <= n - m) {
 798 //          c = y[i+j];
 799 //          if (x[m-1] == c)
 800 //            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
 801 //          if (i < 0) return j;
 802 //          // c < 256 for Latin1 string, so, no need for branch
 803 //          #ifdef SOURCE_STRING_IS_LATIN1
 804 //          // LL case: (c< 256) always true. Remove branch
 805 //          j += bc[y[j+m-1]];
 806 //          #endif
 807 //          #ifndef PATTERN_STRING_IS_UTF
 808 //          // UU case: need if (c<ASIZE) check. Skip 1 character if not.
 809 //          if (c < ASIZE)
 810 //            j += bc[y[j+m-1]];
 811 //          else
 812 //            j += 1
 813 //          #endif
 814 //          #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
 815 //          // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
 816 //          if (c < ASIZE)
 817 //            j += bc[y[j+m-1]];
 818 //          else
 819 //            j += m
 820 //          #endif
 821 //       }
 822 //    }
 823 
 824   if (icnt1 == -1) {
 825     Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
 826         BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
 827     Register cnt1end = tmp2;
 828     Register str2end = cnt2;
 829     Register skipch = tmp2;
 830 
 831     // str1 length is >=8, so, we can read at least 1 register for cases when
 832     // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
 833     // UL case. We'll re-read last character in inner pre-loop code to have
 834     // single outer pre-loop load
 835     const int firstStep = isL ? 7 : 3;
 836 
 837     const int ASIZE = 256;
 838     const int STORED_BYTES = 32; // amount of bytes stored per instruction
 839     sub(sp, sp, ASIZE);
 840     mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
 841     mov(ch1, sp);
 842     BIND(BM_INIT_LOOP);
 843       stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
 844       subs(tmp5, tmp5, 1);
 845       br(GT, BM_INIT_LOOP);
 846 
 847       sub(cnt1tmp, cnt1, 1);
 848       mov(tmp5, str2);
 849       add(str2end, str2, result_tmp, LSL, str2_chr_shift);
 850       sub(ch2, cnt1, 1);
 851       mov(tmp3, str1);
 852     BIND(BCLOOP);
 853       (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
 854       if (!str1_isL) {
 855         subs(zr, ch1, ASIZE);
 856         br(HS, BCSKIP);
 857       }
 858       strb(ch2, Address(sp, ch1));
 859     BIND(BCSKIP);
 860       subs(ch2, ch2, 1);
 861       br(GT, BCLOOP);
 862 
 863       add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
 864       if (str1_isL == str2_isL) {
 865         // load last 8 bytes (8LL/4UU symbols)
 866         ldr(tmp6, Address(tmp6, -wordSize));
 867       } else {
 868         ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
 869         // convert Latin1 to UTF. We'll have to wait until load completed, but
 870         // it's still faster than per-character loads+checks
 871         lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
 872         ubfx(ch1, tmp6, 8, 8); // str1[N-2]
 873         ubfx(ch2, tmp6, 16, 8); // str1[N-3]
 874         andr(tmp6, tmp6, 0xFF); // str1[N-4]
 875         orr(ch2, ch1, ch2, LSL, 16);
 876         orr(tmp6, tmp6, tmp3, LSL, 48);
 877         orr(tmp6, tmp6, ch2, LSL, 16);
 878       }
 879     BIND(BMLOOPSTR2);
 880       (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 881       sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
 882       if (str1_isL == str2_isL) {
 883         // re-init tmp3. It's for free because it's executed in parallel with
 884         // load above. Alternative is to initialize it before loop, but it'll
 885         // affect performance on in-order systems with 2 or more ld/st pipelines
 886         lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
 887       }
 888       if (!isL) { // UU/UL case
 889         lsl(ch2, cnt1tmp, 1); // offset in bytes
 890       }
 891       cmp(tmp3, skipch);
 892       br(NE, BMSKIP);
 893       ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
 894       mov(ch1, tmp6);
 895       if (isL) {
 896         b(BMLOOPSTR1_AFTER_LOAD);
 897       } else {
 898         sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
 899         b(BMLOOPSTR1_CMP);
 900       }
 901     BIND(BMLOOPSTR1);
 902       (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
 903       (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 904     BIND(BMLOOPSTR1_AFTER_LOAD);
 905       subs(cnt1tmp, cnt1tmp, 1);
 906       br(LT, BMLOOPSTR1_LASTCMP);
 907     BIND(BMLOOPSTR1_CMP);
 908       cmp(ch1, ch2);
 909       br(EQ, BMLOOPSTR1);
 910     BIND(BMSKIP);
 911       if (!isL) {
 912         // if we've met UTF symbol while searching Latin1 pattern, then we can
 913         // skip cnt1 symbols
 914         if (str1_isL != str2_isL) {
 915           mov(result_tmp, cnt1);
 916         } else {
 917           mov(result_tmp, 1);
 918         }
 919         subs(zr, skipch, ASIZE);
 920         br(HS, BMADV);
 921       }
 922       ldrb(result_tmp, Address(sp, skipch)); // load skip distance
 923     BIND(BMADV);
 924       sub(cnt1tmp, cnt1, 1);
 925       add(str2, str2, result_tmp, LSL, str2_chr_shift);
 926       cmp(str2, str2end);
 927       br(LE, BMLOOPSTR2);
 928       add(sp, sp, ASIZE);
 929       b(NOMATCH);
 930     BIND(BMLOOPSTR1_LASTCMP);
 931       cmp(ch1, ch2);
 932       br(NE, BMSKIP);
 933     BIND(BMMATCH);
 934       sub(result, str2, tmp5);
 935       if (!str2_isL) lsr(result, result, 1);
 936       add(sp, sp, ASIZE);
 937       b(DONE);
 938 
 939     BIND(LINEARSTUB);
 940     cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm
 941     br(LT, LINEAR_MEDIUM);
 942     mov(result, zr);
 943     RuntimeAddress stub = nullptr;
 944     if (isL) {
 945       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
 946       assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated");
 947     } else if (str1_isL) {
 948       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
 949        assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated");
 950     } else {
 951       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
 952       assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated");
 953     }
 954     address call = trampoline_call(stub);
 955     if (call == nullptr) {
 956       DEBUG_ONLY(reset_labels(LINEARSEARCH, LINEAR_MEDIUM, DONE, NOMATCH, MATCH));
 957       ciEnv::current()->record_failure("CodeCache is full");
 958       return;
 959     }
 960     b(DONE);
 961   }
 962 
 963   BIND(LINEARSEARCH);
 964   {
 965     Label DO1, DO2, DO3;
 966 
 967     Register str2tmp = tmp2;
 968     Register first = tmp3;
 969 
 970     if (icnt1 == -1)
 971     {
 972         Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
 973 
 974         cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2));
 975         br(LT, DOSHORT);
 976       BIND(LINEAR_MEDIUM);
 977         (this->*str1_load_1chr)(first, Address(str1));
 978         lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
 979         sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
 980         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 981         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 982 
 983       BIND(FIRST_LOOP);
 984         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
 985         cmp(first, ch2);
 986         br(EQ, STR1_LOOP);
 987       BIND(STR2_NEXT);
 988         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 989         br(LE, FIRST_LOOP);
 990         b(NOMATCH);
 991 
 992       BIND(STR1_LOOP);
 993         adds(cnt1tmp, cnt1_neg, str1_chr_size);
 994         add(cnt2tmp, cnt2_neg, str2_chr_size);
 995         br(GE, MATCH);
 996 
 997       BIND(STR1_NEXT);
 998         (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
 999         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
1000         cmp(ch1, ch2);
1001         br(NE, STR2_NEXT);
1002         adds(cnt1tmp, cnt1tmp, str1_chr_size);
1003         add(cnt2tmp, cnt2tmp, str2_chr_size);
1004         br(LT, STR1_NEXT);
1005         b(MATCH);
1006 
1007       BIND(DOSHORT);
1008       if (str1_isL == str2_isL) {
1009         cmp(cnt1, (u1)2);
1010         br(LT, DO1);
1011         br(GT, DO3);
1012       }
1013     }
1014 
1015     if (icnt1 == 4) {
1016       Label CH1_LOOP;
1017 
1018         (this->*load_4chr)(ch1, str1);
1019         sub(result_tmp, cnt2, 4);
1020         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
1021         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
1022 
1023       BIND(CH1_LOOP);
1024         (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
1025         cmp(ch1, ch2);
1026         br(EQ, MATCH);
1027         adds(cnt2_neg, cnt2_neg, str2_chr_size);
1028         br(LE, CH1_LOOP);
1029         b(NOMATCH);
1030       }
1031 
1032     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
1033       Label CH1_LOOP;
1034 
1035       BIND(DO2);
1036         (this->*load_2chr)(ch1, str1);
1037         if (icnt1 == 2) {
1038           sub(result_tmp, cnt2, 2);
1039         }
1040         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
1041         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
1042       BIND(CH1_LOOP);
1043         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
1044         cmp(ch1, ch2);
1045         br(EQ, MATCH);
1046         adds(cnt2_neg, cnt2_neg, str2_chr_size);
1047         br(LE, CH1_LOOP);
1048         b(NOMATCH);
1049     }
1050 
1051     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
1052       Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
1053 
1054       BIND(DO3);
1055         (this->*load_2chr)(first, str1);
1056         (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
1057         if (icnt1 == 3) {
1058           sub(result_tmp, cnt2, 3);
1059         }
1060         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
1061         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
1062       BIND(FIRST_LOOP);
1063         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
1064         cmpw(first, ch2);
1065         br(EQ, STR1_LOOP);
1066       BIND(STR2_NEXT);
1067         adds(cnt2_neg, cnt2_neg, str2_chr_size);
1068         br(LE, FIRST_LOOP);
1069         b(NOMATCH);
1070 
1071       BIND(STR1_LOOP);
1072         add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
1073         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
1074         cmp(ch1, ch2);
1075         br(NE, STR2_NEXT);
1076         b(MATCH);
1077     }
1078 
1079     if (icnt1 == -1 || icnt1 == 1) {
1080       Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
1081 
1082       BIND(DO1);
1083         (this->*str1_load_1chr)(ch1, str1);
1084         cmp(cnt2, (u1)8);
1085         br(LT, DO1_SHORT);
1086 
1087         sub(result_tmp, cnt2, 8/str2_chr_size);
1088         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
1089         mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
1090         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
1091 
1092         if (str2_isL) {
1093           orr(ch1, ch1, ch1, LSL, 8);
1094         }
1095         orr(ch1, ch1, ch1, LSL, 16);
1096         orr(ch1, ch1, ch1, LSL, 32);
1097       BIND(CH1_LOOP);
1098         ldr(ch2, Address(str2, cnt2_neg));
1099         eor(ch2, ch1, ch2);
1100         sub(tmp1, ch2, tmp3);
1101         orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
1102         bics(tmp1, tmp1, tmp2);
1103         br(NE, HAS_ZERO);
1104         adds(cnt2_neg, cnt2_neg, 8);
1105         br(LT, CH1_LOOP);
1106 
1107         cmp(cnt2_neg, (u1)8);
1108         mov(cnt2_neg, 0);
1109         br(LT, CH1_LOOP);
1110         b(NOMATCH);
1111 
1112       BIND(HAS_ZERO);
1113         rev(tmp1, tmp1);
1114         clz(tmp1, tmp1);
1115         add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
1116         b(MATCH);
1117 
1118       BIND(DO1_SHORT);
1119         mov(result_tmp, cnt2);
1120         lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
1121         sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
1122       BIND(DO1_LOOP);
1123         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
1124         cmpw(ch1, ch2);
1125         br(EQ, MATCH);
1126         adds(cnt2_neg, cnt2_neg, str2_chr_size);
1127         br(LT, DO1_LOOP);
1128     }
1129   }
1130   BIND(NOMATCH);
1131     mov(result, -1);
1132     b(DONE);
1133   BIND(MATCH);
1134     add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
1135   BIND(DONE);
1136 }
1137 
1138 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
1139 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
1140 
1141 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1,
1142                                             Register ch, Register result,
1143                                             Register tmp1, Register tmp2, Register tmp3)
1144 {
1145   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
1146   Register cnt1_neg = cnt1;
1147   Register ch1 = rscratch1;
1148   Register result_tmp = rscratch2;
1149 
1150   cbz(cnt1, NOMATCH);
1151 
1152   cmp(cnt1, (u1)4);
1153   br(LT, DO1_SHORT);
1154 
1155   orr(ch, ch, ch, LSL, 16);
1156   orr(ch, ch, ch, LSL, 32);
1157 
1158   sub(cnt1, cnt1, 4);
1159   mov(result_tmp, cnt1);
1160   lea(str1, Address(str1, cnt1, Address::uxtw(1)));
1161   sub(cnt1_neg, zr, cnt1, LSL, 1);
1162 
1163   mov(tmp3, 0x0001000100010001);
1164 
1165   BIND(CH1_LOOP);
1166     ldr(ch1, Address(str1, cnt1_neg));
1167     eor(ch1, ch, ch1);
1168     sub(tmp1, ch1, tmp3);
1169     orr(tmp2, ch1, 0x7fff7fff7fff7fff);
1170     bics(tmp1, tmp1, tmp2);
1171     br(NE, HAS_ZERO);
1172     adds(cnt1_neg, cnt1_neg, 8);
1173     br(LT, CH1_LOOP);
1174 
1175     cmp(cnt1_neg, (u1)8);
1176     mov(cnt1_neg, 0);
1177     br(LT, CH1_LOOP);
1178     b(NOMATCH);
1179 
1180   BIND(HAS_ZERO);
1181     rev(tmp1, tmp1);
1182     clz(tmp1, tmp1);
1183     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
1184     b(MATCH);
1185 
1186   BIND(DO1_SHORT);
1187     mov(result_tmp, cnt1);
1188     lea(str1, Address(str1, cnt1, Address::uxtw(1)));
1189     sub(cnt1_neg, zr, cnt1, LSL, 1);
1190   BIND(DO1_LOOP);
1191     ldrh(ch1, Address(str1, cnt1_neg));
1192     cmpw(ch, ch1);
1193     br(EQ, MATCH);
1194     adds(cnt1_neg, cnt1_neg, 2);
1195     br(LT, DO1_LOOP);
1196   BIND(NOMATCH);
1197     mov(result, -1);
1198     b(DONE);
1199   BIND(MATCH);
1200     add(result, result_tmp, cnt1_neg, ASR, 1);
1201   BIND(DONE);
1202 }
1203 
1204 void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1,
1205                                                 Register ch, Register result,
1206                                                 FloatRegister ztmp1,
1207                                                 FloatRegister ztmp2,
1208                                                 PRegister tmp_pg,
1209                                                 PRegister tmp_pdn, bool isL)
1210 {
1211   // Note that `tmp_pdn` should *NOT* be used as governing predicate register.
1212   assert(tmp_pg->is_governing(),
1213          "this register has to be a governing predicate register");
1214 
1215   Label LOOP, MATCH, DONE, NOMATCH;
1216   Register vec_len = rscratch1;
1217   Register idx = rscratch2;
1218 
1219   SIMD_RegVariant T = (isL == true) ? B : H;
1220 
1221   cbz(cnt1, NOMATCH);
1222 
1223   // Assign the particular char throughout the vector.
1224   sve_dup(ztmp2, T, ch);
1225   if (isL) {
1226     sve_cntb(vec_len);
1227   } else {
1228     sve_cnth(vec_len);
1229   }
1230   mov(idx, 0);
1231 
1232   // Generate a predicate to control the reading of input string.
1233   sve_whilelt(tmp_pg, T, idx, cnt1);
1234 
1235   BIND(LOOP);
1236     // Read a vector of 8- or 16-bit data depending on the string type. Note
1237     // that inactive elements indicated by the predicate register won't cause
1238     // a data read from memory to the destination vector.
1239     if (isL) {
1240       sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx));
1241     } else {
1242       sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1)));
1243     }
1244     add(idx, idx, vec_len);
1245 
1246     // Perform the comparison. An element of the destination predicate is set
1247     // to active if the particular char is matched.
1248     sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2);
1249 
1250     // Branch if the particular char is found.
1251     br(NE, MATCH);
1252 
1253     sve_whilelt(tmp_pg, T, idx, cnt1);
1254 
1255     // Loop back if the particular char not found.
1256     br(MI, LOOP);
1257 
1258   BIND(NOMATCH);
1259     mov(result, -1);
1260     b(DONE);
1261 
1262   BIND(MATCH);
1263     // Undo the index increment.
1264     sub(idx, idx, vec_len);
1265 
1266     // Crop the vector to find its location.
1267     sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */);
1268     add(result, idx, -1);
1269     sve_incp(result, T, tmp_pdn);
1270   BIND(DONE);
1271 }
1272 
1273 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1,
1274                                             Register ch, Register result,
1275                                             Register tmp1, Register tmp2, Register tmp3)
1276 {
1277   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
1278   Register cnt1_neg = cnt1;
1279   Register ch1 = rscratch1;
1280   Register result_tmp = rscratch2;
1281 
1282   cbz(cnt1, NOMATCH);
1283 
1284   cmp(cnt1, (u1)8);
1285   br(LT, DO1_SHORT);
1286 
1287   orr(ch, ch, ch, LSL, 8);
1288   orr(ch, ch, ch, LSL, 16);
1289   orr(ch, ch, ch, LSL, 32);
1290 
1291   sub(cnt1, cnt1, 8);
1292   mov(result_tmp, cnt1);
1293   lea(str1, Address(str1, cnt1));
1294   sub(cnt1_neg, zr, cnt1);
1295 
1296   mov(tmp3, 0x0101010101010101);
1297 
1298   BIND(CH1_LOOP);
1299     ldr(ch1, Address(str1, cnt1_neg));
1300     eor(ch1, ch, ch1);
1301     sub(tmp1, ch1, tmp3);
1302     orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f);
1303     bics(tmp1, tmp1, tmp2);
1304     br(NE, HAS_ZERO);
1305     adds(cnt1_neg, cnt1_neg, 8);
1306     br(LT, CH1_LOOP);
1307 
1308     cmp(cnt1_neg, (u1)8);
1309     mov(cnt1_neg, 0);
1310     br(LT, CH1_LOOP);
1311     b(NOMATCH);
1312 
1313   BIND(HAS_ZERO);
1314     rev(tmp1, tmp1);
1315     clz(tmp1, tmp1);
1316     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
1317     b(MATCH);
1318 
1319   BIND(DO1_SHORT);
1320     mov(result_tmp, cnt1);
1321     lea(str1, Address(str1, cnt1));
1322     sub(cnt1_neg, zr, cnt1);
1323   BIND(DO1_LOOP);
1324     ldrb(ch1, Address(str1, cnt1_neg));
1325     cmp(ch, ch1);
1326     br(EQ, MATCH);
1327     adds(cnt1_neg, cnt1_neg, 1);
1328     br(LT, DO1_LOOP);
1329   BIND(NOMATCH);
1330     mov(result, -1);
1331     b(DONE);
1332   BIND(MATCH);
1333     add(result, result_tmp, cnt1_neg);
1334   BIND(DONE);
1335 }
1336 
1337 // Compare strings.
1338 void C2_MacroAssembler::string_compare(Register str1, Register str2,
1339     Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
1340     FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
1341     PRegister pgtmp1, PRegister pgtmp2, int ae) {
1342   Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
1343       DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
1344       SHORT_LOOP_START, TAIL_CHECK;
1345 
1346   bool isLL = ae == StrIntrinsicNode::LL;
1347   bool isLU = ae == StrIntrinsicNode::LU;
1348   bool isUL = ae == StrIntrinsicNode::UL;
1349 
1350   // The stub threshold for LL strings is: 72 (64 + 8) chars
1351   // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch)
1352   // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least)
1353   const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36);
1354 
1355   bool str1_isL = isLL || isLU;
1356   bool str2_isL = isLL || isUL;
1357 
1358   int str1_chr_shift = str1_isL ? 0 : 1;
1359   int str2_chr_shift = str2_isL ? 0 : 1;
1360   int str1_chr_size = str1_isL ? 1 : 2;
1361   int str2_chr_size = str2_isL ? 1 : 2;
1362   int minCharsInWord = isLL ? wordSize : wordSize/2;
1363 
1364   FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
1365   chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
1366                                       (chr_insn)&MacroAssembler::ldrh;
1367   chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
1368                                       (chr_insn)&MacroAssembler::ldrh;
1369   uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
1370                             (uxt_insn)&MacroAssembler::uxthw;
1371 
1372   BLOCK_COMMENT("string_compare {");
1373 
1374   // Bizarrely, the counts are passed in bytes, regardless of whether they
1375   // are L or U strings, however the result is always in characters.
1376   if (!str1_isL) asrw(cnt1, cnt1, 1);
1377   if (!str2_isL) asrw(cnt2, cnt2, 1);
1378 
1379   // Compute the minimum of the string lengths and save the difference.
1380   subsw(result, cnt1, cnt2);
1381   cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
1382 
1383   // A very short string
1384   cmpw(cnt2, minCharsInWord);
1385   br(Assembler::LE, SHORT_STRING);
1386 
1387   // Compare longwords
1388   // load first parts of strings and finish initialization while loading
1389   {
1390     if (str1_isL == str2_isL) { // LL or UU
1391       ldr(tmp1, Address(str1));
1392       cmp(str1, str2);
1393       br(Assembler::EQ, DONE);
1394       ldr(tmp2, Address(str2));
1395       cmp(cnt2, stub_threshold);
1396       br(GE, STUB);
1397       subsw(cnt2, cnt2, minCharsInWord);
1398       br(EQ, TAIL_CHECK);
1399       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1400       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1401       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1402     } else if (isLU) {
1403       ldrs(vtmp, Address(str1));
1404       ldr(tmp2, Address(str2));
1405       cmp(cnt2, stub_threshold);
1406       br(GE, STUB);
1407       subw(cnt2, cnt2, 4);
1408       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1409       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1410       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1411       zip1(vtmp, T8B, vtmp, vtmpZ);
1412       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1413       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1414       add(cnt1, cnt1, 4);
1415       fmovd(tmp1, vtmp);
1416     } else { // UL case
1417       ldr(tmp1, Address(str1));
1418       ldrs(vtmp, Address(str2));
1419       cmp(cnt2, stub_threshold);
1420       br(GE, STUB);
1421       subw(cnt2, cnt2, 4);
1422       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1423       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1424       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1425       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1426       zip1(vtmp, T8B, vtmp, vtmpZ);
1427       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1428       add(cnt1, cnt1, 8);
1429       fmovd(tmp2, vtmp);
1430     }
1431     adds(cnt2, cnt2, isUL ? 4 : 8);
1432     br(GE, TAIL);
1433     eor(rscratch2, tmp1, tmp2);
1434     cbnz(rscratch2, DIFF);
1435     // main loop
1436     bind(NEXT_WORD);
1437     if (str1_isL == str2_isL) {
1438       ldr(tmp1, Address(str1, cnt2));
1439       ldr(tmp2, Address(str2, cnt2));
1440       adds(cnt2, cnt2, 8);
1441     } else if (isLU) {
1442       ldrs(vtmp, Address(str1, cnt1));
1443       ldr(tmp2, Address(str2, cnt2));
1444       add(cnt1, cnt1, 4);
1445       zip1(vtmp, T8B, vtmp, vtmpZ);
1446       fmovd(tmp1, vtmp);
1447       adds(cnt2, cnt2, 8);
1448     } else { // UL
1449       ldrs(vtmp, Address(str2, cnt2));
1450       ldr(tmp1, Address(str1, cnt1));
1451       zip1(vtmp, T8B, vtmp, vtmpZ);
1452       add(cnt1, cnt1, 8);
1453       fmovd(tmp2, vtmp);
1454       adds(cnt2, cnt2, 4);
1455     }
1456     br(GE, TAIL);
1457 
1458     eor(rscratch2, tmp1, tmp2);
1459     cbz(rscratch2, NEXT_WORD);
1460     b(DIFF);
1461     bind(TAIL);
1462     eor(rscratch2, tmp1, tmp2);
1463     cbnz(rscratch2, DIFF);
1464     // Last longword.  In the case where length == 4 we compare the
1465     // same longword twice, but that's still faster than another
1466     // conditional branch.
1467     if (str1_isL == str2_isL) {
1468       ldr(tmp1, Address(str1));
1469       ldr(tmp2, Address(str2));
1470     } else if (isLU) {
1471       ldrs(vtmp, Address(str1));
1472       ldr(tmp2, Address(str2));
1473       zip1(vtmp, T8B, vtmp, vtmpZ);
1474       fmovd(tmp1, vtmp);
1475     } else { // UL
1476       ldrs(vtmp, Address(str2));
1477       ldr(tmp1, Address(str1));
1478       zip1(vtmp, T8B, vtmp, vtmpZ);
1479       fmovd(tmp2, vtmp);
1480     }
1481     bind(TAIL_CHECK);
1482     eor(rscratch2, tmp1, tmp2);
1483     cbz(rscratch2, DONE);
1484 
1485     // Find the first different characters in the longwords and
1486     // compute their difference.
1487     bind(DIFF);
1488     rev(rscratch2, rscratch2);
1489     clz(rscratch2, rscratch2);
1490     andr(rscratch2, rscratch2, isLL ? -8 : -16);
1491     lsrv(tmp1, tmp1, rscratch2);
1492     (this->*ext_chr)(tmp1, tmp1);
1493     lsrv(tmp2, tmp2, rscratch2);
1494     (this->*ext_chr)(tmp2, tmp2);
1495     subw(result, tmp1, tmp2);
1496     b(DONE);
1497   }
1498 
1499   bind(STUB);
1500     RuntimeAddress stub = nullptr;
1501     switch(ae) {
1502       case StrIntrinsicNode::LL:
1503         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
1504         break;
1505       case StrIntrinsicNode::UU:
1506         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
1507         break;
1508       case StrIntrinsicNode::LU:
1509         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
1510         break;
1511       case StrIntrinsicNode::UL:
1512         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
1513         break;
1514       default:
1515         ShouldNotReachHere();
1516      }
1517     assert(stub.target() != nullptr, "compare_long_string stub has not been generated");
1518     address call = trampoline_call(stub);
1519     if (call == nullptr) {
1520       DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START));
1521       ciEnv::current()->record_failure("CodeCache is full");
1522       return;
1523     }
1524     b(DONE);
1525 
1526   bind(SHORT_STRING);
1527   // Is the minimum length zero?
1528   cbz(cnt2, DONE);
1529   // arrange code to do most branches while loading and loading next characters
1530   // while comparing previous
1531   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1532   subs(cnt2, cnt2, 1);
1533   br(EQ, SHORT_LAST_INIT);
1534   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1535   b(SHORT_LOOP_START);
1536   bind(SHORT_LOOP);
1537   subs(cnt2, cnt2, 1);
1538   br(EQ, SHORT_LAST);
1539   bind(SHORT_LOOP_START);
1540   (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
1541   (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
1542   cmp(tmp1, cnt1);
1543   br(NE, SHORT_LOOP_TAIL);
1544   subs(cnt2, cnt2, 1);
1545   br(EQ, SHORT_LAST2);
1546   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1547   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1548   cmp(tmp2, rscratch1);
1549   br(EQ, SHORT_LOOP);
1550   sub(result, tmp2, rscratch1);
1551   b(DONE);
1552   bind(SHORT_LOOP_TAIL);
1553   sub(result, tmp1, cnt1);
1554   b(DONE);
1555   bind(SHORT_LAST2);
1556   cmp(tmp2, rscratch1);
1557   br(EQ, DONE);
1558   sub(result, tmp2, rscratch1);
1559 
1560   b(DONE);
1561   bind(SHORT_LAST_INIT);
1562   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1563   bind(SHORT_LAST);
1564   cmp(tmp1, cnt1);
1565   br(EQ, DONE);
1566   sub(result, tmp1, cnt1);
1567 
1568   bind(DONE);
1569 
1570   BLOCK_COMMENT("} string_compare");
1571 }
1572 
1573 void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1,
1574                                      FloatRegister src2, Condition cond, bool isQ) {
1575   SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1576   FloatRegister zn = src1, zm = src2;
1577   bool needs_negation = false;
1578   switch (cond) {
1579     case LT: cond = GT; zn = src2; zm = src1; break;
1580     case LE: cond = GE; zn = src2; zm = src1; break;
1581     case LO: cond = HI; zn = src2; zm = src1; break;
1582     case LS: cond = HS; zn = src2; zm = src1; break;
1583     case NE: cond = EQ; needs_negation = true; break;
1584     default:
1585       break;
1586   }
1587 
1588   if (is_floating_point_type(bt)) {
1589     fcm(cond, dst, size, zn, zm);
1590   } else {
1591     cm(cond, dst, size, zn, zm);
1592   }
1593 
1594   if (needs_negation) {
1595     notr(dst, isQ ? T16B : T8B, dst);
1596   }
1597 }
1598 
1599 void C2_MacroAssembler::neon_compare_zero(FloatRegister dst, BasicType bt, FloatRegister src,
1600                                           Condition cond, bool isQ) {
1601   SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1602   if (bt == T_FLOAT || bt == T_DOUBLE) {
1603     if (cond == Assembler::NE) {
1604       fcm(Assembler::EQ, dst, size, src);
1605       notr(dst, isQ ? T16B : T8B, dst);
1606     } else {
1607       fcm(cond, dst, size, src);
1608     }
1609   } else {
1610     if (cond == Assembler::NE) {
1611       cm(Assembler::EQ, dst, size, src);
1612       notr(dst, isQ ? T16B : T8B, dst);
1613     } else {
1614       cm(cond, dst, size, src);
1615     }
1616   }
1617 }
1618 
1619 // Compress the least significant bit of each byte to the rightmost and clear
1620 // the higher garbage bits.
1621 void C2_MacroAssembler::bytemask_compress(Register dst) {
1622   // Example input, dst = 0x01 00 00 00 01 01 00 01
1623   // The "??" bytes are garbage.
1624   orr(dst, dst, dst, Assembler::LSR, 7);  // dst = 0x?? 02 ?? 00 ?? 03 ?? 01
1625   orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D
1626   orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D
1627   andr(dst, dst, 0xff);                   // dst = 0x8D
1628 }
1629 
1630 // Pack the lowest-numbered bit of each mask element in src into a long value
1631 // in dst, at most the first 64 lane elements.
1632 // Clobbers: rscratch1, if UseSVE=1 or the hardware doesn't support FEAT_BITPERM.
1633 void C2_MacroAssembler::sve_vmask_tolong(Register dst, PRegister src, BasicType bt, int lane_cnt,
1634                                          FloatRegister vtmp1, FloatRegister vtmp2) {
1635   assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
1636   assert_different_registers(dst, rscratch1);
1637   assert_different_registers(vtmp1, vtmp2);
1638 
1639   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
1640   // Example:   src = 0b01100101 10001101, bt = T_BYTE, lane_cnt = 16
1641   // Expected:  dst = 0x658D
1642 
1643   // Convert the mask into vector with sequential bytes.
1644   // vtmp1 = 0x00010100 0x00010001 0x01000000 0x01010001
1645   sve_cpy(vtmp1, size, src, 1, false);
1646   if (bt != T_BYTE) {
1647     sve_vector_narrow(vtmp1, B, vtmp1, size, vtmp2);
1648   }
1649 
1650   if (UseSVE > 1 && VM_Version::supports_svebitperm()) {
1651     // Given a vector with the value 0x00 or 0x01 in each byte, the basic idea
1652     // is to compress each significant bit of the byte in a cross-lane way. Due
1653     // to the lack of a cross-lane bit-compress instruction, we use BEXT
1654     // (bit-compress in each lane) with the biggest lane size (T = D) then
1655     // concatenate the results.
1656 
1657     // The second source input of BEXT, initialized with 0x01 in each byte.
1658     // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1659     sve_dup(vtmp2, B, 1);
1660 
1661     // BEXT vtmp1.D, vtmp1.D, vtmp2.D
1662     // vtmp1 = 0x0001010000010001 | 0x0100000001010001
1663     // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1664     //         ---------------------------------------
1665     // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1666     sve_bext(vtmp1, D, vtmp1, vtmp2);
1667 
1668     // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the
1669     // result to dst.
1670     // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1671     // dst   = 0x658D
1672     if (lane_cnt <= 8) {
1673       // No need to concatenate.
1674       umov(dst, vtmp1, B, 0);
1675     } else if (lane_cnt <= 16) {
1676       ins(vtmp1, B, vtmp1, 1, 8);
1677       umov(dst, vtmp1, H, 0);
1678     } else {
1679       // As the lane count is 64 at most, the final expected value must be in
1680       // the lowest 64 bits after narrowing vtmp1 from D to B.
1681       sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2);
1682       umov(dst, vtmp1, D, 0);
1683     }
1684   } else if (UseSVE > 0) {
1685     // Compress the lowest 8 bytes.
1686     fmovd(dst, vtmp1);
1687     bytemask_compress(dst);
1688     if (lane_cnt <= 8) return;
1689 
1690     // Repeat on higher bytes and join the results.
1691     // Compress 8 bytes in each iteration.
1692     for (int idx = 1; idx < (lane_cnt / 8); idx++) {
1693       sve_extract_integral(rscratch1, T_LONG, vtmp1, idx, vtmp2);
1694       bytemask_compress(rscratch1);
1695       orr(dst, dst, rscratch1, Assembler::LSL, idx << 3);
1696     }
1697   } else {
1698     assert(false, "unsupported");
1699     ShouldNotReachHere();
1700   }
1701 }
1702 
1703 // Unpack the mask, a long value in src, into predicate register dst based on the
1704 // corresponding data type. Note that dst can support at most 64 lanes.
1705 // Below example gives the expected dst predicate register in different types, with
1706 // a valid src(0x658D) on a 1024-bit vector size machine.
1707 // BYTE:  dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 00 00 65 8D
1708 // SHORT: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 14 11 40 51
1709 // INT:   dst = 0x00 00 00 00 00 00 00 00 01 10 01 01 10 00 11 01
1710 // LONG:  dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01
1711 //
1712 // The number of significant bits of src must be equal to lane_cnt. E.g., 0xFF658D which
1713 // has 24 significant bits would be an invalid input if dst predicate register refers to
1714 // a LONG type 1024-bit vector, which has at most 16 lanes.
1715 void C2_MacroAssembler::sve_vmask_fromlong(PRegister dst, Register src, BasicType bt, int lane_cnt,
1716                                            FloatRegister vtmp1, FloatRegister vtmp2) {
1717   assert(UseSVE == 2 && VM_Version::supports_svebitperm() &&
1718          lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported");
1719   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
1720   // Example:   src = 0x658D, bt = T_BYTE, size = B, lane_cnt = 16
1721   // Expected:  dst = 0b01101001 10001101
1722 
1723   // Put long value from general purpose register into the first lane of vector.
1724   // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1725   sve_dup(vtmp1, B, 0);
1726   mov(vtmp1, D, 0, src);
1727 
1728   // As sve_cmp generates mask value with the minimum unit in byte, we should
1729   // transform the value in the first lane which is mask in bit now to the
1730   // mask in byte, which can be done by SVE2's BDEP instruction.
1731 
1732   // The first source input of BDEP instruction. Deposite each byte in every 8 bytes.
1733   // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1734   if (lane_cnt <= 8) {
1735     // Nothing. As only one byte exsits.
1736   } else if (lane_cnt <= 16) {
1737     ins(vtmp1, B, vtmp1, 8, 1);
1738     mov(vtmp1, B, 1, zr);
1739   } else {
1740     sve_vector_extend(vtmp1, D, vtmp1, B);
1741   }
1742 
1743   // The second source input of BDEP instruction, initialized with 0x01 for each byte.
1744   // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1745   sve_dup(vtmp2, B, 1);
1746 
1747   // BDEP vtmp1.D, vtmp1.D, vtmp2.D
1748   // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1749   // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1750   //         ---------------------------------------
1751   // vtmp1 = 0x0001010000010001 | 0x0100000001010001
1752   sve_bdep(vtmp1, D, vtmp1, vtmp2);
1753 
1754   if (bt != T_BYTE) {
1755     sve_vector_extend(vtmp1, size, vtmp1, B);
1756   }
1757   // Generate mask according to the given vector, in which the elements have been
1758   // extended to expected type.
1759   // dst = 0b01101001 10001101
1760   sve_cmp(Assembler::NE, dst, size, ptrue, vtmp1, 0);
1761 }
1762 
1763 // Clobbers: rflags
1764 void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg,
1765                                     FloatRegister zn, FloatRegister zm, Condition cond) {
1766   assert(pg->is_governing(), "This register has to be a governing predicate register");
1767   FloatRegister z1 = zn, z2 = zm;
1768   switch (cond) {
1769     case LE: z1 = zm; z2 = zn; cond = GE; break;
1770     case LT: z1 = zm; z2 = zn; cond = GT; break;
1771     case LO: z1 = zm; z2 = zn; cond = HI; break;
1772     case LS: z1 = zm; z2 = zn; cond = HS; break;
1773     default:
1774       break;
1775   }
1776 
1777   SIMD_RegVariant size = elemType_to_regVariant(bt);
1778   if (is_floating_point_type(bt)) {
1779     sve_fcm(cond, pd, size, pg, z1, z2);
1780   } else {
1781     assert(is_integral_type(bt), "unsupported element type");
1782     sve_cmp(cond, pd, size, pg, z1, z2);
1783   }
1784 }
1785 
1786 // Get index of the last mask lane that is set
1787 void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) {
1788   SIMD_RegVariant size = elemType_to_regVariant(bt);
1789   sve_rev(ptmp, size, src);
1790   sve_brkb(ptmp, ptrue, ptmp, false);
1791   sve_cntp(dst, size, ptrue, ptmp);
1792   movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1);
1793   subw(dst, rscratch1, dst);
1794 }
1795 
1796 // Extend integer vector src to dst with the same lane count
1797 // but larger element size, e.g. 4B -> 4I
1798 void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes,
1799                                            FloatRegister src, BasicType src_bt, bool is_unsigned) {
1800   if (src_bt == T_BYTE) {
1801     if (dst_bt == T_SHORT) {
1802       // 4B/8B to 4S/8S
1803       _xshll(is_unsigned, dst, T8H, src, T8B, 0);
1804     } else {
1805       // 4B to 4I
1806       assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported");
1807       _xshll(is_unsigned, dst, T8H, src, T8B, 0);
1808       _xshll(is_unsigned, dst, T4S, dst, T4H, 0);
1809     }
1810   } else if (src_bt == T_SHORT) {
1811     // 4S to 4I
1812     assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported");
1813     _xshll(is_unsigned, dst, T4S, src, T4H, 0);
1814   } else if (src_bt == T_INT) {
1815     // 2I to 2L
1816     assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported");
1817     _xshll(is_unsigned, dst, T2D, src, T2S, 0);
1818   } else {
1819     ShouldNotReachHere();
1820   }
1821 }
1822 
1823 // Narrow integer vector src down to dst with the same lane count
1824 // but smaller element size, e.g. 4I -> 4B
1825 void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt,
1826                                            FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) {
1827   if (src_bt == T_SHORT) {
1828     // 4S/8S to 4B/8B
1829     assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
1830     assert(dst_bt == T_BYTE, "unsupported");
1831     xtn(dst, T8B, src, T8H);
1832   } else if (src_bt == T_INT) {
1833     // 4I to 4B/4S
1834     assert(src_vlen_in_bytes == 16, "unsupported");
1835     assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported");
1836     xtn(dst, T4H, src, T4S);
1837     if (dst_bt == T_BYTE) {
1838       xtn(dst, T8B, dst, T8H);
1839     }
1840   } else if (src_bt == T_LONG) {
1841     // 2L to 2I
1842     assert(src_vlen_in_bytes == 16, "unsupported");
1843     assert(dst_bt == T_INT, "unsupported");
1844     xtn(dst, T2S, src, T2D);
1845   } else {
1846     ShouldNotReachHere();
1847   }
1848 }
1849 
1850 void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size,
1851                                           FloatRegister src, SIMD_RegVariant src_size,
1852                                           bool is_unsigned) {
1853   assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size");
1854 
1855   if (src_size == B) {
1856     switch (dst_size) {
1857     case H:
1858       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1859       break;
1860     case S:
1861       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1862       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1863       break;
1864     case D:
1865       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1866       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1867       _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1868       break;
1869     default:
1870       ShouldNotReachHere();
1871     }
1872   } else if (src_size == H) {
1873     if (dst_size == S) {
1874       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1875     } else { // D
1876       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1877       _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1878     }
1879   } else if (src_size == S) {
1880     _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, src);
1881   }
1882 }
1883 
1884 // Vector narrow from src to dst with specified element sizes.
1885 // High part of dst vector will be filled with zero.
1886 void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size,
1887                                           FloatRegister src, SIMD_RegVariant src_size,
1888                                           FloatRegister tmp) {
1889   assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size");
1890   assert_different_registers(src, tmp);
1891   sve_dup(tmp, src_size, 0);
1892   if (src_size == D) {
1893     switch (dst_size) {
1894     case S:
1895       sve_uzp1(dst, S, src, tmp);
1896       break;
1897     case H:
1898       assert_different_registers(dst, tmp);
1899       sve_uzp1(dst, S, src, tmp);
1900       sve_uzp1(dst, H, dst, tmp);
1901       break;
1902     case B:
1903       assert_different_registers(dst, tmp);
1904       sve_uzp1(dst, S, src, tmp);
1905       sve_uzp1(dst, H, dst, tmp);
1906       sve_uzp1(dst, B, dst, tmp);
1907       break;
1908     default:
1909       ShouldNotReachHere();
1910     }
1911   } else if (src_size == S) {
1912     if (dst_size == H) {
1913       sve_uzp1(dst, H, src, tmp);
1914     } else { // B
1915       assert_different_registers(dst, tmp);
1916       sve_uzp1(dst, H, src, tmp);
1917       sve_uzp1(dst, B, dst, tmp);
1918     }
1919   } else if (src_size == H) {
1920     sve_uzp1(dst, B, src, tmp);
1921   }
1922 }
1923 
1924 // Extend src predicate to dst predicate with the same lane count but larger
1925 // element size, e.g. 64Byte -> 512Long
1926 void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src,
1927                                              uint dst_element_length_in_bytes,
1928                                              uint src_element_length_in_bytes) {
1929   if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) {
1930     sve_punpklo(dst, src);
1931   } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) {
1932     sve_punpklo(dst, src);
1933     sve_punpklo(dst, dst);
1934   } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) {
1935     sve_punpklo(dst, src);
1936     sve_punpklo(dst, dst);
1937     sve_punpklo(dst, dst);
1938   } else {
1939     assert(false, "unsupported");
1940     ShouldNotReachHere();
1941   }
1942 }
1943 
1944 // Narrow src predicate to dst predicate with the same lane count but
1945 // smaller element size, e.g. 512Long -> 64Byte
1946 void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src, PRegister ptmp,
1947                                              uint dst_element_length_in_bytes, uint src_element_length_in_bytes) {
1948   // The insignificant bits in src predicate are expected to be zero.
1949   // To ensure the higher order bits of the resultant narrowed vector are 0, an all-zero predicate is
1950   // passed as the second argument. An example narrowing operation with a given mask would be -
1951   // 128Long -> 64Int on a 128-bit machine i.e 2L -> 2I
1952   // Mask (for 2 Longs) : TF
1953   // Predicate register for the above mask (16 bits) : 00000001 00000000
1954   // After narrowing (uzp1 dst.b, src.b, ptmp.b) : 0000 0000 0001 0000
1955   // Which translates to mask for 2 integers as : TF (lower half is considered while upper half is 0)
1956   assert_different_registers(src, ptmp);
1957   assert_different_registers(dst, ptmp);
1958   sve_pfalse(ptmp);
1959   if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) {
1960     sve_uzp1(dst, B, src, ptmp);
1961   } else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) {
1962     sve_uzp1(dst, H, src, ptmp);
1963     sve_uzp1(dst, B, dst, ptmp);
1964   } else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) {
1965     sve_uzp1(dst, S, src, ptmp);
1966     sve_uzp1(dst, H, dst, ptmp);
1967     sve_uzp1(dst, B, dst, ptmp);
1968   } else {
1969     assert(false, "unsupported");
1970     ShouldNotReachHere();
1971   }
1972 }
1973 
1974 // Vector reduction add for integral type with ASIMD instructions.
1975 void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt,
1976                                                  Register isrc, FloatRegister vsrc,
1977                                                  unsigned vector_length_in_bytes,
1978                                                  FloatRegister vtmp) {
1979   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1980   assert_different_registers(dst, isrc);
1981   bool isQ = vector_length_in_bytes == 16;
1982 
1983   BLOCK_COMMENT("neon_reduce_add_integral {");
1984     switch(bt) {
1985       case T_BYTE:
1986         addv(vtmp, isQ ? T16B : T8B, vsrc);
1987         smov(dst, vtmp, B, 0);
1988         addw(dst, dst, isrc, ext::sxtb);
1989         break;
1990       case T_SHORT:
1991         addv(vtmp, isQ ? T8H : T4H, vsrc);
1992         smov(dst, vtmp, H, 0);
1993         addw(dst, dst, isrc, ext::sxth);
1994         break;
1995       case T_INT:
1996         isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc);
1997         umov(dst, vtmp, S, 0);
1998         addw(dst, dst, isrc);
1999         break;
2000       case T_LONG:
2001         assert(isQ, "unsupported");
2002         addpd(vtmp, vsrc);
2003         umov(dst, vtmp, D, 0);
2004         add(dst, dst, isrc);
2005         break;
2006       default:
2007         assert(false, "unsupported");
2008         ShouldNotReachHere();
2009     }
2010   BLOCK_COMMENT("} neon_reduce_add_integral");
2011 }
2012 
2013 // Vector reduction multiply for integral type with ASIMD instructions.
2014 // Note: temporary registers vtmp1 and vtmp2 are not used in some cases.
2015 // Clobbers: rscratch1
2016 void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt,
2017                                                  Register isrc, FloatRegister vsrc,
2018                                                  unsigned vector_length_in_bytes,
2019                                                  FloatRegister vtmp1, FloatRegister vtmp2) {
2020   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
2021   bool isQ = vector_length_in_bytes == 16;
2022 
2023   BLOCK_COMMENT("neon_reduce_mul_integral {");
2024     switch(bt) {
2025       case T_BYTE:
2026         if (isQ) {
2027           // Multiply the lower half and higher half of vector iteratively.
2028           // vtmp1 = vsrc[8:15]
2029           ins(vtmp1, D, vsrc, 0, 1);
2030           // vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7]
2031           mulv(vtmp1, T8B, vtmp1, vsrc);
2032           // vtmp2 = vtmp1[4:7]
2033           ins(vtmp2, S, vtmp1, 0, 1);
2034           // vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3]
2035           mulv(vtmp1, T8B, vtmp2, vtmp1);
2036         } else {
2037           ins(vtmp1, S, vsrc, 0, 1);
2038           mulv(vtmp1, T8B, vtmp1, vsrc);
2039         }
2040         // vtmp2 = vtmp1[2:3]
2041         ins(vtmp2, H, vtmp1, 0, 1);
2042         // vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1]
2043         mulv(vtmp2, T8B, vtmp2, vtmp1);
2044         // dst = vtmp2[0] * isrc * vtmp2[1]
2045         umov(rscratch1, vtmp2, B, 0);
2046         mulw(dst, rscratch1, isrc);
2047         sxtb(dst, dst);
2048         umov(rscratch1, vtmp2, B, 1);
2049         mulw(dst, rscratch1, dst);
2050         sxtb(dst, dst);
2051         break;
2052       case T_SHORT:
2053         if (isQ) {
2054           ins(vtmp2, D, vsrc, 0, 1);
2055           mulv(vtmp2, T4H, vtmp2, vsrc);
2056           ins(vtmp1, S, vtmp2, 0, 1);
2057           mulv(vtmp1, T4H, vtmp1, vtmp2);
2058         } else {
2059           ins(vtmp1, S, vsrc, 0, 1);
2060           mulv(vtmp1, T4H, vtmp1, vsrc);
2061         }
2062         umov(rscratch1, vtmp1, H, 0);
2063         mulw(dst, rscratch1, isrc);
2064         sxth(dst, dst);
2065         umov(rscratch1, vtmp1, H, 1);
2066         mulw(dst, rscratch1, dst);
2067         sxth(dst, dst);
2068         break;
2069       case T_INT:
2070         if (isQ) {
2071           ins(vtmp1, D, vsrc, 0, 1);
2072           mulv(vtmp1, T2S, vtmp1, vsrc);
2073         } else {
2074           vtmp1 = vsrc;
2075         }
2076         umov(rscratch1, vtmp1, S, 0);
2077         mul(dst, rscratch1, isrc);
2078         umov(rscratch1, vtmp1, S, 1);
2079         mul(dst, rscratch1, dst);
2080         break;
2081       case T_LONG:
2082         umov(rscratch1, vsrc, D, 0);
2083         mul(dst, isrc, rscratch1);
2084         umov(rscratch1, vsrc, D, 1);
2085         mul(dst, dst, rscratch1);
2086         break;
2087       default:
2088         assert(false, "unsupported");
2089         ShouldNotReachHere();
2090     }
2091   BLOCK_COMMENT("} neon_reduce_mul_integral");
2092 }
2093 
2094 // Vector reduction multiply for floating-point type with ASIMD instructions.
2095 void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt,
2096                                            FloatRegister fsrc, FloatRegister vsrc,
2097                                            unsigned vector_length_in_bytes,
2098                                            FloatRegister vtmp) {
2099   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
2100   bool isQ = vector_length_in_bytes == 16;
2101 
2102   BLOCK_COMMENT("neon_reduce_mul_fp {");
2103     switch(bt) {
2104       case T_FLOAT:
2105         fmuls(dst, fsrc, vsrc);
2106         ins(vtmp, S, vsrc, 0, 1);
2107         fmuls(dst, dst, vtmp);
2108         if (isQ) {
2109           ins(vtmp, S, vsrc, 0, 2);
2110           fmuls(dst, dst, vtmp);
2111           ins(vtmp, S, vsrc, 0, 3);
2112           fmuls(dst, dst, vtmp);
2113          }
2114         break;
2115       case T_DOUBLE:
2116         assert(isQ, "unsupported");
2117         fmuld(dst, fsrc, vsrc);
2118         ins(vtmp, D, vsrc, 0, 1);
2119         fmuld(dst, dst, vtmp);
2120         break;
2121       default:
2122         assert(false, "unsupported");
2123         ShouldNotReachHere();
2124     }
2125   BLOCK_COMMENT("} neon_reduce_mul_fp");
2126 }
2127 
2128 // Helper to select logical instruction
2129 void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd,
2130                                                    Register Rn, Register Rm,
2131                                                    enum shift_kind kind, unsigned shift) {
2132   switch(opc) {
2133     case Op_AndReductionV:
2134       is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift);
2135       break;
2136     case Op_OrReductionV:
2137       is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift);
2138       break;
2139     case Op_XorReductionV:
2140       is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift);
2141       break;
2142     default:
2143       assert(false, "unsupported");
2144       ShouldNotReachHere();
2145   }
2146 }
2147 
2148 // Vector reduction logical operations And, Or, Xor
2149 // Clobbers: rscratch1
2150 void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt,
2151                                             Register isrc, FloatRegister vsrc,
2152                                             unsigned vector_length_in_bytes) {
2153   assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV,
2154          "unsupported");
2155   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
2156   assert_different_registers(dst, isrc);
2157   bool isQ = vector_length_in_bytes == 16;
2158 
2159   BLOCK_COMMENT("neon_reduce_logical {");
2160     umov(rscratch1, vsrc, isQ ? D : S, 0);
2161     umov(dst, vsrc, isQ ? D : S, 1);
2162     neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1);
2163     switch(bt) {
2164       case T_BYTE:
2165         if (isQ) {
2166           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
2167         }
2168         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
2169         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8);
2170         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
2171         sxtb(dst, dst);
2172         break;
2173       case T_SHORT:
2174         if (isQ) {
2175           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
2176         }
2177         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
2178         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
2179         sxth(dst, dst);
2180         break;
2181       case T_INT:
2182         if (isQ) {
2183           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
2184         }
2185         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
2186         break;
2187       case T_LONG:
2188         assert(isQ, "unsupported");
2189         neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst);
2190         break;
2191       default:
2192         assert(false, "unsupported");
2193         ShouldNotReachHere();
2194     }
2195   BLOCK_COMMENT("} neon_reduce_logical");
2196 }
2197 
2198 // Vector reduction min/max for integral type with ASIMD instructions.
2199 // Note: vtmp is not used and expected to be fnoreg for T_LONG case.
2200 // Clobbers: rscratch1, rflags
2201 void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt,
2202                                                     Register isrc, FloatRegister vsrc,
2203                                                     unsigned vector_length_in_bytes,
2204                                                     FloatRegister vtmp) {
2205   assert(opc == Op_MinReductionV || opc == Op_MaxReductionV, "unsupported");
2206   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
2207   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported");
2208   assert_different_registers(dst, isrc);
2209   bool isQ = vector_length_in_bytes == 16;
2210   bool is_min = opc == Op_MinReductionV;
2211 
2212   BLOCK_COMMENT("neon_reduce_minmax_integral {");
2213     if (bt == T_LONG) {
2214       assert(vtmp == fnoreg, "should be");
2215       assert(isQ, "should be");
2216       umov(rscratch1, vsrc, D, 0);
2217       cmp(isrc, rscratch1);
2218       csel(dst, isrc, rscratch1, is_min ? LT : GT);
2219       umov(rscratch1, vsrc, D, 1);
2220       cmp(dst, rscratch1);
2221       csel(dst, dst, rscratch1, is_min ? LT : GT);
2222     } else {
2223       SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
2224       if (size == T2S) {
2225         is_min ? sminp(vtmp, size, vsrc, vsrc) : smaxp(vtmp, size, vsrc, vsrc);
2226       } else {
2227         is_min ? sminv(vtmp, size, vsrc) : smaxv(vtmp, size, vsrc);
2228       }
2229       if (bt == T_INT) {
2230         umov(dst, vtmp, S, 0);
2231       } else {
2232         smov(dst, vtmp, elemType_to_regVariant(bt), 0);
2233       }
2234       cmpw(dst, isrc);
2235       cselw(dst, dst, isrc, is_min ? LT : GT);
2236     }
2237   BLOCK_COMMENT("} neon_reduce_minmax_integral");
2238 }
2239 
2240 // Vector reduction for integral type with SVE instruction.
2241 // Supported operations are Add, And, Or, Xor, Max, Min.
2242 // rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV.
2243 void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1,
2244                                             FloatRegister src2, PRegister pg, FloatRegister tmp) {
2245   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2246   assert(pg->is_governing(), "This register has to be a governing predicate register");
2247   assert_different_registers(src1, dst);
2248   // Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved.
2249   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2250   switch (opc) {
2251     case Op_AddReductionVI: {
2252       sve_uaddv(tmp, size, pg, src2);
2253       if (bt == T_BYTE) {
2254         smov(dst, tmp, size, 0);
2255         addw(dst, src1, dst, ext::sxtb);
2256       } else if (bt == T_SHORT) {
2257         smov(dst, tmp, size, 0);
2258         addw(dst, src1, dst, ext::sxth);
2259       } else {
2260         umov(dst, tmp, size, 0);
2261         addw(dst, dst, src1);
2262       }
2263       break;
2264     }
2265     case Op_AddReductionVL: {
2266       sve_uaddv(tmp, size, pg, src2);
2267       umov(dst, tmp, size, 0);
2268       add(dst, dst, src1);
2269       break;
2270     }
2271     case Op_AndReductionV: {
2272       sve_andv(tmp, size, pg, src2);
2273       if (bt == T_INT || bt == T_LONG) {
2274         umov(dst, tmp, size, 0);
2275       } else {
2276         smov(dst, tmp, size, 0);
2277       }
2278       if (bt == T_LONG) {
2279         andr(dst, dst, src1);
2280       } else {
2281         andw(dst, dst, src1);
2282       }
2283       break;
2284     }
2285     case Op_OrReductionV: {
2286       sve_orv(tmp, size, pg, src2);
2287       if (bt == T_INT || bt == T_LONG) {
2288         umov(dst, tmp, size, 0);
2289       } else {
2290         smov(dst, tmp, size, 0);
2291       }
2292       if (bt == T_LONG) {
2293         orr(dst, dst, src1);
2294       } else {
2295         orrw(dst, dst, src1);
2296       }
2297       break;
2298     }
2299     case Op_XorReductionV: {
2300       sve_eorv(tmp, size, pg, src2);
2301       if (bt == T_INT || bt == T_LONG) {
2302         umov(dst, tmp, size, 0);
2303       } else {
2304         smov(dst, tmp, size, 0);
2305       }
2306       if (bt == T_LONG) {
2307         eor(dst, dst, src1);
2308       } else {
2309         eorw(dst, dst, src1);
2310       }
2311       break;
2312     }
2313     case Op_MaxReductionV: {
2314       sve_smaxv(tmp, size, pg, src2);
2315       if (bt == T_INT || bt == T_LONG) {
2316         umov(dst, tmp, size, 0);
2317       } else {
2318         smov(dst, tmp, size, 0);
2319       }
2320       if (bt == T_LONG) {
2321         cmp(dst, src1);
2322         csel(dst, dst, src1, Assembler::GT);
2323       } else {
2324         cmpw(dst, src1);
2325         cselw(dst, dst, src1, Assembler::GT);
2326       }
2327       break;
2328     }
2329     case Op_MinReductionV: {
2330       sve_sminv(tmp, size, pg, src2);
2331       if (bt == T_INT || bt == T_LONG) {
2332         umov(dst, tmp, size, 0);
2333       } else {
2334         smov(dst, tmp, size, 0);
2335       }
2336       if (bt == T_LONG) {
2337         cmp(dst, src1);
2338         csel(dst, dst, src1, Assembler::LT);
2339       } else {
2340         cmpw(dst, src1);
2341         cselw(dst, dst, src1, Assembler::LT);
2342       }
2343       break;
2344     }
2345     default:
2346       assert(false, "unsupported");
2347       ShouldNotReachHere();
2348   }
2349 
2350   if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) {
2351     if (bt == T_BYTE) {
2352       sxtb(dst, dst);
2353     } else if (bt == T_SHORT) {
2354       sxth(dst, dst);
2355     }
2356   }
2357 }
2358 
2359 // Set elements of the dst predicate to true for lanes in the range of [0, lane_cnt), or
2360 // to false otherwise. The input "lane_cnt" should be smaller than or equal to the supported
2361 // max vector length of the basic type. Clobbers: rscratch1 and the rFlagsReg.
2362 void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt) {
2363   uint32_t max_vector_length = Matcher::max_vector_size(bt);
2364   assert(lane_cnt <= max_vector_length, "unsupported input lane_cnt");
2365 
2366   // Set all elements to false if the input "lane_cnt" is zero.
2367   if (lane_cnt == 0) {
2368     sve_pfalse(dst);
2369     return;
2370   }
2371 
2372   SIMD_RegVariant size = elemType_to_regVariant(bt);
2373   assert(size != Q, "invalid size");
2374 
2375   // Set all true if "lane_cnt" equals to the max lane count.
2376   if (lane_cnt == max_vector_length) {
2377     sve_ptrue(dst, size, /* ALL */ 0b11111);
2378     return;
2379   }
2380 
2381   // Fixed numbers for "ptrue".
2382   switch(lane_cnt) {
2383   case 1: /* VL1 */
2384   case 2: /* VL2 */
2385   case 3: /* VL3 */
2386   case 4: /* VL4 */
2387   case 5: /* VL5 */
2388   case 6: /* VL6 */
2389   case 7: /* VL7 */
2390   case 8: /* VL8 */
2391     sve_ptrue(dst, size, lane_cnt);
2392     return;
2393   case 16:
2394     sve_ptrue(dst, size, /* VL16 */ 0b01001);
2395     return;
2396   case 32:
2397     sve_ptrue(dst, size, /* VL32 */ 0b01010);
2398     return;
2399   case 64:
2400     sve_ptrue(dst, size, /* VL64 */ 0b01011);
2401     return;
2402   case 128:
2403     sve_ptrue(dst, size, /* VL128 */ 0b01100);
2404     return;
2405   case 256:
2406     sve_ptrue(dst, size, /* VL256 */ 0b01101);
2407     return;
2408   default:
2409     break;
2410   }
2411 
2412   // Special patterns for "ptrue".
2413   if (lane_cnt == round_down_power_of_2(max_vector_length)) {
2414     sve_ptrue(dst, size, /* POW2 */ 0b00000);
2415   } else if (lane_cnt == max_vector_length - (max_vector_length % 4)) {
2416     sve_ptrue(dst, size, /* MUL4 */ 0b11101);
2417   } else if (lane_cnt == max_vector_length - (max_vector_length % 3)) {
2418     sve_ptrue(dst, size, /* MUL3 */ 0b11110);
2419   } else {
2420     // Encode to "whileltw" for the remaining cases.
2421     mov(rscratch1, lane_cnt);
2422     sve_whileltw(dst, size, zr, rscratch1);
2423   }
2424 }
2425 
2426 // Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst.
2427 // Any remaining elements of dst will be filled with zero.
2428 // Clobbers: rscratch1
2429 // Preserves: src, mask
2430 void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask,
2431                                            FloatRegister vtmp1, FloatRegister vtmp2,
2432                                            PRegister pgtmp) {
2433   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2434   assert_different_registers(dst, src, vtmp1, vtmp2);
2435   assert_different_registers(mask, pgtmp);
2436 
2437   // Example input:   src   = 8888 7777 6666 5555 4444 3333 2222 1111
2438   //                  mask  = 0001 0000 0000 0001 0001 0000 0001 0001
2439   // Expected result: dst   = 0000 0000 0000 8888 5555 4444 2222 1111
2440   sve_dup(vtmp2, H, 0);
2441 
2442   // Extend lowest half to type INT.
2443   // dst = 00004444 00003333 00002222 00001111
2444   sve_uunpklo(dst, S, src);
2445   // pgtmp = 00000001 00000000 00000001 00000001
2446   sve_punpklo(pgtmp, mask);
2447   // Pack the active elements in size of type INT to the right,
2448   // and fill the remainings with zero.
2449   // dst = 00000000 00004444 00002222 00001111
2450   sve_compact(dst, S, dst, pgtmp);
2451   // Narrow the result back to type SHORT.
2452   // dst = 0000 0000 0000 0000 0000 4444 2222 1111
2453   sve_uzp1(dst, H, dst, vtmp2);
2454   // Count the active elements of lowest half.
2455   // rscratch1 = 3
2456   sve_cntp(rscratch1, S, ptrue, pgtmp);
2457 
2458   // Repeat to the highest half.
2459   // pgtmp = 00000001 00000000 00000000 00000001
2460   sve_punpkhi(pgtmp, mask);
2461   // vtmp1 = 00008888 00007777 00006666 00005555
2462   sve_uunpkhi(vtmp1, S, src);
2463   // vtmp1 = 00000000 00000000 00008888 00005555
2464   sve_compact(vtmp1, S, vtmp1, pgtmp);
2465   // vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555
2466   sve_uzp1(vtmp1, H, vtmp1, vtmp2);
2467 
2468   // Compressed low:   dst   = 0000 0000 0000 0000 0000 4444 2222 1111
2469   // Compressed high:  vtmp1 = 0000 0000 0000 0000 0000 0000 8888  5555
2470   // Left shift(cross lane) compressed high with TRUE_CNT lanes,
2471   // TRUE_CNT is the number of active elements in the compressed low.
2472   neg(rscratch1, rscratch1);
2473   // vtmp2 = {4 3 2 1 0 -1 -2 -3}
2474   sve_index(vtmp2, H, rscratch1, 1);
2475   // vtmp1 = 0000 0000 0000 8888 5555 0000 0000 0000
2476   sve_tbl(vtmp1, H, vtmp1, vtmp2);
2477 
2478   // Combine the compressed high(after shifted) with the compressed low.
2479   // dst = 0000 0000 0000 8888 5555 4444 2222 1111
2480   sve_orr(dst, dst, vtmp1);
2481 }
2482 
2483 // Clobbers: rscratch1, rscratch2
2484 // Preserves: src, mask
2485 void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask,
2486                                           FloatRegister vtmp1, FloatRegister vtmp2,
2487                                           FloatRegister vtmp3, FloatRegister vtmp4,
2488                                           PRegister ptmp, PRegister pgtmp) {
2489   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2490   assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3, vtmp4);
2491   assert_different_registers(mask, ptmp, pgtmp);
2492   // Example input:   src   = 88 77 66 55 44 33 22 11
2493   //                  mask  = 01 00 00 01 01 00 01 01
2494   // Expected result: dst   = 00 00 00 88 55 44 22 11
2495 
2496   sve_dup(vtmp4, B, 0);
2497   // Extend lowest half to type SHORT.
2498   // vtmp1 = 0044 0033 0022 0011
2499   sve_uunpklo(vtmp1, H, src);
2500   // ptmp = 0001 0000 0001 0001
2501   sve_punpklo(ptmp, mask);
2502   // Count the active elements of lowest half.
2503   // rscratch2 = 3
2504   sve_cntp(rscratch2, H, ptrue, ptmp);
2505   // Pack the active elements in size of type SHORT to the right,
2506   // and fill the remainings with zero.
2507   // dst = 0000 0044 0022 0011
2508   sve_compress_short(dst, vtmp1, ptmp, vtmp2, vtmp3, pgtmp);
2509   // Narrow the result back to type BYTE.
2510   // dst = 00 00 00 00 00 44 22 11
2511   sve_uzp1(dst, B, dst, vtmp4);
2512 
2513   // Repeat to the highest half.
2514   // ptmp = 0001 0000 0000 0001
2515   sve_punpkhi(ptmp, mask);
2516   // vtmp1 = 0088 0077 0066 0055
2517   sve_uunpkhi(vtmp2, H, src);
2518   // vtmp1 = 0000 0000 0088 0055
2519   sve_compress_short(vtmp1, vtmp2, ptmp, vtmp3, vtmp4, pgtmp);
2520 
2521   sve_dup(vtmp4, B, 0);
2522   // vtmp1 = 00 00 00 00 00 00 88 55
2523   sve_uzp1(vtmp1, B, vtmp1, vtmp4);
2524 
2525   // Compressed low:   dst   = 00 00 00 00 00 44 22 11
2526   // Compressed high:  vtmp1 = 00 00 00 00 00 00 88 55
2527   // Left shift(cross lane) compressed high with TRUE_CNT lanes,
2528   // TRUE_CNT is the number of active elements in the compressed low.
2529   neg(rscratch2, rscratch2);
2530   // vtmp2 = {4 3 2 1 0 -1 -2 -3}
2531   sve_index(vtmp2, B, rscratch2, 1);
2532   // vtmp1 = 00 00 00 88 55 00 00 00
2533   sve_tbl(vtmp1, B, vtmp1, vtmp2);
2534   // Combine the compressed high(after shifted) with the compressed low.
2535   // dst = 00 00 00 88 55 44 22 11
2536   sve_orr(dst, dst, vtmp1);
2537 }
2538 
2539 void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2540   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2541   SIMD_Arrangement size = isQ ? T16B : T8B;
2542   if (bt == T_BYTE) {
2543     rbit(dst, size, src);
2544   } else {
2545     neon_reverse_bytes(dst, src, bt, isQ);
2546     rbit(dst, size, dst);
2547   }
2548 }
2549 
2550 void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2551   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2552   SIMD_Arrangement size = isQ ? T16B : T8B;
2553   switch (bt) {
2554     case T_BYTE:
2555       if (dst != src) {
2556         orr(dst, size, src, src);
2557       }
2558       break;
2559     case T_SHORT:
2560       rev16(dst, size, src);
2561       break;
2562     case T_INT:
2563       rev32(dst, size, src);
2564       break;
2565     case T_LONG:
2566       rev64(dst, size, src);
2567       break;
2568     default:
2569       assert(false, "unsupported");
2570       ShouldNotReachHere();
2571   }
2572 }
2573 
2574 // VectorRearrange implementation for short/int/float/long/double types with NEON
2575 // instructions. For VectorRearrange short/int/float, we use NEON tbl instruction.
2576 // But since it supports bytes table only, we need to lookup 2/4 bytes as a group.
2577 // For VectorRearrange long/double, we compare the shuffle input with iota indices,
2578 // and use bsl to implement the operation.
2579 void C2_MacroAssembler::neon_rearrange_hsd(FloatRegister dst, FloatRegister src,
2580                                            FloatRegister shuffle, FloatRegister tmp,
2581                                            BasicType bt, bool isQ) {
2582   assert_different_registers(dst, src, shuffle, tmp);
2583   SIMD_Arrangement size1 = isQ ? T16B : T8B;
2584   SIMD_Arrangement size2 = esize2arrangement((uint)type2aelembytes(bt), isQ);
2585 
2586   // Here is an example that rearranges a NEON vector with 4 ints:
2587   // Rearrange V1 int[a0, a1, a2, a3] to V2 int[a2, a3, a0, a1]
2588   //   1. We assume the shuffle input is Vi int[2, 3, 0, 1].
2589   //   2. Multiply Vi int[2, 3, 0, 1] with constant int vector
2590   //      [0x04040404, 0x04040404, 0x04040404, 0x04040404], and get
2591   //      tbl base Vm int[0x08080808, 0x0c0c0c0c, 0x00000000, 0x04040404].
2592   //   3. Add Vm with constant int[0x03020100, 0x03020100, 0x03020100, 0x03020100],
2593   //      and get tbl index Vm int[0x0b0a0908, 0x0f0e0d0c, 0x03020100, 0x07060504]
2594   //   4. Use Vm as index register, and use V1 as table register.
2595   //      Then get V2 as the result by tbl NEON instructions.
2596   switch (bt) {
2597     case T_SHORT:
2598       mov(tmp, size1, 0x02);
2599       mulv(dst, size2, shuffle, tmp);
2600       mov(tmp, size2, 0x0100);
2601       addv(dst, size1, dst, tmp);
2602       tbl(dst, size1, src, 1, dst);
2603       break;
2604     case T_INT:
2605     case T_FLOAT:
2606       mov(tmp, size1, 0x04);
2607       mulv(dst, size2, shuffle, tmp);
2608       mov(tmp, size2, 0x03020100);
2609       addv(dst, size1, dst, tmp);
2610       tbl(dst, size1, src, 1, dst);
2611       break;
2612     case T_LONG:
2613     case T_DOUBLE:
2614       // Load the iota indices for Long type. The indices are ordered by
2615       // type B/S/I/L/F/D, and the offset between two types is 16; Hence
2616       // the offset for L is 48.
2617       lea(rscratch1,
2618           ExternalAddress(StubRoutines::aarch64::vector_iota_indices() + 48));
2619       ldrq(tmp, rscratch1);
2620       // Check whether the input "shuffle" is the same with iota indices.
2621       // Return "src" if true, otherwise swap the two elements of "src".
2622       cm(EQ, dst, size2, shuffle, tmp);
2623       ext(tmp, size1, src, src, 8);
2624       bsl(dst, size1, src, tmp);
2625       break;
2626     default:
2627       assert(false, "unsupported element type");
2628       ShouldNotReachHere();
2629   }
2630 }
2631 
2632 // Extract a scalar element from an sve vector at position 'idx'.
2633 // The input elements in src are expected to be of integral type.
2634 void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src,
2635                                              int idx, FloatRegister vtmp) {
2636   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2637   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2638   if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction
2639     if (bt == T_INT || bt == T_LONG) {
2640       umov(dst, src, size, idx);
2641     } else {
2642       smov(dst, src, size, idx);
2643     }
2644   } else {
2645     sve_orr(vtmp, src, src);
2646     sve_ext(vtmp, vtmp, idx << size);
2647     if (bt == T_INT || bt == T_LONG) {
2648       umov(dst, vtmp, size, 0);
2649     } else {
2650       smov(dst, vtmp, size, 0);
2651     }
2652   }
2653 }
2654 
2655 // java.lang.Math::round intrinsics
2656 
2657 // Clobbers: rscratch1, rflags
2658 void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2659                                           FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) {
2660   assert_different_registers(tmp1, tmp2, tmp3, src, dst);
2661   switch (T) {
2662     case T2S:
2663     case T4S:
2664       fmovs(tmp1, T, 0.5f);
2665       mov(rscratch1, jint_cast(0x1.0p23f));
2666       break;
2667     case T2D:
2668       fmovd(tmp1, T, 0.5);
2669       mov(rscratch1, julong_cast(0x1.0p52));
2670       break;
2671     default:
2672       assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2673   }
2674   fadd(tmp1, T, tmp1, src);
2675   fcvtms(tmp1, T, tmp1);
2676   // tmp1 = floor(src + 0.5, ties to even)
2677 
2678   fcvtas(dst, T, src);
2679   // dst = round(src), ties to away
2680 
2681   fneg(tmp3, T, src);
2682   dup(tmp2, T, rscratch1);
2683   cm(HS, tmp3, T, tmp3, tmp2);
2684   // tmp3 is now a set of flags
2685 
2686   bif(dst, T16B, tmp1, tmp3);
2687   // result in dst
2688 }
2689 
2690 // Clobbers: rscratch1, rflags
2691 void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2692                                          FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) {
2693   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2694   assert_different_registers(tmp1, tmp2, src, dst);
2695 
2696   switch (T) {
2697     case S:
2698       mov(rscratch1, jint_cast(0x1.0p23f));
2699       break;
2700     case D:
2701       mov(rscratch1, julong_cast(0x1.0p52));
2702       break;
2703     default:
2704       assert(T == S || T == D, "invalid register variant");
2705   }
2706 
2707   sve_frinta(dst, T, ptrue, src);
2708   // dst = round(src), ties to away
2709 
2710   Label none;
2711 
2712   sve_fneg(tmp1, T, ptrue, src);
2713   sve_dup(tmp2, T, rscratch1);
2714   sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1);
2715   br(EQ, none);
2716   {
2717     sve_cpy(tmp1, T, pgtmp, 0.5);
2718     sve_fadd(tmp1, T, pgtmp, src);
2719     sve_frintm(dst, T, pgtmp, tmp1);
2720     // dst = floor(src + 0.5, ties to even)
2721   }
2722   bind(none);
2723 
2724   sve_fcvtzs(dst, T, ptrue, dst, T);
2725   // result in dst
2726 }
2727 
2728 void C2_MacroAssembler::vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero,
2729                                            FloatRegister one, SIMD_Arrangement T) {
2730   assert_different_registers(dst, src, zero, one);
2731   assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2732 
2733   facgt(dst, T, src, zero);
2734   ushr(dst, T, dst, 1); // dst=0 for +-0.0 and NaN. 0x7FF..F otherwise
2735   bsl(dst, T == T2S ? T8B : T16B, one, src); // Result in dst
2736 }
2737 
2738 void C2_MacroAssembler::vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero,
2739                                           FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T) {
2740     assert_different_registers(dst, src, zero, one, vtmp);
2741     assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2742 
2743     sve_orr(vtmp, src, src);
2744     sve_fac(Assembler::GT, pgtmp, T, ptrue, src, zero); // pmtp=0 for +-0.0 and NaN. 0x1 otherwise
2745     switch (T) {
2746     case S:
2747       sve_and(vtmp, T, min_jint); // Extract the sign bit of float value in every lane of src
2748       sve_orr(vtmp, T, jint_cast(1.0)); // OR it with +1 to make the final result +1 or -1 depending
2749                                         // on the sign of the float value
2750       break;
2751     case D:
2752       sve_and(vtmp, T, min_jlong);
2753       sve_orr(vtmp, T, jlong_cast(1.0));
2754       break;
2755     default:
2756       assert(false, "unsupported");
2757       ShouldNotReachHere();
2758     }
2759     sve_sel(dst, T, pgtmp, vtmp, src); // Select either from src or vtmp based on the predicate register pgtmp
2760                                        // Result in dst
2761 }
2762 
2763 bool C2_MacroAssembler::in_scratch_emit_size() {
2764   if (ciEnv::current()->task() != nullptr) {
2765     PhaseOutput* phase_output = Compile::current()->output();
2766     if (phase_output != nullptr && phase_output->in_scratch_emit_size()) {
2767       return true;
2768     }
2769   }
2770   return MacroAssembler::in_scratch_emit_size();
2771 }