1 /*
   2  * Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "asm/assembler.hpp"
  26 #include "asm/assembler.inline.hpp"
  27 #include "opto/c2_MacroAssembler.hpp"
  28 #include "opto/compile.hpp"
  29 #include "opto/intrinsicnode.hpp"
  30 #include "opto/matcher.hpp"
  31 #include "opto/output.hpp"
  32 #include "opto/subnode.hpp"
  33 #include "runtime/stubRoutines.hpp"
  34 #include "utilities/globalDefinitions.hpp"
  35 #include "utilities/powerOfTwo.hpp"
  36 
  37 #ifdef PRODUCT
  38 #define BLOCK_COMMENT(str) /* nothing */
  39 #define STOP(error) stop(error)
  40 #else
  41 #define BLOCK_COMMENT(str) block_comment(str)
  42 #define STOP(error) block_comment(error); stop(error)
  43 #endif
  44 
  45 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  46 
  47 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
  48 
  49 // jdk.internal.util.ArraysSupport.vectorizedHashCode
  50 address C2_MacroAssembler::arrays_hashcode(Register ary, Register cnt, Register result,
  51                                            FloatRegister vdata0, FloatRegister vdata1,
  52                                            FloatRegister vdata2, FloatRegister vdata3,
  53                                            FloatRegister vmul0, FloatRegister vmul1,
  54                                            FloatRegister vmul2, FloatRegister vmul3,
  55                                            FloatRegister vpow, FloatRegister vpowm,
  56                                            BasicType eltype) {
  57   ARRAYS_HASHCODE_REGISTERS;
  58 
  59   Register tmp1 = rscratch1, tmp2 = rscratch2;
  60 
  61   Label TAIL, STUB_SWITCH, STUB_SWITCH_OUT, LOOP, BR_BASE, LARGE, DONE;
  62 
  63   // Vectorization factor. Number of array elements loaded to one SIMD&FP registers by the stubs. We
  64   // use 8H load arrangements for chars and shorts and 8B for booleans and bytes. It's possible to
  65   // use 4H for chars and shorts instead, but using 8H gives better performance.
  66   const size_t vf = eltype == T_BOOLEAN || eltype == T_BYTE ? 8
  67                     : eltype == T_CHAR || eltype == T_SHORT ? 8
  68                     : eltype == T_INT                       ? 4
  69                                                             : 0;
  70   guarantee(vf, "unsupported eltype");
  71 
  72   // Unroll factor for the scalar loop below. The value is chosen based on performance analysis.
  73   const size_t unroll_factor = 4;
  74 
  75   switch (eltype) {
  76   case T_BOOLEAN:
  77     BLOCK_COMMENT("arrays_hashcode(unsigned byte) {");
  78     break;
  79   case T_CHAR:
  80     BLOCK_COMMENT("arrays_hashcode(char) {");
  81     break;
  82   case T_BYTE:
  83     BLOCK_COMMENT("arrays_hashcode(byte) {");
  84     break;
  85   case T_SHORT:
  86     BLOCK_COMMENT("arrays_hashcode(short) {");
  87     break;
  88   case T_INT:
  89     BLOCK_COMMENT("arrays_hashcode(int) {");
  90     break;
  91   default:
  92     ShouldNotReachHere();
  93   }
  94 
  95   // large_arrays_hashcode(T_INT) performs worse than the scalar loop below when the Neon loop
  96   // implemented by the stub executes just once. Call the stub only if at least two iterations will
  97   // be executed.
  98   const size_t large_threshold = eltype == T_INT ? vf * 2 : vf;
  99   cmpw(cnt, large_threshold);
 100   br(Assembler::HS, LARGE);
 101 
 102   bind(TAIL);
 103 
 104   // The andr performs cnt % uf where uf = unroll_factor. The subtract shifted by 3 offsets past
 105   // uf - (cnt % uf) pairs of load + madd insns i.e. it only executes cnt % uf load + madd pairs.
 106   // Iteration eats up the remainder, uf elements at a time.
 107   assert(is_power_of_2(unroll_factor), "can't use this value to calculate the jump target PC");
 108   andr(tmp2, cnt, unroll_factor - 1);
 109   adr(tmp1, BR_BASE);
 110   // For Cortex-A53 offset is 4 because 2 nops are generated.
 111   sub(tmp1, tmp1, tmp2, ext::sxtw, VM_Version::supports_a53mac() ? 4 : 3);
 112   movw(tmp2, 0x1f);
 113   br(tmp1);
 114 
 115   bind(LOOP);
 116   for (size_t i = 0; i < unroll_factor; ++i) {
 117     load(tmp1, Address(post(ary, type2aelembytes(eltype))), eltype);
 118     maddw(result, result, tmp2, tmp1);
 119     // maddw generates an extra nop for Cortex-A53 (see maddw definition in macroAssembler).
 120     // Generate 2nd nop to have 4 instructions per iteration.
 121     if (VM_Version::supports_a53mac()) {
 122       nop();
 123     }
 124   }
 125   bind(BR_BASE);
 126   subsw(cnt, cnt, unroll_factor);
 127   br(Assembler::HS, LOOP);
 128 
 129   b(DONE);
 130 
 131   bind(LARGE);
 132 
 133   RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_arrays_hashcode(eltype));
 134   assert(stub.target() != nullptr, "array_hashcode stub has not been generated");
 135   address tpc = trampoline_call(stub);
 136   if (tpc == nullptr) {
 137     DEBUG_ONLY(reset_labels(TAIL, BR_BASE));
 138     postcond(pc() == badAddress);
 139     return nullptr;
 140   }
 141 
 142   bind(DONE);
 143 
 144   BLOCK_COMMENT("} // arrays_hashcode");
 145 
 146   postcond(pc() != badAddress);
 147   return pc();
 148 }
 149 
 150 void C2_MacroAssembler::fast_lock(Register objectReg, Register boxReg, Register tmpReg,
 151                                   Register tmp2Reg, Register tmp3Reg) {
 152   Register oop = objectReg;
 153   Register box = boxReg;
 154   Register disp_hdr = tmpReg;
 155   Register tmp = tmp2Reg;
 156   Label cont;
 157   Label object_has_monitor;
 158   Label count, no_count;
 159 
 160   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight");
 161   assert_different_registers(oop, box, tmp, disp_hdr, rscratch2);
 162 
 163   // Load markWord from object into displaced_header.
 164   ldr(disp_hdr, Address(oop, oopDesc::mark_offset_in_bytes()));
 165 
 166   if (DiagnoseSyncOnValueBasedClasses != 0) {
 167     load_klass(tmp, oop);
 168     ldrb(tmp, Address(tmp, Klass::misc_flags_offset()));
 169     tst(tmp, KlassFlags::_misc_is_value_based_class);
 170     br(Assembler::NE, cont);
 171   }
 172 
 173   // Check for existing monitor
 174   tbnz(disp_hdr, exact_log2(markWord::monitor_value), object_has_monitor);
 175 
 176   if (LockingMode == LM_MONITOR) {
 177     tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0.
 178     b(cont);
 179   } else {
 180     assert(LockingMode == LM_LEGACY, "must be");
 181     // Set tmp to be (markWord of object | UNLOCK_VALUE).
 182     orr(tmp, disp_hdr, markWord::unlocked_value);
 183 
 184     // Initialize the box. (Must happen before we update the object mark!)
 185     str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 186 
 187     // Compare object markWord with an unlocked value (tmp) and if
 188     // equal exchange the stack address of our box with object markWord.
 189     // On failure disp_hdr contains the possibly locked markWord.
 190     cmpxchg(oop, tmp, box, Assembler::xword, /*acquire*/ true,
 191             /*release*/ true, /*weak*/ false, disp_hdr);
 192     br(Assembler::EQ, cont);
 193 
 194     assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
 195 
 196     // If the compare-and-exchange succeeded, then we found an unlocked
 197     // object, will have now locked it will continue at label cont
 198 
 199     // Check if the owner is self by comparing the value in the
 200     // markWord of object (disp_hdr) with the stack pointer.
 201     mov(rscratch1, sp);
 202     sub(disp_hdr, disp_hdr, rscratch1);
 203     mov(tmp, (address) (~(os::vm_page_size()-1) | markWord::lock_mask_in_place));
 204     // If condition is true we are cont and hence we can store 0 as the
 205     // displaced header in the box, which indicates that it is a recursive lock.
 206     ands(tmp/*==0?*/, disp_hdr, tmp);   // Sets flags for result
 207     str(tmp/*==0, perhaps*/, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 208     b(cont);
 209   }
 210 
 211   // Handle existing monitor.
 212   bind(object_has_monitor);
 213 
 214   // Try to CAS owner (no owner => current thread's _monitor_owner_id).
 215   ldr(rscratch2, Address(rthread, JavaThread::monitor_owner_id_offset()));
 216   add(tmp, disp_hdr, (in_bytes(ObjectMonitor::owner_offset())-markWord::monitor_value));
 217   cmpxchg(tmp, zr, rscratch2, Assembler::xword, /*acquire*/ true,
 218           /*release*/ true, /*weak*/ false, tmp3Reg); // Sets flags for result
 219 
 220   // Store a non-null value into the box to avoid looking like a re-entrant
 221   // lock. The fast-path monitor unlock code checks for
 222   // markWord::monitor_value so use markWord::unused_mark which has the
 223   // relevant bit set, and also matches ObjectSynchronizer::enter.
 224   mov(tmp, (address)markWord::unused_mark().value());
 225   str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 226 
 227   br(Assembler::EQ, cont); // CAS success means locking succeeded
 228 
 229   cmp(tmp3Reg, rscratch2);
 230   br(Assembler::NE, cont); // Check for recursive locking
 231 
 232   // Recursive lock case
 233   increment(Address(disp_hdr, in_bytes(ObjectMonitor::recursions_offset()) - markWord::monitor_value), 1);
 234   // flag == EQ still from the cmp above, checking if this is a reentrant lock
 235 
 236   bind(cont);
 237   // flag == EQ indicates success
 238   // flag == NE indicates failure
 239   br(Assembler::NE, no_count);
 240 
 241   bind(count);
 242   if (LockingMode == LM_LEGACY) {
 243     inc_held_monitor_count(rscratch1);
 244   }
 245 
 246   bind(no_count);
 247 }
 248 
 249 void C2_MacroAssembler::fast_unlock(Register objectReg, Register boxReg, Register tmpReg,
 250                                     Register tmp2Reg) {
 251   Register oop = objectReg;
 252   Register box = boxReg;
 253   Register disp_hdr = tmpReg;
 254   Register owner_addr = tmpReg;
 255   Register tmp = tmp2Reg;
 256   Label cont;
 257   Label object_has_monitor;
 258   Label count, no_count;
 259   Label unlocked;
 260 
 261   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight");
 262   assert_different_registers(oop, box, tmp, disp_hdr);
 263 
 264   if (LockingMode == LM_LEGACY) {
 265     // Find the lock address and load the displaced header from the stack.
 266     ldr(disp_hdr, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 267 
 268     // If the displaced header is 0, we have a recursive unlock.
 269     cmp(disp_hdr, zr);
 270     br(Assembler::EQ, cont);
 271   }
 272 
 273   // Handle existing monitor.
 274   ldr(tmp, Address(oop, oopDesc::mark_offset_in_bytes()));
 275   tbnz(tmp, exact_log2(markWord::monitor_value), object_has_monitor);
 276 
 277   if (LockingMode == LM_MONITOR) {
 278     tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0.
 279     b(cont);
 280   } else {
 281     assert(LockingMode == LM_LEGACY, "must be");
 282     // Check if it is still a light weight lock, this is is true if we
 283     // see the stack address of the basicLock in the markWord of the
 284     // object.
 285 
 286     cmpxchg(oop, box, disp_hdr, Assembler::xword, /*acquire*/ false,
 287             /*release*/ true, /*weak*/ false, tmp);
 288     b(cont);
 289   }
 290 
 291   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
 292 
 293   // Handle existing monitor.
 294   bind(object_has_monitor);
 295   STATIC_ASSERT(markWord::monitor_value <= INT_MAX);
 296   add(tmp, tmp, -(int)markWord::monitor_value); // monitor
 297 
 298   ldr(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset()));
 299 
 300   Label notRecursive;
 301   cbz(disp_hdr, notRecursive);
 302 
 303   // Recursive lock
 304   sub(disp_hdr, disp_hdr, 1u);
 305   str(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset()));
 306   cmp(disp_hdr, disp_hdr); // Sets flags for result
 307   b(cont);
 308 
 309   bind(notRecursive);
 310 
 311   // Compute owner address.
 312   lea(owner_addr, Address(tmp, ObjectMonitor::owner_offset()));
 313 
 314   // Set owner to null.
 315   // Release to satisfy the JMM
 316   stlr(zr, owner_addr);
 317   // We need a full fence after clearing owner to avoid stranding.
 318   // StoreLoad achieves this.
 319   membar(StoreLoad);
 320 
 321   // Check if the entry_list is empty.
 322   ldr(rscratch1, Address(tmp, ObjectMonitor::entry_list_offset()));
 323   cmp(rscratch1, zr);
 324   br(Assembler::EQ, cont);     // If so we are done.
 325 
 326   // Check if there is a successor.
 327   ldr(rscratch1, Address(tmp, ObjectMonitor::succ_offset()));
 328   cmp(rscratch1, zr);
 329   br(Assembler::NE, unlocked); // If so we are done.
 330 
 331   // Save the monitor pointer in the current thread, so we can try to
 332   // reacquire the lock in SharedRuntime::monitor_exit_helper().
 333   str(tmp, Address(rthread, JavaThread::unlocked_inflated_monitor_offset()));
 334 
 335   cmp(zr, rthread); // Set Flag to NE => slow path
 336   b(cont);
 337 
 338   bind(unlocked);
 339   cmp(zr, zr); // Set Flag to EQ => fast path
 340 
 341   // Intentional fall-through
 342 
 343   bind(cont);
 344   // flag == EQ indicates success
 345   // flag == NE indicates failure
 346   br(Assembler::NE, no_count);
 347 
 348   bind(count);
 349   if (LockingMode == LM_LEGACY) {
 350     dec_held_monitor_count(rscratch1);
 351   }
 352 
 353   bind(no_count);
 354 }
 355 
 356 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register t1,
 357                                               Register t2, Register t3) {
 358   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 359   assert_different_registers(obj, box, t1, t2, t3, rscratch2);
 360 
 361   // Handle inflated monitor.
 362   Label inflated;
 363   // Finish fast lock successfully. MUST branch to with flag == EQ
 364   Label locked;
 365   // Finish fast lock unsuccessfully. MUST branch to with flag == NE
 366   Label slow_path;
 367 
 368   if (UseObjectMonitorTable) {
 369     // Clear cache in case fast locking succeeds or we need to take the slow-path.
 370     str(zr, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 371   }
 372 
 373   if (DiagnoseSyncOnValueBasedClasses != 0) {
 374     load_klass(t1, obj);
 375     ldrb(t1, Address(t1, Klass::misc_flags_offset()));
 376     tst(t1, KlassFlags::_misc_is_value_based_class);
 377     br(Assembler::NE, slow_path);
 378   }
 379 
 380   const Register t1_mark = t1;
 381   const Register t3_t = t3;
 382 
 383   { // Lightweight locking
 384 
 385     // Push lock to the lock stack and finish successfully. MUST branch to with flag == EQ
 386     Label push;
 387 
 388     const Register t2_top = t2;
 389 
 390     // Check if lock-stack is full.
 391     ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 392     cmpw(t2_top, (unsigned)LockStack::end_offset() - 1);
 393     br(Assembler::GT, slow_path);
 394 
 395     // Check if recursive.
 396     subw(t3_t, t2_top, oopSize);
 397     ldr(t3_t, Address(rthread, t3_t));
 398     cmp(obj, t3_t);
 399     br(Assembler::EQ, push);
 400 
 401     // Relaxed normal load to check for monitor. Optimization for monitor case.
 402     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 403     tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
 404 
 405     // Not inflated
 406     assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea");
 407 
 408     // Try to lock. Transition lock-bits 0b01 => 0b00
 409     orr(t1_mark, t1_mark, markWord::unlocked_value);
 410     eor(t3_t, t1_mark, markWord::unlocked_value);
 411     cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
 412             /*acquire*/ true, /*release*/ false, /*weak*/ false, noreg);
 413     br(Assembler::NE, slow_path);
 414 
 415     bind(push);
 416     // After successful lock, push object on lock-stack.
 417     str(obj, Address(rthread, t2_top));
 418     addw(t2_top, t2_top, oopSize);
 419     strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 420     b(locked);
 421   }
 422 
 423   { // Handle inflated monitor.
 424     bind(inflated);
 425 
 426     const Register t1_monitor = t1;
 427 
 428     if (!UseObjectMonitorTable) {
 429       assert(t1_monitor == t1_mark, "should be the same here");
 430     } else {
 431       Label monitor_found;
 432 
 433       // Load cache address
 434       lea(t3_t, Address(rthread, JavaThread::om_cache_oops_offset()));
 435 
 436       const int num_unrolled = 2;
 437       for (int i = 0; i < num_unrolled; i++) {
 438         ldr(t1, Address(t3_t));
 439         cmp(obj, t1);
 440         br(Assembler::EQ, monitor_found);
 441         increment(t3_t, in_bytes(OMCache::oop_to_oop_difference()));
 442       }
 443 
 444       Label loop;
 445 
 446       // Search for obj in cache.
 447       bind(loop);
 448 
 449       // Check for match.
 450       ldr(t1, Address(t3_t));
 451       cmp(obj, t1);
 452       br(Assembler::EQ, monitor_found);
 453 
 454       // Search until null encountered, guaranteed _null_sentinel at end.
 455       increment(t3_t, in_bytes(OMCache::oop_to_oop_difference()));
 456       cbnz(t1, loop);
 457       // Cache Miss, NE set from cmp above, cbnz does not set flags
 458       b(slow_path);
 459 
 460       bind(monitor_found);
 461       ldr(t1_monitor, Address(t3_t, OMCache::oop_to_monitor_difference()));
 462     }
 463 
 464     const Register t2_owner_addr = t2;
 465     const Register t3_owner = t3;
 466     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 467     const Address owner_address(t1_monitor, ObjectMonitor::owner_offset() - monitor_tag);
 468     const Address recursions_address(t1_monitor, ObjectMonitor::recursions_offset() - monitor_tag);
 469 
 470     Label monitor_locked;
 471 
 472     // Compute owner address.
 473     lea(t2_owner_addr, owner_address);
 474 
 475     // Try to CAS owner (no owner => current thread's _monitor_owner_id).
 476     ldr(rscratch2, Address(rthread, JavaThread::monitor_owner_id_offset()));
 477     cmpxchg(t2_owner_addr, zr, rscratch2, Assembler::xword, /*acquire*/ true,
 478             /*release*/ false, /*weak*/ false, t3_owner);
 479     br(Assembler::EQ, monitor_locked);
 480 
 481     // Check if recursive.
 482     cmp(t3_owner, rscratch2);
 483     br(Assembler::NE, slow_path);
 484 
 485     // Recursive.
 486     increment(recursions_address, 1);
 487 
 488     bind(monitor_locked);
 489     if (UseObjectMonitorTable) {
 490       str(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 491     }
 492   }
 493 
 494   bind(locked);
 495 
 496 #ifdef ASSERT
 497   // Check that locked label is reached with Flags == EQ.
 498   Label flag_correct;
 499   br(Assembler::EQ, flag_correct);
 500   stop("Fast Lock Flag != EQ");
 501 #endif
 502 
 503   bind(slow_path);
 504 #ifdef ASSERT
 505   // Check that slow_path label is reached with Flags == NE.
 506   br(Assembler::NE, flag_correct);
 507   stop("Fast Lock Flag != NE");
 508   bind(flag_correct);
 509 #endif
 510   // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
 511 }
 512 
 513 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register box, Register t1,
 514                                                 Register t2, Register t3) {
 515   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 516   assert_different_registers(obj, box, t1, t2, t3);
 517 
 518   // Handle inflated monitor.
 519   Label inflated, inflated_load_mark;
 520   // Finish fast unlock successfully. MUST branch to with flag == EQ
 521   Label unlocked;
 522   // Finish fast unlock unsuccessfully. MUST branch to with flag == NE
 523   Label slow_path;
 524 
 525   const Register t1_mark = t1;
 526   const Register t2_top = t2;
 527   const Register t3_t = t3;
 528 
 529   { // Lightweight unlock
 530 
 531     Label push_and_slow_path;
 532 
 533     // Check if obj is top of lock-stack.
 534     ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 535     subw(t2_top, t2_top, oopSize);
 536     ldr(t3_t, Address(rthread, t2_top));
 537     cmp(obj, t3_t);
 538     // Top of lock stack was not obj. Must be monitor.
 539     br(Assembler::NE, inflated_load_mark);
 540 
 541     // Pop lock-stack.
 542     DEBUG_ONLY(str(zr, Address(rthread, t2_top));)
 543     strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 544 
 545     // Check if recursive.
 546     subw(t3_t, t2_top, oopSize);
 547     ldr(t3_t, Address(rthread, t3_t));
 548     cmp(obj, t3_t);
 549     br(Assembler::EQ, unlocked);
 550 
 551     // Not recursive.
 552     // Load Mark.
 553     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 554 
 555     // Check header for monitor (0b10).
 556     // Because we got here by popping (meaning we pushed in locked)
 557     // there will be no monitor in the box. So we need to push back the obj
 558     // so that the runtime can fix any potential anonymous owner.
 559     tbnz(t1_mark, exact_log2(markWord::monitor_value), UseObjectMonitorTable ? push_and_slow_path : inflated);
 560 
 561     // Try to unlock. Transition lock bits 0b00 => 0b01
 562     assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea");
 563     orr(t3_t, t1_mark, markWord::unlocked_value);
 564     cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
 565             /*acquire*/ false, /*release*/ true, /*weak*/ false, noreg);
 566     br(Assembler::EQ, unlocked);
 567 
 568     bind(push_and_slow_path);
 569     // Compare and exchange failed.
 570     // Restore lock-stack and handle the unlock in runtime.
 571     DEBUG_ONLY(str(obj, Address(rthread, t2_top));)
 572     addw(t2_top, t2_top, oopSize);
 573     str(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 574     b(slow_path);
 575   }
 576 
 577 
 578   { // Handle inflated monitor.
 579     bind(inflated_load_mark);
 580     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 581 #ifdef ASSERT
 582     tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
 583     stop("Fast Unlock not monitor");
 584 #endif
 585 
 586     bind(inflated);
 587 
 588 #ifdef ASSERT
 589     Label check_done;
 590     subw(t2_top, t2_top, oopSize);
 591     cmpw(t2_top, in_bytes(JavaThread::lock_stack_base_offset()));
 592     br(Assembler::LT, check_done);
 593     ldr(t3_t, Address(rthread, t2_top));
 594     cmp(obj, t3_t);
 595     br(Assembler::NE, inflated);
 596     stop("Fast Unlock lock on stack");
 597     bind(check_done);
 598 #endif
 599 
 600     const Register t1_monitor = t1;
 601 
 602     if (!UseObjectMonitorTable) {
 603       assert(t1_monitor == t1_mark, "should be the same here");
 604 
 605       // Untag the monitor.
 606       add(t1_monitor, t1_mark, -(int)markWord::monitor_value);
 607     } else {
 608       ldr(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 609       // null check with Flags == NE, no valid pointer below alignof(ObjectMonitor*)
 610       cmp(t1_monitor, checked_cast<uint8_t>(alignof(ObjectMonitor*)));
 611       br(Assembler::LO, slow_path);
 612     }
 613 
 614     const Register t2_recursions = t2;
 615     Label not_recursive;
 616 
 617     // Check if recursive.
 618     ldr(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
 619     cbz(t2_recursions, not_recursive);
 620 
 621     // Recursive unlock.
 622     sub(t2_recursions, t2_recursions, 1u);
 623     str(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
 624     // Set flag == EQ
 625     cmp(t2_recursions, t2_recursions);
 626     b(unlocked);
 627 
 628     bind(not_recursive);
 629 
 630     const Register t2_owner_addr = t2;
 631 
 632     // Compute owner address.
 633     lea(t2_owner_addr, Address(t1_monitor, ObjectMonitor::owner_offset()));
 634 
 635     // Set owner to null.
 636     // Release to satisfy the JMM
 637     stlr(zr, t2_owner_addr);
 638     // We need a full fence after clearing owner to avoid stranding.
 639     // StoreLoad achieves this.
 640     membar(StoreLoad);
 641 
 642     // Check if the entry_list is empty.
 643     ldr(rscratch1, Address(t1_monitor, ObjectMonitor::entry_list_offset()));
 644     cmp(rscratch1, zr);
 645     br(Assembler::EQ, unlocked);  // If so we are done.
 646 
 647     // Check if there is a successor.
 648     ldr(rscratch1, Address(t1_monitor, ObjectMonitor::succ_offset()));
 649     cmp(rscratch1, zr);
 650     br(Assembler::NE, unlocked);  // If so we are done.
 651 
 652     // Save the monitor pointer in the current thread, so we can try to
 653     // reacquire the lock in SharedRuntime::monitor_exit_helper().
 654     str(t1_monitor, Address(rthread, JavaThread::unlocked_inflated_monitor_offset()));
 655 
 656     cmp(zr, rthread); // Set Flag to NE => slow path
 657     b(slow_path);
 658   }
 659 
 660   bind(unlocked);
 661   cmp(zr, zr); // Set Flags to EQ => fast path
 662 
 663 #ifdef ASSERT
 664   // Check that unlocked label is reached with Flags == EQ.
 665   Label flag_correct;
 666   br(Assembler::EQ, flag_correct);
 667   stop("Fast Unlock Flag != EQ");
 668 #endif
 669 
 670   bind(slow_path);
 671 #ifdef ASSERT
 672   // Check that slow_path label is reached with Flags == NE.
 673   br(Assembler::NE, flag_correct);
 674   stop("Fast Unlock Flag != NE");
 675   bind(flag_correct);
 676 #endif
 677   // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
 678 }
 679 
 680 // Search for str1 in str2 and return index or -1
 681 // Clobbers: rscratch1, rscratch2, rflags. May also clobber v0-v1, when icnt1==-1.
 682 void C2_MacroAssembler::string_indexof(Register str2, Register str1,
 683                                        Register cnt2, Register cnt1,
 684                                        Register tmp1, Register tmp2,
 685                                        Register tmp3, Register tmp4,
 686                                        Register tmp5, Register tmp6,
 687                                        int icnt1, Register result, int ae) {
 688   // NOTE: tmp5, tmp6 can be zr depending on specific method version
 689   Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
 690 
 691   Register ch1 = rscratch1;
 692   Register ch2 = rscratch2;
 693   Register cnt1tmp = tmp1;
 694   Register cnt2tmp = tmp2;
 695   Register cnt1_neg = cnt1;
 696   Register cnt2_neg = cnt2;
 697   Register result_tmp = tmp4;
 698 
 699   bool isL = ae == StrIntrinsicNode::LL;
 700 
 701   bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
 702   bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
 703   int str1_chr_shift = str1_isL ? 0:1;
 704   int str2_chr_shift = str2_isL ? 0:1;
 705   int str1_chr_size = str1_isL ? 1:2;
 706   int str2_chr_size = str2_isL ? 1:2;
 707   chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
 708                                       (chr_insn)&MacroAssembler::ldrh;
 709   chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
 710                                       (chr_insn)&MacroAssembler::ldrh;
 711   chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
 712   chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
 713 
 714   // Note, inline_string_indexOf() generates checks:
 715   // if (substr.count > string.count) return -1;
 716   // if (substr.count == 0) return 0;
 717 
 718   // We have two strings, a source string in str2, cnt2 and a pattern string
 719   // in str1, cnt1. Find the 1st occurrence of pattern in source or return -1.
 720 
 721   // For larger pattern and source we use a simplified Boyer Moore algorithm.
 722   // With a small pattern and source we use linear scan.
 723 
 724   if (icnt1 == -1) {
 725     sub(result_tmp, cnt2, cnt1);
 726     cmp(cnt1, (u1)8);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
 727     br(LT, LINEARSEARCH);
 728     dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
 729     subs(zr, cnt1, 256);
 730     lsr(tmp1, cnt2, 2);
 731     ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
 732     br(GE, LINEARSTUB);
 733   }
 734 
 735 // The Boyer Moore alogorithm is based on the description here:-
 736 //
 737 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
 738 //
 739 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
 740 // and the 'Good Suffix' rule.
 741 //
 742 // These rules are essentially heuristics for how far we can shift the
 743 // pattern along the search string.
 744 //
 745 // The implementation here uses the 'Bad Character' rule only because of the
 746 // complexity of initialisation for the 'Good Suffix' rule.
 747 //
 748 // This is also known as the Boyer-Moore-Horspool algorithm:-
 749 //
 750 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
 751 //
 752 // This particular implementation has few java-specific optimizations.
 753 //
 754 // #define ASIZE 256
 755 //
 756 //    int bm(unsigned char *x, int m, unsigned char *y, int n) {
 757 //       int i, j;
 758 //       unsigned c;
 759 //       unsigned char bc[ASIZE];
 760 //
 761 //       /* Preprocessing */
 762 //       for (i = 0; i < ASIZE; ++i)
 763 //          bc[i] = m;
 764 //       for (i = 0; i < m - 1; ) {
 765 //          c = x[i];
 766 //          ++i;
 767 //          // c < 256 for Latin1 string, so, no need for branch
 768 //          #ifdef PATTERN_STRING_IS_LATIN1
 769 //          bc[c] = m - i;
 770 //          #else
 771 //          if (c < ASIZE) bc[c] = m - i;
 772 //          #endif
 773 //       }
 774 //
 775 //       /* Searching */
 776 //       j = 0;
 777 //       while (j <= n - m) {
 778 //          c = y[i+j];
 779 //          if (x[m-1] == c)
 780 //            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
 781 //          if (i < 0) return j;
 782 //          // c < 256 for Latin1 string, so, no need for branch
 783 //          #ifdef SOURCE_STRING_IS_LATIN1
 784 //          // LL case: (c< 256) always true. Remove branch
 785 //          j += bc[y[j+m-1]];
 786 //          #endif
 787 //          #ifndef PATTERN_STRING_IS_UTF
 788 //          // UU case: need if (c<ASIZE) check. Skip 1 character if not.
 789 //          if (c < ASIZE)
 790 //            j += bc[y[j+m-1]];
 791 //          else
 792 //            j += 1
 793 //          #endif
 794 //          #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
 795 //          // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
 796 //          if (c < ASIZE)
 797 //            j += bc[y[j+m-1]];
 798 //          else
 799 //            j += m
 800 //          #endif
 801 //       }
 802 //    }
 803 
 804   if (icnt1 == -1) {
 805     Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
 806         BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
 807     Register cnt1end = tmp2;
 808     Register str2end = cnt2;
 809     Register skipch = tmp2;
 810 
 811     // str1 length is >=8, so, we can read at least 1 register for cases when
 812     // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
 813     // UL case. We'll re-read last character in inner pre-loop code to have
 814     // single outer pre-loop load
 815     const int firstStep = isL ? 7 : 3;
 816 
 817     const int ASIZE = 256;
 818     const int STORED_BYTES = 32; // amount of bytes stored per instruction
 819     sub(sp, sp, ASIZE);
 820     mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
 821     mov(ch1, sp);
 822     BIND(BM_INIT_LOOP);
 823       stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
 824       subs(tmp5, tmp5, 1);
 825       br(GT, BM_INIT_LOOP);
 826 
 827       sub(cnt1tmp, cnt1, 1);
 828       mov(tmp5, str2);
 829       add(str2end, str2, result_tmp, LSL, str2_chr_shift);
 830       sub(ch2, cnt1, 1);
 831       mov(tmp3, str1);
 832     BIND(BCLOOP);
 833       (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
 834       if (!str1_isL) {
 835         subs(zr, ch1, ASIZE);
 836         br(HS, BCSKIP);
 837       }
 838       strb(ch2, Address(sp, ch1));
 839     BIND(BCSKIP);
 840       subs(ch2, ch2, 1);
 841       br(GT, BCLOOP);
 842 
 843       add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
 844       if (str1_isL == str2_isL) {
 845         // load last 8 bytes (8LL/4UU symbols)
 846         ldr(tmp6, Address(tmp6, -wordSize));
 847       } else {
 848         ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
 849         // convert Latin1 to UTF. We'll have to wait until load completed, but
 850         // it's still faster than per-character loads+checks
 851         lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
 852         ubfx(ch1, tmp6, 8, 8); // str1[N-2]
 853         ubfx(ch2, tmp6, 16, 8); // str1[N-3]
 854         andr(tmp6, tmp6, 0xFF); // str1[N-4]
 855         orr(ch2, ch1, ch2, LSL, 16);
 856         orr(tmp6, tmp6, tmp3, LSL, 48);
 857         orr(tmp6, tmp6, ch2, LSL, 16);
 858       }
 859     BIND(BMLOOPSTR2);
 860       (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 861       sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
 862       if (str1_isL == str2_isL) {
 863         // re-init tmp3. It's for free because it's executed in parallel with
 864         // load above. Alternative is to initialize it before loop, but it'll
 865         // affect performance on in-order systems with 2 or more ld/st pipelines
 866         lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
 867       }
 868       if (!isL) { // UU/UL case
 869         lsl(ch2, cnt1tmp, 1); // offset in bytes
 870       }
 871       cmp(tmp3, skipch);
 872       br(NE, BMSKIP);
 873       ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
 874       mov(ch1, tmp6);
 875       if (isL) {
 876         b(BMLOOPSTR1_AFTER_LOAD);
 877       } else {
 878         sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
 879         b(BMLOOPSTR1_CMP);
 880       }
 881     BIND(BMLOOPSTR1);
 882       (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
 883       (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 884     BIND(BMLOOPSTR1_AFTER_LOAD);
 885       subs(cnt1tmp, cnt1tmp, 1);
 886       br(LT, BMLOOPSTR1_LASTCMP);
 887     BIND(BMLOOPSTR1_CMP);
 888       cmp(ch1, ch2);
 889       br(EQ, BMLOOPSTR1);
 890     BIND(BMSKIP);
 891       if (!isL) {
 892         // if we've met UTF symbol while searching Latin1 pattern, then we can
 893         // skip cnt1 symbols
 894         if (str1_isL != str2_isL) {
 895           mov(result_tmp, cnt1);
 896         } else {
 897           mov(result_tmp, 1);
 898         }
 899         subs(zr, skipch, ASIZE);
 900         br(HS, BMADV);
 901       }
 902       ldrb(result_tmp, Address(sp, skipch)); // load skip distance
 903     BIND(BMADV);
 904       sub(cnt1tmp, cnt1, 1);
 905       add(str2, str2, result_tmp, LSL, str2_chr_shift);
 906       cmp(str2, str2end);
 907       br(LE, BMLOOPSTR2);
 908       add(sp, sp, ASIZE);
 909       b(NOMATCH);
 910     BIND(BMLOOPSTR1_LASTCMP);
 911       cmp(ch1, ch2);
 912       br(NE, BMSKIP);
 913     BIND(BMMATCH);
 914       sub(result, str2, tmp5);
 915       if (!str2_isL) lsr(result, result, 1);
 916       add(sp, sp, ASIZE);
 917       b(DONE);
 918 
 919     BIND(LINEARSTUB);
 920     cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm
 921     br(LT, LINEAR_MEDIUM);
 922     mov(result, zr);
 923     RuntimeAddress stub = nullptr;
 924     if (isL) {
 925       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
 926       assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated");
 927     } else if (str1_isL) {
 928       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
 929        assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated");
 930     } else {
 931       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
 932       assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated");
 933     }
 934     address call = trampoline_call(stub);
 935     if (call == nullptr) {
 936       DEBUG_ONLY(reset_labels(LINEARSEARCH, LINEAR_MEDIUM, DONE, NOMATCH, MATCH));
 937       ciEnv::current()->record_failure("CodeCache is full");
 938       return;
 939     }
 940     b(DONE);
 941   }
 942 
 943   BIND(LINEARSEARCH);
 944   {
 945     Label DO1, DO2, DO3;
 946 
 947     Register str2tmp = tmp2;
 948     Register first = tmp3;
 949 
 950     if (icnt1 == -1)
 951     {
 952         Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
 953 
 954         cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2));
 955         br(LT, DOSHORT);
 956       BIND(LINEAR_MEDIUM);
 957         (this->*str1_load_1chr)(first, Address(str1));
 958         lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
 959         sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
 960         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 961         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 962 
 963       BIND(FIRST_LOOP);
 964         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
 965         cmp(first, ch2);
 966         br(EQ, STR1_LOOP);
 967       BIND(STR2_NEXT);
 968         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 969         br(LE, FIRST_LOOP);
 970         b(NOMATCH);
 971 
 972       BIND(STR1_LOOP);
 973         adds(cnt1tmp, cnt1_neg, str1_chr_size);
 974         add(cnt2tmp, cnt2_neg, str2_chr_size);
 975         br(GE, MATCH);
 976 
 977       BIND(STR1_NEXT);
 978         (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
 979         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
 980         cmp(ch1, ch2);
 981         br(NE, STR2_NEXT);
 982         adds(cnt1tmp, cnt1tmp, str1_chr_size);
 983         add(cnt2tmp, cnt2tmp, str2_chr_size);
 984         br(LT, STR1_NEXT);
 985         b(MATCH);
 986 
 987       BIND(DOSHORT);
 988       if (str1_isL == str2_isL) {
 989         cmp(cnt1, (u1)2);
 990         br(LT, DO1);
 991         br(GT, DO3);
 992       }
 993     }
 994 
 995     if (icnt1 == 4) {
 996       Label CH1_LOOP;
 997 
 998         (this->*load_4chr)(ch1, str1);
 999         sub(result_tmp, cnt2, 4);
1000         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
1001         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
1002 
1003       BIND(CH1_LOOP);
1004         (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
1005         cmp(ch1, ch2);
1006         br(EQ, MATCH);
1007         adds(cnt2_neg, cnt2_neg, str2_chr_size);
1008         br(LE, CH1_LOOP);
1009         b(NOMATCH);
1010       }
1011 
1012     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
1013       Label CH1_LOOP;
1014 
1015       BIND(DO2);
1016         (this->*load_2chr)(ch1, str1);
1017         if (icnt1 == 2) {
1018           sub(result_tmp, cnt2, 2);
1019         }
1020         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
1021         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
1022       BIND(CH1_LOOP);
1023         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
1024         cmp(ch1, ch2);
1025         br(EQ, MATCH);
1026         adds(cnt2_neg, cnt2_neg, str2_chr_size);
1027         br(LE, CH1_LOOP);
1028         b(NOMATCH);
1029     }
1030 
1031     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
1032       Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
1033 
1034       BIND(DO3);
1035         (this->*load_2chr)(first, str1);
1036         (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
1037         if (icnt1 == 3) {
1038           sub(result_tmp, cnt2, 3);
1039         }
1040         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
1041         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
1042       BIND(FIRST_LOOP);
1043         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
1044         cmpw(first, ch2);
1045         br(EQ, STR1_LOOP);
1046       BIND(STR2_NEXT);
1047         adds(cnt2_neg, cnt2_neg, str2_chr_size);
1048         br(LE, FIRST_LOOP);
1049         b(NOMATCH);
1050 
1051       BIND(STR1_LOOP);
1052         add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
1053         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
1054         cmp(ch1, ch2);
1055         br(NE, STR2_NEXT);
1056         b(MATCH);
1057     }
1058 
1059     if (icnt1 == -1 || icnt1 == 1) {
1060       Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
1061 
1062       BIND(DO1);
1063         (this->*str1_load_1chr)(ch1, str1);
1064         cmp(cnt2, (u1)8);
1065         br(LT, DO1_SHORT);
1066 
1067         sub(result_tmp, cnt2, 8/str2_chr_size);
1068         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
1069         mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
1070         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
1071 
1072         if (str2_isL) {
1073           orr(ch1, ch1, ch1, LSL, 8);
1074         }
1075         orr(ch1, ch1, ch1, LSL, 16);
1076         orr(ch1, ch1, ch1, LSL, 32);
1077       BIND(CH1_LOOP);
1078         ldr(ch2, Address(str2, cnt2_neg));
1079         eor(ch2, ch1, ch2);
1080         sub(tmp1, ch2, tmp3);
1081         orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
1082         bics(tmp1, tmp1, tmp2);
1083         br(NE, HAS_ZERO);
1084         adds(cnt2_neg, cnt2_neg, 8);
1085         br(LT, CH1_LOOP);
1086 
1087         cmp(cnt2_neg, (u1)8);
1088         mov(cnt2_neg, 0);
1089         br(LT, CH1_LOOP);
1090         b(NOMATCH);
1091 
1092       BIND(HAS_ZERO);
1093         rev(tmp1, tmp1);
1094         clz(tmp1, tmp1);
1095         add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
1096         b(MATCH);
1097 
1098       BIND(DO1_SHORT);
1099         mov(result_tmp, cnt2);
1100         lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
1101         sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
1102       BIND(DO1_LOOP);
1103         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
1104         cmpw(ch1, ch2);
1105         br(EQ, MATCH);
1106         adds(cnt2_neg, cnt2_neg, str2_chr_size);
1107         br(LT, DO1_LOOP);
1108     }
1109   }
1110   BIND(NOMATCH);
1111     mov(result, -1);
1112     b(DONE);
1113   BIND(MATCH);
1114     add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
1115   BIND(DONE);
1116 }
1117 
1118 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
1119 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
1120 
1121 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1,
1122                                             Register ch, Register result,
1123                                             Register tmp1, Register tmp2, Register tmp3)
1124 {
1125   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
1126   Register cnt1_neg = cnt1;
1127   Register ch1 = rscratch1;
1128   Register result_tmp = rscratch2;
1129 
1130   cbz(cnt1, NOMATCH);
1131 
1132   cmp(cnt1, (u1)4);
1133   br(LT, DO1_SHORT);
1134 
1135   orr(ch, ch, ch, LSL, 16);
1136   orr(ch, ch, ch, LSL, 32);
1137 
1138   sub(cnt1, cnt1, 4);
1139   mov(result_tmp, cnt1);
1140   lea(str1, Address(str1, cnt1, Address::uxtw(1)));
1141   sub(cnt1_neg, zr, cnt1, LSL, 1);
1142 
1143   mov(tmp3, 0x0001000100010001);
1144 
1145   BIND(CH1_LOOP);
1146     ldr(ch1, Address(str1, cnt1_neg));
1147     eor(ch1, ch, ch1);
1148     sub(tmp1, ch1, tmp3);
1149     orr(tmp2, ch1, 0x7fff7fff7fff7fff);
1150     bics(tmp1, tmp1, tmp2);
1151     br(NE, HAS_ZERO);
1152     adds(cnt1_neg, cnt1_neg, 8);
1153     br(LT, CH1_LOOP);
1154 
1155     cmp(cnt1_neg, (u1)8);
1156     mov(cnt1_neg, 0);
1157     br(LT, CH1_LOOP);
1158     b(NOMATCH);
1159 
1160   BIND(HAS_ZERO);
1161     rev(tmp1, tmp1);
1162     clz(tmp1, tmp1);
1163     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
1164     b(MATCH);
1165 
1166   BIND(DO1_SHORT);
1167     mov(result_tmp, cnt1);
1168     lea(str1, Address(str1, cnt1, Address::uxtw(1)));
1169     sub(cnt1_neg, zr, cnt1, LSL, 1);
1170   BIND(DO1_LOOP);
1171     ldrh(ch1, Address(str1, cnt1_neg));
1172     cmpw(ch, ch1);
1173     br(EQ, MATCH);
1174     adds(cnt1_neg, cnt1_neg, 2);
1175     br(LT, DO1_LOOP);
1176   BIND(NOMATCH);
1177     mov(result, -1);
1178     b(DONE);
1179   BIND(MATCH);
1180     add(result, result_tmp, cnt1_neg, ASR, 1);
1181   BIND(DONE);
1182 }
1183 
1184 void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1,
1185                                                 Register ch, Register result,
1186                                                 FloatRegister ztmp1,
1187                                                 FloatRegister ztmp2,
1188                                                 PRegister tmp_pg,
1189                                                 PRegister tmp_pdn, bool isL)
1190 {
1191   // Note that `tmp_pdn` should *NOT* be used as governing predicate register.
1192   assert(tmp_pg->is_governing(),
1193          "this register has to be a governing predicate register");
1194 
1195   Label LOOP, MATCH, DONE, NOMATCH;
1196   Register vec_len = rscratch1;
1197   Register idx = rscratch2;
1198 
1199   SIMD_RegVariant T = (isL == true) ? B : H;
1200 
1201   cbz(cnt1, NOMATCH);
1202 
1203   // Assign the particular char throughout the vector.
1204   sve_dup(ztmp2, T, ch);
1205   if (isL) {
1206     sve_cntb(vec_len);
1207   } else {
1208     sve_cnth(vec_len);
1209   }
1210   mov(idx, 0);
1211 
1212   // Generate a predicate to control the reading of input string.
1213   sve_whilelt(tmp_pg, T, idx, cnt1);
1214 
1215   BIND(LOOP);
1216     // Read a vector of 8- or 16-bit data depending on the string type. Note
1217     // that inactive elements indicated by the predicate register won't cause
1218     // a data read from memory to the destination vector.
1219     if (isL) {
1220       sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx));
1221     } else {
1222       sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1)));
1223     }
1224     add(idx, idx, vec_len);
1225 
1226     // Perform the comparison. An element of the destination predicate is set
1227     // to active if the particular char is matched.
1228     sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2);
1229 
1230     // Branch if the particular char is found.
1231     br(NE, MATCH);
1232 
1233     sve_whilelt(tmp_pg, T, idx, cnt1);
1234 
1235     // Loop back if the particular char not found.
1236     br(MI, LOOP);
1237 
1238   BIND(NOMATCH);
1239     mov(result, -1);
1240     b(DONE);
1241 
1242   BIND(MATCH);
1243     // Undo the index increment.
1244     sub(idx, idx, vec_len);
1245 
1246     // Crop the vector to find its location.
1247     sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */);
1248     add(result, idx, -1);
1249     sve_incp(result, T, tmp_pdn);
1250   BIND(DONE);
1251 }
1252 
1253 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1,
1254                                             Register ch, Register result,
1255                                             Register tmp1, Register tmp2, Register tmp3)
1256 {
1257   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
1258   Register cnt1_neg = cnt1;
1259   Register ch1 = rscratch1;
1260   Register result_tmp = rscratch2;
1261 
1262   cbz(cnt1, NOMATCH);
1263 
1264   cmp(cnt1, (u1)8);
1265   br(LT, DO1_SHORT);
1266 
1267   orr(ch, ch, ch, LSL, 8);
1268   orr(ch, ch, ch, LSL, 16);
1269   orr(ch, ch, ch, LSL, 32);
1270 
1271   sub(cnt1, cnt1, 8);
1272   mov(result_tmp, cnt1);
1273   lea(str1, Address(str1, cnt1));
1274   sub(cnt1_neg, zr, cnt1);
1275 
1276   mov(tmp3, 0x0101010101010101);
1277 
1278   BIND(CH1_LOOP);
1279     ldr(ch1, Address(str1, cnt1_neg));
1280     eor(ch1, ch, ch1);
1281     sub(tmp1, ch1, tmp3);
1282     orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f);
1283     bics(tmp1, tmp1, tmp2);
1284     br(NE, HAS_ZERO);
1285     adds(cnt1_neg, cnt1_neg, 8);
1286     br(LT, CH1_LOOP);
1287 
1288     cmp(cnt1_neg, (u1)8);
1289     mov(cnt1_neg, 0);
1290     br(LT, CH1_LOOP);
1291     b(NOMATCH);
1292 
1293   BIND(HAS_ZERO);
1294     rev(tmp1, tmp1);
1295     clz(tmp1, tmp1);
1296     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
1297     b(MATCH);
1298 
1299   BIND(DO1_SHORT);
1300     mov(result_tmp, cnt1);
1301     lea(str1, Address(str1, cnt1));
1302     sub(cnt1_neg, zr, cnt1);
1303   BIND(DO1_LOOP);
1304     ldrb(ch1, Address(str1, cnt1_neg));
1305     cmp(ch, ch1);
1306     br(EQ, MATCH);
1307     adds(cnt1_neg, cnt1_neg, 1);
1308     br(LT, DO1_LOOP);
1309   BIND(NOMATCH);
1310     mov(result, -1);
1311     b(DONE);
1312   BIND(MATCH);
1313     add(result, result_tmp, cnt1_neg);
1314   BIND(DONE);
1315 }
1316 
1317 // Compare strings.
1318 void C2_MacroAssembler::string_compare(Register str1, Register str2,
1319     Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
1320     FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
1321     PRegister pgtmp1, PRegister pgtmp2, int ae) {
1322   Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
1323       DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
1324       SHORT_LOOP_START, TAIL_CHECK;
1325 
1326   bool isLL = ae == StrIntrinsicNode::LL;
1327   bool isLU = ae == StrIntrinsicNode::LU;
1328   bool isUL = ae == StrIntrinsicNode::UL;
1329 
1330   // The stub threshold for LL strings is: 72 (64 + 8) chars
1331   // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch)
1332   // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least)
1333   const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36);
1334 
1335   bool str1_isL = isLL || isLU;
1336   bool str2_isL = isLL || isUL;
1337 
1338   int str1_chr_shift = str1_isL ? 0 : 1;
1339   int str2_chr_shift = str2_isL ? 0 : 1;
1340   int str1_chr_size = str1_isL ? 1 : 2;
1341   int str2_chr_size = str2_isL ? 1 : 2;
1342   int minCharsInWord = isLL ? wordSize : wordSize/2;
1343 
1344   FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
1345   chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
1346                                       (chr_insn)&MacroAssembler::ldrh;
1347   chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
1348                                       (chr_insn)&MacroAssembler::ldrh;
1349   uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
1350                             (uxt_insn)&MacroAssembler::uxthw;
1351 
1352   BLOCK_COMMENT("string_compare {");
1353 
1354   // Bizarrely, the counts are passed in bytes, regardless of whether they
1355   // are L or U strings, however the result is always in characters.
1356   if (!str1_isL) asrw(cnt1, cnt1, 1);
1357   if (!str2_isL) asrw(cnt2, cnt2, 1);
1358 
1359   // Compute the minimum of the string lengths and save the difference.
1360   subsw(result, cnt1, cnt2);
1361   cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
1362 
1363   // A very short string
1364   cmpw(cnt2, minCharsInWord);
1365   br(Assembler::LE, SHORT_STRING);
1366 
1367   // Compare longwords
1368   // load first parts of strings and finish initialization while loading
1369   {
1370     if (str1_isL == str2_isL) { // LL or UU
1371       ldr(tmp1, Address(str1));
1372       cmp(str1, str2);
1373       br(Assembler::EQ, DONE);
1374       ldr(tmp2, Address(str2));
1375       cmp(cnt2, stub_threshold);
1376       br(GE, STUB);
1377       subsw(cnt2, cnt2, minCharsInWord);
1378       br(EQ, TAIL_CHECK);
1379       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1380       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1381       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1382     } else if (isLU) {
1383       ldrs(vtmp, Address(str1));
1384       ldr(tmp2, Address(str2));
1385       cmp(cnt2, stub_threshold);
1386       br(GE, STUB);
1387       subw(cnt2, cnt2, 4);
1388       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1389       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1390       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1391       zip1(vtmp, T8B, vtmp, vtmpZ);
1392       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1393       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1394       add(cnt1, cnt1, 4);
1395       fmovd(tmp1, vtmp);
1396     } else { // UL case
1397       ldr(tmp1, Address(str1));
1398       ldrs(vtmp, Address(str2));
1399       cmp(cnt2, stub_threshold);
1400       br(GE, STUB);
1401       subw(cnt2, cnt2, 4);
1402       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1403       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1404       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1405       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1406       zip1(vtmp, T8B, vtmp, vtmpZ);
1407       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1408       add(cnt1, cnt1, 8);
1409       fmovd(tmp2, vtmp);
1410     }
1411     adds(cnt2, cnt2, isUL ? 4 : 8);
1412     br(GE, TAIL);
1413     eor(rscratch2, tmp1, tmp2);
1414     cbnz(rscratch2, DIFF);
1415     // main loop
1416     bind(NEXT_WORD);
1417     if (str1_isL == str2_isL) {
1418       ldr(tmp1, Address(str1, cnt2));
1419       ldr(tmp2, Address(str2, cnt2));
1420       adds(cnt2, cnt2, 8);
1421     } else if (isLU) {
1422       ldrs(vtmp, Address(str1, cnt1));
1423       ldr(tmp2, Address(str2, cnt2));
1424       add(cnt1, cnt1, 4);
1425       zip1(vtmp, T8B, vtmp, vtmpZ);
1426       fmovd(tmp1, vtmp);
1427       adds(cnt2, cnt2, 8);
1428     } else { // UL
1429       ldrs(vtmp, Address(str2, cnt2));
1430       ldr(tmp1, Address(str1, cnt1));
1431       zip1(vtmp, T8B, vtmp, vtmpZ);
1432       add(cnt1, cnt1, 8);
1433       fmovd(tmp2, vtmp);
1434       adds(cnt2, cnt2, 4);
1435     }
1436     br(GE, TAIL);
1437 
1438     eor(rscratch2, tmp1, tmp2);
1439     cbz(rscratch2, NEXT_WORD);
1440     b(DIFF);
1441     bind(TAIL);
1442     eor(rscratch2, tmp1, tmp2);
1443     cbnz(rscratch2, DIFF);
1444     // Last longword.  In the case where length == 4 we compare the
1445     // same longword twice, but that's still faster than another
1446     // conditional branch.
1447     if (str1_isL == str2_isL) {
1448       ldr(tmp1, Address(str1));
1449       ldr(tmp2, Address(str2));
1450     } else if (isLU) {
1451       ldrs(vtmp, Address(str1));
1452       ldr(tmp2, Address(str2));
1453       zip1(vtmp, T8B, vtmp, vtmpZ);
1454       fmovd(tmp1, vtmp);
1455     } else { // UL
1456       ldrs(vtmp, Address(str2));
1457       ldr(tmp1, Address(str1));
1458       zip1(vtmp, T8B, vtmp, vtmpZ);
1459       fmovd(tmp2, vtmp);
1460     }
1461     bind(TAIL_CHECK);
1462     eor(rscratch2, tmp1, tmp2);
1463     cbz(rscratch2, DONE);
1464 
1465     // Find the first different characters in the longwords and
1466     // compute their difference.
1467     bind(DIFF);
1468     rev(rscratch2, rscratch2);
1469     clz(rscratch2, rscratch2);
1470     andr(rscratch2, rscratch2, isLL ? -8 : -16);
1471     lsrv(tmp1, tmp1, rscratch2);
1472     (this->*ext_chr)(tmp1, tmp1);
1473     lsrv(tmp2, tmp2, rscratch2);
1474     (this->*ext_chr)(tmp2, tmp2);
1475     subw(result, tmp1, tmp2);
1476     b(DONE);
1477   }
1478 
1479   bind(STUB);
1480     RuntimeAddress stub = nullptr;
1481     switch(ae) {
1482       case StrIntrinsicNode::LL:
1483         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
1484         break;
1485       case StrIntrinsicNode::UU:
1486         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
1487         break;
1488       case StrIntrinsicNode::LU:
1489         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
1490         break;
1491       case StrIntrinsicNode::UL:
1492         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
1493         break;
1494       default:
1495         ShouldNotReachHere();
1496      }
1497     assert(stub.target() != nullptr, "compare_long_string stub has not been generated");
1498     address call = trampoline_call(stub);
1499     if (call == nullptr) {
1500       DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START));
1501       ciEnv::current()->record_failure("CodeCache is full");
1502       return;
1503     }
1504     b(DONE);
1505 
1506   bind(SHORT_STRING);
1507   // Is the minimum length zero?
1508   cbz(cnt2, DONE);
1509   // arrange code to do most branches while loading and loading next characters
1510   // while comparing previous
1511   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1512   subs(cnt2, cnt2, 1);
1513   br(EQ, SHORT_LAST_INIT);
1514   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1515   b(SHORT_LOOP_START);
1516   bind(SHORT_LOOP);
1517   subs(cnt2, cnt2, 1);
1518   br(EQ, SHORT_LAST);
1519   bind(SHORT_LOOP_START);
1520   (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
1521   (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
1522   cmp(tmp1, cnt1);
1523   br(NE, SHORT_LOOP_TAIL);
1524   subs(cnt2, cnt2, 1);
1525   br(EQ, SHORT_LAST2);
1526   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1527   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1528   cmp(tmp2, rscratch1);
1529   br(EQ, SHORT_LOOP);
1530   sub(result, tmp2, rscratch1);
1531   b(DONE);
1532   bind(SHORT_LOOP_TAIL);
1533   sub(result, tmp1, cnt1);
1534   b(DONE);
1535   bind(SHORT_LAST2);
1536   cmp(tmp2, rscratch1);
1537   br(EQ, DONE);
1538   sub(result, tmp2, rscratch1);
1539 
1540   b(DONE);
1541   bind(SHORT_LAST_INIT);
1542   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1543   bind(SHORT_LAST);
1544   cmp(tmp1, cnt1);
1545   br(EQ, DONE);
1546   sub(result, tmp1, cnt1);
1547 
1548   bind(DONE);
1549 
1550   BLOCK_COMMENT("} string_compare");
1551 }
1552 
1553 void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1,
1554                                      FloatRegister src2, Condition cond, bool isQ) {
1555   SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1556   FloatRegister zn = src1, zm = src2;
1557   bool needs_negation = false;
1558   switch (cond) {
1559     case LT: cond = GT; zn = src2; zm = src1; break;
1560     case LE: cond = GE; zn = src2; zm = src1; break;
1561     case LO: cond = HI; zn = src2; zm = src1; break;
1562     case LS: cond = HS; zn = src2; zm = src1; break;
1563     case NE: cond = EQ; needs_negation = true; break;
1564     default:
1565       break;
1566   }
1567 
1568   if (is_floating_point_type(bt)) {
1569     fcm(cond, dst, size, zn, zm);
1570   } else {
1571     cm(cond, dst, size, zn, zm);
1572   }
1573 
1574   if (needs_negation) {
1575     notr(dst, isQ ? T16B : T8B, dst);
1576   }
1577 }
1578 
1579 void C2_MacroAssembler::neon_compare_zero(FloatRegister dst, BasicType bt, FloatRegister src,
1580                                           Condition cond, bool isQ) {
1581   SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1582   if (bt == T_FLOAT || bt == T_DOUBLE) {
1583     if (cond == Assembler::NE) {
1584       fcm(Assembler::EQ, dst, size, src);
1585       notr(dst, isQ ? T16B : T8B, dst);
1586     } else {
1587       fcm(cond, dst, size, src);
1588     }
1589   } else {
1590     if (cond == Assembler::NE) {
1591       cm(Assembler::EQ, dst, size, src);
1592       notr(dst, isQ ? T16B : T8B, dst);
1593     } else {
1594       cm(cond, dst, size, src);
1595     }
1596   }
1597 }
1598 
1599 // Compress the least significant bit of each byte to the rightmost and clear
1600 // the higher garbage bits.
1601 void C2_MacroAssembler::bytemask_compress(Register dst) {
1602   // Example input, dst = 0x01 00 00 00 01 01 00 01
1603   // The "??" bytes are garbage.
1604   orr(dst, dst, dst, Assembler::LSR, 7);  // dst = 0x?? 02 ?? 00 ?? 03 ?? 01
1605   orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D
1606   orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D
1607   andr(dst, dst, 0xff);                   // dst = 0x8D
1608 }
1609 
1610 // Pack the lowest-numbered bit of each mask element in src into a long value
1611 // in dst, at most the first 64 lane elements.
1612 // Clobbers: rscratch1, if UseSVE=1 or the hardware doesn't support FEAT_BITPERM.
1613 void C2_MacroAssembler::sve_vmask_tolong(Register dst, PRegister src, BasicType bt, int lane_cnt,
1614                                          FloatRegister vtmp1, FloatRegister vtmp2) {
1615   assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
1616   assert_different_registers(dst, rscratch1);
1617   assert_different_registers(vtmp1, vtmp2);
1618 
1619   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
1620   // Example:   src = 0b01100101 10001101, bt = T_BYTE, lane_cnt = 16
1621   // Expected:  dst = 0x658D
1622 
1623   // Convert the mask into vector with sequential bytes.
1624   // vtmp1 = 0x00010100 0x00010001 0x01000000 0x01010001
1625   sve_cpy(vtmp1, size, src, 1, false);
1626   if (bt != T_BYTE) {
1627     sve_vector_narrow(vtmp1, B, vtmp1, size, vtmp2);
1628   }
1629 
1630   if (UseSVE > 1 && VM_Version::supports_svebitperm()) {
1631     // Given a vector with the value 0x00 or 0x01 in each byte, the basic idea
1632     // is to compress each significant bit of the byte in a cross-lane way. Due
1633     // to the lack of a cross-lane bit-compress instruction, we use BEXT
1634     // (bit-compress in each lane) with the biggest lane size (T = D) then
1635     // concatenate the results.
1636 
1637     // The second source input of BEXT, initialized with 0x01 in each byte.
1638     // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1639     sve_dup(vtmp2, B, 1);
1640 
1641     // BEXT vtmp1.D, vtmp1.D, vtmp2.D
1642     // vtmp1 = 0x0001010000010001 | 0x0100000001010001
1643     // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1644     //         ---------------------------------------
1645     // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1646     sve_bext(vtmp1, D, vtmp1, vtmp2);
1647 
1648     // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the
1649     // result to dst.
1650     // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1651     // dst   = 0x658D
1652     if (lane_cnt <= 8) {
1653       // No need to concatenate.
1654       umov(dst, vtmp1, B, 0);
1655     } else if (lane_cnt <= 16) {
1656       ins(vtmp1, B, vtmp1, 1, 8);
1657       umov(dst, vtmp1, H, 0);
1658     } else {
1659       // As the lane count is 64 at most, the final expected value must be in
1660       // the lowest 64 bits after narrowing vtmp1 from D to B.
1661       sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2);
1662       umov(dst, vtmp1, D, 0);
1663     }
1664   } else if (UseSVE > 0) {
1665     // Compress the lowest 8 bytes.
1666     fmovd(dst, vtmp1);
1667     bytemask_compress(dst);
1668     if (lane_cnt <= 8) return;
1669 
1670     // Repeat on higher bytes and join the results.
1671     // Compress 8 bytes in each iteration.
1672     for (int idx = 1; idx < (lane_cnt / 8); idx++) {
1673       sve_extract_integral(rscratch1, T_LONG, vtmp1, idx, vtmp2);
1674       bytemask_compress(rscratch1);
1675       orr(dst, dst, rscratch1, Assembler::LSL, idx << 3);
1676     }
1677   } else {
1678     assert(false, "unsupported");
1679     ShouldNotReachHere();
1680   }
1681 }
1682 
1683 // Unpack the mask, a long value in src, into predicate register dst based on the
1684 // corresponding data type. Note that dst can support at most 64 lanes.
1685 // Below example gives the expected dst predicate register in different types, with
1686 // a valid src(0x658D) on a 1024-bit vector size machine.
1687 // BYTE:  dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 00 00 65 8D
1688 // SHORT: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 14 11 40 51
1689 // INT:   dst = 0x00 00 00 00 00 00 00 00 01 10 01 01 10 00 11 01
1690 // LONG:  dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01
1691 //
1692 // The number of significant bits of src must be equal to lane_cnt. E.g., 0xFF658D which
1693 // has 24 significant bits would be an invalid input if dst predicate register refers to
1694 // a LONG type 1024-bit vector, which has at most 16 lanes.
1695 void C2_MacroAssembler::sve_vmask_fromlong(PRegister dst, Register src, BasicType bt, int lane_cnt,
1696                                            FloatRegister vtmp1, FloatRegister vtmp2) {
1697   assert(UseSVE == 2 && VM_Version::supports_svebitperm() &&
1698          lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported");
1699   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
1700   // Example:   src = 0x658D, bt = T_BYTE, size = B, lane_cnt = 16
1701   // Expected:  dst = 0b01101001 10001101
1702 
1703   // Put long value from general purpose register into the first lane of vector.
1704   // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1705   sve_dup(vtmp1, B, 0);
1706   mov(vtmp1, D, 0, src);
1707 
1708   // As sve_cmp generates mask value with the minimum unit in byte, we should
1709   // transform the value in the first lane which is mask in bit now to the
1710   // mask in byte, which can be done by SVE2's BDEP instruction.
1711 
1712   // The first source input of BDEP instruction. Deposite each byte in every 8 bytes.
1713   // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1714   if (lane_cnt <= 8) {
1715     // Nothing. As only one byte exsits.
1716   } else if (lane_cnt <= 16) {
1717     ins(vtmp1, B, vtmp1, 8, 1);
1718     mov(vtmp1, B, 1, zr);
1719   } else {
1720     sve_vector_extend(vtmp1, D, vtmp1, B);
1721   }
1722 
1723   // The second source input of BDEP instruction, initialized with 0x01 for each byte.
1724   // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1725   sve_dup(vtmp2, B, 1);
1726 
1727   // BDEP vtmp1.D, vtmp1.D, vtmp2.D
1728   // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1729   // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1730   //         ---------------------------------------
1731   // vtmp1 = 0x0001010000010001 | 0x0100000001010001
1732   sve_bdep(vtmp1, D, vtmp1, vtmp2);
1733 
1734   if (bt != T_BYTE) {
1735     sve_vector_extend(vtmp1, size, vtmp1, B);
1736   }
1737   // Generate mask according to the given vector, in which the elements have been
1738   // extended to expected type.
1739   // dst = 0b01101001 10001101
1740   sve_cmp(Assembler::NE, dst, size, ptrue, vtmp1, 0);
1741 }
1742 
1743 // Clobbers: rflags
1744 void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg,
1745                                     FloatRegister zn, FloatRegister zm, Condition cond) {
1746   assert(pg->is_governing(), "This register has to be a governing predicate register");
1747   FloatRegister z1 = zn, z2 = zm;
1748   switch (cond) {
1749     case LE: z1 = zm; z2 = zn; cond = GE; break;
1750     case LT: z1 = zm; z2 = zn; cond = GT; break;
1751     case LO: z1 = zm; z2 = zn; cond = HI; break;
1752     case LS: z1 = zm; z2 = zn; cond = HS; break;
1753     default:
1754       break;
1755   }
1756 
1757   SIMD_RegVariant size = elemType_to_regVariant(bt);
1758   if (is_floating_point_type(bt)) {
1759     sve_fcm(cond, pd, size, pg, z1, z2);
1760   } else {
1761     assert(is_integral_type(bt), "unsupported element type");
1762     sve_cmp(cond, pd, size, pg, z1, z2);
1763   }
1764 }
1765 
1766 // Get index of the last mask lane that is set
1767 void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) {
1768   SIMD_RegVariant size = elemType_to_regVariant(bt);
1769   sve_rev(ptmp, size, src);
1770   sve_brkb(ptmp, ptrue, ptmp, false);
1771   sve_cntp(dst, size, ptrue, ptmp);
1772   movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1);
1773   subw(dst, rscratch1, dst);
1774 }
1775 
1776 // Extend integer vector src to dst with the same lane count
1777 // but larger element size, e.g. 4B -> 4I
1778 void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes,
1779                                            FloatRegister src, BasicType src_bt, bool is_unsigned) {
1780   if (src_bt == T_BYTE) {
1781     if (dst_bt == T_SHORT) {
1782       // 4B/8B to 4S/8S
1783       _xshll(is_unsigned, dst, T8H, src, T8B, 0);
1784     } else {
1785       // 4B to 4I
1786       assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported");
1787       _xshll(is_unsigned, dst, T8H, src, T8B, 0);
1788       _xshll(is_unsigned, dst, T4S, dst, T4H, 0);
1789     }
1790   } else if (src_bt == T_SHORT) {
1791     // 4S to 4I
1792     assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported");
1793     _xshll(is_unsigned, dst, T4S, src, T4H, 0);
1794   } else if (src_bt == T_INT) {
1795     // 2I to 2L
1796     assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported");
1797     _xshll(is_unsigned, dst, T2D, src, T2S, 0);
1798   } else {
1799     ShouldNotReachHere();
1800   }
1801 }
1802 
1803 // Narrow integer vector src down to dst with the same lane count
1804 // but smaller element size, e.g. 4I -> 4B
1805 void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt,
1806                                            FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) {
1807   if (src_bt == T_SHORT) {
1808     // 4S/8S to 4B/8B
1809     assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
1810     assert(dst_bt == T_BYTE, "unsupported");
1811     xtn(dst, T8B, src, T8H);
1812   } else if (src_bt == T_INT) {
1813     // 4I to 4B/4S
1814     assert(src_vlen_in_bytes == 16, "unsupported");
1815     assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported");
1816     xtn(dst, T4H, src, T4S);
1817     if (dst_bt == T_BYTE) {
1818       xtn(dst, T8B, dst, T8H);
1819     }
1820   } else if (src_bt == T_LONG) {
1821     // 2L to 2I
1822     assert(src_vlen_in_bytes == 16, "unsupported");
1823     assert(dst_bt == T_INT, "unsupported");
1824     xtn(dst, T2S, src, T2D);
1825   } else {
1826     ShouldNotReachHere();
1827   }
1828 }
1829 
1830 void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size,
1831                                           FloatRegister src, SIMD_RegVariant src_size,
1832                                           bool is_unsigned) {
1833   assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size");
1834 
1835   if (src_size == B) {
1836     switch (dst_size) {
1837     case H:
1838       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1839       break;
1840     case S:
1841       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1842       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1843       break;
1844     case D:
1845       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1846       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1847       _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1848       break;
1849     default:
1850       ShouldNotReachHere();
1851     }
1852   } else if (src_size == H) {
1853     if (dst_size == S) {
1854       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1855     } else { // D
1856       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1857       _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1858     }
1859   } else if (src_size == S) {
1860     _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, src);
1861   }
1862 }
1863 
1864 // Vector narrow from src to dst with specified element sizes.
1865 // High part of dst vector will be filled with zero.
1866 void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size,
1867                                           FloatRegister src, SIMD_RegVariant src_size,
1868                                           FloatRegister tmp) {
1869   assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size");
1870   assert_different_registers(src, tmp);
1871   sve_dup(tmp, src_size, 0);
1872   if (src_size == D) {
1873     switch (dst_size) {
1874     case S:
1875       sve_uzp1(dst, S, src, tmp);
1876       break;
1877     case H:
1878       assert_different_registers(dst, tmp);
1879       sve_uzp1(dst, S, src, tmp);
1880       sve_uzp1(dst, H, dst, tmp);
1881       break;
1882     case B:
1883       assert_different_registers(dst, tmp);
1884       sve_uzp1(dst, S, src, tmp);
1885       sve_uzp1(dst, H, dst, tmp);
1886       sve_uzp1(dst, B, dst, tmp);
1887       break;
1888     default:
1889       ShouldNotReachHere();
1890     }
1891   } else if (src_size == S) {
1892     if (dst_size == H) {
1893       sve_uzp1(dst, H, src, tmp);
1894     } else { // B
1895       assert_different_registers(dst, tmp);
1896       sve_uzp1(dst, H, src, tmp);
1897       sve_uzp1(dst, B, dst, tmp);
1898     }
1899   } else if (src_size == H) {
1900     sve_uzp1(dst, B, src, tmp);
1901   }
1902 }
1903 
1904 // Extend src predicate to dst predicate with the same lane count but larger
1905 // element size, e.g. 64Byte -> 512Long
1906 void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src,
1907                                              uint dst_element_length_in_bytes,
1908                                              uint src_element_length_in_bytes) {
1909   if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) {
1910     sve_punpklo(dst, src);
1911   } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) {
1912     sve_punpklo(dst, src);
1913     sve_punpklo(dst, dst);
1914   } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) {
1915     sve_punpklo(dst, src);
1916     sve_punpklo(dst, dst);
1917     sve_punpklo(dst, dst);
1918   } else {
1919     assert(false, "unsupported");
1920     ShouldNotReachHere();
1921   }
1922 }
1923 
1924 // Narrow src predicate to dst predicate with the same lane count but
1925 // smaller element size, e.g. 512Long -> 64Byte
1926 void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src, PRegister ptmp,
1927                                              uint dst_element_length_in_bytes, uint src_element_length_in_bytes) {
1928   // The insignificant bits in src predicate are expected to be zero.
1929   // To ensure the higher order bits of the resultant narrowed vector are 0, an all-zero predicate is
1930   // passed as the second argument. An example narrowing operation with a given mask would be -
1931   // 128Long -> 64Int on a 128-bit machine i.e 2L -> 2I
1932   // Mask (for 2 Longs) : TF
1933   // Predicate register for the above mask (16 bits) : 00000001 00000000
1934   // After narrowing (uzp1 dst.b, src.b, ptmp.b) : 0000 0000 0001 0000
1935   // Which translates to mask for 2 integers as : TF (lower half is considered while upper half is 0)
1936   assert_different_registers(src, ptmp);
1937   assert_different_registers(dst, ptmp);
1938   sve_pfalse(ptmp);
1939   if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) {
1940     sve_uzp1(dst, B, src, ptmp);
1941   } else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) {
1942     sve_uzp1(dst, H, src, ptmp);
1943     sve_uzp1(dst, B, dst, ptmp);
1944   } else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) {
1945     sve_uzp1(dst, S, src, ptmp);
1946     sve_uzp1(dst, H, dst, ptmp);
1947     sve_uzp1(dst, B, dst, ptmp);
1948   } else {
1949     assert(false, "unsupported");
1950     ShouldNotReachHere();
1951   }
1952 }
1953 
1954 // Vector reduction add for integral type with ASIMD instructions.
1955 void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt,
1956                                                  Register isrc, FloatRegister vsrc,
1957                                                  unsigned vector_length_in_bytes,
1958                                                  FloatRegister vtmp) {
1959   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1960   assert_different_registers(dst, isrc);
1961   bool isQ = vector_length_in_bytes == 16;
1962 
1963   BLOCK_COMMENT("neon_reduce_add_integral {");
1964     switch(bt) {
1965       case T_BYTE:
1966         addv(vtmp, isQ ? T16B : T8B, vsrc);
1967         smov(dst, vtmp, B, 0);
1968         addw(dst, dst, isrc, ext::sxtb);
1969         break;
1970       case T_SHORT:
1971         addv(vtmp, isQ ? T8H : T4H, vsrc);
1972         smov(dst, vtmp, H, 0);
1973         addw(dst, dst, isrc, ext::sxth);
1974         break;
1975       case T_INT:
1976         isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc);
1977         umov(dst, vtmp, S, 0);
1978         addw(dst, dst, isrc);
1979         break;
1980       case T_LONG:
1981         assert(isQ, "unsupported");
1982         addpd(vtmp, vsrc);
1983         umov(dst, vtmp, D, 0);
1984         add(dst, dst, isrc);
1985         break;
1986       default:
1987         assert(false, "unsupported");
1988         ShouldNotReachHere();
1989     }
1990   BLOCK_COMMENT("} neon_reduce_add_integral");
1991 }
1992 
1993 // Vector reduction multiply for integral type with ASIMD instructions.
1994 // Note: temporary registers vtmp1 and vtmp2 are not used in some cases.
1995 // Clobbers: rscratch1
1996 void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt,
1997                                                  Register isrc, FloatRegister vsrc,
1998                                                  unsigned vector_length_in_bytes,
1999                                                  FloatRegister vtmp1, FloatRegister vtmp2) {
2000   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
2001   bool isQ = vector_length_in_bytes == 16;
2002 
2003   BLOCK_COMMENT("neon_reduce_mul_integral {");
2004     switch(bt) {
2005       case T_BYTE:
2006         if (isQ) {
2007           // Multiply the lower half and higher half of vector iteratively.
2008           // vtmp1 = vsrc[8:15]
2009           ins(vtmp1, D, vsrc, 0, 1);
2010           // vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7]
2011           mulv(vtmp1, T8B, vtmp1, vsrc);
2012           // vtmp2 = vtmp1[4:7]
2013           ins(vtmp2, S, vtmp1, 0, 1);
2014           // vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3]
2015           mulv(vtmp1, T8B, vtmp2, vtmp1);
2016         } else {
2017           ins(vtmp1, S, vsrc, 0, 1);
2018           mulv(vtmp1, T8B, vtmp1, vsrc);
2019         }
2020         // vtmp2 = vtmp1[2:3]
2021         ins(vtmp2, H, vtmp1, 0, 1);
2022         // vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1]
2023         mulv(vtmp2, T8B, vtmp2, vtmp1);
2024         // dst = vtmp2[0] * isrc * vtmp2[1]
2025         umov(rscratch1, vtmp2, B, 0);
2026         mulw(dst, rscratch1, isrc);
2027         sxtb(dst, dst);
2028         umov(rscratch1, vtmp2, B, 1);
2029         mulw(dst, rscratch1, dst);
2030         sxtb(dst, dst);
2031         break;
2032       case T_SHORT:
2033         if (isQ) {
2034           ins(vtmp2, D, vsrc, 0, 1);
2035           mulv(vtmp2, T4H, vtmp2, vsrc);
2036           ins(vtmp1, S, vtmp2, 0, 1);
2037           mulv(vtmp1, T4H, vtmp1, vtmp2);
2038         } else {
2039           ins(vtmp1, S, vsrc, 0, 1);
2040           mulv(vtmp1, T4H, vtmp1, vsrc);
2041         }
2042         umov(rscratch1, vtmp1, H, 0);
2043         mulw(dst, rscratch1, isrc);
2044         sxth(dst, dst);
2045         umov(rscratch1, vtmp1, H, 1);
2046         mulw(dst, rscratch1, dst);
2047         sxth(dst, dst);
2048         break;
2049       case T_INT:
2050         if (isQ) {
2051           ins(vtmp1, D, vsrc, 0, 1);
2052           mulv(vtmp1, T2S, vtmp1, vsrc);
2053         } else {
2054           vtmp1 = vsrc;
2055         }
2056         umov(rscratch1, vtmp1, S, 0);
2057         mul(dst, rscratch1, isrc);
2058         umov(rscratch1, vtmp1, S, 1);
2059         mul(dst, rscratch1, dst);
2060         break;
2061       case T_LONG:
2062         umov(rscratch1, vsrc, D, 0);
2063         mul(dst, isrc, rscratch1);
2064         umov(rscratch1, vsrc, D, 1);
2065         mul(dst, dst, rscratch1);
2066         break;
2067       default:
2068         assert(false, "unsupported");
2069         ShouldNotReachHere();
2070     }
2071   BLOCK_COMMENT("} neon_reduce_mul_integral");
2072 }
2073 
2074 // Vector reduction multiply for floating-point type with ASIMD instructions.
2075 void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt,
2076                                            FloatRegister fsrc, FloatRegister vsrc,
2077                                            unsigned vector_length_in_bytes,
2078                                            FloatRegister vtmp) {
2079   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
2080   bool isQ = vector_length_in_bytes == 16;
2081 
2082   BLOCK_COMMENT("neon_reduce_mul_fp {");
2083     switch(bt) {
2084       case T_FLOAT:
2085         fmuls(dst, fsrc, vsrc);
2086         ins(vtmp, S, vsrc, 0, 1);
2087         fmuls(dst, dst, vtmp);
2088         if (isQ) {
2089           ins(vtmp, S, vsrc, 0, 2);
2090           fmuls(dst, dst, vtmp);
2091           ins(vtmp, S, vsrc, 0, 3);
2092           fmuls(dst, dst, vtmp);
2093          }
2094         break;
2095       case T_DOUBLE:
2096         assert(isQ, "unsupported");
2097         fmuld(dst, fsrc, vsrc);
2098         ins(vtmp, D, vsrc, 0, 1);
2099         fmuld(dst, dst, vtmp);
2100         break;
2101       default:
2102         assert(false, "unsupported");
2103         ShouldNotReachHere();
2104     }
2105   BLOCK_COMMENT("} neon_reduce_mul_fp");
2106 }
2107 
2108 // Helper to select logical instruction
2109 void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd,
2110                                                    Register Rn, Register Rm,
2111                                                    enum shift_kind kind, unsigned shift) {
2112   switch(opc) {
2113     case Op_AndReductionV:
2114       is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift);
2115       break;
2116     case Op_OrReductionV:
2117       is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift);
2118       break;
2119     case Op_XorReductionV:
2120       is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift);
2121       break;
2122     default:
2123       assert(false, "unsupported");
2124       ShouldNotReachHere();
2125   }
2126 }
2127 
2128 // Vector reduction logical operations And, Or, Xor
2129 // Clobbers: rscratch1
2130 void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt,
2131                                             Register isrc, FloatRegister vsrc,
2132                                             unsigned vector_length_in_bytes) {
2133   assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV,
2134          "unsupported");
2135   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
2136   assert_different_registers(dst, isrc);
2137   bool isQ = vector_length_in_bytes == 16;
2138 
2139   BLOCK_COMMENT("neon_reduce_logical {");
2140     umov(rscratch1, vsrc, isQ ? D : S, 0);
2141     umov(dst, vsrc, isQ ? D : S, 1);
2142     neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1);
2143     switch(bt) {
2144       case T_BYTE:
2145         if (isQ) {
2146           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
2147         }
2148         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
2149         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8);
2150         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
2151         sxtb(dst, dst);
2152         break;
2153       case T_SHORT:
2154         if (isQ) {
2155           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
2156         }
2157         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
2158         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
2159         sxth(dst, dst);
2160         break;
2161       case T_INT:
2162         if (isQ) {
2163           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
2164         }
2165         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
2166         break;
2167       case T_LONG:
2168         assert(isQ, "unsupported");
2169         neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst);
2170         break;
2171       default:
2172         assert(false, "unsupported");
2173         ShouldNotReachHere();
2174     }
2175   BLOCK_COMMENT("} neon_reduce_logical");
2176 }
2177 
2178 // Vector reduction min/max for integral type with ASIMD instructions.
2179 // Note: vtmp is not used and expected to be fnoreg for T_LONG case.
2180 // Clobbers: rscratch1, rflags
2181 void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt,
2182                                                     Register isrc, FloatRegister vsrc,
2183                                                     unsigned vector_length_in_bytes,
2184                                                     FloatRegister vtmp) {
2185   assert(opc == Op_MinReductionV || opc == Op_MaxReductionV, "unsupported");
2186   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
2187   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported");
2188   assert_different_registers(dst, isrc);
2189   bool isQ = vector_length_in_bytes == 16;
2190   bool is_min = opc == Op_MinReductionV;
2191 
2192   BLOCK_COMMENT("neon_reduce_minmax_integral {");
2193     if (bt == T_LONG) {
2194       assert(vtmp == fnoreg, "should be");
2195       assert(isQ, "should be");
2196       umov(rscratch1, vsrc, D, 0);
2197       cmp(isrc, rscratch1);
2198       csel(dst, isrc, rscratch1, is_min ? LT : GT);
2199       umov(rscratch1, vsrc, D, 1);
2200       cmp(dst, rscratch1);
2201       csel(dst, dst, rscratch1, is_min ? LT : GT);
2202     } else {
2203       SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
2204       if (size == T2S) {
2205         is_min ? sminp(vtmp, size, vsrc, vsrc) : smaxp(vtmp, size, vsrc, vsrc);
2206       } else {
2207         is_min ? sminv(vtmp, size, vsrc) : smaxv(vtmp, size, vsrc);
2208       }
2209       if (bt == T_INT) {
2210         umov(dst, vtmp, S, 0);
2211       } else {
2212         smov(dst, vtmp, elemType_to_regVariant(bt), 0);
2213       }
2214       cmpw(dst, isrc);
2215       cselw(dst, dst, isrc, is_min ? LT : GT);
2216     }
2217   BLOCK_COMMENT("} neon_reduce_minmax_integral");
2218 }
2219 
2220 // Vector reduction for integral type with SVE instruction.
2221 // Supported operations are Add, And, Or, Xor, Max, Min.
2222 // rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV.
2223 void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1,
2224                                             FloatRegister src2, PRegister pg, FloatRegister tmp) {
2225   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2226   assert(pg->is_governing(), "This register has to be a governing predicate register");
2227   assert_different_registers(src1, dst);
2228   // Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved.
2229   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2230   switch (opc) {
2231     case Op_AddReductionVI: {
2232       sve_uaddv(tmp, size, pg, src2);
2233       if (bt == T_BYTE) {
2234         smov(dst, tmp, size, 0);
2235         addw(dst, src1, dst, ext::sxtb);
2236       } else if (bt == T_SHORT) {
2237         smov(dst, tmp, size, 0);
2238         addw(dst, src1, dst, ext::sxth);
2239       } else {
2240         umov(dst, tmp, size, 0);
2241         addw(dst, dst, src1);
2242       }
2243       break;
2244     }
2245     case Op_AddReductionVL: {
2246       sve_uaddv(tmp, size, pg, src2);
2247       umov(dst, tmp, size, 0);
2248       add(dst, dst, src1);
2249       break;
2250     }
2251     case Op_AndReductionV: {
2252       sve_andv(tmp, size, pg, src2);
2253       if (bt == T_INT || bt == T_LONG) {
2254         umov(dst, tmp, size, 0);
2255       } else {
2256         smov(dst, tmp, size, 0);
2257       }
2258       if (bt == T_LONG) {
2259         andr(dst, dst, src1);
2260       } else {
2261         andw(dst, dst, src1);
2262       }
2263       break;
2264     }
2265     case Op_OrReductionV: {
2266       sve_orv(tmp, size, pg, src2);
2267       if (bt == T_INT || bt == T_LONG) {
2268         umov(dst, tmp, size, 0);
2269       } else {
2270         smov(dst, tmp, size, 0);
2271       }
2272       if (bt == T_LONG) {
2273         orr(dst, dst, src1);
2274       } else {
2275         orrw(dst, dst, src1);
2276       }
2277       break;
2278     }
2279     case Op_XorReductionV: {
2280       sve_eorv(tmp, size, pg, src2);
2281       if (bt == T_INT || bt == T_LONG) {
2282         umov(dst, tmp, size, 0);
2283       } else {
2284         smov(dst, tmp, size, 0);
2285       }
2286       if (bt == T_LONG) {
2287         eor(dst, dst, src1);
2288       } else {
2289         eorw(dst, dst, src1);
2290       }
2291       break;
2292     }
2293     case Op_MaxReductionV: {
2294       sve_smaxv(tmp, size, pg, src2);
2295       if (bt == T_INT || bt == T_LONG) {
2296         umov(dst, tmp, size, 0);
2297       } else {
2298         smov(dst, tmp, size, 0);
2299       }
2300       if (bt == T_LONG) {
2301         cmp(dst, src1);
2302         csel(dst, dst, src1, Assembler::GT);
2303       } else {
2304         cmpw(dst, src1);
2305         cselw(dst, dst, src1, Assembler::GT);
2306       }
2307       break;
2308     }
2309     case Op_MinReductionV: {
2310       sve_sminv(tmp, size, pg, src2);
2311       if (bt == T_INT || bt == T_LONG) {
2312         umov(dst, tmp, size, 0);
2313       } else {
2314         smov(dst, tmp, size, 0);
2315       }
2316       if (bt == T_LONG) {
2317         cmp(dst, src1);
2318         csel(dst, dst, src1, Assembler::LT);
2319       } else {
2320         cmpw(dst, src1);
2321         cselw(dst, dst, src1, Assembler::LT);
2322       }
2323       break;
2324     }
2325     default:
2326       assert(false, "unsupported");
2327       ShouldNotReachHere();
2328   }
2329 
2330   if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) {
2331     if (bt == T_BYTE) {
2332       sxtb(dst, dst);
2333     } else if (bt == T_SHORT) {
2334       sxth(dst, dst);
2335     }
2336   }
2337 }
2338 
2339 // Set elements of the dst predicate to true for lanes in the range of [0, lane_cnt), or
2340 // to false otherwise. The input "lane_cnt" should be smaller than or equal to the supported
2341 // max vector length of the basic type. Clobbers: rscratch1 and the rFlagsReg.
2342 void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt) {
2343   uint32_t max_vector_length = Matcher::max_vector_size(bt);
2344   assert(lane_cnt <= max_vector_length, "unsupported input lane_cnt");
2345 
2346   // Set all elements to false if the input "lane_cnt" is zero.
2347   if (lane_cnt == 0) {
2348     sve_pfalse(dst);
2349     return;
2350   }
2351 
2352   SIMD_RegVariant size = elemType_to_regVariant(bt);
2353   assert(size != Q, "invalid size");
2354 
2355   // Set all true if "lane_cnt" equals to the max lane count.
2356   if (lane_cnt == max_vector_length) {
2357     sve_ptrue(dst, size, /* ALL */ 0b11111);
2358     return;
2359   }
2360 
2361   // Fixed numbers for "ptrue".
2362   switch(lane_cnt) {
2363   case 1: /* VL1 */
2364   case 2: /* VL2 */
2365   case 3: /* VL3 */
2366   case 4: /* VL4 */
2367   case 5: /* VL5 */
2368   case 6: /* VL6 */
2369   case 7: /* VL7 */
2370   case 8: /* VL8 */
2371     sve_ptrue(dst, size, lane_cnt);
2372     return;
2373   case 16:
2374     sve_ptrue(dst, size, /* VL16 */ 0b01001);
2375     return;
2376   case 32:
2377     sve_ptrue(dst, size, /* VL32 */ 0b01010);
2378     return;
2379   case 64:
2380     sve_ptrue(dst, size, /* VL64 */ 0b01011);
2381     return;
2382   case 128:
2383     sve_ptrue(dst, size, /* VL128 */ 0b01100);
2384     return;
2385   case 256:
2386     sve_ptrue(dst, size, /* VL256 */ 0b01101);
2387     return;
2388   default:
2389     break;
2390   }
2391 
2392   // Special patterns for "ptrue".
2393   if (lane_cnt == round_down_power_of_2(max_vector_length)) {
2394     sve_ptrue(dst, size, /* POW2 */ 0b00000);
2395   } else if (lane_cnt == max_vector_length - (max_vector_length % 4)) {
2396     sve_ptrue(dst, size, /* MUL4 */ 0b11101);
2397   } else if (lane_cnt == max_vector_length - (max_vector_length % 3)) {
2398     sve_ptrue(dst, size, /* MUL3 */ 0b11110);
2399   } else {
2400     // Encode to "whileltw" for the remaining cases.
2401     mov(rscratch1, lane_cnt);
2402     sve_whileltw(dst, size, zr, rscratch1);
2403   }
2404 }
2405 
2406 // Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst.
2407 // Any remaining elements of dst will be filled with zero.
2408 // Clobbers: rscratch1
2409 // Preserves: src, mask
2410 void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask,
2411                                            FloatRegister vtmp1, FloatRegister vtmp2,
2412                                            PRegister pgtmp) {
2413   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2414   assert_different_registers(dst, src, vtmp1, vtmp2);
2415   assert_different_registers(mask, pgtmp);
2416 
2417   // Example input:   src   = 8888 7777 6666 5555 4444 3333 2222 1111
2418   //                  mask  = 0001 0000 0000 0001 0001 0000 0001 0001
2419   // Expected result: dst   = 0000 0000 0000 8888 5555 4444 2222 1111
2420   sve_dup(vtmp2, H, 0);
2421 
2422   // Extend lowest half to type INT.
2423   // dst = 00004444 00003333 00002222 00001111
2424   sve_uunpklo(dst, S, src);
2425   // pgtmp = 00000001 00000000 00000001 00000001
2426   sve_punpklo(pgtmp, mask);
2427   // Pack the active elements in size of type INT to the right,
2428   // and fill the remainings with zero.
2429   // dst = 00000000 00004444 00002222 00001111
2430   sve_compact(dst, S, dst, pgtmp);
2431   // Narrow the result back to type SHORT.
2432   // dst = 0000 0000 0000 0000 0000 4444 2222 1111
2433   sve_uzp1(dst, H, dst, vtmp2);
2434   // Count the active elements of lowest half.
2435   // rscratch1 = 3
2436   sve_cntp(rscratch1, S, ptrue, pgtmp);
2437 
2438   // Repeat to the highest half.
2439   // pgtmp = 00000001 00000000 00000000 00000001
2440   sve_punpkhi(pgtmp, mask);
2441   // vtmp1 = 00008888 00007777 00006666 00005555
2442   sve_uunpkhi(vtmp1, S, src);
2443   // vtmp1 = 00000000 00000000 00008888 00005555
2444   sve_compact(vtmp1, S, vtmp1, pgtmp);
2445   // vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555
2446   sve_uzp1(vtmp1, H, vtmp1, vtmp2);
2447 
2448   // Compressed low:   dst   = 0000 0000 0000 0000 0000 4444 2222 1111
2449   // Compressed high:  vtmp1 = 0000 0000 0000 0000 0000 0000 8888  5555
2450   // Left shift(cross lane) compressed high with TRUE_CNT lanes,
2451   // TRUE_CNT is the number of active elements in the compressed low.
2452   neg(rscratch1, rscratch1);
2453   // vtmp2 = {4 3 2 1 0 -1 -2 -3}
2454   sve_index(vtmp2, H, rscratch1, 1);
2455   // vtmp1 = 0000 0000 0000 8888 5555 0000 0000 0000
2456   sve_tbl(vtmp1, H, vtmp1, vtmp2);
2457 
2458   // Combine the compressed high(after shifted) with the compressed low.
2459   // dst = 0000 0000 0000 8888 5555 4444 2222 1111
2460   sve_orr(dst, dst, vtmp1);
2461 }
2462 
2463 // Clobbers: rscratch1, rscratch2
2464 // Preserves: src, mask
2465 void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask,
2466                                           FloatRegister vtmp1, FloatRegister vtmp2,
2467                                           FloatRegister vtmp3, FloatRegister vtmp4,
2468                                           PRegister ptmp, PRegister pgtmp) {
2469   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2470   assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3, vtmp4);
2471   assert_different_registers(mask, ptmp, pgtmp);
2472   // Example input:   src   = 88 77 66 55 44 33 22 11
2473   //                  mask  = 01 00 00 01 01 00 01 01
2474   // Expected result: dst   = 00 00 00 88 55 44 22 11
2475 
2476   sve_dup(vtmp4, B, 0);
2477   // Extend lowest half to type SHORT.
2478   // vtmp1 = 0044 0033 0022 0011
2479   sve_uunpklo(vtmp1, H, src);
2480   // ptmp = 0001 0000 0001 0001
2481   sve_punpklo(ptmp, mask);
2482   // Count the active elements of lowest half.
2483   // rscratch2 = 3
2484   sve_cntp(rscratch2, H, ptrue, ptmp);
2485   // Pack the active elements in size of type SHORT to the right,
2486   // and fill the remainings with zero.
2487   // dst = 0000 0044 0022 0011
2488   sve_compress_short(dst, vtmp1, ptmp, vtmp2, vtmp3, pgtmp);
2489   // Narrow the result back to type BYTE.
2490   // dst = 00 00 00 00 00 44 22 11
2491   sve_uzp1(dst, B, dst, vtmp4);
2492 
2493   // Repeat to the highest half.
2494   // ptmp = 0001 0000 0000 0001
2495   sve_punpkhi(ptmp, mask);
2496   // vtmp1 = 0088 0077 0066 0055
2497   sve_uunpkhi(vtmp2, H, src);
2498   // vtmp1 = 0000 0000 0088 0055
2499   sve_compress_short(vtmp1, vtmp2, ptmp, vtmp3, vtmp4, pgtmp);
2500 
2501   sve_dup(vtmp4, B, 0);
2502   // vtmp1 = 00 00 00 00 00 00 88 55
2503   sve_uzp1(vtmp1, B, vtmp1, vtmp4);
2504 
2505   // Compressed low:   dst   = 00 00 00 00 00 44 22 11
2506   // Compressed high:  vtmp1 = 00 00 00 00 00 00 88 55
2507   // Left shift(cross lane) compressed high with TRUE_CNT lanes,
2508   // TRUE_CNT is the number of active elements in the compressed low.
2509   neg(rscratch2, rscratch2);
2510   // vtmp2 = {4 3 2 1 0 -1 -2 -3}
2511   sve_index(vtmp2, B, rscratch2, 1);
2512   // vtmp1 = 00 00 00 88 55 00 00 00
2513   sve_tbl(vtmp1, B, vtmp1, vtmp2);
2514   // Combine the compressed high(after shifted) with the compressed low.
2515   // dst = 00 00 00 88 55 44 22 11
2516   sve_orr(dst, dst, vtmp1);
2517 }
2518 
2519 void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2520   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2521   SIMD_Arrangement size = isQ ? T16B : T8B;
2522   if (bt == T_BYTE) {
2523     rbit(dst, size, src);
2524   } else {
2525     neon_reverse_bytes(dst, src, bt, isQ);
2526     rbit(dst, size, dst);
2527   }
2528 }
2529 
2530 void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2531   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2532   SIMD_Arrangement size = isQ ? T16B : T8B;
2533   switch (bt) {
2534     case T_BYTE:
2535       if (dst != src) {
2536         orr(dst, size, src, src);
2537       }
2538       break;
2539     case T_SHORT:
2540       rev16(dst, size, src);
2541       break;
2542     case T_INT:
2543       rev32(dst, size, src);
2544       break;
2545     case T_LONG:
2546       rev64(dst, size, src);
2547       break;
2548     default:
2549       assert(false, "unsupported");
2550       ShouldNotReachHere();
2551   }
2552 }
2553 
2554 // VectorRearrange implementation for short/int/float/long/double types with NEON
2555 // instructions. For VectorRearrange short/int/float, we use NEON tbl instruction.
2556 // But since it supports bytes table only, we need to lookup 2/4 bytes as a group.
2557 // For VectorRearrange long/double, we compare the shuffle input with iota indices,
2558 // and use bsl to implement the operation.
2559 void C2_MacroAssembler::neon_rearrange_hsd(FloatRegister dst, FloatRegister src,
2560                                            FloatRegister shuffle, FloatRegister tmp,
2561                                            BasicType bt, bool isQ) {
2562   assert_different_registers(dst, src, shuffle, tmp);
2563   SIMD_Arrangement size1 = isQ ? T16B : T8B;
2564   SIMD_Arrangement size2 = esize2arrangement((uint)type2aelembytes(bt), isQ);
2565 
2566   // Here is an example that rearranges a NEON vector with 4 ints:
2567   // Rearrange V1 int[a0, a1, a2, a3] to V2 int[a2, a3, a0, a1]
2568   //   1. We assume the shuffle input is Vi int[2, 3, 0, 1].
2569   //   2. Multiply Vi int[2, 3, 0, 1] with constant int vector
2570   //      [0x04040404, 0x04040404, 0x04040404, 0x04040404], and get
2571   //      tbl base Vm int[0x08080808, 0x0c0c0c0c, 0x00000000, 0x04040404].
2572   //   3. Add Vm with constant int[0x03020100, 0x03020100, 0x03020100, 0x03020100],
2573   //      and get tbl index Vm int[0x0b0a0908, 0x0f0e0d0c, 0x03020100, 0x07060504]
2574   //   4. Use Vm as index register, and use V1 as table register.
2575   //      Then get V2 as the result by tbl NEON instructions.
2576   switch (bt) {
2577     case T_SHORT:
2578       mov(tmp, size1, 0x02);
2579       mulv(dst, size2, shuffle, tmp);
2580       mov(tmp, size2, 0x0100);
2581       addv(dst, size1, dst, tmp);
2582       tbl(dst, size1, src, 1, dst);
2583       break;
2584     case T_INT:
2585     case T_FLOAT:
2586       mov(tmp, size1, 0x04);
2587       mulv(dst, size2, shuffle, tmp);
2588       mov(tmp, size2, 0x03020100);
2589       addv(dst, size1, dst, tmp);
2590       tbl(dst, size1, src, 1, dst);
2591       break;
2592     case T_LONG:
2593     case T_DOUBLE:
2594       // Load the iota indices for Long type. The indices are ordered by
2595       // type B/S/I/L/F/D, and the offset between two types is 16; Hence
2596       // the offset for L is 48.
2597       lea(rscratch1,
2598           ExternalAddress(StubRoutines::aarch64::vector_iota_indices() + 48));
2599       ldrq(tmp, rscratch1);
2600       // Check whether the input "shuffle" is the same with iota indices.
2601       // Return "src" if true, otherwise swap the two elements of "src".
2602       cm(EQ, dst, size2, shuffle, tmp);
2603       ext(tmp, size1, src, src, 8);
2604       bsl(dst, size1, src, tmp);
2605       break;
2606     default:
2607       assert(false, "unsupported element type");
2608       ShouldNotReachHere();
2609   }
2610 }
2611 
2612 // Extract a scalar element from an sve vector at position 'idx'.
2613 // The input elements in src are expected to be of integral type.
2614 void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src,
2615                                              int idx, FloatRegister vtmp) {
2616   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2617   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2618   if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction
2619     if (bt == T_INT || bt == T_LONG) {
2620       umov(dst, src, size, idx);
2621     } else {
2622       smov(dst, src, size, idx);
2623     }
2624   } else {
2625     sve_orr(vtmp, src, src);
2626     sve_ext(vtmp, vtmp, idx << size);
2627     if (bt == T_INT || bt == T_LONG) {
2628       umov(dst, vtmp, size, 0);
2629     } else {
2630       smov(dst, vtmp, size, 0);
2631     }
2632   }
2633 }
2634 
2635 // java.lang.Math::round intrinsics
2636 
2637 // Clobbers: rscratch1, rflags
2638 void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2639                                           FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) {
2640   assert_different_registers(tmp1, tmp2, tmp3, src, dst);
2641   switch (T) {
2642     case T2S:
2643     case T4S:
2644       fmovs(tmp1, T, 0.5f);
2645       mov(rscratch1, jint_cast(0x1.0p23f));
2646       break;
2647     case T2D:
2648       fmovd(tmp1, T, 0.5);
2649       mov(rscratch1, julong_cast(0x1.0p52));
2650       break;
2651     default:
2652       assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2653   }
2654   fadd(tmp1, T, tmp1, src);
2655   fcvtms(tmp1, T, tmp1);
2656   // tmp1 = floor(src + 0.5, ties to even)
2657 
2658   fcvtas(dst, T, src);
2659   // dst = round(src), ties to away
2660 
2661   fneg(tmp3, T, src);
2662   dup(tmp2, T, rscratch1);
2663   cm(HS, tmp3, T, tmp3, tmp2);
2664   // tmp3 is now a set of flags
2665 
2666   bif(dst, T16B, tmp1, tmp3);
2667   // result in dst
2668 }
2669 
2670 // Clobbers: rscratch1, rflags
2671 void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2672                                          FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) {
2673   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2674   assert_different_registers(tmp1, tmp2, src, dst);
2675 
2676   switch (T) {
2677     case S:
2678       mov(rscratch1, jint_cast(0x1.0p23f));
2679       break;
2680     case D:
2681       mov(rscratch1, julong_cast(0x1.0p52));
2682       break;
2683     default:
2684       assert(T == S || T == D, "invalid register variant");
2685   }
2686 
2687   sve_frinta(dst, T, ptrue, src);
2688   // dst = round(src), ties to away
2689 
2690   Label none;
2691 
2692   sve_fneg(tmp1, T, ptrue, src);
2693   sve_dup(tmp2, T, rscratch1);
2694   sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1);
2695   br(EQ, none);
2696   {
2697     sve_cpy(tmp1, T, pgtmp, 0.5);
2698     sve_fadd(tmp1, T, pgtmp, src);
2699     sve_frintm(dst, T, pgtmp, tmp1);
2700     // dst = floor(src + 0.5, ties to even)
2701   }
2702   bind(none);
2703 
2704   sve_fcvtzs(dst, T, ptrue, dst, T);
2705   // result in dst
2706 }
2707 
2708 void C2_MacroAssembler::vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero,
2709                                            FloatRegister one, SIMD_Arrangement T) {
2710   assert_different_registers(dst, src, zero, one);
2711   assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2712 
2713   facgt(dst, T, src, zero);
2714   ushr(dst, T, dst, 1); // dst=0 for +-0.0 and NaN. 0x7FF..F otherwise
2715   bsl(dst, T == T2S ? T8B : T16B, one, src); // Result in dst
2716 }
2717 
2718 void C2_MacroAssembler::vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero,
2719                                           FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T) {
2720     assert_different_registers(dst, src, zero, one, vtmp);
2721     assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2722 
2723     sve_orr(vtmp, src, src);
2724     sve_fac(Assembler::GT, pgtmp, T, ptrue, src, zero); // pmtp=0 for +-0.0 and NaN. 0x1 otherwise
2725     switch (T) {
2726     case S:
2727       sve_and(vtmp, T, min_jint); // Extract the sign bit of float value in every lane of src
2728       sve_orr(vtmp, T, jint_cast(1.0)); // OR it with +1 to make the final result +1 or -1 depending
2729                                         // on the sign of the float value
2730       break;
2731     case D:
2732       sve_and(vtmp, T, min_jlong);
2733       sve_orr(vtmp, T, jlong_cast(1.0));
2734       break;
2735     default:
2736       assert(false, "unsupported");
2737       ShouldNotReachHere();
2738     }
2739     sve_sel(dst, T, pgtmp, vtmp, src); // Select either from src or vtmp based on the predicate register pgtmp
2740                                        // Result in dst
2741 }
2742 
2743 bool C2_MacroAssembler::in_scratch_emit_size() {
2744   if (ciEnv::current()->task() != nullptr) {
2745     PhaseOutput* phase_output = Compile::current()->output();
2746     if (phase_output != nullptr && phase_output->in_scratch_emit_size()) {
2747       return true;
2748     }
2749   }
2750   return MacroAssembler::in_scratch_emit_size();
2751 }
2752 
2753 static void abort_verify_int_in_range(uint idx, jint val, jint lo, jint hi) {
2754   fatal("Invalid CastII, idx: %u, val: %d, lo: %d, hi: %d", idx, val, lo, hi);
2755 }
2756 
2757 void C2_MacroAssembler::verify_int_in_range(uint idx, const TypeInt* t, Register rval, Register rtmp) {
2758   assert(!t->empty() && !t->singleton(), "%s", Type::str(t));
2759   if (t == TypeInt::INT) {
2760     return;
2761   }
2762   BLOCK_COMMENT("verify_int_in_range {");
2763   Label L_success, L_failure;
2764 
2765   jint lo = t->_lo;
2766   jint hi = t->_hi;
2767 
2768   if (lo != min_jint && hi != max_jint) {
2769     subsw(rtmp, rval, lo);
2770     br(Assembler::LT, L_failure);
2771     subsw(rtmp, rval, hi);
2772     br(Assembler::LE, L_success);
2773   } else if (lo != min_jint) {
2774     subsw(rtmp, rval, lo);
2775     br(Assembler::GE, L_success);
2776   } else if (hi != max_jint) {
2777     subsw(rtmp, rval, hi);
2778     br(Assembler::LE, L_success);
2779   } else {
2780     ShouldNotReachHere();
2781   }
2782 
2783   bind(L_failure);
2784   movw(c_rarg0, idx);
2785   mov(c_rarg1, rval);
2786   movw(c_rarg2, lo);
2787   movw(c_rarg3, hi);
2788   reconstruct_frame_pointer(rtmp);
2789   rt_call(CAST_FROM_FN_PTR(address, abort_verify_int_in_range), rtmp);
2790   hlt(0);
2791 
2792   bind(L_success);
2793   BLOCK_COMMENT("} verify_int_in_range");
2794 }
2795 
2796 static void abort_verify_long_in_range(uint idx, jlong val, jlong lo, jlong hi) {
2797   fatal("Invalid CastLL, idx: %u, val: " JLONG_FORMAT ", lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, val, lo, hi);
2798 }
2799 
2800 void C2_MacroAssembler::verify_long_in_range(uint idx, const TypeLong* t, Register rval, Register rtmp) {
2801   assert(!t->empty() && !t->singleton(), "%s", Type::str(t));
2802   if (t == TypeLong::LONG) {
2803     return;
2804   }
2805   BLOCK_COMMENT("verify_long_in_range {");
2806   Label L_success, L_failure;
2807 
2808   jlong lo = t->_lo;
2809   jlong hi = t->_hi;
2810 
2811   if (lo != min_jlong && hi != max_jlong) {
2812     subs(rtmp, rval, lo);
2813     br(Assembler::LT, L_failure);
2814     subs(rtmp, rval, hi);
2815     br(Assembler::LE, L_success);
2816   } else if (lo != min_jlong) {
2817     subs(rtmp, rval, lo);
2818     br(Assembler::GE, L_success);
2819   } else if (hi != max_jlong) {
2820     subs(rtmp, rval, hi);
2821     br(Assembler::LE, L_success);
2822   } else {
2823     ShouldNotReachHere();
2824   }
2825 
2826   bind(L_failure);
2827   movw(c_rarg0, idx);
2828   mov(c_rarg1, rval);
2829   mov(c_rarg2, lo);
2830   mov(c_rarg3, hi);
2831   reconstruct_frame_pointer(rtmp);
2832   rt_call(CAST_FROM_FN_PTR(address, abort_verify_long_in_range), rtmp);
2833   hlt(0);
2834 
2835   bind(L_success);
2836   BLOCK_COMMENT("} verify_long_in_range");
2837 }
2838 
2839 void C2_MacroAssembler::reconstruct_frame_pointer(Register rtmp) {
2840   const int framesize = Compile::current()->output()->frame_size_in_bytes();
2841   if (PreserveFramePointer) {
2842     // frame pointer is valid
2843 #ifdef ASSERT
2844     // Verify frame pointer value in rfp.
2845     add(rtmp, sp, framesize - 2 * wordSize);
2846     Label L_success;
2847     cmp(rfp, rtmp);
2848     br(Assembler::EQ, L_success);
2849     stop("frame pointer mismatch");
2850     bind(L_success);
2851 #endif // ASSERT
2852   } else {
2853     add(rfp, sp, framesize - 2 * wordSize);
2854   }
2855 }