1 /*
   2  * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "opto/c2_MacroAssembler.hpp"
  29 #include "opto/compile.hpp"
  30 #include "opto/intrinsicnode.hpp"
  31 #include "opto/matcher.hpp"
  32 #include "opto/output.hpp"
  33 #include "opto/subnode.hpp"
  34 #include "runtime/stubRoutines.hpp"
  35 #include "utilities/globalDefinitions.hpp"
  36 #include "utilities/powerOfTwo.hpp"
  37 
  38 #ifdef PRODUCT
  39 #define BLOCK_COMMENT(str) /* nothing */
  40 #define STOP(error) stop(error)
  41 #else
  42 #define BLOCK_COMMENT(str) block_comment(str)
  43 #define STOP(error) block_comment(error); stop(error)
  44 #endif
  45 
  46 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  47 
  48 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
  49 
  50 // jdk.internal.util.ArraysSupport.vectorizedHashCode
  51 address C2_MacroAssembler::arrays_hashcode(Register ary, Register cnt, Register result,
  52                                            FloatRegister vdata0, FloatRegister vdata1,
  53                                            FloatRegister vdata2, FloatRegister vdata3,
  54                                            FloatRegister vmul0, FloatRegister vmul1,
  55                                            FloatRegister vmul2, FloatRegister vmul3,
  56                                            FloatRegister vpow, FloatRegister vpowm,
  57                                            BasicType eltype) {
  58   ARRAYS_HASHCODE_REGISTERS;
  59 
  60   Register tmp1 = rscratch1, tmp2 = rscratch2;
  61 
  62   Label TAIL, STUB_SWITCH, STUB_SWITCH_OUT, LOOP, BR_BASE, LARGE, DONE;
  63 
  64   // Vectorization factor. Number of array elements loaded to one SIMD&FP registers by the stubs. We
  65   // use 8H load arrangements for chars and shorts and 8B for booleans and bytes. It's possible to
  66   // use 4H for chars and shorts instead, but using 8H gives better performance.
  67   const size_t vf = eltype == T_BOOLEAN || eltype == T_BYTE ? 8
  68                     : eltype == T_CHAR || eltype == T_SHORT ? 8
  69                     : eltype == T_INT                       ? 4
  70                                                             : 0;
  71   guarantee(vf, "unsupported eltype");
  72 
  73   // Unroll factor for the scalar loop below. The value is chosen based on performance analysis.
  74   const size_t unroll_factor = 4;
  75 
  76   switch (eltype) {
  77   case T_BOOLEAN:
  78     BLOCK_COMMENT("arrays_hashcode(unsigned byte) {");
  79     break;
  80   case T_CHAR:
  81     BLOCK_COMMENT("arrays_hashcode(char) {");
  82     break;
  83   case T_BYTE:
  84     BLOCK_COMMENT("arrays_hashcode(byte) {");
  85     break;
  86   case T_SHORT:
  87     BLOCK_COMMENT("arrays_hashcode(short) {");
  88     break;
  89   case T_INT:
  90     BLOCK_COMMENT("arrays_hashcode(int) {");
  91     break;
  92   default:
  93     ShouldNotReachHere();
  94   }
  95 
  96   // large_arrays_hashcode(T_INT) performs worse than the scalar loop below when the Neon loop
  97   // implemented by the stub executes just once. Call the stub only if at least two iterations will
  98   // be executed.
  99   const size_t large_threshold = eltype == T_INT ? vf * 2 : vf;
 100   cmpw(cnt, large_threshold);
 101   br(Assembler::HS, LARGE);
 102 
 103   bind(TAIL);
 104 
 105   // The andr performs cnt % uf where uf = unroll_factor. The subtract shifted by 3 offsets past
 106   // uf - (cnt % uf) pairs of load + madd insns i.e. it only executes cnt % uf load + madd pairs.
 107   // Iteration eats up the remainder, uf elements at a time.
 108   assert(is_power_of_2(unroll_factor), "can't use this value to calculate the jump target PC");
 109   andr(tmp2, cnt, unroll_factor - 1);
 110   adr(tmp1, BR_BASE);
 111   sub(tmp1, tmp1, tmp2, ext::sxtw, 3);
 112   movw(tmp2, 0x1f);
 113   br(tmp1);
 114 
 115   bind(LOOP);
 116   for (size_t i = 0; i < unroll_factor; ++i) {
 117     load(tmp1, Address(post(ary, type2aelembytes(eltype))), eltype);
 118     maddw(result, result, tmp2, tmp1);
 119   }
 120   bind(BR_BASE);
 121   subsw(cnt, cnt, unroll_factor);
 122   br(Assembler::HS, LOOP);
 123 
 124   b(DONE);
 125 
 126   bind(LARGE);
 127 
 128   RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_arrays_hashcode(eltype));
 129   assert(stub.target() != nullptr, "array_hashcode stub has not been generated");
 130   address tpc = trampoline_call(stub);
 131   if (tpc == nullptr) {
 132     DEBUG_ONLY(reset_labels(TAIL, BR_BASE));
 133     postcond(pc() == badAddress);
 134     return nullptr;
 135   }
 136 
 137   bind(DONE);
 138 
 139   BLOCK_COMMENT("} // arrays_hashcode");
 140 
 141   postcond(pc() != badAddress);
 142   return pc();
 143 }
 144 
 145 void C2_MacroAssembler::fast_lock(Register objectReg, Register boxReg, Register tmpReg,
 146                                   Register tmp2Reg, Register tmp3Reg) {
 147   Register oop = objectReg;
 148   Register box = boxReg;
 149   Register disp_hdr = tmpReg;
 150   Register tmp = tmp2Reg;
 151   Label cont;
 152   Label object_has_monitor;
 153   Label count, no_count;
 154 
 155   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight");
 156   assert_different_registers(oop, box, tmp, disp_hdr);
 157 
 158   // Load markWord from object into displaced_header.
 159   ldr(disp_hdr, Address(oop, oopDesc::mark_offset_in_bytes()));
 160 
 161   if (DiagnoseSyncOnValueBasedClasses != 0) {
 162     load_klass(tmp, oop);
 163     ldrb(tmp, Address(tmp, Klass::misc_flags_offset()));
 164     tst(tmp, KlassFlags::_misc_is_value_based_class);
 165     br(Assembler::NE, cont);
 166   }
 167 
 168   // Check for existing monitor
 169   tbnz(disp_hdr, exact_log2(markWord::monitor_value), object_has_monitor);
 170 
 171   if (LockingMode == LM_MONITOR) {
 172     tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0.
 173     b(cont);
 174   } else {
 175     assert(LockingMode == LM_LEGACY, "must be");
 176     // Set tmp to be (markWord of object | UNLOCK_VALUE).
 177     orr(tmp, disp_hdr, markWord::unlocked_value);
 178 
 179     // Initialize the box. (Must happen before we update the object mark!)
 180     str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 181 
 182     // Compare object markWord with an unlocked value (tmp) and if
 183     // equal exchange the stack address of our box with object markWord.
 184     // On failure disp_hdr contains the possibly locked markWord.
 185     cmpxchg(oop, tmp, box, Assembler::xword, /*acquire*/ true,
 186             /*release*/ true, /*weak*/ false, disp_hdr);
 187     br(Assembler::EQ, cont);
 188 
 189     assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
 190 
 191     // If the compare-and-exchange succeeded, then we found an unlocked
 192     // object, will have now locked it will continue at label cont
 193 
 194     // Check if the owner is self by comparing the value in the
 195     // markWord of object (disp_hdr) with the stack pointer.
 196     mov(rscratch1, sp);
 197     sub(disp_hdr, disp_hdr, rscratch1);
 198     mov(tmp, (address) (~(os::vm_page_size()-1) | markWord::lock_mask_in_place));
 199     // If condition is true we are cont and hence we can store 0 as the
 200     // displaced header in the box, which indicates that it is a recursive lock.
 201     ands(tmp/*==0?*/, disp_hdr, tmp);   // Sets flags for result
 202     str(tmp/*==0, perhaps*/, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 203     b(cont);
 204   }
 205 
 206   // Handle existing monitor.
 207   bind(object_has_monitor);
 208 
 209   // The object's monitor m is unlocked iff m->owner == nullptr,
 210   // otherwise m->owner may contain a thread or a stack address.
 211   //
 212   // Try to CAS m->owner from null to current thread.
 213   add(tmp, disp_hdr, (in_bytes(ObjectMonitor::owner_offset())-markWord::monitor_value));
 214   cmpxchg(tmp, zr, rthread, Assembler::xword, /*acquire*/ true,
 215           /*release*/ true, /*weak*/ false, tmp3Reg); // Sets flags for result
 216 
 217   // Store a non-null value into the box to avoid looking like a re-entrant
 218   // lock. The fast-path monitor unlock code checks for
 219   // markWord::monitor_value so use markWord::unused_mark which has the
 220   // relevant bit set, and also matches ObjectSynchronizer::enter.
 221   mov(tmp, (address)markWord::unused_mark().value());
 222   str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 223 
 224   br(Assembler::EQ, cont); // CAS success means locking succeeded
 225 
 226   cmp(tmp3Reg, rthread);
 227   br(Assembler::NE, cont); // Check for recursive locking
 228 
 229   // Recursive lock case
 230   increment(Address(disp_hdr, in_bytes(ObjectMonitor::recursions_offset()) - markWord::monitor_value), 1);
 231   // flag == EQ still from the cmp above, checking if this is a reentrant lock
 232 
 233   bind(cont);
 234   // flag == EQ indicates success
 235   // flag == NE indicates failure
 236   br(Assembler::NE, no_count);
 237 
 238   bind(count);
 239   increment(Address(rthread, JavaThread::held_monitor_count_offset()));
 240 
 241   bind(no_count);
 242 }
 243 
 244 void C2_MacroAssembler::fast_unlock(Register objectReg, Register boxReg, Register tmpReg,
 245                                     Register tmp2Reg) {
 246   Register oop = objectReg;
 247   Register box = boxReg;
 248   Register disp_hdr = tmpReg;
 249   Register owner_addr = tmpReg;
 250   Register tmp = tmp2Reg;
 251   Label cont;
 252   Label object_has_monitor;
 253   Label count, no_count;
 254   Label unlocked;
 255 
 256   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight");
 257   assert_different_registers(oop, box, tmp, disp_hdr);
 258 
 259   if (LockingMode == LM_LEGACY) {
 260     // Find the lock address and load the displaced header from the stack.
 261     ldr(disp_hdr, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 262 
 263     // If the displaced header is 0, we have a recursive unlock.
 264     cmp(disp_hdr, zr);
 265     br(Assembler::EQ, cont);
 266   }
 267 
 268   // Handle existing monitor.
 269   ldr(tmp, Address(oop, oopDesc::mark_offset_in_bytes()));
 270   tbnz(tmp, exact_log2(markWord::monitor_value), object_has_monitor);
 271 
 272   if (LockingMode == LM_MONITOR) {
 273     tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0.
 274     b(cont);
 275   } else {
 276     assert(LockingMode == LM_LEGACY, "must be");
 277     // Check if it is still a light weight lock, this is is true if we
 278     // see the stack address of the basicLock in the markWord of the
 279     // object.
 280 
 281     cmpxchg(oop, box, disp_hdr, Assembler::xword, /*acquire*/ false,
 282             /*release*/ true, /*weak*/ false, tmp);
 283     b(cont);
 284   }
 285 
 286   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
 287 
 288   // Handle existing monitor.
 289   bind(object_has_monitor);
 290   STATIC_ASSERT(markWord::monitor_value <= INT_MAX);
 291   add(tmp, tmp, -(int)markWord::monitor_value); // monitor
 292 
 293   ldr(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset()));
 294 
 295   Label notRecursive;
 296   cbz(disp_hdr, notRecursive);
 297 
 298   // Recursive lock
 299   sub(disp_hdr, disp_hdr, 1u);
 300   str(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset()));
 301   cmp(disp_hdr, disp_hdr); // Sets flags for result
 302   b(cont);
 303 
 304   bind(notRecursive);
 305 
 306   // Compute owner address.
 307   lea(owner_addr, Address(tmp, ObjectMonitor::owner_offset()));
 308 
 309   // Set owner to null.
 310   // Release to satisfy the JMM
 311   stlr(zr, owner_addr);
 312   // We need a full fence after clearing owner to avoid stranding.
 313   // StoreLoad achieves this.
 314   membar(StoreLoad);
 315 
 316   // Check if the entry lists are empty (EntryList first - by convention).
 317   ldr(rscratch1, Address(tmp, ObjectMonitor::EntryList_offset()));
 318   ldr(tmpReg, Address(tmp, ObjectMonitor::cxq_offset()));
 319   orr(rscratch1, rscratch1, tmpReg);
 320   cmp(rscratch1, zr);
 321   br(Assembler::EQ, cont);     // If so we are done.
 322 
 323   // Check if there is a successor.
 324   ldr(rscratch1, Address(tmp, ObjectMonitor::succ_offset()));
 325   cmp(rscratch1, zr);
 326   br(Assembler::NE, unlocked); // If so we are done.
 327 
 328   // Save the monitor pointer in the current thread, so we can try to
 329   // reacquire the lock in SharedRuntime::monitor_exit_helper().
 330   str(tmp, Address(rthread, JavaThread::unlocked_inflated_monitor_offset()));
 331 
 332   cmp(zr, rthread); // Set Flag to NE => slow path
 333   b(cont);
 334 
 335   bind(unlocked);
 336   cmp(zr, zr); // Set Flag to EQ => fast path
 337 
 338   // Intentional fall-through
 339 
 340   bind(cont);
 341   // flag == EQ indicates success
 342   // flag == NE indicates failure
 343   br(Assembler::NE, no_count);
 344 
 345   bind(count);
 346   decrement(Address(rthread, JavaThread::held_monitor_count_offset()));
 347 
 348   bind(no_count);
 349 }
 350 
 351 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register t1,
 352                                               Register t2, Register t3) {
 353   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 354   assert_different_registers(obj, box, t1, t2, t3);
 355 
 356   // Handle inflated monitor.
 357   Label inflated;
 358   // Finish fast lock successfully. MUST branch to with flag == EQ
 359   Label locked;
 360   // Finish fast lock unsuccessfully. MUST branch to with flag == NE
 361   Label slow_path;
 362 
 363   if (UseObjectMonitorTable) {
 364     // Clear cache in case fast locking succeeds.
 365     str(zr, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 366   }
 367 
 368   if (DiagnoseSyncOnValueBasedClasses != 0) {
 369     load_klass(t1, obj);
 370     ldrb(t1, Address(t1, Klass::misc_flags_offset()));
 371     tst(t1, KlassFlags::_misc_is_value_based_class);
 372     br(Assembler::NE, slow_path);
 373   }
 374 
 375   const Register t1_mark = t1;
 376   const Register t3_t = t3;
 377 
 378   { // Lightweight locking
 379 
 380     // Push lock to the lock stack and finish successfully. MUST branch to with flag == EQ
 381     Label push;
 382 
 383     const Register t2_top = t2;
 384 
 385     // Check if lock-stack is full.
 386     ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 387     cmpw(t2_top, (unsigned)LockStack::end_offset() - 1);
 388     br(Assembler::GT, slow_path);
 389 
 390     // Check if recursive.
 391     subw(t3_t, t2_top, oopSize);
 392     ldr(t3_t, Address(rthread, t3_t));
 393     cmp(obj, t3_t);
 394     br(Assembler::EQ, push);
 395 
 396     // Relaxed normal load to check for monitor. Optimization for monitor case.
 397     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 398     tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
 399 
 400     // Not inflated
 401     assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea");
 402 
 403     // Try to lock. Transition lock-bits 0b01 => 0b00
 404     orr(t1_mark, t1_mark, markWord::unlocked_value);
 405     eor(t3_t, t1_mark, markWord::unlocked_value);
 406     cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
 407             /*acquire*/ true, /*release*/ false, /*weak*/ false, noreg);
 408     br(Assembler::NE, slow_path);
 409 
 410     bind(push);
 411     // After successful lock, push object on lock-stack.
 412     str(obj, Address(rthread, t2_top));
 413     addw(t2_top, t2_top, oopSize);
 414     strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 415     b(locked);
 416   }
 417 
 418   { // Handle inflated monitor.
 419     bind(inflated);
 420 
 421     const Register t1_monitor = t1;
 422 
 423     if (!UseObjectMonitorTable) {
 424       assert(t1_monitor == t1_mark, "should be the same here");
 425     } else {
 426       Label monitor_found;
 427 
 428       // Load cache address
 429       lea(t3_t, Address(rthread, JavaThread::om_cache_oops_offset()));
 430 
 431       const int num_unrolled = 2;
 432       for (int i = 0; i < num_unrolled; i++) {
 433         ldr(t1, Address(t3_t));
 434         cmp(obj, t1);
 435         br(Assembler::EQ, monitor_found);
 436         increment(t3_t, in_bytes(OMCache::oop_to_oop_difference()));
 437       }
 438 
 439       Label loop;
 440 
 441       // Search for obj in cache.
 442       bind(loop);
 443 
 444       // Check for match.
 445       ldr(t1, Address(t3_t));
 446       cmp(obj, t1);
 447       br(Assembler::EQ, monitor_found);
 448 
 449       // Search until null encountered, guaranteed _null_sentinel at end.
 450       increment(t3_t, in_bytes(OMCache::oop_to_oop_difference()));
 451       cbnz(t1, loop);
 452       // Cache Miss, NE set from cmp above, cbnz does not set flags
 453       b(slow_path);
 454 
 455       bind(monitor_found);
 456       ldr(t1_monitor, Address(t3_t, OMCache::oop_to_monitor_difference()));
 457     }
 458 
 459     const Register t2_owner_addr = t2;
 460     const Register t3_owner = t3;
 461     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 462     const Address owner_address(t1_monitor, ObjectMonitor::owner_offset() - monitor_tag);
 463     const Address recursions_address(t1_monitor, ObjectMonitor::recursions_offset() - monitor_tag);
 464 
 465     Label monitor_locked;
 466 
 467     // Compute owner address.
 468     lea(t2_owner_addr, owner_address);
 469 
 470     // CAS owner (null => current thread).
 471     cmpxchg(t2_owner_addr, zr, rthread, Assembler::xword, /*acquire*/ true,
 472             /*release*/ false, /*weak*/ false, t3_owner);
 473     br(Assembler::EQ, monitor_locked);
 474 
 475     // Check if recursive.
 476     cmp(t3_owner, rthread);
 477     br(Assembler::NE, slow_path);
 478 
 479     // Recursive.
 480     increment(recursions_address, 1);
 481 
 482     bind(monitor_locked);
 483     if (UseObjectMonitorTable) {
 484       str(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 485     }
 486   }
 487 
 488   bind(locked);
 489   increment(Address(rthread, JavaThread::held_monitor_count_offset()));
 490 
 491 #ifdef ASSERT
 492   // Check that locked label is reached with Flags == EQ.
 493   Label flag_correct;
 494   br(Assembler::EQ, flag_correct);
 495   stop("Fast Lock Flag != EQ");
 496 #endif
 497 
 498   bind(slow_path);
 499 #ifdef ASSERT
 500   // Check that slow_path label is reached with Flags == NE.
 501   br(Assembler::NE, flag_correct);
 502   stop("Fast Lock Flag != NE");
 503   bind(flag_correct);
 504 #endif
 505   // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
 506 }
 507 
 508 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register box, Register t1,
 509                                                 Register t2, Register t3) {
 510   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 511   assert_different_registers(obj, box, t1, t2, t3);
 512 
 513   // Handle inflated monitor.
 514   Label inflated, inflated_load_mark;
 515   // Finish fast unlock successfully. MUST branch to with flag == EQ
 516   Label unlocked;
 517   // Finish fast unlock unsuccessfully. MUST branch to with flag == NE
 518   Label slow_path;
 519 
 520   const Register t1_mark = t1;
 521   const Register t2_top = t2;
 522   const Register t3_t = t3;
 523 
 524   { // Lightweight unlock
 525 
 526     Label push_and_slow_path;
 527 
 528     // Check if obj is top of lock-stack.
 529     ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 530     subw(t2_top, t2_top, oopSize);
 531     ldr(t3_t, Address(rthread, t2_top));
 532     cmp(obj, t3_t);
 533     // Top of lock stack was not obj. Must be monitor.
 534     br(Assembler::NE, inflated_load_mark);
 535 
 536     // Pop lock-stack.
 537     DEBUG_ONLY(str(zr, Address(rthread, t2_top));)
 538     strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 539 
 540     // Check if recursive.
 541     subw(t3_t, t2_top, oopSize);
 542     ldr(t3_t, Address(rthread, t3_t));
 543     cmp(obj, t3_t);
 544     br(Assembler::EQ, unlocked);
 545 
 546     // Not recursive.
 547     // Load Mark.
 548     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 549 
 550     // Check header for monitor (0b10).
 551     // Because we got here by popping (meaning we pushed in locked)
 552     // there will be no monitor in the box. So we need to push back the obj
 553     // so that the runtime can fix any potential anonymous owner.
 554     tbnz(t1_mark, exact_log2(markWord::monitor_value), UseObjectMonitorTable ? push_and_slow_path : inflated);
 555 
 556     // Try to unlock. Transition lock bits 0b00 => 0b01
 557     assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea");
 558     orr(t3_t, t1_mark, markWord::unlocked_value);
 559     cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
 560             /*acquire*/ false, /*release*/ true, /*weak*/ false, noreg);
 561     br(Assembler::EQ, unlocked);
 562 
 563     bind(push_and_slow_path);
 564     // Compare and exchange failed.
 565     // Restore lock-stack and handle the unlock in runtime.
 566     DEBUG_ONLY(str(obj, Address(rthread, t2_top));)
 567     addw(t2_top, t2_top, oopSize);
 568     str(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 569     b(slow_path);
 570   }
 571 
 572 
 573   { // Handle inflated monitor.
 574     bind(inflated_load_mark);
 575     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 576 #ifdef ASSERT
 577     tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
 578     stop("Fast Unlock not monitor");
 579 #endif
 580 
 581     bind(inflated);
 582 
 583 #ifdef ASSERT
 584     Label check_done;
 585     subw(t2_top, t2_top, oopSize);
 586     cmpw(t2_top, in_bytes(JavaThread::lock_stack_base_offset()));
 587     br(Assembler::LT, check_done);
 588     ldr(t3_t, Address(rthread, t2_top));
 589     cmp(obj, t3_t);
 590     br(Assembler::NE, inflated);
 591     stop("Fast Unlock lock on stack");
 592     bind(check_done);
 593 #endif
 594 
 595     const Register t1_monitor = t1;
 596 
 597     if (!UseObjectMonitorTable) {
 598       assert(t1_monitor == t1_mark, "should be the same here");
 599 
 600       // Untag the monitor.
 601       add(t1_monitor, t1_mark, -(int)markWord::monitor_value);
 602     } else {
 603       ldr(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 604       // null check with Flags == NE, no valid pointer below alignof(ObjectMonitor*)
 605       cmp(t1_monitor, checked_cast<uint8_t>(alignof(ObjectMonitor*)));
 606       br(Assembler::LO, slow_path);
 607     }
 608 
 609     const Register t2_recursions = t2;
 610     Label not_recursive;
 611 
 612     // Check if recursive.
 613     ldr(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
 614     cbz(t2_recursions, not_recursive);
 615 
 616     // Recursive unlock.
 617     sub(t2_recursions, t2_recursions, 1u);
 618     str(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
 619     // Set flag == EQ
 620     cmp(t2_recursions, t2_recursions);
 621     b(unlocked);
 622 
 623     bind(not_recursive);
 624 
 625     const Register t2_owner_addr = t2;
 626 
 627     // Compute owner address.
 628     lea(t2_owner_addr, Address(t1_monitor, ObjectMonitor::owner_offset()));
 629 
 630     // Set owner to null.
 631     // Release to satisfy the JMM
 632     stlr(zr, t2_owner_addr);
 633     // We need a full fence after clearing owner to avoid stranding.
 634     // StoreLoad achieves this.
 635     membar(StoreLoad);
 636 
 637     // Check if the entry lists are empty (EntryList first - by convention).
 638     ldr(rscratch1, Address(t1_monitor, ObjectMonitor::EntryList_offset()));
 639     ldr(t3_t, Address(t1_monitor, ObjectMonitor::cxq_offset()));
 640     orr(rscratch1, rscratch1, t3_t);
 641     cmp(rscratch1, zr);
 642     br(Assembler::EQ, unlocked);  // If so we are done.
 643 
 644     // Check if there is a successor.
 645     ldr(rscratch1, Address(t1_monitor, ObjectMonitor::succ_offset()));
 646     cmp(rscratch1, zr);
 647     br(Assembler::NE, unlocked);  // If so we are done.
 648 
 649     // Save the monitor pointer in the current thread, so we can try to
 650     // reacquire the lock in SharedRuntime::monitor_exit_helper().
 651     str(t1_monitor, Address(rthread, JavaThread::unlocked_inflated_monitor_offset()));
 652 
 653     cmp(zr, rthread); // Set Flag to NE => slow path
 654     b(slow_path);
 655   }
 656 
 657   bind(unlocked);
 658   decrement(Address(rthread, JavaThread::held_monitor_count_offset()));
 659   cmp(zr, zr); // Set Flags to EQ => fast path
 660 
 661 #ifdef ASSERT
 662   // Check that unlocked label is reached with Flags == EQ.
 663   Label flag_correct;
 664   br(Assembler::EQ, flag_correct);
 665   stop("Fast Unlock Flag != EQ");
 666 #endif
 667 
 668   bind(slow_path);
 669 #ifdef ASSERT
 670   // Check that slow_path label is reached with Flags == NE.
 671   br(Assembler::NE, flag_correct);
 672   stop("Fast Unlock Flag != NE");
 673   bind(flag_correct);
 674 #endif
 675   // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
 676 }
 677 
 678 // Search for str1 in str2 and return index or -1
 679 // Clobbers: rscratch1, rscratch2, rflags. May also clobber v0-v1, when icnt1==-1.
 680 void C2_MacroAssembler::string_indexof(Register str2, Register str1,
 681                                        Register cnt2, Register cnt1,
 682                                        Register tmp1, Register tmp2,
 683                                        Register tmp3, Register tmp4,
 684                                        Register tmp5, Register tmp6,
 685                                        int icnt1, Register result, int ae) {
 686   // NOTE: tmp5, tmp6 can be zr depending on specific method version
 687   Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
 688 
 689   Register ch1 = rscratch1;
 690   Register ch2 = rscratch2;
 691   Register cnt1tmp = tmp1;
 692   Register cnt2tmp = tmp2;
 693   Register cnt1_neg = cnt1;
 694   Register cnt2_neg = cnt2;
 695   Register result_tmp = tmp4;
 696 
 697   bool isL = ae == StrIntrinsicNode::LL;
 698 
 699   bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
 700   bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
 701   int str1_chr_shift = str1_isL ? 0:1;
 702   int str2_chr_shift = str2_isL ? 0:1;
 703   int str1_chr_size = str1_isL ? 1:2;
 704   int str2_chr_size = str2_isL ? 1:2;
 705   chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
 706                                       (chr_insn)&MacroAssembler::ldrh;
 707   chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
 708                                       (chr_insn)&MacroAssembler::ldrh;
 709   chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
 710   chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
 711 
 712   // Note, inline_string_indexOf() generates checks:
 713   // if (substr.count > string.count) return -1;
 714   // if (substr.count == 0) return 0;
 715 
 716   // We have two strings, a source string in str2, cnt2 and a pattern string
 717   // in str1, cnt1. Find the 1st occurrence of pattern in source or return -1.
 718 
 719   // For larger pattern and source we use a simplified Boyer Moore algorithm.
 720   // With a small pattern and source we use linear scan.
 721 
 722   if (icnt1 == -1) {
 723     sub(result_tmp, cnt2, cnt1);
 724     cmp(cnt1, (u1)8);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
 725     br(LT, LINEARSEARCH);
 726     dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
 727     subs(zr, cnt1, 256);
 728     lsr(tmp1, cnt2, 2);
 729     ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
 730     br(GE, LINEARSTUB);
 731   }
 732 
 733 // The Boyer Moore alogorithm is based on the description here:-
 734 //
 735 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
 736 //
 737 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
 738 // and the 'Good Suffix' rule.
 739 //
 740 // These rules are essentially heuristics for how far we can shift the
 741 // pattern along the search string.
 742 //
 743 // The implementation here uses the 'Bad Character' rule only because of the
 744 // complexity of initialisation for the 'Good Suffix' rule.
 745 //
 746 // This is also known as the Boyer-Moore-Horspool algorithm:-
 747 //
 748 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
 749 //
 750 // This particular implementation has few java-specific optimizations.
 751 //
 752 // #define ASIZE 256
 753 //
 754 //    int bm(unsigned char *x, int m, unsigned char *y, int n) {
 755 //       int i, j;
 756 //       unsigned c;
 757 //       unsigned char bc[ASIZE];
 758 //
 759 //       /* Preprocessing */
 760 //       for (i = 0; i < ASIZE; ++i)
 761 //          bc[i] = m;
 762 //       for (i = 0; i < m - 1; ) {
 763 //          c = x[i];
 764 //          ++i;
 765 //          // c < 256 for Latin1 string, so, no need for branch
 766 //          #ifdef PATTERN_STRING_IS_LATIN1
 767 //          bc[c] = m - i;
 768 //          #else
 769 //          if (c < ASIZE) bc[c] = m - i;
 770 //          #endif
 771 //       }
 772 //
 773 //       /* Searching */
 774 //       j = 0;
 775 //       while (j <= n - m) {
 776 //          c = y[i+j];
 777 //          if (x[m-1] == c)
 778 //            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
 779 //          if (i < 0) return j;
 780 //          // c < 256 for Latin1 string, so, no need for branch
 781 //          #ifdef SOURCE_STRING_IS_LATIN1
 782 //          // LL case: (c< 256) always true. Remove branch
 783 //          j += bc[y[j+m-1]];
 784 //          #endif
 785 //          #ifndef PATTERN_STRING_IS_UTF
 786 //          // UU case: need if (c<ASIZE) check. Skip 1 character if not.
 787 //          if (c < ASIZE)
 788 //            j += bc[y[j+m-1]];
 789 //          else
 790 //            j += 1
 791 //          #endif
 792 //          #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
 793 //          // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
 794 //          if (c < ASIZE)
 795 //            j += bc[y[j+m-1]];
 796 //          else
 797 //            j += m
 798 //          #endif
 799 //       }
 800 //    }
 801 
 802   if (icnt1 == -1) {
 803     Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
 804         BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
 805     Register cnt1end = tmp2;
 806     Register str2end = cnt2;
 807     Register skipch = tmp2;
 808 
 809     // str1 length is >=8, so, we can read at least 1 register for cases when
 810     // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
 811     // UL case. We'll re-read last character in inner pre-loop code to have
 812     // single outer pre-loop load
 813     const int firstStep = isL ? 7 : 3;
 814 
 815     const int ASIZE = 256;
 816     const int STORED_BYTES = 32; // amount of bytes stored per instruction
 817     sub(sp, sp, ASIZE);
 818     mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
 819     mov(ch1, sp);
 820     BIND(BM_INIT_LOOP);
 821       stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
 822       subs(tmp5, tmp5, 1);
 823       br(GT, BM_INIT_LOOP);
 824 
 825       sub(cnt1tmp, cnt1, 1);
 826       mov(tmp5, str2);
 827       add(str2end, str2, result_tmp, LSL, str2_chr_shift);
 828       sub(ch2, cnt1, 1);
 829       mov(tmp3, str1);
 830     BIND(BCLOOP);
 831       (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
 832       if (!str1_isL) {
 833         subs(zr, ch1, ASIZE);
 834         br(HS, BCSKIP);
 835       }
 836       strb(ch2, Address(sp, ch1));
 837     BIND(BCSKIP);
 838       subs(ch2, ch2, 1);
 839       br(GT, BCLOOP);
 840 
 841       add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
 842       if (str1_isL == str2_isL) {
 843         // load last 8 bytes (8LL/4UU symbols)
 844         ldr(tmp6, Address(tmp6, -wordSize));
 845       } else {
 846         ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
 847         // convert Latin1 to UTF. We'll have to wait until load completed, but
 848         // it's still faster than per-character loads+checks
 849         lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
 850         ubfx(ch1, tmp6, 8, 8); // str1[N-2]
 851         ubfx(ch2, tmp6, 16, 8); // str1[N-3]
 852         andr(tmp6, tmp6, 0xFF); // str1[N-4]
 853         orr(ch2, ch1, ch2, LSL, 16);
 854         orr(tmp6, tmp6, tmp3, LSL, 48);
 855         orr(tmp6, tmp6, ch2, LSL, 16);
 856       }
 857     BIND(BMLOOPSTR2);
 858       (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 859       sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
 860       if (str1_isL == str2_isL) {
 861         // re-init tmp3. It's for free because it's executed in parallel with
 862         // load above. Alternative is to initialize it before loop, but it'll
 863         // affect performance on in-order systems with 2 or more ld/st pipelines
 864         lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
 865       }
 866       if (!isL) { // UU/UL case
 867         lsl(ch2, cnt1tmp, 1); // offset in bytes
 868       }
 869       cmp(tmp3, skipch);
 870       br(NE, BMSKIP);
 871       ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
 872       mov(ch1, tmp6);
 873       if (isL) {
 874         b(BMLOOPSTR1_AFTER_LOAD);
 875       } else {
 876         sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
 877         b(BMLOOPSTR1_CMP);
 878       }
 879     BIND(BMLOOPSTR1);
 880       (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
 881       (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 882     BIND(BMLOOPSTR1_AFTER_LOAD);
 883       subs(cnt1tmp, cnt1tmp, 1);
 884       br(LT, BMLOOPSTR1_LASTCMP);
 885     BIND(BMLOOPSTR1_CMP);
 886       cmp(ch1, ch2);
 887       br(EQ, BMLOOPSTR1);
 888     BIND(BMSKIP);
 889       if (!isL) {
 890         // if we've met UTF symbol while searching Latin1 pattern, then we can
 891         // skip cnt1 symbols
 892         if (str1_isL != str2_isL) {
 893           mov(result_tmp, cnt1);
 894         } else {
 895           mov(result_tmp, 1);
 896         }
 897         subs(zr, skipch, ASIZE);
 898         br(HS, BMADV);
 899       }
 900       ldrb(result_tmp, Address(sp, skipch)); // load skip distance
 901     BIND(BMADV);
 902       sub(cnt1tmp, cnt1, 1);
 903       add(str2, str2, result_tmp, LSL, str2_chr_shift);
 904       cmp(str2, str2end);
 905       br(LE, BMLOOPSTR2);
 906       add(sp, sp, ASIZE);
 907       b(NOMATCH);
 908     BIND(BMLOOPSTR1_LASTCMP);
 909       cmp(ch1, ch2);
 910       br(NE, BMSKIP);
 911     BIND(BMMATCH);
 912       sub(result, str2, tmp5);
 913       if (!str2_isL) lsr(result, result, 1);
 914       add(sp, sp, ASIZE);
 915       b(DONE);
 916 
 917     BIND(LINEARSTUB);
 918     cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm
 919     br(LT, LINEAR_MEDIUM);
 920     mov(result, zr);
 921     RuntimeAddress stub = nullptr;
 922     if (isL) {
 923       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
 924       assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated");
 925     } else if (str1_isL) {
 926       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
 927        assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated");
 928     } else {
 929       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
 930       assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated");
 931     }
 932     address call = trampoline_call(stub);
 933     if (call == nullptr) {
 934       DEBUG_ONLY(reset_labels(LINEARSEARCH, LINEAR_MEDIUM, DONE, NOMATCH, MATCH));
 935       ciEnv::current()->record_failure("CodeCache is full");
 936       return;
 937     }
 938     b(DONE);
 939   }
 940 
 941   BIND(LINEARSEARCH);
 942   {
 943     Label DO1, DO2, DO3;
 944 
 945     Register str2tmp = tmp2;
 946     Register first = tmp3;
 947 
 948     if (icnt1 == -1)
 949     {
 950         Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
 951 
 952         cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2));
 953         br(LT, DOSHORT);
 954       BIND(LINEAR_MEDIUM);
 955         (this->*str1_load_1chr)(first, Address(str1));
 956         lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
 957         sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
 958         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 959         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 960 
 961       BIND(FIRST_LOOP);
 962         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
 963         cmp(first, ch2);
 964         br(EQ, STR1_LOOP);
 965       BIND(STR2_NEXT);
 966         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 967         br(LE, FIRST_LOOP);
 968         b(NOMATCH);
 969 
 970       BIND(STR1_LOOP);
 971         adds(cnt1tmp, cnt1_neg, str1_chr_size);
 972         add(cnt2tmp, cnt2_neg, str2_chr_size);
 973         br(GE, MATCH);
 974 
 975       BIND(STR1_NEXT);
 976         (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
 977         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
 978         cmp(ch1, ch2);
 979         br(NE, STR2_NEXT);
 980         adds(cnt1tmp, cnt1tmp, str1_chr_size);
 981         add(cnt2tmp, cnt2tmp, str2_chr_size);
 982         br(LT, STR1_NEXT);
 983         b(MATCH);
 984 
 985       BIND(DOSHORT);
 986       if (str1_isL == str2_isL) {
 987         cmp(cnt1, (u1)2);
 988         br(LT, DO1);
 989         br(GT, DO3);
 990       }
 991     }
 992 
 993     if (icnt1 == 4) {
 994       Label CH1_LOOP;
 995 
 996         (this->*load_4chr)(ch1, str1);
 997         sub(result_tmp, cnt2, 4);
 998         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 999         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
1000 
1001       BIND(CH1_LOOP);
1002         (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
1003         cmp(ch1, ch2);
1004         br(EQ, MATCH);
1005         adds(cnt2_neg, cnt2_neg, str2_chr_size);
1006         br(LE, CH1_LOOP);
1007         b(NOMATCH);
1008       }
1009 
1010     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
1011       Label CH1_LOOP;
1012 
1013       BIND(DO2);
1014         (this->*load_2chr)(ch1, str1);
1015         if (icnt1 == 2) {
1016           sub(result_tmp, cnt2, 2);
1017         }
1018         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
1019         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
1020       BIND(CH1_LOOP);
1021         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
1022         cmp(ch1, ch2);
1023         br(EQ, MATCH);
1024         adds(cnt2_neg, cnt2_neg, str2_chr_size);
1025         br(LE, CH1_LOOP);
1026         b(NOMATCH);
1027     }
1028 
1029     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
1030       Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
1031 
1032       BIND(DO3);
1033         (this->*load_2chr)(first, str1);
1034         (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
1035         if (icnt1 == 3) {
1036           sub(result_tmp, cnt2, 3);
1037         }
1038         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
1039         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
1040       BIND(FIRST_LOOP);
1041         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
1042         cmpw(first, ch2);
1043         br(EQ, STR1_LOOP);
1044       BIND(STR2_NEXT);
1045         adds(cnt2_neg, cnt2_neg, str2_chr_size);
1046         br(LE, FIRST_LOOP);
1047         b(NOMATCH);
1048 
1049       BIND(STR1_LOOP);
1050         add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
1051         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
1052         cmp(ch1, ch2);
1053         br(NE, STR2_NEXT);
1054         b(MATCH);
1055     }
1056 
1057     if (icnt1 == -1 || icnt1 == 1) {
1058       Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
1059 
1060       BIND(DO1);
1061         (this->*str1_load_1chr)(ch1, str1);
1062         cmp(cnt2, (u1)8);
1063         br(LT, DO1_SHORT);
1064 
1065         sub(result_tmp, cnt2, 8/str2_chr_size);
1066         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
1067         mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
1068         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
1069 
1070         if (str2_isL) {
1071           orr(ch1, ch1, ch1, LSL, 8);
1072         }
1073         orr(ch1, ch1, ch1, LSL, 16);
1074         orr(ch1, ch1, ch1, LSL, 32);
1075       BIND(CH1_LOOP);
1076         ldr(ch2, Address(str2, cnt2_neg));
1077         eor(ch2, ch1, ch2);
1078         sub(tmp1, ch2, tmp3);
1079         orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
1080         bics(tmp1, tmp1, tmp2);
1081         br(NE, HAS_ZERO);
1082         adds(cnt2_neg, cnt2_neg, 8);
1083         br(LT, CH1_LOOP);
1084 
1085         cmp(cnt2_neg, (u1)8);
1086         mov(cnt2_neg, 0);
1087         br(LT, CH1_LOOP);
1088         b(NOMATCH);
1089 
1090       BIND(HAS_ZERO);
1091         rev(tmp1, tmp1);
1092         clz(tmp1, tmp1);
1093         add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
1094         b(MATCH);
1095 
1096       BIND(DO1_SHORT);
1097         mov(result_tmp, cnt2);
1098         lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
1099         sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
1100       BIND(DO1_LOOP);
1101         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
1102         cmpw(ch1, ch2);
1103         br(EQ, MATCH);
1104         adds(cnt2_neg, cnt2_neg, str2_chr_size);
1105         br(LT, DO1_LOOP);
1106     }
1107   }
1108   BIND(NOMATCH);
1109     mov(result, -1);
1110     b(DONE);
1111   BIND(MATCH);
1112     add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
1113   BIND(DONE);
1114 }
1115 
1116 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
1117 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
1118 
1119 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1,
1120                                             Register ch, Register result,
1121                                             Register tmp1, Register tmp2, Register tmp3)
1122 {
1123   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
1124   Register cnt1_neg = cnt1;
1125   Register ch1 = rscratch1;
1126   Register result_tmp = rscratch2;
1127 
1128   cbz(cnt1, NOMATCH);
1129 
1130   cmp(cnt1, (u1)4);
1131   br(LT, DO1_SHORT);
1132 
1133   orr(ch, ch, ch, LSL, 16);
1134   orr(ch, ch, ch, LSL, 32);
1135 
1136   sub(cnt1, cnt1, 4);
1137   mov(result_tmp, cnt1);
1138   lea(str1, Address(str1, cnt1, Address::uxtw(1)));
1139   sub(cnt1_neg, zr, cnt1, LSL, 1);
1140 
1141   mov(tmp3, 0x0001000100010001);
1142 
1143   BIND(CH1_LOOP);
1144     ldr(ch1, Address(str1, cnt1_neg));
1145     eor(ch1, ch, ch1);
1146     sub(tmp1, ch1, tmp3);
1147     orr(tmp2, ch1, 0x7fff7fff7fff7fff);
1148     bics(tmp1, tmp1, tmp2);
1149     br(NE, HAS_ZERO);
1150     adds(cnt1_neg, cnt1_neg, 8);
1151     br(LT, CH1_LOOP);
1152 
1153     cmp(cnt1_neg, (u1)8);
1154     mov(cnt1_neg, 0);
1155     br(LT, CH1_LOOP);
1156     b(NOMATCH);
1157 
1158   BIND(HAS_ZERO);
1159     rev(tmp1, tmp1);
1160     clz(tmp1, tmp1);
1161     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
1162     b(MATCH);
1163 
1164   BIND(DO1_SHORT);
1165     mov(result_tmp, cnt1);
1166     lea(str1, Address(str1, cnt1, Address::uxtw(1)));
1167     sub(cnt1_neg, zr, cnt1, LSL, 1);
1168   BIND(DO1_LOOP);
1169     ldrh(ch1, Address(str1, cnt1_neg));
1170     cmpw(ch, ch1);
1171     br(EQ, MATCH);
1172     adds(cnt1_neg, cnt1_neg, 2);
1173     br(LT, DO1_LOOP);
1174   BIND(NOMATCH);
1175     mov(result, -1);
1176     b(DONE);
1177   BIND(MATCH);
1178     add(result, result_tmp, cnt1_neg, ASR, 1);
1179   BIND(DONE);
1180 }
1181 
1182 void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1,
1183                                                 Register ch, Register result,
1184                                                 FloatRegister ztmp1,
1185                                                 FloatRegister ztmp2,
1186                                                 PRegister tmp_pg,
1187                                                 PRegister tmp_pdn, bool isL)
1188 {
1189   // Note that `tmp_pdn` should *NOT* be used as governing predicate register.
1190   assert(tmp_pg->is_governing(),
1191          "this register has to be a governing predicate register");
1192 
1193   Label LOOP, MATCH, DONE, NOMATCH;
1194   Register vec_len = rscratch1;
1195   Register idx = rscratch2;
1196 
1197   SIMD_RegVariant T = (isL == true) ? B : H;
1198 
1199   cbz(cnt1, NOMATCH);
1200 
1201   // Assign the particular char throughout the vector.
1202   sve_dup(ztmp2, T, ch);
1203   if (isL) {
1204     sve_cntb(vec_len);
1205   } else {
1206     sve_cnth(vec_len);
1207   }
1208   mov(idx, 0);
1209 
1210   // Generate a predicate to control the reading of input string.
1211   sve_whilelt(tmp_pg, T, idx, cnt1);
1212 
1213   BIND(LOOP);
1214     // Read a vector of 8- or 16-bit data depending on the string type. Note
1215     // that inactive elements indicated by the predicate register won't cause
1216     // a data read from memory to the destination vector.
1217     if (isL) {
1218       sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx));
1219     } else {
1220       sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1)));
1221     }
1222     add(idx, idx, vec_len);
1223 
1224     // Perform the comparison. An element of the destination predicate is set
1225     // to active if the particular char is matched.
1226     sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2);
1227 
1228     // Branch if the particular char is found.
1229     br(NE, MATCH);
1230 
1231     sve_whilelt(tmp_pg, T, idx, cnt1);
1232 
1233     // Loop back if the particular char not found.
1234     br(MI, LOOP);
1235 
1236   BIND(NOMATCH);
1237     mov(result, -1);
1238     b(DONE);
1239 
1240   BIND(MATCH);
1241     // Undo the index increment.
1242     sub(idx, idx, vec_len);
1243 
1244     // Crop the vector to find its location.
1245     sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */);
1246     add(result, idx, -1);
1247     sve_incp(result, T, tmp_pdn);
1248   BIND(DONE);
1249 }
1250 
1251 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1,
1252                                             Register ch, Register result,
1253                                             Register tmp1, Register tmp2, Register tmp3)
1254 {
1255   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
1256   Register cnt1_neg = cnt1;
1257   Register ch1 = rscratch1;
1258   Register result_tmp = rscratch2;
1259 
1260   cbz(cnt1, NOMATCH);
1261 
1262   cmp(cnt1, (u1)8);
1263   br(LT, DO1_SHORT);
1264 
1265   orr(ch, ch, ch, LSL, 8);
1266   orr(ch, ch, ch, LSL, 16);
1267   orr(ch, ch, ch, LSL, 32);
1268 
1269   sub(cnt1, cnt1, 8);
1270   mov(result_tmp, cnt1);
1271   lea(str1, Address(str1, cnt1));
1272   sub(cnt1_neg, zr, cnt1);
1273 
1274   mov(tmp3, 0x0101010101010101);
1275 
1276   BIND(CH1_LOOP);
1277     ldr(ch1, Address(str1, cnt1_neg));
1278     eor(ch1, ch, ch1);
1279     sub(tmp1, ch1, tmp3);
1280     orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f);
1281     bics(tmp1, tmp1, tmp2);
1282     br(NE, HAS_ZERO);
1283     adds(cnt1_neg, cnt1_neg, 8);
1284     br(LT, CH1_LOOP);
1285 
1286     cmp(cnt1_neg, (u1)8);
1287     mov(cnt1_neg, 0);
1288     br(LT, CH1_LOOP);
1289     b(NOMATCH);
1290 
1291   BIND(HAS_ZERO);
1292     rev(tmp1, tmp1);
1293     clz(tmp1, tmp1);
1294     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
1295     b(MATCH);
1296 
1297   BIND(DO1_SHORT);
1298     mov(result_tmp, cnt1);
1299     lea(str1, Address(str1, cnt1));
1300     sub(cnt1_neg, zr, cnt1);
1301   BIND(DO1_LOOP);
1302     ldrb(ch1, Address(str1, cnt1_neg));
1303     cmp(ch, ch1);
1304     br(EQ, MATCH);
1305     adds(cnt1_neg, cnt1_neg, 1);
1306     br(LT, DO1_LOOP);
1307   BIND(NOMATCH);
1308     mov(result, -1);
1309     b(DONE);
1310   BIND(MATCH);
1311     add(result, result_tmp, cnt1_neg);
1312   BIND(DONE);
1313 }
1314 
1315 // Compare strings.
1316 void C2_MacroAssembler::string_compare(Register str1, Register str2,
1317     Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
1318     FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
1319     PRegister pgtmp1, PRegister pgtmp2, int ae) {
1320   Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
1321       DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
1322       SHORT_LOOP_START, TAIL_CHECK;
1323 
1324   bool isLL = ae == StrIntrinsicNode::LL;
1325   bool isLU = ae == StrIntrinsicNode::LU;
1326   bool isUL = ae == StrIntrinsicNode::UL;
1327 
1328   // The stub threshold for LL strings is: 72 (64 + 8) chars
1329   // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch)
1330   // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least)
1331   const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36);
1332 
1333   bool str1_isL = isLL || isLU;
1334   bool str2_isL = isLL || isUL;
1335 
1336   int str1_chr_shift = str1_isL ? 0 : 1;
1337   int str2_chr_shift = str2_isL ? 0 : 1;
1338   int str1_chr_size = str1_isL ? 1 : 2;
1339   int str2_chr_size = str2_isL ? 1 : 2;
1340   int minCharsInWord = isLL ? wordSize : wordSize/2;
1341 
1342   FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
1343   chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
1344                                       (chr_insn)&MacroAssembler::ldrh;
1345   chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
1346                                       (chr_insn)&MacroAssembler::ldrh;
1347   uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
1348                             (uxt_insn)&MacroAssembler::uxthw;
1349 
1350   BLOCK_COMMENT("string_compare {");
1351 
1352   // Bizarrely, the counts are passed in bytes, regardless of whether they
1353   // are L or U strings, however the result is always in characters.
1354   if (!str1_isL) asrw(cnt1, cnt1, 1);
1355   if (!str2_isL) asrw(cnt2, cnt2, 1);
1356 
1357   // Compute the minimum of the string lengths and save the difference.
1358   subsw(result, cnt1, cnt2);
1359   cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
1360 
1361   // A very short string
1362   cmpw(cnt2, minCharsInWord);
1363   br(Assembler::LE, SHORT_STRING);
1364 
1365   // Compare longwords
1366   // load first parts of strings and finish initialization while loading
1367   {
1368     if (str1_isL == str2_isL) { // LL or UU
1369       ldr(tmp1, Address(str1));
1370       cmp(str1, str2);
1371       br(Assembler::EQ, DONE);
1372       ldr(tmp2, Address(str2));
1373       cmp(cnt2, stub_threshold);
1374       br(GE, STUB);
1375       subsw(cnt2, cnt2, minCharsInWord);
1376       br(EQ, TAIL_CHECK);
1377       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1378       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1379       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1380     } else if (isLU) {
1381       ldrs(vtmp, Address(str1));
1382       ldr(tmp2, Address(str2));
1383       cmp(cnt2, stub_threshold);
1384       br(GE, STUB);
1385       subw(cnt2, cnt2, 4);
1386       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1387       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1388       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1389       zip1(vtmp, T8B, vtmp, vtmpZ);
1390       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1391       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1392       add(cnt1, cnt1, 4);
1393       fmovd(tmp1, vtmp);
1394     } else { // UL case
1395       ldr(tmp1, Address(str1));
1396       ldrs(vtmp, Address(str2));
1397       cmp(cnt2, stub_threshold);
1398       br(GE, STUB);
1399       subw(cnt2, cnt2, 4);
1400       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1401       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1402       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1403       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1404       zip1(vtmp, T8B, vtmp, vtmpZ);
1405       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1406       add(cnt1, cnt1, 8);
1407       fmovd(tmp2, vtmp);
1408     }
1409     adds(cnt2, cnt2, isUL ? 4 : 8);
1410     br(GE, TAIL);
1411     eor(rscratch2, tmp1, tmp2);
1412     cbnz(rscratch2, DIFF);
1413     // main loop
1414     bind(NEXT_WORD);
1415     if (str1_isL == str2_isL) {
1416       ldr(tmp1, Address(str1, cnt2));
1417       ldr(tmp2, Address(str2, cnt2));
1418       adds(cnt2, cnt2, 8);
1419     } else if (isLU) {
1420       ldrs(vtmp, Address(str1, cnt1));
1421       ldr(tmp2, Address(str2, cnt2));
1422       add(cnt1, cnt1, 4);
1423       zip1(vtmp, T8B, vtmp, vtmpZ);
1424       fmovd(tmp1, vtmp);
1425       adds(cnt2, cnt2, 8);
1426     } else { // UL
1427       ldrs(vtmp, Address(str2, cnt2));
1428       ldr(tmp1, Address(str1, cnt1));
1429       zip1(vtmp, T8B, vtmp, vtmpZ);
1430       add(cnt1, cnt1, 8);
1431       fmovd(tmp2, vtmp);
1432       adds(cnt2, cnt2, 4);
1433     }
1434     br(GE, TAIL);
1435 
1436     eor(rscratch2, tmp1, tmp2);
1437     cbz(rscratch2, NEXT_WORD);
1438     b(DIFF);
1439     bind(TAIL);
1440     eor(rscratch2, tmp1, tmp2);
1441     cbnz(rscratch2, DIFF);
1442     // Last longword.  In the case where length == 4 we compare the
1443     // same longword twice, but that's still faster than another
1444     // conditional branch.
1445     if (str1_isL == str2_isL) {
1446       ldr(tmp1, Address(str1));
1447       ldr(tmp2, Address(str2));
1448     } else if (isLU) {
1449       ldrs(vtmp, Address(str1));
1450       ldr(tmp2, Address(str2));
1451       zip1(vtmp, T8B, vtmp, vtmpZ);
1452       fmovd(tmp1, vtmp);
1453     } else { // UL
1454       ldrs(vtmp, Address(str2));
1455       ldr(tmp1, Address(str1));
1456       zip1(vtmp, T8B, vtmp, vtmpZ);
1457       fmovd(tmp2, vtmp);
1458     }
1459     bind(TAIL_CHECK);
1460     eor(rscratch2, tmp1, tmp2);
1461     cbz(rscratch2, DONE);
1462 
1463     // Find the first different characters in the longwords and
1464     // compute their difference.
1465     bind(DIFF);
1466     rev(rscratch2, rscratch2);
1467     clz(rscratch2, rscratch2);
1468     andr(rscratch2, rscratch2, isLL ? -8 : -16);
1469     lsrv(tmp1, tmp1, rscratch2);
1470     (this->*ext_chr)(tmp1, tmp1);
1471     lsrv(tmp2, tmp2, rscratch2);
1472     (this->*ext_chr)(tmp2, tmp2);
1473     subw(result, tmp1, tmp2);
1474     b(DONE);
1475   }
1476 
1477   bind(STUB);
1478     RuntimeAddress stub = nullptr;
1479     switch(ae) {
1480       case StrIntrinsicNode::LL:
1481         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
1482         break;
1483       case StrIntrinsicNode::UU:
1484         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
1485         break;
1486       case StrIntrinsicNode::LU:
1487         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
1488         break;
1489       case StrIntrinsicNode::UL:
1490         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
1491         break;
1492       default:
1493         ShouldNotReachHere();
1494      }
1495     assert(stub.target() != nullptr, "compare_long_string stub has not been generated");
1496     address call = trampoline_call(stub);
1497     if (call == nullptr) {
1498       DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START));
1499       ciEnv::current()->record_failure("CodeCache is full");
1500       return;
1501     }
1502     b(DONE);
1503 
1504   bind(SHORT_STRING);
1505   // Is the minimum length zero?
1506   cbz(cnt2, DONE);
1507   // arrange code to do most branches while loading and loading next characters
1508   // while comparing previous
1509   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1510   subs(cnt2, cnt2, 1);
1511   br(EQ, SHORT_LAST_INIT);
1512   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1513   b(SHORT_LOOP_START);
1514   bind(SHORT_LOOP);
1515   subs(cnt2, cnt2, 1);
1516   br(EQ, SHORT_LAST);
1517   bind(SHORT_LOOP_START);
1518   (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
1519   (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
1520   cmp(tmp1, cnt1);
1521   br(NE, SHORT_LOOP_TAIL);
1522   subs(cnt2, cnt2, 1);
1523   br(EQ, SHORT_LAST2);
1524   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1525   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1526   cmp(tmp2, rscratch1);
1527   br(EQ, SHORT_LOOP);
1528   sub(result, tmp2, rscratch1);
1529   b(DONE);
1530   bind(SHORT_LOOP_TAIL);
1531   sub(result, tmp1, cnt1);
1532   b(DONE);
1533   bind(SHORT_LAST2);
1534   cmp(tmp2, rscratch1);
1535   br(EQ, DONE);
1536   sub(result, tmp2, rscratch1);
1537 
1538   b(DONE);
1539   bind(SHORT_LAST_INIT);
1540   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1541   bind(SHORT_LAST);
1542   cmp(tmp1, cnt1);
1543   br(EQ, DONE);
1544   sub(result, tmp1, cnt1);
1545 
1546   bind(DONE);
1547 
1548   BLOCK_COMMENT("} string_compare");
1549 }
1550 
1551 void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1,
1552                                      FloatRegister src2, Condition cond, bool isQ) {
1553   SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1554   FloatRegister zn = src1, zm = src2;
1555   bool needs_negation = false;
1556   switch (cond) {
1557     case LT: cond = GT; zn = src2; zm = src1; break;
1558     case LE: cond = GE; zn = src2; zm = src1; break;
1559     case LO: cond = HI; zn = src2; zm = src1; break;
1560     case LS: cond = HS; zn = src2; zm = src1; break;
1561     case NE: cond = EQ; needs_negation = true; break;
1562     default:
1563       break;
1564   }
1565 
1566   if (is_floating_point_type(bt)) {
1567     fcm(cond, dst, size, zn, zm);
1568   } else {
1569     cm(cond, dst, size, zn, zm);
1570   }
1571 
1572   if (needs_negation) {
1573     notr(dst, isQ ? T16B : T8B, dst);
1574   }
1575 }
1576 
1577 void C2_MacroAssembler::neon_compare_zero(FloatRegister dst, BasicType bt, FloatRegister src,
1578                                           Condition cond, bool isQ) {
1579   SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1580   if (bt == T_FLOAT || bt == T_DOUBLE) {
1581     if (cond == Assembler::NE) {
1582       fcm(Assembler::EQ, dst, size, src);
1583       notr(dst, isQ ? T16B : T8B, dst);
1584     } else {
1585       fcm(cond, dst, size, src);
1586     }
1587   } else {
1588     if (cond == Assembler::NE) {
1589       cm(Assembler::EQ, dst, size, src);
1590       notr(dst, isQ ? T16B : T8B, dst);
1591     } else {
1592       cm(cond, dst, size, src);
1593     }
1594   }
1595 }
1596 
1597 // Compress the least significant bit of each byte to the rightmost and clear
1598 // the higher garbage bits.
1599 void C2_MacroAssembler::bytemask_compress(Register dst) {
1600   // Example input, dst = 0x01 00 00 00 01 01 00 01
1601   // The "??" bytes are garbage.
1602   orr(dst, dst, dst, Assembler::LSR, 7);  // dst = 0x?? 02 ?? 00 ?? 03 ?? 01
1603   orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D
1604   orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D
1605   andr(dst, dst, 0xff);                   // dst = 0x8D
1606 }
1607 
1608 // Pack the lowest-numbered bit of each mask element in src into a long value
1609 // in dst, at most the first 64 lane elements.
1610 // Clobbers: rscratch1, if UseSVE=1 or the hardware doesn't support FEAT_BITPERM.
1611 void C2_MacroAssembler::sve_vmask_tolong(Register dst, PRegister src, BasicType bt, int lane_cnt,
1612                                          FloatRegister vtmp1, FloatRegister vtmp2) {
1613   assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
1614   assert_different_registers(dst, rscratch1);
1615   assert_different_registers(vtmp1, vtmp2);
1616 
1617   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
1618   // Example:   src = 0b01100101 10001101, bt = T_BYTE, lane_cnt = 16
1619   // Expected:  dst = 0x658D
1620 
1621   // Convert the mask into vector with sequential bytes.
1622   // vtmp1 = 0x00010100 0x00010001 0x01000000 0x01010001
1623   sve_cpy(vtmp1, size, src, 1, false);
1624   if (bt != T_BYTE) {
1625     sve_vector_narrow(vtmp1, B, vtmp1, size, vtmp2);
1626   }
1627 
1628   if (UseSVE > 1 && VM_Version::supports_svebitperm()) {
1629     // Given a vector with the value 0x00 or 0x01 in each byte, the basic idea
1630     // is to compress each significant bit of the byte in a cross-lane way. Due
1631     // to the lack of a cross-lane bit-compress instruction, we use BEXT
1632     // (bit-compress in each lane) with the biggest lane size (T = D) then
1633     // concatenate the results.
1634 
1635     // The second source input of BEXT, initialized with 0x01 in each byte.
1636     // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1637     sve_dup(vtmp2, B, 1);
1638 
1639     // BEXT vtmp1.D, vtmp1.D, vtmp2.D
1640     // vtmp1 = 0x0001010000010001 | 0x0100000001010001
1641     // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1642     //         ---------------------------------------
1643     // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1644     sve_bext(vtmp1, D, vtmp1, vtmp2);
1645 
1646     // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the
1647     // result to dst.
1648     // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1649     // dst   = 0x658D
1650     if (lane_cnt <= 8) {
1651       // No need to concatenate.
1652       umov(dst, vtmp1, B, 0);
1653     } else if (lane_cnt <= 16) {
1654       ins(vtmp1, B, vtmp1, 1, 8);
1655       umov(dst, vtmp1, H, 0);
1656     } else {
1657       // As the lane count is 64 at most, the final expected value must be in
1658       // the lowest 64 bits after narrowing vtmp1 from D to B.
1659       sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2);
1660       umov(dst, vtmp1, D, 0);
1661     }
1662   } else if (UseSVE > 0) {
1663     // Compress the lowest 8 bytes.
1664     fmovd(dst, vtmp1);
1665     bytemask_compress(dst);
1666     if (lane_cnt <= 8) return;
1667 
1668     // Repeat on higher bytes and join the results.
1669     // Compress 8 bytes in each iteration.
1670     for (int idx = 1; idx < (lane_cnt / 8); idx++) {
1671       sve_extract_integral(rscratch1, T_LONG, vtmp1, idx, vtmp2);
1672       bytemask_compress(rscratch1);
1673       orr(dst, dst, rscratch1, Assembler::LSL, idx << 3);
1674     }
1675   } else {
1676     assert(false, "unsupported");
1677     ShouldNotReachHere();
1678   }
1679 }
1680 
1681 // Unpack the mask, a long value in src, into predicate register dst based on the
1682 // corresponding data type. Note that dst can support at most 64 lanes.
1683 // Below example gives the expected dst predicate register in different types, with
1684 // a valid src(0x658D) on a 1024-bit vector size machine.
1685 // BYTE:  dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 00 00 65 8D
1686 // SHORT: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 14 11 40 51
1687 // INT:   dst = 0x00 00 00 00 00 00 00 00 01 10 01 01 10 00 11 01
1688 // LONG:  dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01
1689 //
1690 // The number of significant bits of src must be equal to lane_cnt. E.g., 0xFF658D which
1691 // has 24 significant bits would be an invalid input if dst predicate register refers to
1692 // a LONG type 1024-bit vector, which has at most 16 lanes.
1693 void C2_MacroAssembler::sve_vmask_fromlong(PRegister dst, Register src, BasicType bt, int lane_cnt,
1694                                            FloatRegister vtmp1, FloatRegister vtmp2) {
1695   assert(UseSVE == 2 && VM_Version::supports_svebitperm() &&
1696          lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported");
1697   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
1698   // Example:   src = 0x658D, bt = T_BYTE, size = B, lane_cnt = 16
1699   // Expected:  dst = 0b01101001 10001101
1700 
1701   // Put long value from general purpose register into the first lane of vector.
1702   // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1703   sve_dup(vtmp1, B, 0);
1704   mov(vtmp1, D, 0, src);
1705 
1706   // As sve_cmp generates mask value with the minimum unit in byte, we should
1707   // transform the value in the first lane which is mask in bit now to the
1708   // mask in byte, which can be done by SVE2's BDEP instruction.
1709 
1710   // The first source input of BDEP instruction. Deposite each byte in every 8 bytes.
1711   // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1712   if (lane_cnt <= 8) {
1713     // Nothing. As only one byte exsits.
1714   } else if (lane_cnt <= 16) {
1715     ins(vtmp1, B, vtmp1, 8, 1);
1716     mov(vtmp1, B, 1, zr);
1717   } else {
1718     sve_vector_extend(vtmp1, D, vtmp1, B);
1719   }
1720 
1721   // The second source input of BDEP instruction, initialized with 0x01 for each byte.
1722   // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1723   sve_dup(vtmp2, B, 1);
1724 
1725   // BDEP vtmp1.D, vtmp1.D, vtmp2.D
1726   // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1727   // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1728   //         ---------------------------------------
1729   // vtmp1 = 0x0001010000010001 | 0x0100000001010001
1730   sve_bdep(vtmp1, D, vtmp1, vtmp2);
1731 
1732   if (bt != T_BYTE) {
1733     sve_vector_extend(vtmp1, size, vtmp1, B);
1734   }
1735   // Generate mask according to the given vector, in which the elements have been
1736   // extended to expected type.
1737   // dst = 0b01101001 10001101
1738   sve_cmp(Assembler::NE, dst, size, ptrue, vtmp1, 0);
1739 }
1740 
1741 // Clobbers: rflags
1742 void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg,
1743                                     FloatRegister zn, FloatRegister zm, Condition cond) {
1744   assert(pg->is_governing(), "This register has to be a governing predicate register");
1745   FloatRegister z1 = zn, z2 = zm;
1746   switch (cond) {
1747     case LE: z1 = zm; z2 = zn; cond = GE; break;
1748     case LT: z1 = zm; z2 = zn; cond = GT; break;
1749     case LO: z1 = zm; z2 = zn; cond = HI; break;
1750     case LS: z1 = zm; z2 = zn; cond = HS; break;
1751     default:
1752       break;
1753   }
1754 
1755   SIMD_RegVariant size = elemType_to_regVariant(bt);
1756   if (is_floating_point_type(bt)) {
1757     sve_fcm(cond, pd, size, pg, z1, z2);
1758   } else {
1759     assert(is_integral_type(bt), "unsupported element type");
1760     sve_cmp(cond, pd, size, pg, z1, z2);
1761   }
1762 }
1763 
1764 // Get index of the last mask lane that is set
1765 void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) {
1766   SIMD_RegVariant size = elemType_to_regVariant(bt);
1767   sve_rev(ptmp, size, src);
1768   sve_brkb(ptmp, ptrue, ptmp, false);
1769   sve_cntp(dst, size, ptrue, ptmp);
1770   movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1);
1771   subw(dst, rscratch1, dst);
1772 }
1773 
1774 // Extend integer vector src to dst with the same lane count
1775 // but larger element size, e.g. 4B -> 4I
1776 void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes,
1777                                            FloatRegister src, BasicType src_bt, bool is_unsigned) {
1778   if (src_bt == T_BYTE) {
1779     if (dst_bt == T_SHORT) {
1780       // 4B/8B to 4S/8S
1781       _xshll(is_unsigned, dst, T8H, src, T8B, 0);
1782     } else {
1783       // 4B to 4I
1784       assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported");
1785       _xshll(is_unsigned, dst, T8H, src, T8B, 0);
1786       _xshll(is_unsigned, dst, T4S, dst, T4H, 0);
1787     }
1788   } else if (src_bt == T_SHORT) {
1789     // 4S to 4I
1790     assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported");
1791     _xshll(is_unsigned, dst, T4S, src, T4H, 0);
1792   } else if (src_bt == T_INT) {
1793     // 2I to 2L
1794     assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported");
1795     _xshll(is_unsigned, dst, T2D, src, T2S, 0);
1796   } else {
1797     ShouldNotReachHere();
1798   }
1799 }
1800 
1801 // Narrow integer vector src down to dst with the same lane count
1802 // but smaller element size, e.g. 4I -> 4B
1803 void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt,
1804                                            FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) {
1805   if (src_bt == T_SHORT) {
1806     // 4S/8S to 4B/8B
1807     assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
1808     assert(dst_bt == T_BYTE, "unsupported");
1809     xtn(dst, T8B, src, T8H);
1810   } else if (src_bt == T_INT) {
1811     // 4I to 4B/4S
1812     assert(src_vlen_in_bytes == 16, "unsupported");
1813     assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported");
1814     xtn(dst, T4H, src, T4S);
1815     if (dst_bt == T_BYTE) {
1816       xtn(dst, T8B, dst, T8H);
1817     }
1818   } else if (src_bt == T_LONG) {
1819     // 2L to 2I
1820     assert(src_vlen_in_bytes == 16, "unsupported");
1821     assert(dst_bt == T_INT, "unsupported");
1822     xtn(dst, T2S, src, T2D);
1823   } else {
1824     ShouldNotReachHere();
1825   }
1826 }
1827 
1828 void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size,
1829                                           FloatRegister src, SIMD_RegVariant src_size,
1830                                           bool is_unsigned) {
1831   assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size");
1832 
1833   if (src_size == B) {
1834     switch (dst_size) {
1835     case H:
1836       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1837       break;
1838     case S:
1839       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1840       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1841       break;
1842     case D:
1843       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1844       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1845       _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1846       break;
1847     default:
1848       ShouldNotReachHere();
1849     }
1850   } else if (src_size == H) {
1851     if (dst_size == S) {
1852       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1853     } else { // D
1854       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1855       _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1856     }
1857   } else if (src_size == S) {
1858     _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, src);
1859   }
1860 }
1861 
1862 // Vector narrow from src to dst with specified element sizes.
1863 // High part of dst vector will be filled with zero.
1864 void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size,
1865                                           FloatRegister src, SIMD_RegVariant src_size,
1866                                           FloatRegister tmp) {
1867   assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size");
1868   assert_different_registers(src, tmp);
1869   sve_dup(tmp, src_size, 0);
1870   if (src_size == D) {
1871     switch (dst_size) {
1872     case S:
1873       sve_uzp1(dst, S, src, tmp);
1874       break;
1875     case H:
1876       assert_different_registers(dst, tmp);
1877       sve_uzp1(dst, S, src, tmp);
1878       sve_uzp1(dst, H, dst, tmp);
1879       break;
1880     case B:
1881       assert_different_registers(dst, tmp);
1882       sve_uzp1(dst, S, src, tmp);
1883       sve_uzp1(dst, H, dst, tmp);
1884       sve_uzp1(dst, B, dst, tmp);
1885       break;
1886     default:
1887       ShouldNotReachHere();
1888     }
1889   } else if (src_size == S) {
1890     if (dst_size == H) {
1891       sve_uzp1(dst, H, src, tmp);
1892     } else { // B
1893       assert_different_registers(dst, tmp);
1894       sve_uzp1(dst, H, src, tmp);
1895       sve_uzp1(dst, B, dst, tmp);
1896     }
1897   } else if (src_size == H) {
1898     sve_uzp1(dst, B, src, tmp);
1899   }
1900 }
1901 
1902 // Extend src predicate to dst predicate with the same lane count but larger
1903 // element size, e.g. 64Byte -> 512Long
1904 void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src,
1905                                              uint dst_element_length_in_bytes,
1906                                              uint src_element_length_in_bytes) {
1907   if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) {
1908     sve_punpklo(dst, src);
1909   } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) {
1910     sve_punpklo(dst, src);
1911     sve_punpklo(dst, dst);
1912   } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) {
1913     sve_punpklo(dst, src);
1914     sve_punpklo(dst, dst);
1915     sve_punpklo(dst, dst);
1916   } else {
1917     assert(false, "unsupported");
1918     ShouldNotReachHere();
1919   }
1920 }
1921 
1922 // Narrow src predicate to dst predicate with the same lane count but
1923 // smaller element size, e.g. 512Long -> 64Byte
1924 void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src, PRegister ptmp,
1925                                              uint dst_element_length_in_bytes, uint src_element_length_in_bytes) {
1926   // The insignificant bits in src predicate are expected to be zero.
1927   // To ensure the higher order bits of the resultant narrowed vector are 0, an all-zero predicate is
1928   // passed as the second argument. An example narrowing operation with a given mask would be -
1929   // 128Long -> 64Int on a 128-bit machine i.e 2L -> 2I
1930   // Mask (for 2 Longs) : TF
1931   // Predicate register for the above mask (16 bits) : 00000001 00000000
1932   // After narrowing (uzp1 dst.b, src.b, ptmp.b) : 0000 0000 0001 0000
1933   // Which translates to mask for 2 integers as : TF (lower half is considered while upper half is 0)
1934   assert_different_registers(src, ptmp);
1935   assert_different_registers(dst, ptmp);
1936   sve_pfalse(ptmp);
1937   if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) {
1938     sve_uzp1(dst, B, src, ptmp);
1939   } else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) {
1940     sve_uzp1(dst, H, src, ptmp);
1941     sve_uzp1(dst, B, dst, ptmp);
1942   } else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) {
1943     sve_uzp1(dst, S, src, ptmp);
1944     sve_uzp1(dst, H, dst, ptmp);
1945     sve_uzp1(dst, B, dst, ptmp);
1946   } else {
1947     assert(false, "unsupported");
1948     ShouldNotReachHere();
1949   }
1950 }
1951 
1952 // Vector reduction add for integral type with ASIMD instructions.
1953 void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt,
1954                                                  Register isrc, FloatRegister vsrc,
1955                                                  unsigned vector_length_in_bytes,
1956                                                  FloatRegister vtmp) {
1957   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1958   assert_different_registers(dst, isrc);
1959   bool isQ = vector_length_in_bytes == 16;
1960 
1961   BLOCK_COMMENT("neon_reduce_add_integral {");
1962     switch(bt) {
1963       case T_BYTE:
1964         addv(vtmp, isQ ? T16B : T8B, vsrc);
1965         smov(dst, vtmp, B, 0);
1966         addw(dst, dst, isrc, ext::sxtb);
1967         break;
1968       case T_SHORT:
1969         addv(vtmp, isQ ? T8H : T4H, vsrc);
1970         smov(dst, vtmp, H, 0);
1971         addw(dst, dst, isrc, ext::sxth);
1972         break;
1973       case T_INT:
1974         isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc);
1975         umov(dst, vtmp, S, 0);
1976         addw(dst, dst, isrc);
1977         break;
1978       case T_LONG:
1979         assert(isQ, "unsupported");
1980         addpd(vtmp, vsrc);
1981         umov(dst, vtmp, D, 0);
1982         add(dst, dst, isrc);
1983         break;
1984       default:
1985         assert(false, "unsupported");
1986         ShouldNotReachHere();
1987     }
1988   BLOCK_COMMENT("} neon_reduce_add_integral");
1989 }
1990 
1991 // Vector reduction multiply for integral type with ASIMD instructions.
1992 // Note: temporary registers vtmp1 and vtmp2 are not used in some cases.
1993 // Clobbers: rscratch1
1994 void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt,
1995                                                  Register isrc, FloatRegister vsrc,
1996                                                  unsigned vector_length_in_bytes,
1997                                                  FloatRegister vtmp1, FloatRegister vtmp2) {
1998   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1999   bool isQ = vector_length_in_bytes == 16;
2000 
2001   BLOCK_COMMENT("neon_reduce_mul_integral {");
2002     switch(bt) {
2003       case T_BYTE:
2004         if (isQ) {
2005           // Multiply the lower half and higher half of vector iteratively.
2006           // vtmp1 = vsrc[8:15]
2007           ins(vtmp1, D, vsrc, 0, 1);
2008           // vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7]
2009           mulv(vtmp1, T8B, vtmp1, vsrc);
2010           // vtmp2 = vtmp1[4:7]
2011           ins(vtmp2, S, vtmp1, 0, 1);
2012           // vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3]
2013           mulv(vtmp1, T8B, vtmp2, vtmp1);
2014         } else {
2015           ins(vtmp1, S, vsrc, 0, 1);
2016           mulv(vtmp1, T8B, vtmp1, vsrc);
2017         }
2018         // vtmp2 = vtmp1[2:3]
2019         ins(vtmp2, H, vtmp1, 0, 1);
2020         // vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1]
2021         mulv(vtmp2, T8B, vtmp2, vtmp1);
2022         // dst = vtmp2[0] * isrc * vtmp2[1]
2023         umov(rscratch1, vtmp2, B, 0);
2024         mulw(dst, rscratch1, isrc);
2025         sxtb(dst, dst);
2026         umov(rscratch1, vtmp2, B, 1);
2027         mulw(dst, rscratch1, dst);
2028         sxtb(dst, dst);
2029         break;
2030       case T_SHORT:
2031         if (isQ) {
2032           ins(vtmp2, D, vsrc, 0, 1);
2033           mulv(vtmp2, T4H, vtmp2, vsrc);
2034           ins(vtmp1, S, vtmp2, 0, 1);
2035           mulv(vtmp1, T4H, vtmp1, vtmp2);
2036         } else {
2037           ins(vtmp1, S, vsrc, 0, 1);
2038           mulv(vtmp1, T4H, vtmp1, vsrc);
2039         }
2040         umov(rscratch1, vtmp1, H, 0);
2041         mulw(dst, rscratch1, isrc);
2042         sxth(dst, dst);
2043         umov(rscratch1, vtmp1, H, 1);
2044         mulw(dst, rscratch1, dst);
2045         sxth(dst, dst);
2046         break;
2047       case T_INT:
2048         if (isQ) {
2049           ins(vtmp1, D, vsrc, 0, 1);
2050           mulv(vtmp1, T2S, vtmp1, vsrc);
2051         } else {
2052           vtmp1 = vsrc;
2053         }
2054         umov(rscratch1, vtmp1, S, 0);
2055         mul(dst, rscratch1, isrc);
2056         umov(rscratch1, vtmp1, S, 1);
2057         mul(dst, rscratch1, dst);
2058         break;
2059       case T_LONG:
2060         umov(rscratch1, vsrc, D, 0);
2061         mul(dst, isrc, rscratch1);
2062         umov(rscratch1, vsrc, D, 1);
2063         mul(dst, dst, rscratch1);
2064         break;
2065       default:
2066         assert(false, "unsupported");
2067         ShouldNotReachHere();
2068     }
2069   BLOCK_COMMENT("} neon_reduce_mul_integral");
2070 }
2071 
2072 // Vector reduction multiply for floating-point type with ASIMD instructions.
2073 void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt,
2074                                            FloatRegister fsrc, FloatRegister vsrc,
2075                                            unsigned vector_length_in_bytes,
2076                                            FloatRegister vtmp) {
2077   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
2078   bool isQ = vector_length_in_bytes == 16;
2079 
2080   BLOCK_COMMENT("neon_reduce_mul_fp {");
2081     switch(bt) {
2082       case T_FLOAT:
2083         fmuls(dst, fsrc, vsrc);
2084         ins(vtmp, S, vsrc, 0, 1);
2085         fmuls(dst, dst, vtmp);
2086         if (isQ) {
2087           ins(vtmp, S, vsrc, 0, 2);
2088           fmuls(dst, dst, vtmp);
2089           ins(vtmp, S, vsrc, 0, 3);
2090           fmuls(dst, dst, vtmp);
2091          }
2092         break;
2093       case T_DOUBLE:
2094         assert(isQ, "unsupported");
2095         fmuld(dst, fsrc, vsrc);
2096         ins(vtmp, D, vsrc, 0, 1);
2097         fmuld(dst, dst, vtmp);
2098         break;
2099       default:
2100         assert(false, "unsupported");
2101         ShouldNotReachHere();
2102     }
2103   BLOCK_COMMENT("} neon_reduce_mul_fp");
2104 }
2105 
2106 // Helper to select logical instruction
2107 void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd,
2108                                                    Register Rn, Register Rm,
2109                                                    enum shift_kind kind, unsigned shift) {
2110   switch(opc) {
2111     case Op_AndReductionV:
2112       is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift);
2113       break;
2114     case Op_OrReductionV:
2115       is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift);
2116       break;
2117     case Op_XorReductionV:
2118       is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift);
2119       break;
2120     default:
2121       assert(false, "unsupported");
2122       ShouldNotReachHere();
2123   }
2124 }
2125 
2126 // Vector reduction logical operations And, Or, Xor
2127 // Clobbers: rscratch1
2128 void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt,
2129                                             Register isrc, FloatRegister vsrc,
2130                                             unsigned vector_length_in_bytes) {
2131   assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV,
2132          "unsupported");
2133   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
2134   assert_different_registers(dst, isrc);
2135   bool isQ = vector_length_in_bytes == 16;
2136 
2137   BLOCK_COMMENT("neon_reduce_logical {");
2138     umov(rscratch1, vsrc, isQ ? D : S, 0);
2139     umov(dst, vsrc, isQ ? D : S, 1);
2140     neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1);
2141     switch(bt) {
2142       case T_BYTE:
2143         if (isQ) {
2144           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
2145         }
2146         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
2147         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8);
2148         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
2149         sxtb(dst, dst);
2150         break;
2151       case T_SHORT:
2152         if (isQ) {
2153           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
2154         }
2155         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
2156         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
2157         sxth(dst, dst);
2158         break;
2159       case T_INT:
2160         if (isQ) {
2161           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
2162         }
2163         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
2164         break;
2165       case T_LONG:
2166         assert(isQ, "unsupported");
2167         neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst);
2168         break;
2169       default:
2170         assert(false, "unsupported");
2171         ShouldNotReachHere();
2172     }
2173   BLOCK_COMMENT("} neon_reduce_logical");
2174 }
2175 
2176 // Vector reduction min/max for integral type with ASIMD instructions.
2177 // Note: vtmp is not used and expected to be fnoreg for T_LONG case.
2178 // Clobbers: rscratch1, rflags
2179 void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt,
2180                                                     Register isrc, FloatRegister vsrc,
2181                                                     unsigned vector_length_in_bytes,
2182                                                     FloatRegister vtmp) {
2183   assert(opc == Op_MinReductionV || opc == Op_MaxReductionV, "unsupported");
2184   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
2185   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported");
2186   assert_different_registers(dst, isrc);
2187   bool isQ = vector_length_in_bytes == 16;
2188   bool is_min = opc == Op_MinReductionV;
2189 
2190   BLOCK_COMMENT("neon_reduce_minmax_integral {");
2191     if (bt == T_LONG) {
2192       assert(vtmp == fnoreg, "should be");
2193       assert(isQ, "should be");
2194       umov(rscratch1, vsrc, D, 0);
2195       cmp(isrc, rscratch1);
2196       csel(dst, isrc, rscratch1, is_min ? LT : GT);
2197       umov(rscratch1, vsrc, D, 1);
2198       cmp(dst, rscratch1);
2199       csel(dst, dst, rscratch1, is_min ? LT : GT);
2200     } else {
2201       SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
2202       if (size == T2S) {
2203         is_min ? sminp(vtmp, size, vsrc, vsrc) : smaxp(vtmp, size, vsrc, vsrc);
2204       } else {
2205         is_min ? sminv(vtmp, size, vsrc) : smaxv(vtmp, size, vsrc);
2206       }
2207       if (bt == T_INT) {
2208         umov(dst, vtmp, S, 0);
2209       } else {
2210         smov(dst, vtmp, elemType_to_regVariant(bt), 0);
2211       }
2212       cmpw(dst, isrc);
2213       cselw(dst, dst, isrc, is_min ? LT : GT);
2214     }
2215   BLOCK_COMMENT("} neon_reduce_minmax_integral");
2216 }
2217 
2218 // Vector reduction for integral type with SVE instruction.
2219 // Supported operations are Add, And, Or, Xor, Max, Min.
2220 // rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV.
2221 void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1,
2222                                             FloatRegister src2, PRegister pg, FloatRegister tmp) {
2223   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2224   assert(pg->is_governing(), "This register has to be a governing predicate register");
2225   assert_different_registers(src1, dst);
2226   // Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved.
2227   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2228   switch (opc) {
2229     case Op_AddReductionVI: {
2230       sve_uaddv(tmp, size, pg, src2);
2231       if (bt == T_BYTE) {
2232         smov(dst, tmp, size, 0);
2233         addw(dst, src1, dst, ext::sxtb);
2234       } else if (bt == T_SHORT) {
2235         smov(dst, tmp, size, 0);
2236         addw(dst, src1, dst, ext::sxth);
2237       } else {
2238         umov(dst, tmp, size, 0);
2239         addw(dst, dst, src1);
2240       }
2241       break;
2242     }
2243     case Op_AddReductionVL: {
2244       sve_uaddv(tmp, size, pg, src2);
2245       umov(dst, tmp, size, 0);
2246       add(dst, dst, src1);
2247       break;
2248     }
2249     case Op_AndReductionV: {
2250       sve_andv(tmp, size, pg, src2);
2251       if (bt == T_INT || bt == T_LONG) {
2252         umov(dst, tmp, size, 0);
2253       } else {
2254         smov(dst, tmp, size, 0);
2255       }
2256       if (bt == T_LONG) {
2257         andr(dst, dst, src1);
2258       } else {
2259         andw(dst, dst, src1);
2260       }
2261       break;
2262     }
2263     case Op_OrReductionV: {
2264       sve_orv(tmp, size, pg, src2);
2265       if (bt == T_INT || bt == T_LONG) {
2266         umov(dst, tmp, size, 0);
2267       } else {
2268         smov(dst, tmp, size, 0);
2269       }
2270       if (bt == T_LONG) {
2271         orr(dst, dst, src1);
2272       } else {
2273         orrw(dst, dst, src1);
2274       }
2275       break;
2276     }
2277     case Op_XorReductionV: {
2278       sve_eorv(tmp, size, pg, src2);
2279       if (bt == T_INT || bt == T_LONG) {
2280         umov(dst, tmp, size, 0);
2281       } else {
2282         smov(dst, tmp, size, 0);
2283       }
2284       if (bt == T_LONG) {
2285         eor(dst, dst, src1);
2286       } else {
2287         eorw(dst, dst, src1);
2288       }
2289       break;
2290     }
2291     case Op_MaxReductionV: {
2292       sve_smaxv(tmp, size, pg, src2);
2293       if (bt == T_INT || bt == T_LONG) {
2294         umov(dst, tmp, size, 0);
2295       } else {
2296         smov(dst, tmp, size, 0);
2297       }
2298       if (bt == T_LONG) {
2299         cmp(dst, src1);
2300         csel(dst, dst, src1, Assembler::GT);
2301       } else {
2302         cmpw(dst, src1);
2303         cselw(dst, dst, src1, Assembler::GT);
2304       }
2305       break;
2306     }
2307     case Op_MinReductionV: {
2308       sve_sminv(tmp, size, pg, src2);
2309       if (bt == T_INT || bt == T_LONG) {
2310         umov(dst, tmp, size, 0);
2311       } else {
2312         smov(dst, tmp, size, 0);
2313       }
2314       if (bt == T_LONG) {
2315         cmp(dst, src1);
2316         csel(dst, dst, src1, Assembler::LT);
2317       } else {
2318         cmpw(dst, src1);
2319         cselw(dst, dst, src1, Assembler::LT);
2320       }
2321       break;
2322     }
2323     default:
2324       assert(false, "unsupported");
2325       ShouldNotReachHere();
2326   }
2327 
2328   if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) {
2329     if (bt == T_BYTE) {
2330       sxtb(dst, dst);
2331     } else if (bt == T_SHORT) {
2332       sxth(dst, dst);
2333     }
2334   }
2335 }
2336 
2337 // Set elements of the dst predicate to true for lanes in the range of [0, lane_cnt), or
2338 // to false otherwise. The input "lane_cnt" should be smaller than or equal to the supported
2339 // max vector length of the basic type. Clobbers: rscratch1 and the rFlagsReg.
2340 void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt) {
2341   uint32_t max_vector_length = Matcher::max_vector_size(bt);
2342   assert(lane_cnt <= max_vector_length, "unsupported input lane_cnt");
2343 
2344   // Set all elements to false if the input "lane_cnt" is zero.
2345   if (lane_cnt == 0) {
2346     sve_pfalse(dst);
2347     return;
2348   }
2349 
2350   SIMD_RegVariant size = elemType_to_regVariant(bt);
2351   assert(size != Q, "invalid size");
2352 
2353   // Set all true if "lane_cnt" equals to the max lane count.
2354   if (lane_cnt == max_vector_length) {
2355     sve_ptrue(dst, size, /* ALL */ 0b11111);
2356     return;
2357   }
2358 
2359   // Fixed numbers for "ptrue".
2360   switch(lane_cnt) {
2361   case 1: /* VL1 */
2362   case 2: /* VL2 */
2363   case 3: /* VL3 */
2364   case 4: /* VL4 */
2365   case 5: /* VL5 */
2366   case 6: /* VL6 */
2367   case 7: /* VL7 */
2368   case 8: /* VL8 */
2369     sve_ptrue(dst, size, lane_cnt);
2370     return;
2371   case 16:
2372     sve_ptrue(dst, size, /* VL16 */ 0b01001);
2373     return;
2374   case 32:
2375     sve_ptrue(dst, size, /* VL32 */ 0b01010);
2376     return;
2377   case 64:
2378     sve_ptrue(dst, size, /* VL64 */ 0b01011);
2379     return;
2380   case 128:
2381     sve_ptrue(dst, size, /* VL128 */ 0b01100);
2382     return;
2383   case 256:
2384     sve_ptrue(dst, size, /* VL256 */ 0b01101);
2385     return;
2386   default:
2387     break;
2388   }
2389 
2390   // Special patterns for "ptrue".
2391   if (lane_cnt == round_down_power_of_2(max_vector_length)) {
2392     sve_ptrue(dst, size, /* POW2 */ 0b00000);
2393   } else if (lane_cnt == max_vector_length - (max_vector_length % 4)) {
2394     sve_ptrue(dst, size, /* MUL4 */ 0b11101);
2395   } else if (lane_cnt == max_vector_length - (max_vector_length % 3)) {
2396     sve_ptrue(dst, size, /* MUL3 */ 0b11110);
2397   } else {
2398     // Encode to "whileltw" for the remaining cases.
2399     mov(rscratch1, lane_cnt);
2400     sve_whileltw(dst, size, zr, rscratch1);
2401   }
2402 }
2403 
2404 // Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst.
2405 // Any remaining elements of dst will be filled with zero.
2406 // Clobbers: rscratch1
2407 // Preserves: src, mask
2408 void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask,
2409                                            FloatRegister vtmp1, FloatRegister vtmp2,
2410                                            PRegister pgtmp) {
2411   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2412   assert_different_registers(dst, src, vtmp1, vtmp2);
2413   assert_different_registers(mask, pgtmp);
2414 
2415   // Example input:   src   = 8888 7777 6666 5555 4444 3333 2222 1111
2416   //                  mask  = 0001 0000 0000 0001 0001 0000 0001 0001
2417   // Expected result: dst   = 0000 0000 0000 8888 5555 4444 2222 1111
2418   sve_dup(vtmp2, H, 0);
2419 
2420   // Extend lowest half to type INT.
2421   // dst = 00004444 00003333 00002222 00001111
2422   sve_uunpklo(dst, S, src);
2423   // pgtmp = 00000001 00000000 00000001 00000001
2424   sve_punpklo(pgtmp, mask);
2425   // Pack the active elements in size of type INT to the right,
2426   // and fill the remainings with zero.
2427   // dst = 00000000 00004444 00002222 00001111
2428   sve_compact(dst, S, dst, pgtmp);
2429   // Narrow the result back to type SHORT.
2430   // dst = 0000 0000 0000 0000 0000 4444 2222 1111
2431   sve_uzp1(dst, H, dst, vtmp2);
2432   // Count the active elements of lowest half.
2433   // rscratch1 = 3
2434   sve_cntp(rscratch1, S, ptrue, pgtmp);
2435 
2436   // Repeat to the highest half.
2437   // pgtmp = 00000001 00000000 00000000 00000001
2438   sve_punpkhi(pgtmp, mask);
2439   // vtmp1 = 00008888 00007777 00006666 00005555
2440   sve_uunpkhi(vtmp1, S, src);
2441   // vtmp1 = 00000000 00000000 00008888 00005555
2442   sve_compact(vtmp1, S, vtmp1, pgtmp);
2443   // vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555
2444   sve_uzp1(vtmp1, H, vtmp1, vtmp2);
2445 
2446   // Compressed low:   dst   = 0000 0000 0000 0000 0000 4444 2222 1111
2447   // Compressed high:  vtmp1 = 0000 0000 0000 0000 0000 0000 8888  5555
2448   // Left shift(cross lane) compressed high with TRUE_CNT lanes,
2449   // TRUE_CNT is the number of active elements in the compressed low.
2450   neg(rscratch1, rscratch1);
2451   // vtmp2 = {4 3 2 1 0 -1 -2 -3}
2452   sve_index(vtmp2, H, rscratch1, 1);
2453   // vtmp1 = 0000 0000 0000 8888 5555 0000 0000 0000
2454   sve_tbl(vtmp1, H, vtmp1, vtmp2);
2455 
2456   // Combine the compressed high(after shifted) with the compressed low.
2457   // dst = 0000 0000 0000 8888 5555 4444 2222 1111
2458   sve_orr(dst, dst, vtmp1);
2459 }
2460 
2461 // Clobbers: rscratch1, rscratch2
2462 // Preserves: src, mask
2463 void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask,
2464                                           FloatRegister vtmp1, FloatRegister vtmp2,
2465                                           FloatRegister vtmp3, FloatRegister vtmp4,
2466                                           PRegister ptmp, PRegister pgtmp) {
2467   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2468   assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3, vtmp4);
2469   assert_different_registers(mask, ptmp, pgtmp);
2470   // Example input:   src   = 88 77 66 55 44 33 22 11
2471   //                  mask  = 01 00 00 01 01 00 01 01
2472   // Expected result: dst   = 00 00 00 88 55 44 22 11
2473 
2474   sve_dup(vtmp4, B, 0);
2475   // Extend lowest half to type SHORT.
2476   // vtmp1 = 0044 0033 0022 0011
2477   sve_uunpklo(vtmp1, H, src);
2478   // ptmp = 0001 0000 0001 0001
2479   sve_punpklo(ptmp, mask);
2480   // Count the active elements of lowest half.
2481   // rscratch2 = 3
2482   sve_cntp(rscratch2, H, ptrue, ptmp);
2483   // Pack the active elements in size of type SHORT to the right,
2484   // and fill the remainings with zero.
2485   // dst = 0000 0044 0022 0011
2486   sve_compress_short(dst, vtmp1, ptmp, vtmp2, vtmp3, pgtmp);
2487   // Narrow the result back to type BYTE.
2488   // dst = 00 00 00 00 00 44 22 11
2489   sve_uzp1(dst, B, dst, vtmp4);
2490 
2491   // Repeat to the highest half.
2492   // ptmp = 0001 0000 0000 0001
2493   sve_punpkhi(ptmp, mask);
2494   // vtmp1 = 0088 0077 0066 0055
2495   sve_uunpkhi(vtmp2, H, src);
2496   // vtmp1 = 0000 0000 0088 0055
2497   sve_compress_short(vtmp1, vtmp2, ptmp, vtmp3, vtmp4, pgtmp);
2498 
2499   sve_dup(vtmp4, B, 0);
2500   // vtmp1 = 00 00 00 00 00 00 88 55
2501   sve_uzp1(vtmp1, B, vtmp1, vtmp4);
2502 
2503   // Compressed low:   dst   = 00 00 00 00 00 44 22 11
2504   // Compressed high:  vtmp1 = 00 00 00 00 00 00 88 55
2505   // Left shift(cross lane) compressed high with TRUE_CNT lanes,
2506   // TRUE_CNT is the number of active elements in the compressed low.
2507   neg(rscratch2, rscratch2);
2508   // vtmp2 = {4 3 2 1 0 -1 -2 -3}
2509   sve_index(vtmp2, B, rscratch2, 1);
2510   // vtmp1 = 00 00 00 88 55 00 00 00
2511   sve_tbl(vtmp1, B, vtmp1, vtmp2);
2512   // Combine the compressed high(after shifted) with the compressed low.
2513   // dst = 00 00 00 88 55 44 22 11
2514   sve_orr(dst, dst, vtmp1);
2515 }
2516 
2517 void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2518   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2519   SIMD_Arrangement size = isQ ? T16B : T8B;
2520   if (bt == T_BYTE) {
2521     rbit(dst, size, src);
2522   } else {
2523     neon_reverse_bytes(dst, src, bt, isQ);
2524     rbit(dst, size, dst);
2525   }
2526 }
2527 
2528 void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2529   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2530   SIMD_Arrangement size = isQ ? T16B : T8B;
2531   switch (bt) {
2532     case T_BYTE:
2533       if (dst != src) {
2534         orr(dst, size, src, src);
2535       }
2536       break;
2537     case T_SHORT:
2538       rev16(dst, size, src);
2539       break;
2540     case T_INT:
2541       rev32(dst, size, src);
2542       break;
2543     case T_LONG:
2544       rev64(dst, size, src);
2545       break;
2546     default:
2547       assert(false, "unsupported");
2548       ShouldNotReachHere();
2549   }
2550 }
2551 
2552 // Extract a scalar element from an sve vector at position 'idx'.
2553 // The input elements in src are expected to be of integral type.
2554 void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src,
2555                                              int idx, FloatRegister vtmp) {
2556   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2557   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2558   if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction
2559     if (bt == T_INT || bt == T_LONG) {
2560       umov(dst, src, size, idx);
2561     } else {
2562       smov(dst, src, size, idx);
2563     }
2564   } else {
2565     sve_orr(vtmp, src, src);
2566     sve_ext(vtmp, vtmp, idx << size);
2567     if (bt == T_INT || bt == T_LONG) {
2568       umov(dst, vtmp, size, 0);
2569     } else {
2570       smov(dst, vtmp, size, 0);
2571     }
2572   }
2573 }
2574 
2575 // java.lang.Math::round intrinsics
2576 
2577 // Clobbers: rscratch1, rflags
2578 void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2579                                           FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) {
2580   assert_different_registers(tmp1, tmp2, tmp3, src, dst);
2581   switch (T) {
2582     case T2S:
2583     case T4S:
2584       fmovs(tmp1, T, 0.5f);
2585       mov(rscratch1, jint_cast(0x1.0p23f));
2586       break;
2587     case T2D:
2588       fmovd(tmp1, T, 0.5);
2589       mov(rscratch1, julong_cast(0x1.0p52));
2590       break;
2591     default:
2592       assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2593   }
2594   fadd(tmp1, T, tmp1, src);
2595   fcvtms(tmp1, T, tmp1);
2596   // tmp1 = floor(src + 0.5, ties to even)
2597 
2598   fcvtas(dst, T, src);
2599   // dst = round(src), ties to away
2600 
2601   fneg(tmp3, T, src);
2602   dup(tmp2, T, rscratch1);
2603   cm(HS, tmp3, T, tmp3, tmp2);
2604   // tmp3 is now a set of flags
2605 
2606   bif(dst, T16B, tmp1, tmp3);
2607   // result in dst
2608 }
2609 
2610 // Clobbers: rscratch1, rflags
2611 void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2612                                          FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) {
2613   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2614   assert_different_registers(tmp1, tmp2, src, dst);
2615 
2616   switch (T) {
2617     case S:
2618       mov(rscratch1, jint_cast(0x1.0p23f));
2619       break;
2620     case D:
2621       mov(rscratch1, julong_cast(0x1.0p52));
2622       break;
2623     default:
2624       assert(T == S || T == D, "invalid register variant");
2625   }
2626 
2627   sve_frinta(dst, T, ptrue, src);
2628   // dst = round(src), ties to away
2629 
2630   Label none;
2631 
2632   sve_fneg(tmp1, T, ptrue, src);
2633   sve_dup(tmp2, T, rscratch1);
2634   sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1);
2635   br(EQ, none);
2636   {
2637     sve_cpy(tmp1, T, pgtmp, 0.5);
2638     sve_fadd(tmp1, T, pgtmp, src);
2639     sve_frintm(dst, T, pgtmp, tmp1);
2640     // dst = floor(src + 0.5, ties to even)
2641   }
2642   bind(none);
2643 
2644   sve_fcvtzs(dst, T, ptrue, dst, T);
2645   // result in dst
2646 }
2647 
2648 void C2_MacroAssembler::vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero,
2649                                            FloatRegister one, SIMD_Arrangement T) {
2650   assert_different_registers(dst, src, zero, one);
2651   assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2652 
2653   facgt(dst, T, src, zero);
2654   ushr(dst, T, dst, 1); // dst=0 for +-0.0 and NaN. 0x7FF..F otherwise
2655   bsl(dst, T == T2S ? T8B : T16B, one, src); // Result in dst
2656 }
2657 
2658 void C2_MacroAssembler::vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero,
2659                                           FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T) {
2660     assert_different_registers(dst, src, zero, one, vtmp);
2661     assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2662 
2663     sve_orr(vtmp, src, src);
2664     sve_fac(Assembler::GT, pgtmp, T, ptrue, src, zero); // pmtp=0 for +-0.0 and NaN. 0x1 otherwise
2665     switch (T) {
2666     case S:
2667       sve_and(vtmp, T, min_jint); // Extract the sign bit of float value in every lane of src
2668       sve_orr(vtmp, T, jint_cast(1.0)); // OR it with +1 to make the final result +1 or -1 depending
2669                                         // on the sign of the float value
2670       break;
2671     case D:
2672       sve_and(vtmp, T, min_jlong);
2673       sve_orr(vtmp, T, jlong_cast(1.0));
2674       break;
2675     default:
2676       assert(false, "unsupported");
2677       ShouldNotReachHere();
2678     }
2679     sve_sel(dst, T, pgtmp, vtmp, src); // Select either from src or vtmp based on the predicate register pgtmp
2680                                        // Result in dst
2681 }
2682 
2683 bool C2_MacroAssembler::in_scratch_emit_size() {
2684   if (ciEnv::current()->task() != nullptr) {
2685     PhaseOutput* phase_output = Compile::current()->output();
2686     if (phase_output != nullptr && phase_output->in_scratch_emit_size()) {
2687       return true;
2688     }
2689   }
2690   return MacroAssembler::in_scratch_emit_size();
2691 }