1 /*
   2  * Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "asm/assembler.hpp"
  26 #include "asm/assembler.inline.hpp"
  27 #include "opto/c2_MacroAssembler.hpp"
  28 #include "opto/compile.hpp"
  29 #include "opto/intrinsicnode.hpp"
  30 #include "opto/matcher.hpp"
  31 #include "opto/output.hpp"
  32 #include "opto/subnode.hpp"
  33 #include "runtime/stubRoutines.hpp"
  34 #include "utilities/globalDefinitions.hpp"
  35 #include "utilities/powerOfTwo.hpp"
  36 
  37 #ifdef PRODUCT
  38 #define BLOCK_COMMENT(str) /* nothing */
  39 #define STOP(error) stop(error)
  40 #else
  41 #define BLOCK_COMMENT(str) block_comment(str)
  42 #define STOP(error) block_comment(error); stop(error)
  43 #endif
  44 
  45 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  46 
  47 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
  48 
  49 // jdk.internal.util.ArraysSupport.vectorizedHashCode
  50 address C2_MacroAssembler::arrays_hashcode(Register ary, Register cnt, Register result,
  51                                            FloatRegister vdata0, FloatRegister vdata1,
  52                                            FloatRegister vdata2, FloatRegister vdata3,
  53                                            FloatRegister vmul0, FloatRegister vmul1,
  54                                            FloatRegister vmul2, FloatRegister vmul3,
  55                                            FloatRegister vpow, FloatRegister vpowm,
  56                                            BasicType eltype) {
  57   ARRAYS_HASHCODE_REGISTERS;
  58 
  59   Register tmp1 = rscratch1, tmp2 = rscratch2;
  60 
  61   Label TAIL, STUB_SWITCH, STUB_SWITCH_OUT, LOOP, BR_BASE, LARGE, DONE;
  62 
  63   // Vectorization factor. Number of array elements loaded to one SIMD&FP registers by the stubs. We
  64   // use 8H load arrangements for chars and shorts and 8B for booleans and bytes. It's possible to
  65   // use 4H for chars and shorts instead, but using 8H gives better performance.
  66   const size_t vf = eltype == T_BOOLEAN || eltype == T_BYTE ? 8
  67                     : eltype == T_CHAR || eltype == T_SHORT ? 8
  68                     : eltype == T_INT                       ? 4
  69                                                             : 0;
  70   guarantee(vf, "unsupported eltype");
  71 
  72   // Unroll factor for the scalar loop below. The value is chosen based on performance analysis.
  73   const size_t unroll_factor = 4;
  74 
  75   switch (eltype) {
  76   case T_BOOLEAN:
  77     BLOCK_COMMENT("arrays_hashcode(unsigned byte) {");
  78     break;
  79   case T_CHAR:
  80     BLOCK_COMMENT("arrays_hashcode(char) {");
  81     break;
  82   case T_BYTE:
  83     BLOCK_COMMENT("arrays_hashcode(byte) {");
  84     break;
  85   case T_SHORT:
  86     BLOCK_COMMENT("arrays_hashcode(short) {");
  87     break;
  88   case T_INT:
  89     BLOCK_COMMENT("arrays_hashcode(int) {");
  90     break;
  91   default:
  92     ShouldNotReachHere();
  93   }
  94 
  95   // large_arrays_hashcode(T_INT) performs worse than the scalar loop below when the Neon loop
  96   // implemented by the stub executes just once. Call the stub only if at least two iterations will
  97   // be executed.
  98   const size_t large_threshold = eltype == T_INT ? vf * 2 : vf;
  99   cmpw(cnt, large_threshold);
 100   br(Assembler::HS, LARGE);
 101 
 102   bind(TAIL);
 103 
 104   // The andr performs cnt % uf where uf = unroll_factor. The subtract shifted by 3 offsets past
 105   // uf - (cnt % uf) pairs of load + madd insns i.e. it only executes cnt % uf load + madd pairs.
 106   // Iteration eats up the remainder, uf elements at a time.
 107   assert(is_power_of_2(unroll_factor), "can't use this value to calculate the jump target PC");
 108   andr(tmp2, cnt, unroll_factor - 1);
 109   adr(tmp1, BR_BASE);
 110   sub(tmp1, tmp1, tmp2, ext::sxtw, 3);
 111   movw(tmp2, 0x1f);
 112   br(tmp1);
 113 
 114   bind(LOOP);
 115   for (size_t i = 0; i < unroll_factor; ++i) {
 116     load(tmp1, Address(post(ary, type2aelembytes(eltype))), eltype);
 117     maddw(result, result, tmp2, tmp1);
 118   }
 119   bind(BR_BASE);
 120   subsw(cnt, cnt, unroll_factor);
 121   br(Assembler::HS, LOOP);
 122 
 123   b(DONE);
 124 
 125   bind(LARGE);
 126 
 127   RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_arrays_hashcode(eltype));
 128   assert(stub.target() != nullptr, "array_hashcode stub has not been generated");
 129   address tpc = trampoline_call(stub);
 130   if (tpc == nullptr) {
 131     DEBUG_ONLY(reset_labels(TAIL, BR_BASE));
 132     postcond(pc() == badAddress);
 133     return nullptr;
 134   }
 135 
 136   bind(DONE);
 137 
 138   BLOCK_COMMENT("} // arrays_hashcode");
 139 
 140   postcond(pc() != badAddress);
 141   return pc();
 142 }
 143 
 144 void C2_MacroAssembler::fast_lock(Register objectReg, Register boxReg, Register tmpReg,
 145                                   Register tmp2Reg, Register tmp3Reg) {
 146   Register oop = objectReg;
 147   Register box = boxReg;
 148   Register disp_hdr = tmpReg;
 149   Register tmp = tmp2Reg;
 150   Label cont;
 151   Label object_has_monitor;
 152   Label count, no_count;
 153 
 154   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight");
 155   assert_different_registers(oop, box, tmp, disp_hdr, rscratch2);
 156 
 157   // Load markWord from object into displaced_header.
 158   ldr(disp_hdr, Address(oop, oopDesc::mark_offset_in_bytes()));
 159 
 160   if (DiagnoseSyncOnValueBasedClasses != 0) {
 161     load_klass(tmp, oop);
 162     ldrb(tmp, Address(tmp, Klass::misc_flags_offset()));
 163     tst(tmp, KlassFlags::_misc_is_value_based_class);
 164     br(Assembler::NE, cont);
 165   }
 166 
 167   // Check for existing monitor
 168   tbnz(disp_hdr, exact_log2(markWord::monitor_value), object_has_monitor);
 169 
 170   if (LockingMode == LM_MONITOR) {
 171     tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0.
 172     b(cont);
 173   } else {
 174     assert(LockingMode == LM_LEGACY, "must be");
 175     // Set tmp to be (markWord of object | UNLOCK_VALUE).
 176     orr(tmp, disp_hdr, markWord::unlocked_value);
 177 
 178     // Initialize the box. (Must happen before we update the object mark!)
 179     str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 180 
 181     // Compare object markWord with an unlocked value (tmp) and if
 182     // equal exchange the stack address of our box with object markWord.
 183     // On failure disp_hdr contains the possibly locked markWord.
 184     cmpxchg(oop, tmp, box, Assembler::xword, /*acquire*/ true,
 185             /*release*/ true, /*weak*/ false, disp_hdr);
 186     br(Assembler::EQ, cont);
 187 
 188     assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
 189 
 190     // If the compare-and-exchange succeeded, then we found an unlocked
 191     // object, will have now locked it will continue at label cont
 192 
 193     // Check if the owner is self by comparing the value in the
 194     // markWord of object (disp_hdr) with the stack pointer.
 195     mov(rscratch1, sp);
 196     sub(disp_hdr, disp_hdr, rscratch1);
 197     mov(tmp, (address) (~(os::vm_page_size()-1) | markWord::lock_mask_in_place));
 198     // If condition is true we are cont and hence we can store 0 as the
 199     // displaced header in the box, which indicates that it is a recursive lock.
 200     ands(tmp/*==0?*/, disp_hdr, tmp);   // Sets flags for result
 201     str(tmp/*==0, perhaps*/, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 202     b(cont);
 203   }
 204 
 205   // Handle existing monitor.
 206   bind(object_has_monitor);
 207 
 208   // Try to CAS owner (no owner => current thread's _monitor_owner_id).
 209   ldr(rscratch2, Address(rthread, JavaThread::monitor_owner_id_offset()));
 210   add(tmp, disp_hdr, (in_bytes(ObjectMonitor::owner_offset())-markWord::monitor_value));
 211   cmpxchg(tmp, zr, rscratch2, Assembler::xword, /*acquire*/ true,
 212           /*release*/ true, /*weak*/ false, tmp3Reg); // Sets flags for result
 213 
 214   // Store a non-null value into the box to avoid looking like a re-entrant
 215   // lock. The fast-path monitor unlock code checks for
 216   // markWord::monitor_value so use markWord::unused_mark which has the
 217   // relevant bit set, and also matches ObjectSynchronizer::enter.
 218   mov(tmp, (address)markWord::unused_mark().value());
 219   str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 220 
 221   br(Assembler::EQ, cont); // CAS success means locking succeeded
 222 
 223   cmp(tmp3Reg, rscratch2);
 224   br(Assembler::NE, cont); // Check for recursive locking
 225 
 226   // Recursive lock case
 227   increment(Address(disp_hdr, in_bytes(ObjectMonitor::recursions_offset()) - markWord::monitor_value), 1);
 228   // flag == EQ still from the cmp above, checking if this is a reentrant lock
 229 
 230   bind(cont);
 231   // flag == EQ indicates success
 232   // flag == NE indicates failure
 233   br(Assembler::NE, no_count);
 234 
 235   bind(count);
 236   if (LockingMode == LM_LEGACY) {
 237     inc_held_monitor_count(rscratch1);
 238   }
 239 
 240   bind(no_count);
 241 }
 242 
 243 void C2_MacroAssembler::fast_unlock(Register objectReg, Register boxReg, Register tmpReg,
 244                                     Register tmp2Reg) {
 245   Register oop = objectReg;
 246   Register box = boxReg;
 247   Register disp_hdr = tmpReg;
 248   Register owner_addr = tmpReg;
 249   Register tmp = tmp2Reg;
 250   Label cont;
 251   Label object_has_monitor;
 252   Label count, no_count;
 253   Label unlocked;
 254 
 255   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight");
 256   assert_different_registers(oop, box, tmp, disp_hdr);
 257 
 258   if (LockingMode == LM_LEGACY) {
 259     // Find the lock address and load the displaced header from the stack.
 260     ldr(disp_hdr, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 261 
 262     // If the displaced header is 0, we have a recursive unlock.
 263     cmp(disp_hdr, zr);
 264     br(Assembler::EQ, cont);
 265   }
 266 
 267   // Handle existing monitor.
 268   ldr(tmp, Address(oop, oopDesc::mark_offset_in_bytes()));
 269   tbnz(tmp, exact_log2(markWord::monitor_value), object_has_monitor);
 270 
 271   if (LockingMode == LM_MONITOR) {
 272     tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0.
 273     b(cont);
 274   } else {
 275     assert(LockingMode == LM_LEGACY, "must be");
 276     // Check if it is still a light weight lock, this is is true if we
 277     // see the stack address of the basicLock in the markWord of the
 278     // object.
 279 
 280     cmpxchg(oop, box, disp_hdr, Assembler::xword, /*acquire*/ false,
 281             /*release*/ true, /*weak*/ false, tmp);
 282     b(cont);
 283   }
 284 
 285   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
 286 
 287   // Handle existing monitor.
 288   bind(object_has_monitor);
 289   STATIC_ASSERT(markWord::monitor_value <= INT_MAX);
 290   add(tmp, tmp, -(int)markWord::monitor_value); // monitor
 291 
 292   ldr(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset()));
 293 
 294   Label notRecursive;
 295   cbz(disp_hdr, notRecursive);
 296 
 297   // Recursive lock
 298   sub(disp_hdr, disp_hdr, 1u);
 299   str(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset()));
 300   cmp(disp_hdr, disp_hdr); // Sets flags for result
 301   b(cont);
 302 
 303   bind(notRecursive);
 304 
 305   // Compute owner address.
 306   lea(owner_addr, Address(tmp, ObjectMonitor::owner_offset()));
 307 
 308   // Set owner to null.
 309   // Release to satisfy the JMM
 310   stlr(zr, owner_addr);
 311   // We need a full fence after clearing owner to avoid stranding.
 312   // StoreLoad achieves this.
 313   membar(StoreLoad);
 314 
 315   // Check if the entry lists are empty (EntryList first - by convention).
 316   ldr(rscratch1, Address(tmp, ObjectMonitor::EntryList_offset()));
 317   ldr(tmpReg, Address(tmp, ObjectMonitor::cxq_offset()));
 318   orr(rscratch1, rscratch1, tmpReg);
 319   cmp(rscratch1, zr);
 320   br(Assembler::EQ, cont);     // If so we are done.
 321 
 322   // Check if there is a successor.
 323   ldr(rscratch1, Address(tmp, ObjectMonitor::succ_offset()));
 324   cmp(rscratch1, zr);
 325   br(Assembler::NE, unlocked); // If so we are done.
 326 
 327   // Save the monitor pointer in the current thread, so we can try to
 328   // reacquire the lock in SharedRuntime::monitor_exit_helper().
 329   str(tmp, Address(rthread, JavaThread::unlocked_inflated_monitor_offset()));
 330 
 331   cmp(zr, rthread); // Set Flag to NE => slow path
 332   b(cont);
 333 
 334   bind(unlocked);
 335   cmp(zr, zr); // Set Flag to EQ => fast path
 336 
 337   // Intentional fall-through
 338 
 339   bind(cont);
 340   // flag == EQ indicates success
 341   // flag == NE indicates failure
 342   br(Assembler::NE, no_count);
 343 
 344   bind(count);
 345   if (LockingMode == LM_LEGACY) {
 346     dec_held_monitor_count(rscratch1);
 347   }
 348 
 349   bind(no_count);
 350 }
 351 
 352 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register t1,
 353                                               Register t2, Register t3) {
 354   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 355   assert_different_registers(obj, box, t1, t2, t3, rscratch2);
 356 
 357   // Handle inflated monitor.
 358   Label inflated;
 359   // Finish fast lock successfully. MUST branch to with flag == EQ
 360   Label locked;
 361   // Finish fast lock unsuccessfully. MUST branch to with flag == NE
 362   Label slow_path;
 363 
 364   if (UseObjectMonitorTable) {
 365     // Clear cache in case fast locking succeeds.
 366     str(zr, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 367   }
 368 
 369   if (DiagnoseSyncOnValueBasedClasses != 0) {
 370     load_klass(t1, obj);
 371     ldrb(t1, Address(t1, Klass::misc_flags_offset()));
 372     tst(t1, KlassFlags::_misc_is_value_based_class);
 373     br(Assembler::NE, slow_path);
 374   }
 375 
 376   const Register t1_mark = t1;
 377   const Register t3_t = t3;
 378 
 379   { // Lightweight locking
 380 
 381     // Push lock to the lock stack and finish successfully. MUST branch to with flag == EQ
 382     Label push;
 383 
 384     const Register t2_top = t2;
 385 
 386     // Check if lock-stack is full.
 387     ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 388     cmpw(t2_top, (unsigned)LockStack::end_offset() - 1);
 389     br(Assembler::GT, slow_path);
 390 
 391     // Check if recursive.
 392     subw(t3_t, t2_top, oopSize);
 393     ldr(t3_t, Address(rthread, t3_t));
 394     cmp(obj, t3_t);
 395     br(Assembler::EQ, push);
 396 
 397     // Relaxed normal load to check for monitor. Optimization for monitor case.
 398     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 399     tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
 400 
 401     // Not inflated
 402     assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea");
 403 
 404     // Try to lock. Transition lock-bits 0b01 => 0b00
 405     orr(t1_mark, t1_mark, markWord::unlocked_value);
 406     eor(t3_t, t1_mark, markWord::unlocked_value);
 407     cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
 408             /*acquire*/ true, /*release*/ false, /*weak*/ false, noreg);
 409     br(Assembler::NE, slow_path);
 410 
 411     bind(push);
 412     // After successful lock, push object on lock-stack.
 413     str(obj, Address(rthread, t2_top));
 414     addw(t2_top, t2_top, oopSize);
 415     strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 416     b(locked);
 417   }
 418 
 419   { // Handle inflated monitor.
 420     bind(inflated);
 421 
 422     const Register t1_monitor = t1;
 423 
 424     if (!UseObjectMonitorTable) {
 425       assert(t1_monitor == t1_mark, "should be the same here");
 426     } else {
 427       Label monitor_found;
 428 
 429       // Load cache address
 430       lea(t3_t, Address(rthread, JavaThread::om_cache_oops_offset()));
 431 
 432       const int num_unrolled = 2;
 433       for (int i = 0; i < num_unrolled; i++) {
 434         ldr(t1, Address(t3_t));
 435         cmp(obj, t1);
 436         br(Assembler::EQ, monitor_found);
 437         increment(t3_t, in_bytes(OMCache::oop_to_oop_difference()));
 438       }
 439 
 440       Label loop;
 441 
 442       // Search for obj in cache.
 443       bind(loop);
 444 
 445       // Check for match.
 446       ldr(t1, Address(t3_t));
 447       cmp(obj, t1);
 448       br(Assembler::EQ, monitor_found);
 449 
 450       // Search until null encountered, guaranteed _null_sentinel at end.
 451       increment(t3_t, in_bytes(OMCache::oop_to_oop_difference()));
 452       cbnz(t1, loop);
 453       // Cache Miss, NE set from cmp above, cbnz does not set flags
 454       b(slow_path);
 455 
 456       bind(monitor_found);
 457       ldr(t1_monitor, Address(t3_t, OMCache::oop_to_monitor_difference()));
 458     }
 459 
 460     const Register t2_owner_addr = t2;
 461     const Register t3_owner = t3;
 462     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 463     const Address owner_address(t1_monitor, ObjectMonitor::owner_offset() - monitor_tag);
 464     const Address recursions_address(t1_monitor, ObjectMonitor::recursions_offset() - monitor_tag);
 465 
 466     Label monitor_locked;
 467 
 468     // Compute owner address.
 469     lea(t2_owner_addr, owner_address);
 470 
 471     // Try to CAS owner (no owner => current thread's _monitor_owner_id).
 472     ldr(rscratch2, Address(rthread, JavaThread::monitor_owner_id_offset()));
 473     cmpxchg(t2_owner_addr, zr, rscratch2, Assembler::xword, /*acquire*/ true,
 474             /*release*/ false, /*weak*/ false, t3_owner);
 475     br(Assembler::EQ, monitor_locked);
 476 
 477     // Check if recursive.
 478     cmp(t3_owner, rscratch2);
 479     br(Assembler::NE, slow_path);
 480 
 481     // Recursive.
 482     increment(recursions_address, 1);
 483 
 484     bind(monitor_locked);
 485     if (UseObjectMonitorTable) {
 486       str(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 487     }
 488   }
 489 
 490   bind(locked);
 491 
 492 #ifdef ASSERT
 493   // Check that locked label is reached with Flags == EQ.
 494   Label flag_correct;
 495   br(Assembler::EQ, flag_correct);
 496   stop("Fast Lock Flag != EQ");
 497 #endif
 498 
 499   bind(slow_path);
 500 #ifdef ASSERT
 501   // Check that slow_path label is reached with Flags == NE.
 502   br(Assembler::NE, flag_correct);
 503   stop("Fast Lock Flag != NE");
 504   bind(flag_correct);
 505 #endif
 506   // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
 507 }
 508 
 509 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register box, Register t1,
 510                                                 Register t2, Register t3) {
 511   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 512   assert_different_registers(obj, box, t1, t2, t3);
 513 
 514   // Handle inflated monitor.
 515   Label inflated, inflated_load_mark;
 516   // Finish fast unlock successfully. MUST branch to with flag == EQ
 517   Label unlocked;
 518   // Finish fast unlock unsuccessfully. MUST branch to with flag == NE
 519   Label slow_path;
 520 
 521   const Register t1_mark = t1;
 522   const Register t2_top = t2;
 523   const Register t3_t = t3;
 524 
 525   { // Lightweight unlock
 526 
 527     Label push_and_slow_path;
 528 
 529     // Check if obj is top of lock-stack.
 530     ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 531     subw(t2_top, t2_top, oopSize);
 532     ldr(t3_t, Address(rthread, t2_top));
 533     cmp(obj, t3_t);
 534     // Top of lock stack was not obj. Must be monitor.
 535     br(Assembler::NE, inflated_load_mark);
 536 
 537     // Pop lock-stack.
 538     DEBUG_ONLY(str(zr, Address(rthread, t2_top));)
 539     strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 540 
 541     // Check if recursive.
 542     subw(t3_t, t2_top, oopSize);
 543     ldr(t3_t, Address(rthread, t3_t));
 544     cmp(obj, t3_t);
 545     br(Assembler::EQ, unlocked);
 546 
 547     // Not recursive.
 548     // Load Mark.
 549     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 550 
 551     // Check header for monitor (0b10).
 552     // Because we got here by popping (meaning we pushed in locked)
 553     // there will be no monitor in the box. So we need to push back the obj
 554     // so that the runtime can fix any potential anonymous owner.
 555     tbnz(t1_mark, exact_log2(markWord::monitor_value), UseObjectMonitorTable ? push_and_slow_path : inflated);
 556 
 557     // Try to unlock. Transition lock bits 0b00 => 0b01
 558     assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea");
 559     orr(t3_t, t1_mark, markWord::unlocked_value);
 560     cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
 561             /*acquire*/ false, /*release*/ true, /*weak*/ false, noreg);
 562     br(Assembler::EQ, unlocked);
 563 
 564     bind(push_and_slow_path);
 565     // Compare and exchange failed.
 566     // Restore lock-stack and handle the unlock in runtime.
 567     DEBUG_ONLY(str(obj, Address(rthread, t2_top));)
 568     addw(t2_top, t2_top, oopSize);
 569     str(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 570     b(slow_path);
 571   }
 572 
 573 
 574   { // Handle inflated monitor.
 575     bind(inflated_load_mark);
 576     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 577 #ifdef ASSERT
 578     tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
 579     stop("Fast Unlock not monitor");
 580 #endif
 581 
 582     bind(inflated);
 583 
 584 #ifdef ASSERT
 585     Label check_done;
 586     subw(t2_top, t2_top, oopSize);
 587     cmpw(t2_top, in_bytes(JavaThread::lock_stack_base_offset()));
 588     br(Assembler::LT, check_done);
 589     ldr(t3_t, Address(rthread, t2_top));
 590     cmp(obj, t3_t);
 591     br(Assembler::NE, inflated);
 592     stop("Fast Unlock lock on stack");
 593     bind(check_done);
 594 #endif
 595 
 596     const Register t1_monitor = t1;
 597 
 598     if (!UseObjectMonitorTable) {
 599       assert(t1_monitor == t1_mark, "should be the same here");
 600 
 601       // Untag the monitor.
 602       add(t1_monitor, t1_mark, -(int)markWord::monitor_value);
 603     } else {
 604       ldr(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 605       // null check with Flags == NE, no valid pointer below alignof(ObjectMonitor*)
 606       cmp(t1_monitor, checked_cast<uint8_t>(alignof(ObjectMonitor*)));
 607       br(Assembler::LO, slow_path);
 608     }
 609 
 610     const Register t2_recursions = t2;
 611     Label not_recursive;
 612 
 613     // Check if recursive.
 614     ldr(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
 615     cbz(t2_recursions, not_recursive);
 616 
 617     // Recursive unlock.
 618     sub(t2_recursions, t2_recursions, 1u);
 619     str(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
 620     // Set flag == EQ
 621     cmp(t2_recursions, t2_recursions);
 622     b(unlocked);
 623 
 624     bind(not_recursive);
 625 
 626     const Register t2_owner_addr = t2;
 627 
 628     // Compute owner address.
 629     lea(t2_owner_addr, Address(t1_monitor, ObjectMonitor::owner_offset()));
 630 
 631     // Set owner to null.
 632     // Release to satisfy the JMM
 633     stlr(zr, t2_owner_addr);
 634     // We need a full fence after clearing owner to avoid stranding.
 635     // StoreLoad achieves this.
 636     membar(StoreLoad);
 637 
 638     // Check if the entry lists are empty (EntryList first - by convention).
 639     ldr(rscratch1, Address(t1_monitor, ObjectMonitor::EntryList_offset()));
 640     ldr(t3_t, Address(t1_monitor, ObjectMonitor::cxq_offset()));
 641     orr(rscratch1, rscratch1, t3_t);
 642     cmp(rscratch1, zr);
 643     br(Assembler::EQ, unlocked);  // If so we are done.
 644 
 645     // Check if there is a successor.
 646     ldr(rscratch1, Address(t1_monitor, ObjectMonitor::succ_offset()));
 647     cmp(rscratch1, zr);
 648     br(Assembler::NE, unlocked);  // If so we are done.
 649 
 650     // Save the monitor pointer in the current thread, so we can try to
 651     // reacquire the lock in SharedRuntime::monitor_exit_helper().
 652     str(t1_monitor, Address(rthread, JavaThread::unlocked_inflated_monitor_offset()));
 653 
 654     cmp(zr, rthread); // Set Flag to NE => slow path
 655     b(slow_path);
 656   }
 657 
 658   bind(unlocked);
 659   cmp(zr, zr); // Set Flags to EQ => fast path
 660 
 661 #ifdef ASSERT
 662   // Check that unlocked label is reached with Flags == EQ.
 663   Label flag_correct;
 664   br(Assembler::EQ, flag_correct);
 665   stop("Fast Unlock Flag != EQ");
 666 #endif
 667 
 668   bind(slow_path);
 669 #ifdef ASSERT
 670   // Check that slow_path label is reached with Flags == NE.
 671   br(Assembler::NE, flag_correct);
 672   stop("Fast Unlock Flag != NE");
 673   bind(flag_correct);
 674 #endif
 675   // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
 676 }
 677 
 678 // Search for str1 in str2 and return index or -1
 679 // Clobbers: rscratch1, rscratch2, rflags. May also clobber v0-v1, when icnt1==-1.
 680 void C2_MacroAssembler::string_indexof(Register str2, Register str1,
 681                                        Register cnt2, Register cnt1,
 682                                        Register tmp1, Register tmp2,
 683                                        Register tmp3, Register tmp4,
 684                                        Register tmp5, Register tmp6,
 685                                        int icnt1, Register result, int ae) {
 686   // NOTE: tmp5, tmp6 can be zr depending on specific method version
 687   Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
 688 
 689   Register ch1 = rscratch1;
 690   Register ch2 = rscratch2;
 691   Register cnt1tmp = tmp1;
 692   Register cnt2tmp = tmp2;
 693   Register cnt1_neg = cnt1;
 694   Register cnt2_neg = cnt2;
 695   Register result_tmp = tmp4;
 696 
 697   bool isL = ae == StrIntrinsicNode::LL;
 698 
 699   bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
 700   bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
 701   int str1_chr_shift = str1_isL ? 0:1;
 702   int str2_chr_shift = str2_isL ? 0:1;
 703   int str1_chr_size = str1_isL ? 1:2;
 704   int str2_chr_size = str2_isL ? 1:2;
 705   chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
 706                                       (chr_insn)&MacroAssembler::ldrh;
 707   chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
 708                                       (chr_insn)&MacroAssembler::ldrh;
 709   chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
 710   chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
 711 
 712   // Note, inline_string_indexOf() generates checks:
 713   // if (substr.count > string.count) return -1;
 714   // if (substr.count == 0) return 0;
 715 
 716   // We have two strings, a source string in str2, cnt2 and a pattern string
 717   // in str1, cnt1. Find the 1st occurrence of pattern in source or return -1.
 718 
 719   // For larger pattern and source we use a simplified Boyer Moore algorithm.
 720   // With a small pattern and source we use linear scan.
 721 
 722   if (icnt1 == -1) {
 723     sub(result_tmp, cnt2, cnt1);
 724     cmp(cnt1, (u1)8);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
 725     br(LT, LINEARSEARCH);
 726     dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
 727     subs(zr, cnt1, 256);
 728     lsr(tmp1, cnt2, 2);
 729     ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
 730     br(GE, LINEARSTUB);
 731   }
 732 
 733 // The Boyer Moore alogorithm is based on the description here:-
 734 //
 735 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
 736 //
 737 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
 738 // and the 'Good Suffix' rule.
 739 //
 740 // These rules are essentially heuristics for how far we can shift the
 741 // pattern along the search string.
 742 //
 743 // The implementation here uses the 'Bad Character' rule only because of the
 744 // complexity of initialisation for the 'Good Suffix' rule.
 745 //
 746 // This is also known as the Boyer-Moore-Horspool algorithm:-
 747 //
 748 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
 749 //
 750 // This particular implementation has few java-specific optimizations.
 751 //
 752 // #define ASIZE 256
 753 //
 754 //    int bm(unsigned char *x, int m, unsigned char *y, int n) {
 755 //       int i, j;
 756 //       unsigned c;
 757 //       unsigned char bc[ASIZE];
 758 //
 759 //       /* Preprocessing */
 760 //       for (i = 0; i < ASIZE; ++i)
 761 //          bc[i] = m;
 762 //       for (i = 0; i < m - 1; ) {
 763 //          c = x[i];
 764 //          ++i;
 765 //          // c < 256 for Latin1 string, so, no need for branch
 766 //          #ifdef PATTERN_STRING_IS_LATIN1
 767 //          bc[c] = m - i;
 768 //          #else
 769 //          if (c < ASIZE) bc[c] = m - i;
 770 //          #endif
 771 //       }
 772 //
 773 //       /* Searching */
 774 //       j = 0;
 775 //       while (j <= n - m) {
 776 //          c = y[i+j];
 777 //          if (x[m-1] == c)
 778 //            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
 779 //          if (i < 0) return j;
 780 //          // c < 256 for Latin1 string, so, no need for branch
 781 //          #ifdef SOURCE_STRING_IS_LATIN1
 782 //          // LL case: (c< 256) always true. Remove branch
 783 //          j += bc[y[j+m-1]];
 784 //          #endif
 785 //          #ifndef PATTERN_STRING_IS_UTF
 786 //          // UU case: need if (c<ASIZE) check. Skip 1 character if not.
 787 //          if (c < ASIZE)
 788 //            j += bc[y[j+m-1]];
 789 //          else
 790 //            j += 1
 791 //          #endif
 792 //          #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
 793 //          // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
 794 //          if (c < ASIZE)
 795 //            j += bc[y[j+m-1]];
 796 //          else
 797 //            j += m
 798 //          #endif
 799 //       }
 800 //    }
 801 
 802   if (icnt1 == -1) {
 803     Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
 804         BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
 805     Register cnt1end = tmp2;
 806     Register str2end = cnt2;
 807     Register skipch = tmp2;
 808 
 809     // str1 length is >=8, so, we can read at least 1 register for cases when
 810     // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
 811     // UL case. We'll re-read last character in inner pre-loop code to have
 812     // single outer pre-loop load
 813     const int firstStep = isL ? 7 : 3;
 814 
 815     const int ASIZE = 256;
 816     const int STORED_BYTES = 32; // amount of bytes stored per instruction
 817     sub(sp, sp, ASIZE);
 818     mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
 819     mov(ch1, sp);
 820     BIND(BM_INIT_LOOP);
 821       stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
 822       subs(tmp5, tmp5, 1);
 823       br(GT, BM_INIT_LOOP);
 824 
 825       sub(cnt1tmp, cnt1, 1);
 826       mov(tmp5, str2);
 827       add(str2end, str2, result_tmp, LSL, str2_chr_shift);
 828       sub(ch2, cnt1, 1);
 829       mov(tmp3, str1);
 830     BIND(BCLOOP);
 831       (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
 832       if (!str1_isL) {
 833         subs(zr, ch1, ASIZE);
 834         br(HS, BCSKIP);
 835       }
 836       strb(ch2, Address(sp, ch1));
 837     BIND(BCSKIP);
 838       subs(ch2, ch2, 1);
 839       br(GT, BCLOOP);
 840 
 841       add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
 842       if (str1_isL == str2_isL) {
 843         // load last 8 bytes (8LL/4UU symbols)
 844         ldr(tmp6, Address(tmp6, -wordSize));
 845       } else {
 846         ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
 847         // convert Latin1 to UTF. We'll have to wait until load completed, but
 848         // it's still faster than per-character loads+checks
 849         lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
 850         ubfx(ch1, tmp6, 8, 8); // str1[N-2]
 851         ubfx(ch2, tmp6, 16, 8); // str1[N-3]
 852         andr(tmp6, tmp6, 0xFF); // str1[N-4]
 853         orr(ch2, ch1, ch2, LSL, 16);
 854         orr(tmp6, tmp6, tmp3, LSL, 48);
 855         orr(tmp6, tmp6, ch2, LSL, 16);
 856       }
 857     BIND(BMLOOPSTR2);
 858       (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 859       sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
 860       if (str1_isL == str2_isL) {
 861         // re-init tmp3. It's for free because it's executed in parallel with
 862         // load above. Alternative is to initialize it before loop, but it'll
 863         // affect performance on in-order systems with 2 or more ld/st pipelines
 864         lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
 865       }
 866       if (!isL) { // UU/UL case
 867         lsl(ch2, cnt1tmp, 1); // offset in bytes
 868       }
 869       cmp(tmp3, skipch);
 870       br(NE, BMSKIP);
 871       ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
 872       mov(ch1, tmp6);
 873       if (isL) {
 874         b(BMLOOPSTR1_AFTER_LOAD);
 875       } else {
 876         sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
 877         b(BMLOOPSTR1_CMP);
 878       }
 879     BIND(BMLOOPSTR1);
 880       (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
 881       (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 882     BIND(BMLOOPSTR1_AFTER_LOAD);
 883       subs(cnt1tmp, cnt1tmp, 1);
 884       br(LT, BMLOOPSTR1_LASTCMP);
 885     BIND(BMLOOPSTR1_CMP);
 886       cmp(ch1, ch2);
 887       br(EQ, BMLOOPSTR1);
 888     BIND(BMSKIP);
 889       if (!isL) {
 890         // if we've met UTF symbol while searching Latin1 pattern, then we can
 891         // skip cnt1 symbols
 892         if (str1_isL != str2_isL) {
 893           mov(result_tmp, cnt1);
 894         } else {
 895           mov(result_tmp, 1);
 896         }
 897         subs(zr, skipch, ASIZE);
 898         br(HS, BMADV);
 899       }
 900       ldrb(result_tmp, Address(sp, skipch)); // load skip distance
 901     BIND(BMADV);
 902       sub(cnt1tmp, cnt1, 1);
 903       add(str2, str2, result_tmp, LSL, str2_chr_shift);
 904       cmp(str2, str2end);
 905       br(LE, BMLOOPSTR2);
 906       add(sp, sp, ASIZE);
 907       b(NOMATCH);
 908     BIND(BMLOOPSTR1_LASTCMP);
 909       cmp(ch1, ch2);
 910       br(NE, BMSKIP);
 911     BIND(BMMATCH);
 912       sub(result, str2, tmp5);
 913       if (!str2_isL) lsr(result, result, 1);
 914       add(sp, sp, ASIZE);
 915       b(DONE);
 916 
 917     BIND(LINEARSTUB);
 918     cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm
 919     br(LT, LINEAR_MEDIUM);
 920     mov(result, zr);
 921     RuntimeAddress stub = nullptr;
 922     if (isL) {
 923       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
 924       assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated");
 925     } else if (str1_isL) {
 926       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
 927        assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated");
 928     } else {
 929       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
 930       assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated");
 931     }
 932     address call = trampoline_call(stub);
 933     if (call == nullptr) {
 934       DEBUG_ONLY(reset_labels(LINEARSEARCH, LINEAR_MEDIUM, DONE, NOMATCH, MATCH));
 935       ciEnv::current()->record_failure("CodeCache is full");
 936       return;
 937     }
 938     b(DONE);
 939   }
 940 
 941   BIND(LINEARSEARCH);
 942   {
 943     Label DO1, DO2, DO3;
 944 
 945     Register str2tmp = tmp2;
 946     Register first = tmp3;
 947 
 948     if (icnt1 == -1)
 949     {
 950         Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
 951 
 952         cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2));
 953         br(LT, DOSHORT);
 954       BIND(LINEAR_MEDIUM);
 955         (this->*str1_load_1chr)(first, Address(str1));
 956         lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
 957         sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
 958         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 959         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 960 
 961       BIND(FIRST_LOOP);
 962         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
 963         cmp(first, ch2);
 964         br(EQ, STR1_LOOP);
 965       BIND(STR2_NEXT);
 966         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 967         br(LE, FIRST_LOOP);
 968         b(NOMATCH);
 969 
 970       BIND(STR1_LOOP);
 971         adds(cnt1tmp, cnt1_neg, str1_chr_size);
 972         add(cnt2tmp, cnt2_neg, str2_chr_size);
 973         br(GE, MATCH);
 974 
 975       BIND(STR1_NEXT);
 976         (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
 977         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
 978         cmp(ch1, ch2);
 979         br(NE, STR2_NEXT);
 980         adds(cnt1tmp, cnt1tmp, str1_chr_size);
 981         add(cnt2tmp, cnt2tmp, str2_chr_size);
 982         br(LT, STR1_NEXT);
 983         b(MATCH);
 984 
 985       BIND(DOSHORT);
 986       if (str1_isL == str2_isL) {
 987         cmp(cnt1, (u1)2);
 988         br(LT, DO1);
 989         br(GT, DO3);
 990       }
 991     }
 992 
 993     if (icnt1 == 4) {
 994       Label CH1_LOOP;
 995 
 996         (this->*load_4chr)(ch1, str1);
 997         sub(result_tmp, cnt2, 4);
 998         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 999         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
1000 
1001       BIND(CH1_LOOP);
1002         (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
1003         cmp(ch1, ch2);
1004         br(EQ, MATCH);
1005         adds(cnt2_neg, cnt2_neg, str2_chr_size);
1006         br(LE, CH1_LOOP);
1007         b(NOMATCH);
1008       }
1009 
1010     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
1011       Label CH1_LOOP;
1012 
1013       BIND(DO2);
1014         (this->*load_2chr)(ch1, str1);
1015         if (icnt1 == 2) {
1016           sub(result_tmp, cnt2, 2);
1017         }
1018         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
1019         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
1020       BIND(CH1_LOOP);
1021         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
1022         cmp(ch1, ch2);
1023         br(EQ, MATCH);
1024         adds(cnt2_neg, cnt2_neg, str2_chr_size);
1025         br(LE, CH1_LOOP);
1026         b(NOMATCH);
1027     }
1028 
1029     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
1030       Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
1031 
1032       BIND(DO3);
1033         (this->*load_2chr)(first, str1);
1034         (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
1035         if (icnt1 == 3) {
1036           sub(result_tmp, cnt2, 3);
1037         }
1038         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
1039         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
1040       BIND(FIRST_LOOP);
1041         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
1042         cmpw(first, ch2);
1043         br(EQ, STR1_LOOP);
1044       BIND(STR2_NEXT);
1045         adds(cnt2_neg, cnt2_neg, str2_chr_size);
1046         br(LE, FIRST_LOOP);
1047         b(NOMATCH);
1048 
1049       BIND(STR1_LOOP);
1050         add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
1051         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
1052         cmp(ch1, ch2);
1053         br(NE, STR2_NEXT);
1054         b(MATCH);
1055     }
1056 
1057     if (icnt1 == -1 || icnt1 == 1) {
1058       Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
1059 
1060       BIND(DO1);
1061         (this->*str1_load_1chr)(ch1, str1);
1062         cmp(cnt2, (u1)8);
1063         br(LT, DO1_SHORT);
1064 
1065         sub(result_tmp, cnt2, 8/str2_chr_size);
1066         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
1067         mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
1068         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
1069 
1070         if (str2_isL) {
1071           orr(ch1, ch1, ch1, LSL, 8);
1072         }
1073         orr(ch1, ch1, ch1, LSL, 16);
1074         orr(ch1, ch1, ch1, LSL, 32);
1075       BIND(CH1_LOOP);
1076         ldr(ch2, Address(str2, cnt2_neg));
1077         eor(ch2, ch1, ch2);
1078         sub(tmp1, ch2, tmp3);
1079         orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
1080         bics(tmp1, tmp1, tmp2);
1081         br(NE, HAS_ZERO);
1082         adds(cnt2_neg, cnt2_neg, 8);
1083         br(LT, CH1_LOOP);
1084 
1085         cmp(cnt2_neg, (u1)8);
1086         mov(cnt2_neg, 0);
1087         br(LT, CH1_LOOP);
1088         b(NOMATCH);
1089 
1090       BIND(HAS_ZERO);
1091         rev(tmp1, tmp1);
1092         clz(tmp1, tmp1);
1093         add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
1094         b(MATCH);
1095 
1096       BIND(DO1_SHORT);
1097         mov(result_tmp, cnt2);
1098         lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
1099         sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
1100       BIND(DO1_LOOP);
1101         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
1102         cmpw(ch1, ch2);
1103         br(EQ, MATCH);
1104         adds(cnt2_neg, cnt2_neg, str2_chr_size);
1105         br(LT, DO1_LOOP);
1106     }
1107   }
1108   BIND(NOMATCH);
1109     mov(result, -1);
1110     b(DONE);
1111   BIND(MATCH);
1112     add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
1113   BIND(DONE);
1114 }
1115 
1116 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
1117 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
1118 
1119 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1,
1120                                             Register ch, Register result,
1121                                             Register tmp1, Register tmp2, Register tmp3)
1122 {
1123   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
1124   Register cnt1_neg = cnt1;
1125   Register ch1 = rscratch1;
1126   Register result_tmp = rscratch2;
1127 
1128   cbz(cnt1, NOMATCH);
1129 
1130   cmp(cnt1, (u1)4);
1131   br(LT, DO1_SHORT);
1132 
1133   orr(ch, ch, ch, LSL, 16);
1134   orr(ch, ch, ch, LSL, 32);
1135 
1136   sub(cnt1, cnt1, 4);
1137   mov(result_tmp, cnt1);
1138   lea(str1, Address(str1, cnt1, Address::uxtw(1)));
1139   sub(cnt1_neg, zr, cnt1, LSL, 1);
1140 
1141   mov(tmp3, 0x0001000100010001);
1142 
1143   BIND(CH1_LOOP);
1144     ldr(ch1, Address(str1, cnt1_neg));
1145     eor(ch1, ch, ch1);
1146     sub(tmp1, ch1, tmp3);
1147     orr(tmp2, ch1, 0x7fff7fff7fff7fff);
1148     bics(tmp1, tmp1, tmp2);
1149     br(NE, HAS_ZERO);
1150     adds(cnt1_neg, cnt1_neg, 8);
1151     br(LT, CH1_LOOP);
1152 
1153     cmp(cnt1_neg, (u1)8);
1154     mov(cnt1_neg, 0);
1155     br(LT, CH1_LOOP);
1156     b(NOMATCH);
1157 
1158   BIND(HAS_ZERO);
1159     rev(tmp1, tmp1);
1160     clz(tmp1, tmp1);
1161     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
1162     b(MATCH);
1163 
1164   BIND(DO1_SHORT);
1165     mov(result_tmp, cnt1);
1166     lea(str1, Address(str1, cnt1, Address::uxtw(1)));
1167     sub(cnt1_neg, zr, cnt1, LSL, 1);
1168   BIND(DO1_LOOP);
1169     ldrh(ch1, Address(str1, cnt1_neg));
1170     cmpw(ch, ch1);
1171     br(EQ, MATCH);
1172     adds(cnt1_neg, cnt1_neg, 2);
1173     br(LT, DO1_LOOP);
1174   BIND(NOMATCH);
1175     mov(result, -1);
1176     b(DONE);
1177   BIND(MATCH);
1178     add(result, result_tmp, cnt1_neg, ASR, 1);
1179   BIND(DONE);
1180 }
1181 
1182 void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1,
1183                                                 Register ch, Register result,
1184                                                 FloatRegister ztmp1,
1185                                                 FloatRegister ztmp2,
1186                                                 PRegister tmp_pg,
1187                                                 PRegister tmp_pdn, bool isL)
1188 {
1189   // Note that `tmp_pdn` should *NOT* be used as governing predicate register.
1190   assert(tmp_pg->is_governing(),
1191          "this register has to be a governing predicate register");
1192 
1193   Label LOOP, MATCH, DONE, NOMATCH;
1194   Register vec_len = rscratch1;
1195   Register idx = rscratch2;
1196 
1197   SIMD_RegVariant T = (isL == true) ? B : H;
1198 
1199   cbz(cnt1, NOMATCH);
1200 
1201   // Assign the particular char throughout the vector.
1202   sve_dup(ztmp2, T, ch);
1203   if (isL) {
1204     sve_cntb(vec_len);
1205   } else {
1206     sve_cnth(vec_len);
1207   }
1208   mov(idx, 0);
1209 
1210   // Generate a predicate to control the reading of input string.
1211   sve_whilelt(tmp_pg, T, idx, cnt1);
1212 
1213   BIND(LOOP);
1214     // Read a vector of 8- or 16-bit data depending on the string type. Note
1215     // that inactive elements indicated by the predicate register won't cause
1216     // a data read from memory to the destination vector.
1217     if (isL) {
1218       sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx));
1219     } else {
1220       sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1)));
1221     }
1222     add(idx, idx, vec_len);
1223 
1224     // Perform the comparison. An element of the destination predicate is set
1225     // to active if the particular char is matched.
1226     sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2);
1227 
1228     // Branch if the particular char is found.
1229     br(NE, MATCH);
1230 
1231     sve_whilelt(tmp_pg, T, idx, cnt1);
1232 
1233     // Loop back if the particular char not found.
1234     br(MI, LOOP);
1235 
1236   BIND(NOMATCH);
1237     mov(result, -1);
1238     b(DONE);
1239 
1240   BIND(MATCH);
1241     // Undo the index increment.
1242     sub(idx, idx, vec_len);
1243 
1244     // Crop the vector to find its location.
1245     sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */);
1246     add(result, idx, -1);
1247     sve_incp(result, T, tmp_pdn);
1248   BIND(DONE);
1249 }
1250 
1251 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1,
1252                                             Register ch, Register result,
1253                                             Register tmp1, Register tmp2, Register tmp3)
1254 {
1255   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
1256   Register cnt1_neg = cnt1;
1257   Register ch1 = rscratch1;
1258   Register result_tmp = rscratch2;
1259 
1260   cbz(cnt1, NOMATCH);
1261 
1262   cmp(cnt1, (u1)8);
1263   br(LT, DO1_SHORT);
1264 
1265   orr(ch, ch, ch, LSL, 8);
1266   orr(ch, ch, ch, LSL, 16);
1267   orr(ch, ch, ch, LSL, 32);
1268 
1269   sub(cnt1, cnt1, 8);
1270   mov(result_tmp, cnt1);
1271   lea(str1, Address(str1, cnt1));
1272   sub(cnt1_neg, zr, cnt1);
1273 
1274   mov(tmp3, 0x0101010101010101);
1275 
1276   BIND(CH1_LOOP);
1277     ldr(ch1, Address(str1, cnt1_neg));
1278     eor(ch1, ch, ch1);
1279     sub(tmp1, ch1, tmp3);
1280     orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f);
1281     bics(tmp1, tmp1, tmp2);
1282     br(NE, HAS_ZERO);
1283     adds(cnt1_neg, cnt1_neg, 8);
1284     br(LT, CH1_LOOP);
1285 
1286     cmp(cnt1_neg, (u1)8);
1287     mov(cnt1_neg, 0);
1288     br(LT, CH1_LOOP);
1289     b(NOMATCH);
1290 
1291   BIND(HAS_ZERO);
1292     rev(tmp1, tmp1);
1293     clz(tmp1, tmp1);
1294     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
1295     b(MATCH);
1296 
1297   BIND(DO1_SHORT);
1298     mov(result_tmp, cnt1);
1299     lea(str1, Address(str1, cnt1));
1300     sub(cnt1_neg, zr, cnt1);
1301   BIND(DO1_LOOP);
1302     ldrb(ch1, Address(str1, cnt1_neg));
1303     cmp(ch, ch1);
1304     br(EQ, MATCH);
1305     adds(cnt1_neg, cnt1_neg, 1);
1306     br(LT, DO1_LOOP);
1307   BIND(NOMATCH);
1308     mov(result, -1);
1309     b(DONE);
1310   BIND(MATCH);
1311     add(result, result_tmp, cnt1_neg);
1312   BIND(DONE);
1313 }
1314 
1315 // Compare strings.
1316 void C2_MacroAssembler::string_compare(Register str1, Register str2,
1317     Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
1318     FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
1319     PRegister pgtmp1, PRegister pgtmp2, int ae) {
1320   Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
1321       DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
1322       SHORT_LOOP_START, TAIL_CHECK;
1323 
1324   bool isLL = ae == StrIntrinsicNode::LL;
1325   bool isLU = ae == StrIntrinsicNode::LU;
1326   bool isUL = ae == StrIntrinsicNode::UL;
1327 
1328   // The stub threshold for LL strings is: 72 (64 + 8) chars
1329   // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch)
1330   // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least)
1331   const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36);
1332 
1333   bool str1_isL = isLL || isLU;
1334   bool str2_isL = isLL || isUL;
1335 
1336   int str1_chr_shift = str1_isL ? 0 : 1;
1337   int str2_chr_shift = str2_isL ? 0 : 1;
1338   int str1_chr_size = str1_isL ? 1 : 2;
1339   int str2_chr_size = str2_isL ? 1 : 2;
1340   int minCharsInWord = isLL ? wordSize : wordSize/2;
1341 
1342   FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
1343   chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
1344                                       (chr_insn)&MacroAssembler::ldrh;
1345   chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
1346                                       (chr_insn)&MacroAssembler::ldrh;
1347   uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
1348                             (uxt_insn)&MacroAssembler::uxthw;
1349 
1350   BLOCK_COMMENT("string_compare {");
1351 
1352   // Bizarrely, the counts are passed in bytes, regardless of whether they
1353   // are L or U strings, however the result is always in characters.
1354   if (!str1_isL) asrw(cnt1, cnt1, 1);
1355   if (!str2_isL) asrw(cnt2, cnt2, 1);
1356 
1357   // Compute the minimum of the string lengths and save the difference.
1358   subsw(result, cnt1, cnt2);
1359   cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
1360 
1361   // A very short string
1362   cmpw(cnt2, minCharsInWord);
1363   br(Assembler::LE, SHORT_STRING);
1364 
1365   // Compare longwords
1366   // load first parts of strings and finish initialization while loading
1367   {
1368     if (str1_isL == str2_isL) { // LL or UU
1369       ldr(tmp1, Address(str1));
1370       cmp(str1, str2);
1371       br(Assembler::EQ, DONE);
1372       ldr(tmp2, Address(str2));
1373       cmp(cnt2, stub_threshold);
1374       br(GE, STUB);
1375       subsw(cnt2, cnt2, minCharsInWord);
1376       br(EQ, TAIL_CHECK);
1377       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1378       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1379       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1380     } else if (isLU) {
1381       ldrs(vtmp, Address(str1));
1382       ldr(tmp2, Address(str2));
1383       cmp(cnt2, stub_threshold);
1384       br(GE, STUB);
1385       subw(cnt2, cnt2, 4);
1386       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1387       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1388       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1389       zip1(vtmp, T8B, vtmp, vtmpZ);
1390       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1391       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1392       add(cnt1, cnt1, 4);
1393       fmovd(tmp1, vtmp);
1394     } else { // UL case
1395       ldr(tmp1, Address(str1));
1396       ldrs(vtmp, Address(str2));
1397       cmp(cnt2, stub_threshold);
1398       br(GE, STUB);
1399       subw(cnt2, cnt2, 4);
1400       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1401       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1402       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1403       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1404       zip1(vtmp, T8B, vtmp, vtmpZ);
1405       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1406       add(cnt1, cnt1, 8);
1407       fmovd(tmp2, vtmp);
1408     }
1409     adds(cnt2, cnt2, isUL ? 4 : 8);
1410     br(GE, TAIL);
1411     eor(rscratch2, tmp1, tmp2);
1412     cbnz(rscratch2, DIFF);
1413     // main loop
1414     bind(NEXT_WORD);
1415     if (str1_isL == str2_isL) {
1416       ldr(tmp1, Address(str1, cnt2));
1417       ldr(tmp2, Address(str2, cnt2));
1418       adds(cnt2, cnt2, 8);
1419     } else if (isLU) {
1420       ldrs(vtmp, Address(str1, cnt1));
1421       ldr(tmp2, Address(str2, cnt2));
1422       add(cnt1, cnt1, 4);
1423       zip1(vtmp, T8B, vtmp, vtmpZ);
1424       fmovd(tmp1, vtmp);
1425       adds(cnt2, cnt2, 8);
1426     } else { // UL
1427       ldrs(vtmp, Address(str2, cnt2));
1428       ldr(tmp1, Address(str1, cnt1));
1429       zip1(vtmp, T8B, vtmp, vtmpZ);
1430       add(cnt1, cnt1, 8);
1431       fmovd(tmp2, vtmp);
1432       adds(cnt2, cnt2, 4);
1433     }
1434     br(GE, TAIL);
1435 
1436     eor(rscratch2, tmp1, tmp2);
1437     cbz(rscratch2, NEXT_WORD);
1438     b(DIFF);
1439     bind(TAIL);
1440     eor(rscratch2, tmp1, tmp2);
1441     cbnz(rscratch2, DIFF);
1442     // Last longword.  In the case where length == 4 we compare the
1443     // same longword twice, but that's still faster than another
1444     // conditional branch.
1445     if (str1_isL == str2_isL) {
1446       ldr(tmp1, Address(str1));
1447       ldr(tmp2, Address(str2));
1448     } else if (isLU) {
1449       ldrs(vtmp, Address(str1));
1450       ldr(tmp2, Address(str2));
1451       zip1(vtmp, T8B, vtmp, vtmpZ);
1452       fmovd(tmp1, vtmp);
1453     } else { // UL
1454       ldrs(vtmp, Address(str2));
1455       ldr(tmp1, Address(str1));
1456       zip1(vtmp, T8B, vtmp, vtmpZ);
1457       fmovd(tmp2, vtmp);
1458     }
1459     bind(TAIL_CHECK);
1460     eor(rscratch2, tmp1, tmp2);
1461     cbz(rscratch2, DONE);
1462 
1463     // Find the first different characters in the longwords and
1464     // compute their difference.
1465     bind(DIFF);
1466     rev(rscratch2, rscratch2);
1467     clz(rscratch2, rscratch2);
1468     andr(rscratch2, rscratch2, isLL ? -8 : -16);
1469     lsrv(tmp1, tmp1, rscratch2);
1470     (this->*ext_chr)(tmp1, tmp1);
1471     lsrv(tmp2, tmp2, rscratch2);
1472     (this->*ext_chr)(tmp2, tmp2);
1473     subw(result, tmp1, tmp2);
1474     b(DONE);
1475   }
1476 
1477   bind(STUB);
1478     RuntimeAddress stub = nullptr;
1479     switch(ae) {
1480       case StrIntrinsicNode::LL:
1481         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
1482         break;
1483       case StrIntrinsicNode::UU:
1484         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
1485         break;
1486       case StrIntrinsicNode::LU:
1487         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
1488         break;
1489       case StrIntrinsicNode::UL:
1490         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
1491         break;
1492       default:
1493         ShouldNotReachHere();
1494      }
1495     assert(stub.target() != nullptr, "compare_long_string stub has not been generated");
1496     address call = trampoline_call(stub);
1497     if (call == nullptr) {
1498       DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START));
1499       ciEnv::current()->record_failure("CodeCache is full");
1500       return;
1501     }
1502     b(DONE);
1503 
1504   bind(SHORT_STRING);
1505   // Is the minimum length zero?
1506   cbz(cnt2, DONE);
1507   // arrange code to do most branches while loading and loading next characters
1508   // while comparing previous
1509   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1510   subs(cnt2, cnt2, 1);
1511   br(EQ, SHORT_LAST_INIT);
1512   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1513   b(SHORT_LOOP_START);
1514   bind(SHORT_LOOP);
1515   subs(cnt2, cnt2, 1);
1516   br(EQ, SHORT_LAST);
1517   bind(SHORT_LOOP_START);
1518   (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
1519   (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
1520   cmp(tmp1, cnt1);
1521   br(NE, SHORT_LOOP_TAIL);
1522   subs(cnt2, cnt2, 1);
1523   br(EQ, SHORT_LAST2);
1524   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1525   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1526   cmp(tmp2, rscratch1);
1527   br(EQ, SHORT_LOOP);
1528   sub(result, tmp2, rscratch1);
1529   b(DONE);
1530   bind(SHORT_LOOP_TAIL);
1531   sub(result, tmp1, cnt1);
1532   b(DONE);
1533   bind(SHORT_LAST2);
1534   cmp(tmp2, rscratch1);
1535   br(EQ, DONE);
1536   sub(result, tmp2, rscratch1);
1537 
1538   b(DONE);
1539   bind(SHORT_LAST_INIT);
1540   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1541   bind(SHORT_LAST);
1542   cmp(tmp1, cnt1);
1543   br(EQ, DONE);
1544   sub(result, tmp1, cnt1);
1545 
1546   bind(DONE);
1547 
1548   BLOCK_COMMENT("} string_compare");
1549 }
1550 
1551 void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1,
1552                                      FloatRegister src2, Condition cond, bool isQ) {
1553   SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1554   FloatRegister zn = src1, zm = src2;
1555   bool needs_negation = false;
1556   switch (cond) {
1557     case LT: cond = GT; zn = src2; zm = src1; break;
1558     case LE: cond = GE; zn = src2; zm = src1; break;
1559     case LO: cond = HI; zn = src2; zm = src1; break;
1560     case LS: cond = HS; zn = src2; zm = src1; break;
1561     case NE: cond = EQ; needs_negation = true; break;
1562     default:
1563       break;
1564   }
1565 
1566   if (is_floating_point_type(bt)) {
1567     fcm(cond, dst, size, zn, zm);
1568   } else {
1569     cm(cond, dst, size, zn, zm);
1570   }
1571 
1572   if (needs_negation) {
1573     notr(dst, isQ ? T16B : T8B, dst);
1574   }
1575 }
1576 
1577 void C2_MacroAssembler::neon_compare_zero(FloatRegister dst, BasicType bt, FloatRegister src,
1578                                           Condition cond, bool isQ) {
1579   SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1580   if (bt == T_FLOAT || bt == T_DOUBLE) {
1581     if (cond == Assembler::NE) {
1582       fcm(Assembler::EQ, dst, size, src);
1583       notr(dst, isQ ? T16B : T8B, dst);
1584     } else {
1585       fcm(cond, dst, size, src);
1586     }
1587   } else {
1588     if (cond == Assembler::NE) {
1589       cm(Assembler::EQ, dst, size, src);
1590       notr(dst, isQ ? T16B : T8B, dst);
1591     } else {
1592       cm(cond, dst, size, src);
1593     }
1594   }
1595 }
1596 
1597 // Compress the least significant bit of each byte to the rightmost and clear
1598 // the higher garbage bits.
1599 void C2_MacroAssembler::bytemask_compress(Register dst) {
1600   // Example input, dst = 0x01 00 00 00 01 01 00 01
1601   // The "??" bytes are garbage.
1602   orr(dst, dst, dst, Assembler::LSR, 7);  // dst = 0x?? 02 ?? 00 ?? 03 ?? 01
1603   orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D
1604   orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D
1605   andr(dst, dst, 0xff);                   // dst = 0x8D
1606 }
1607 
1608 // Pack the lowest-numbered bit of each mask element in src into a long value
1609 // in dst, at most the first 64 lane elements.
1610 // Clobbers: rscratch1, if UseSVE=1 or the hardware doesn't support FEAT_BITPERM.
1611 void C2_MacroAssembler::sve_vmask_tolong(Register dst, PRegister src, BasicType bt, int lane_cnt,
1612                                          FloatRegister vtmp1, FloatRegister vtmp2) {
1613   assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
1614   assert_different_registers(dst, rscratch1);
1615   assert_different_registers(vtmp1, vtmp2);
1616 
1617   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
1618   // Example:   src = 0b01100101 10001101, bt = T_BYTE, lane_cnt = 16
1619   // Expected:  dst = 0x658D
1620 
1621   // Convert the mask into vector with sequential bytes.
1622   // vtmp1 = 0x00010100 0x00010001 0x01000000 0x01010001
1623   sve_cpy(vtmp1, size, src, 1, false);
1624   if (bt != T_BYTE) {
1625     sve_vector_narrow(vtmp1, B, vtmp1, size, vtmp2);
1626   }
1627 
1628   if (UseSVE > 1 && VM_Version::supports_svebitperm()) {
1629     // Given a vector with the value 0x00 or 0x01 in each byte, the basic idea
1630     // is to compress each significant bit of the byte in a cross-lane way. Due
1631     // to the lack of a cross-lane bit-compress instruction, we use BEXT
1632     // (bit-compress in each lane) with the biggest lane size (T = D) then
1633     // concatenate the results.
1634 
1635     // The second source input of BEXT, initialized with 0x01 in each byte.
1636     // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1637     sve_dup(vtmp2, B, 1);
1638 
1639     // BEXT vtmp1.D, vtmp1.D, vtmp2.D
1640     // vtmp1 = 0x0001010000010001 | 0x0100000001010001
1641     // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1642     //         ---------------------------------------
1643     // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1644     sve_bext(vtmp1, D, vtmp1, vtmp2);
1645 
1646     // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the
1647     // result to dst.
1648     // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1649     // dst   = 0x658D
1650     if (lane_cnt <= 8) {
1651       // No need to concatenate.
1652       umov(dst, vtmp1, B, 0);
1653     } else if (lane_cnt <= 16) {
1654       ins(vtmp1, B, vtmp1, 1, 8);
1655       umov(dst, vtmp1, H, 0);
1656     } else {
1657       // As the lane count is 64 at most, the final expected value must be in
1658       // the lowest 64 bits after narrowing vtmp1 from D to B.
1659       sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2);
1660       umov(dst, vtmp1, D, 0);
1661     }
1662   } else if (UseSVE > 0) {
1663     // Compress the lowest 8 bytes.
1664     fmovd(dst, vtmp1);
1665     bytemask_compress(dst);
1666     if (lane_cnt <= 8) return;
1667 
1668     // Repeat on higher bytes and join the results.
1669     // Compress 8 bytes in each iteration.
1670     for (int idx = 1; idx < (lane_cnt / 8); idx++) {
1671       sve_extract_integral(rscratch1, T_LONG, vtmp1, idx, vtmp2);
1672       bytemask_compress(rscratch1);
1673       orr(dst, dst, rscratch1, Assembler::LSL, idx << 3);
1674     }
1675   } else {
1676     assert(false, "unsupported");
1677     ShouldNotReachHere();
1678   }
1679 }
1680 
1681 // Unpack the mask, a long value in src, into predicate register dst based on the
1682 // corresponding data type. Note that dst can support at most 64 lanes.
1683 // Below example gives the expected dst predicate register in different types, with
1684 // a valid src(0x658D) on a 1024-bit vector size machine.
1685 // BYTE:  dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 00 00 65 8D
1686 // SHORT: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 14 11 40 51
1687 // INT:   dst = 0x00 00 00 00 00 00 00 00 01 10 01 01 10 00 11 01
1688 // LONG:  dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01
1689 //
1690 // The number of significant bits of src must be equal to lane_cnt. E.g., 0xFF658D which
1691 // has 24 significant bits would be an invalid input if dst predicate register refers to
1692 // a LONG type 1024-bit vector, which has at most 16 lanes.
1693 void C2_MacroAssembler::sve_vmask_fromlong(PRegister dst, Register src, BasicType bt, int lane_cnt,
1694                                            FloatRegister vtmp1, FloatRegister vtmp2) {
1695   assert(UseSVE == 2 && VM_Version::supports_svebitperm() &&
1696          lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported");
1697   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
1698   // Example:   src = 0x658D, bt = T_BYTE, size = B, lane_cnt = 16
1699   // Expected:  dst = 0b01101001 10001101
1700 
1701   // Put long value from general purpose register into the first lane of vector.
1702   // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1703   sve_dup(vtmp1, B, 0);
1704   mov(vtmp1, D, 0, src);
1705 
1706   // As sve_cmp generates mask value with the minimum unit in byte, we should
1707   // transform the value in the first lane which is mask in bit now to the
1708   // mask in byte, which can be done by SVE2's BDEP instruction.
1709 
1710   // The first source input of BDEP instruction. Deposite each byte in every 8 bytes.
1711   // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1712   if (lane_cnt <= 8) {
1713     // Nothing. As only one byte exsits.
1714   } else if (lane_cnt <= 16) {
1715     ins(vtmp1, B, vtmp1, 8, 1);
1716     mov(vtmp1, B, 1, zr);
1717   } else {
1718     sve_vector_extend(vtmp1, D, vtmp1, B);
1719   }
1720 
1721   // The second source input of BDEP instruction, initialized with 0x01 for each byte.
1722   // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1723   sve_dup(vtmp2, B, 1);
1724 
1725   // BDEP vtmp1.D, vtmp1.D, vtmp2.D
1726   // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1727   // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1728   //         ---------------------------------------
1729   // vtmp1 = 0x0001010000010001 | 0x0100000001010001
1730   sve_bdep(vtmp1, D, vtmp1, vtmp2);
1731 
1732   if (bt != T_BYTE) {
1733     sve_vector_extend(vtmp1, size, vtmp1, B);
1734   }
1735   // Generate mask according to the given vector, in which the elements have been
1736   // extended to expected type.
1737   // dst = 0b01101001 10001101
1738   sve_cmp(Assembler::NE, dst, size, ptrue, vtmp1, 0);
1739 }
1740 
1741 // Clobbers: rflags
1742 void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg,
1743                                     FloatRegister zn, FloatRegister zm, Condition cond) {
1744   assert(pg->is_governing(), "This register has to be a governing predicate register");
1745   FloatRegister z1 = zn, z2 = zm;
1746   switch (cond) {
1747     case LE: z1 = zm; z2 = zn; cond = GE; break;
1748     case LT: z1 = zm; z2 = zn; cond = GT; break;
1749     case LO: z1 = zm; z2 = zn; cond = HI; break;
1750     case LS: z1 = zm; z2 = zn; cond = HS; break;
1751     default:
1752       break;
1753   }
1754 
1755   SIMD_RegVariant size = elemType_to_regVariant(bt);
1756   if (is_floating_point_type(bt)) {
1757     sve_fcm(cond, pd, size, pg, z1, z2);
1758   } else {
1759     assert(is_integral_type(bt), "unsupported element type");
1760     sve_cmp(cond, pd, size, pg, z1, z2);
1761   }
1762 }
1763 
1764 // Get index of the last mask lane that is set
1765 void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) {
1766   SIMD_RegVariant size = elemType_to_regVariant(bt);
1767   sve_rev(ptmp, size, src);
1768   sve_brkb(ptmp, ptrue, ptmp, false);
1769   sve_cntp(dst, size, ptrue, ptmp);
1770   movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1);
1771   subw(dst, rscratch1, dst);
1772 }
1773 
1774 // Extend integer vector src to dst with the same lane count
1775 // but larger element size, e.g. 4B -> 4I
1776 void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes,
1777                                            FloatRegister src, BasicType src_bt, bool is_unsigned) {
1778   if (src_bt == T_BYTE) {
1779     if (dst_bt == T_SHORT) {
1780       // 4B/8B to 4S/8S
1781       _xshll(is_unsigned, dst, T8H, src, T8B, 0);
1782     } else {
1783       // 4B to 4I
1784       assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported");
1785       _xshll(is_unsigned, dst, T8H, src, T8B, 0);
1786       _xshll(is_unsigned, dst, T4S, dst, T4H, 0);
1787     }
1788   } else if (src_bt == T_SHORT) {
1789     // 4S to 4I
1790     assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported");
1791     _xshll(is_unsigned, dst, T4S, src, T4H, 0);
1792   } else if (src_bt == T_INT) {
1793     // 2I to 2L
1794     assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported");
1795     _xshll(is_unsigned, dst, T2D, src, T2S, 0);
1796   } else {
1797     ShouldNotReachHere();
1798   }
1799 }
1800 
1801 // Narrow integer vector src down to dst with the same lane count
1802 // but smaller element size, e.g. 4I -> 4B
1803 void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt,
1804                                            FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) {
1805   if (src_bt == T_SHORT) {
1806     // 4S/8S to 4B/8B
1807     assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
1808     assert(dst_bt == T_BYTE, "unsupported");
1809     xtn(dst, T8B, src, T8H);
1810   } else if (src_bt == T_INT) {
1811     // 4I to 4B/4S
1812     assert(src_vlen_in_bytes == 16, "unsupported");
1813     assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported");
1814     xtn(dst, T4H, src, T4S);
1815     if (dst_bt == T_BYTE) {
1816       xtn(dst, T8B, dst, T8H);
1817     }
1818   } else if (src_bt == T_LONG) {
1819     // 2L to 2I
1820     assert(src_vlen_in_bytes == 16, "unsupported");
1821     assert(dst_bt == T_INT, "unsupported");
1822     xtn(dst, T2S, src, T2D);
1823   } else {
1824     ShouldNotReachHere();
1825   }
1826 }
1827 
1828 void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size,
1829                                           FloatRegister src, SIMD_RegVariant src_size,
1830                                           bool is_unsigned) {
1831   assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size");
1832 
1833   if (src_size == B) {
1834     switch (dst_size) {
1835     case H:
1836       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1837       break;
1838     case S:
1839       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1840       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1841       break;
1842     case D:
1843       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1844       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1845       _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1846       break;
1847     default:
1848       ShouldNotReachHere();
1849     }
1850   } else if (src_size == H) {
1851     if (dst_size == S) {
1852       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1853     } else { // D
1854       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1855       _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1856     }
1857   } else if (src_size == S) {
1858     _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, src);
1859   }
1860 }
1861 
1862 // Vector narrow from src to dst with specified element sizes.
1863 // High part of dst vector will be filled with zero.
1864 void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size,
1865                                           FloatRegister src, SIMD_RegVariant src_size,
1866                                           FloatRegister tmp) {
1867   assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size");
1868   assert_different_registers(src, tmp);
1869   sve_dup(tmp, src_size, 0);
1870   if (src_size == D) {
1871     switch (dst_size) {
1872     case S:
1873       sve_uzp1(dst, S, src, tmp);
1874       break;
1875     case H:
1876       assert_different_registers(dst, tmp);
1877       sve_uzp1(dst, S, src, tmp);
1878       sve_uzp1(dst, H, dst, tmp);
1879       break;
1880     case B:
1881       assert_different_registers(dst, tmp);
1882       sve_uzp1(dst, S, src, tmp);
1883       sve_uzp1(dst, H, dst, tmp);
1884       sve_uzp1(dst, B, dst, tmp);
1885       break;
1886     default:
1887       ShouldNotReachHere();
1888     }
1889   } else if (src_size == S) {
1890     if (dst_size == H) {
1891       sve_uzp1(dst, H, src, tmp);
1892     } else { // B
1893       assert_different_registers(dst, tmp);
1894       sve_uzp1(dst, H, src, tmp);
1895       sve_uzp1(dst, B, dst, tmp);
1896     }
1897   } else if (src_size == H) {
1898     sve_uzp1(dst, B, src, tmp);
1899   }
1900 }
1901 
1902 // Extend src predicate to dst predicate with the same lane count but larger
1903 // element size, e.g. 64Byte -> 512Long
1904 void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src,
1905                                              uint dst_element_length_in_bytes,
1906                                              uint src_element_length_in_bytes) {
1907   if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) {
1908     sve_punpklo(dst, src);
1909   } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) {
1910     sve_punpklo(dst, src);
1911     sve_punpklo(dst, dst);
1912   } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) {
1913     sve_punpklo(dst, src);
1914     sve_punpklo(dst, dst);
1915     sve_punpklo(dst, dst);
1916   } else {
1917     assert(false, "unsupported");
1918     ShouldNotReachHere();
1919   }
1920 }
1921 
1922 // Narrow src predicate to dst predicate with the same lane count but
1923 // smaller element size, e.g. 512Long -> 64Byte
1924 void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src, PRegister ptmp,
1925                                              uint dst_element_length_in_bytes, uint src_element_length_in_bytes) {
1926   // The insignificant bits in src predicate are expected to be zero.
1927   // To ensure the higher order bits of the resultant narrowed vector are 0, an all-zero predicate is
1928   // passed as the second argument. An example narrowing operation with a given mask would be -
1929   // 128Long -> 64Int on a 128-bit machine i.e 2L -> 2I
1930   // Mask (for 2 Longs) : TF
1931   // Predicate register for the above mask (16 bits) : 00000001 00000000
1932   // After narrowing (uzp1 dst.b, src.b, ptmp.b) : 0000 0000 0001 0000
1933   // Which translates to mask for 2 integers as : TF (lower half is considered while upper half is 0)
1934   assert_different_registers(src, ptmp);
1935   assert_different_registers(dst, ptmp);
1936   sve_pfalse(ptmp);
1937   if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) {
1938     sve_uzp1(dst, B, src, ptmp);
1939   } else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) {
1940     sve_uzp1(dst, H, src, ptmp);
1941     sve_uzp1(dst, B, dst, ptmp);
1942   } else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) {
1943     sve_uzp1(dst, S, src, ptmp);
1944     sve_uzp1(dst, H, dst, ptmp);
1945     sve_uzp1(dst, B, dst, ptmp);
1946   } else {
1947     assert(false, "unsupported");
1948     ShouldNotReachHere();
1949   }
1950 }
1951 
1952 // Vector reduction add for integral type with ASIMD instructions.
1953 void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt,
1954                                                  Register isrc, FloatRegister vsrc,
1955                                                  unsigned vector_length_in_bytes,
1956                                                  FloatRegister vtmp) {
1957   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1958   assert_different_registers(dst, isrc);
1959   bool isQ = vector_length_in_bytes == 16;
1960 
1961   BLOCK_COMMENT("neon_reduce_add_integral {");
1962     switch(bt) {
1963       case T_BYTE:
1964         addv(vtmp, isQ ? T16B : T8B, vsrc);
1965         smov(dst, vtmp, B, 0);
1966         addw(dst, dst, isrc, ext::sxtb);
1967         break;
1968       case T_SHORT:
1969         addv(vtmp, isQ ? T8H : T4H, vsrc);
1970         smov(dst, vtmp, H, 0);
1971         addw(dst, dst, isrc, ext::sxth);
1972         break;
1973       case T_INT:
1974         isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc);
1975         umov(dst, vtmp, S, 0);
1976         addw(dst, dst, isrc);
1977         break;
1978       case T_LONG:
1979         assert(isQ, "unsupported");
1980         addpd(vtmp, vsrc);
1981         umov(dst, vtmp, D, 0);
1982         add(dst, dst, isrc);
1983         break;
1984       default:
1985         assert(false, "unsupported");
1986         ShouldNotReachHere();
1987     }
1988   BLOCK_COMMENT("} neon_reduce_add_integral");
1989 }
1990 
1991 // Vector reduction multiply for integral type with ASIMD instructions.
1992 // Note: temporary registers vtmp1 and vtmp2 are not used in some cases.
1993 // Clobbers: rscratch1
1994 void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt,
1995                                                  Register isrc, FloatRegister vsrc,
1996                                                  unsigned vector_length_in_bytes,
1997                                                  FloatRegister vtmp1, FloatRegister vtmp2) {
1998   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1999   bool isQ = vector_length_in_bytes == 16;
2000 
2001   BLOCK_COMMENT("neon_reduce_mul_integral {");
2002     switch(bt) {
2003       case T_BYTE:
2004         if (isQ) {
2005           // Multiply the lower half and higher half of vector iteratively.
2006           // vtmp1 = vsrc[8:15]
2007           ins(vtmp1, D, vsrc, 0, 1);
2008           // vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7]
2009           mulv(vtmp1, T8B, vtmp1, vsrc);
2010           // vtmp2 = vtmp1[4:7]
2011           ins(vtmp2, S, vtmp1, 0, 1);
2012           // vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3]
2013           mulv(vtmp1, T8B, vtmp2, vtmp1);
2014         } else {
2015           ins(vtmp1, S, vsrc, 0, 1);
2016           mulv(vtmp1, T8B, vtmp1, vsrc);
2017         }
2018         // vtmp2 = vtmp1[2:3]
2019         ins(vtmp2, H, vtmp1, 0, 1);
2020         // vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1]
2021         mulv(vtmp2, T8B, vtmp2, vtmp1);
2022         // dst = vtmp2[0] * isrc * vtmp2[1]
2023         umov(rscratch1, vtmp2, B, 0);
2024         mulw(dst, rscratch1, isrc);
2025         sxtb(dst, dst);
2026         umov(rscratch1, vtmp2, B, 1);
2027         mulw(dst, rscratch1, dst);
2028         sxtb(dst, dst);
2029         break;
2030       case T_SHORT:
2031         if (isQ) {
2032           ins(vtmp2, D, vsrc, 0, 1);
2033           mulv(vtmp2, T4H, vtmp2, vsrc);
2034           ins(vtmp1, S, vtmp2, 0, 1);
2035           mulv(vtmp1, T4H, vtmp1, vtmp2);
2036         } else {
2037           ins(vtmp1, S, vsrc, 0, 1);
2038           mulv(vtmp1, T4H, vtmp1, vsrc);
2039         }
2040         umov(rscratch1, vtmp1, H, 0);
2041         mulw(dst, rscratch1, isrc);
2042         sxth(dst, dst);
2043         umov(rscratch1, vtmp1, H, 1);
2044         mulw(dst, rscratch1, dst);
2045         sxth(dst, dst);
2046         break;
2047       case T_INT:
2048         if (isQ) {
2049           ins(vtmp1, D, vsrc, 0, 1);
2050           mulv(vtmp1, T2S, vtmp1, vsrc);
2051         } else {
2052           vtmp1 = vsrc;
2053         }
2054         umov(rscratch1, vtmp1, S, 0);
2055         mul(dst, rscratch1, isrc);
2056         umov(rscratch1, vtmp1, S, 1);
2057         mul(dst, rscratch1, dst);
2058         break;
2059       case T_LONG:
2060         umov(rscratch1, vsrc, D, 0);
2061         mul(dst, isrc, rscratch1);
2062         umov(rscratch1, vsrc, D, 1);
2063         mul(dst, dst, rscratch1);
2064         break;
2065       default:
2066         assert(false, "unsupported");
2067         ShouldNotReachHere();
2068     }
2069   BLOCK_COMMENT("} neon_reduce_mul_integral");
2070 }
2071 
2072 // Vector reduction multiply for floating-point type with ASIMD instructions.
2073 void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt,
2074                                            FloatRegister fsrc, FloatRegister vsrc,
2075                                            unsigned vector_length_in_bytes,
2076                                            FloatRegister vtmp) {
2077   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
2078   bool isQ = vector_length_in_bytes == 16;
2079 
2080   BLOCK_COMMENT("neon_reduce_mul_fp {");
2081     switch(bt) {
2082       case T_FLOAT:
2083         fmuls(dst, fsrc, vsrc);
2084         ins(vtmp, S, vsrc, 0, 1);
2085         fmuls(dst, dst, vtmp);
2086         if (isQ) {
2087           ins(vtmp, S, vsrc, 0, 2);
2088           fmuls(dst, dst, vtmp);
2089           ins(vtmp, S, vsrc, 0, 3);
2090           fmuls(dst, dst, vtmp);
2091          }
2092         break;
2093       case T_DOUBLE:
2094         assert(isQ, "unsupported");
2095         fmuld(dst, fsrc, vsrc);
2096         ins(vtmp, D, vsrc, 0, 1);
2097         fmuld(dst, dst, vtmp);
2098         break;
2099       default:
2100         assert(false, "unsupported");
2101         ShouldNotReachHere();
2102     }
2103   BLOCK_COMMENT("} neon_reduce_mul_fp");
2104 }
2105 
2106 // Helper to select logical instruction
2107 void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd,
2108                                                    Register Rn, Register Rm,
2109                                                    enum shift_kind kind, unsigned shift) {
2110   switch(opc) {
2111     case Op_AndReductionV:
2112       is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift);
2113       break;
2114     case Op_OrReductionV:
2115       is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift);
2116       break;
2117     case Op_XorReductionV:
2118       is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift);
2119       break;
2120     default:
2121       assert(false, "unsupported");
2122       ShouldNotReachHere();
2123   }
2124 }
2125 
2126 // Vector reduction logical operations And, Or, Xor
2127 // Clobbers: rscratch1
2128 void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt,
2129                                             Register isrc, FloatRegister vsrc,
2130                                             unsigned vector_length_in_bytes) {
2131   assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV,
2132          "unsupported");
2133   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
2134   assert_different_registers(dst, isrc);
2135   bool isQ = vector_length_in_bytes == 16;
2136 
2137   BLOCK_COMMENT("neon_reduce_logical {");
2138     umov(rscratch1, vsrc, isQ ? D : S, 0);
2139     umov(dst, vsrc, isQ ? D : S, 1);
2140     neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1);
2141     switch(bt) {
2142       case T_BYTE:
2143         if (isQ) {
2144           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
2145         }
2146         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
2147         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8);
2148         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
2149         sxtb(dst, dst);
2150         break;
2151       case T_SHORT:
2152         if (isQ) {
2153           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
2154         }
2155         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
2156         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
2157         sxth(dst, dst);
2158         break;
2159       case T_INT:
2160         if (isQ) {
2161           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
2162         }
2163         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
2164         break;
2165       case T_LONG:
2166         assert(isQ, "unsupported");
2167         neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst);
2168         break;
2169       default:
2170         assert(false, "unsupported");
2171         ShouldNotReachHere();
2172     }
2173   BLOCK_COMMENT("} neon_reduce_logical");
2174 }
2175 
2176 // Vector reduction min/max for integral type with ASIMD instructions.
2177 // Note: vtmp is not used and expected to be fnoreg for T_LONG case.
2178 // Clobbers: rscratch1, rflags
2179 void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt,
2180                                                     Register isrc, FloatRegister vsrc,
2181                                                     unsigned vector_length_in_bytes,
2182                                                     FloatRegister vtmp) {
2183   assert(opc == Op_MinReductionV || opc == Op_MaxReductionV, "unsupported");
2184   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
2185   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported");
2186   assert_different_registers(dst, isrc);
2187   bool isQ = vector_length_in_bytes == 16;
2188   bool is_min = opc == Op_MinReductionV;
2189 
2190   BLOCK_COMMENT("neon_reduce_minmax_integral {");
2191     if (bt == T_LONG) {
2192       assert(vtmp == fnoreg, "should be");
2193       assert(isQ, "should be");
2194       umov(rscratch1, vsrc, D, 0);
2195       cmp(isrc, rscratch1);
2196       csel(dst, isrc, rscratch1, is_min ? LT : GT);
2197       umov(rscratch1, vsrc, D, 1);
2198       cmp(dst, rscratch1);
2199       csel(dst, dst, rscratch1, is_min ? LT : GT);
2200     } else {
2201       SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
2202       if (size == T2S) {
2203         is_min ? sminp(vtmp, size, vsrc, vsrc) : smaxp(vtmp, size, vsrc, vsrc);
2204       } else {
2205         is_min ? sminv(vtmp, size, vsrc) : smaxv(vtmp, size, vsrc);
2206       }
2207       if (bt == T_INT) {
2208         umov(dst, vtmp, S, 0);
2209       } else {
2210         smov(dst, vtmp, elemType_to_regVariant(bt), 0);
2211       }
2212       cmpw(dst, isrc);
2213       cselw(dst, dst, isrc, is_min ? LT : GT);
2214     }
2215   BLOCK_COMMENT("} neon_reduce_minmax_integral");
2216 }
2217 
2218 // Vector reduction for integral type with SVE instruction.
2219 // Supported operations are Add, And, Or, Xor, Max, Min.
2220 // rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV.
2221 void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1,
2222                                             FloatRegister src2, PRegister pg, FloatRegister tmp) {
2223   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2224   assert(pg->is_governing(), "This register has to be a governing predicate register");
2225   assert_different_registers(src1, dst);
2226   // Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved.
2227   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2228   switch (opc) {
2229     case Op_AddReductionVI: {
2230       sve_uaddv(tmp, size, pg, src2);
2231       if (bt == T_BYTE) {
2232         smov(dst, tmp, size, 0);
2233         addw(dst, src1, dst, ext::sxtb);
2234       } else if (bt == T_SHORT) {
2235         smov(dst, tmp, size, 0);
2236         addw(dst, src1, dst, ext::sxth);
2237       } else {
2238         umov(dst, tmp, size, 0);
2239         addw(dst, dst, src1);
2240       }
2241       break;
2242     }
2243     case Op_AddReductionVL: {
2244       sve_uaddv(tmp, size, pg, src2);
2245       umov(dst, tmp, size, 0);
2246       add(dst, dst, src1);
2247       break;
2248     }
2249     case Op_AndReductionV: {
2250       sve_andv(tmp, size, pg, src2);
2251       if (bt == T_INT || bt == T_LONG) {
2252         umov(dst, tmp, size, 0);
2253       } else {
2254         smov(dst, tmp, size, 0);
2255       }
2256       if (bt == T_LONG) {
2257         andr(dst, dst, src1);
2258       } else {
2259         andw(dst, dst, src1);
2260       }
2261       break;
2262     }
2263     case Op_OrReductionV: {
2264       sve_orv(tmp, size, pg, src2);
2265       if (bt == T_INT || bt == T_LONG) {
2266         umov(dst, tmp, size, 0);
2267       } else {
2268         smov(dst, tmp, size, 0);
2269       }
2270       if (bt == T_LONG) {
2271         orr(dst, dst, src1);
2272       } else {
2273         orrw(dst, dst, src1);
2274       }
2275       break;
2276     }
2277     case Op_XorReductionV: {
2278       sve_eorv(tmp, size, pg, src2);
2279       if (bt == T_INT || bt == T_LONG) {
2280         umov(dst, tmp, size, 0);
2281       } else {
2282         smov(dst, tmp, size, 0);
2283       }
2284       if (bt == T_LONG) {
2285         eor(dst, dst, src1);
2286       } else {
2287         eorw(dst, dst, src1);
2288       }
2289       break;
2290     }
2291     case Op_MaxReductionV: {
2292       sve_smaxv(tmp, size, pg, src2);
2293       if (bt == T_INT || bt == T_LONG) {
2294         umov(dst, tmp, size, 0);
2295       } else {
2296         smov(dst, tmp, size, 0);
2297       }
2298       if (bt == T_LONG) {
2299         cmp(dst, src1);
2300         csel(dst, dst, src1, Assembler::GT);
2301       } else {
2302         cmpw(dst, src1);
2303         cselw(dst, dst, src1, Assembler::GT);
2304       }
2305       break;
2306     }
2307     case Op_MinReductionV: {
2308       sve_sminv(tmp, size, pg, src2);
2309       if (bt == T_INT || bt == T_LONG) {
2310         umov(dst, tmp, size, 0);
2311       } else {
2312         smov(dst, tmp, size, 0);
2313       }
2314       if (bt == T_LONG) {
2315         cmp(dst, src1);
2316         csel(dst, dst, src1, Assembler::LT);
2317       } else {
2318         cmpw(dst, src1);
2319         cselw(dst, dst, src1, Assembler::LT);
2320       }
2321       break;
2322     }
2323     default:
2324       assert(false, "unsupported");
2325       ShouldNotReachHere();
2326   }
2327 
2328   if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) {
2329     if (bt == T_BYTE) {
2330       sxtb(dst, dst);
2331     } else if (bt == T_SHORT) {
2332       sxth(dst, dst);
2333     }
2334   }
2335 }
2336 
2337 // Set elements of the dst predicate to true for lanes in the range of [0, lane_cnt), or
2338 // to false otherwise. The input "lane_cnt" should be smaller than or equal to the supported
2339 // max vector length of the basic type. Clobbers: rscratch1 and the rFlagsReg.
2340 void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt) {
2341   uint32_t max_vector_length = Matcher::max_vector_size(bt);
2342   assert(lane_cnt <= max_vector_length, "unsupported input lane_cnt");
2343 
2344   // Set all elements to false if the input "lane_cnt" is zero.
2345   if (lane_cnt == 0) {
2346     sve_pfalse(dst);
2347     return;
2348   }
2349 
2350   SIMD_RegVariant size = elemType_to_regVariant(bt);
2351   assert(size != Q, "invalid size");
2352 
2353   // Set all true if "lane_cnt" equals to the max lane count.
2354   if (lane_cnt == max_vector_length) {
2355     sve_ptrue(dst, size, /* ALL */ 0b11111);
2356     return;
2357   }
2358 
2359   // Fixed numbers for "ptrue".
2360   switch(lane_cnt) {
2361   case 1: /* VL1 */
2362   case 2: /* VL2 */
2363   case 3: /* VL3 */
2364   case 4: /* VL4 */
2365   case 5: /* VL5 */
2366   case 6: /* VL6 */
2367   case 7: /* VL7 */
2368   case 8: /* VL8 */
2369     sve_ptrue(dst, size, lane_cnt);
2370     return;
2371   case 16:
2372     sve_ptrue(dst, size, /* VL16 */ 0b01001);
2373     return;
2374   case 32:
2375     sve_ptrue(dst, size, /* VL32 */ 0b01010);
2376     return;
2377   case 64:
2378     sve_ptrue(dst, size, /* VL64 */ 0b01011);
2379     return;
2380   case 128:
2381     sve_ptrue(dst, size, /* VL128 */ 0b01100);
2382     return;
2383   case 256:
2384     sve_ptrue(dst, size, /* VL256 */ 0b01101);
2385     return;
2386   default:
2387     break;
2388   }
2389 
2390   // Special patterns for "ptrue".
2391   if (lane_cnt == round_down_power_of_2(max_vector_length)) {
2392     sve_ptrue(dst, size, /* POW2 */ 0b00000);
2393   } else if (lane_cnt == max_vector_length - (max_vector_length % 4)) {
2394     sve_ptrue(dst, size, /* MUL4 */ 0b11101);
2395   } else if (lane_cnt == max_vector_length - (max_vector_length % 3)) {
2396     sve_ptrue(dst, size, /* MUL3 */ 0b11110);
2397   } else {
2398     // Encode to "whileltw" for the remaining cases.
2399     mov(rscratch1, lane_cnt);
2400     sve_whileltw(dst, size, zr, rscratch1);
2401   }
2402 }
2403 
2404 // Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst.
2405 // Any remaining elements of dst will be filled with zero.
2406 // Clobbers: rscratch1
2407 // Preserves: src, mask
2408 void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask,
2409                                            FloatRegister vtmp1, FloatRegister vtmp2,
2410                                            PRegister pgtmp) {
2411   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2412   assert_different_registers(dst, src, vtmp1, vtmp2);
2413   assert_different_registers(mask, pgtmp);
2414 
2415   // Example input:   src   = 8888 7777 6666 5555 4444 3333 2222 1111
2416   //                  mask  = 0001 0000 0000 0001 0001 0000 0001 0001
2417   // Expected result: dst   = 0000 0000 0000 8888 5555 4444 2222 1111
2418   sve_dup(vtmp2, H, 0);
2419 
2420   // Extend lowest half to type INT.
2421   // dst = 00004444 00003333 00002222 00001111
2422   sve_uunpklo(dst, S, src);
2423   // pgtmp = 00000001 00000000 00000001 00000001
2424   sve_punpklo(pgtmp, mask);
2425   // Pack the active elements in size of type INT to the right,
2426   // and fill the remainings with zero.
2427   // dst = 00000000 00004444 00002222 00001111
2428   sve_compact(dst, S, dst, pgtmp);
2429   // Narrow the result back to type SHORT.
2430   // dst = 0000 0000 0000 0000 0000 4444 2222 1111
2431   sve_uzp1(dst, H, dst, vtmp2);
2432   // Count the active elements of lowest half.
2433   // rscratch1 = 3
2434   sve_cntp(rscratch1, S, ptrue, pgtmp);
2435 
2436   // Repeat to the highest half.
2437   // pgtmp = 00000001 00000000 00000000 00000001
2438   sve_punpkhi(pgtmp, mask);
2439   // vtmp1 = 00008888 00007777 00006666 00005555
2440   sve_uunpkhi(vtmp1, S, src);
2441   // vtmp1 = 00000000 00000000 00008888 00005555
2442   sve_compact(vtmp1, S, vtmp1, pgtmp);
2443   // vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555
2444   sve_uzp1(vtmp1, H, vtmp1, vtmp2);
2445 
2446   // Compressed low:   dst   = 0000 0000 0000 0000 0000 4444 2222 1111
2447   // Compressed high:  vtmp1 = 0000 0000 0000 0000 0000 0000 8888  5555
2448   // Left shift(cross lane) compressed high with TRUE_CNT lanes,
2449   // TRUE_CNT is the number of active elements in the compressed low.
2450   neg(rscratch1, rscratch1);
2451   // vtmp2 = {4 3 2 1 0 -1 -2 -3}
2452   sve_index(vtmp2, H, rscratch1, 1);
2453   // vtmp1 = 0000 0000 0000 8888 5555 0000 0000 0000
2454   sve_tbl(vtmp1, H, vtmp1, vtmp2);
2455 
2456   // Combine the compressed high(after shifted) with the compressed low.
2457   // dst = 0000 0000 0000 8888 5555 4444 2222 1111
2458   sve_orr(dst, dst, vtmp1);
2459 }
2460 
2461 // Clobbers: rscratch1, rscratch2
2462 // Preserves: src, mask
2463 void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask,
2464                                           FloatRegister vtmp1, FloatRegister vtmp2,
2465                                           FloatRegister vtmp3, FloatRegister vtmp4,
2466                                           PRegister ptmp, PRegister pgtmp) {
2467   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2468   assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3, vtmp4);
2469   assert_different_registers(mask, ptmp, pgtmp);
2470   // Example input:   src   = 88 77 66 55 44 33 22 11
2471   //                  mask  = 01 00 00 01 01 00 01 01
2472   // Expected result: dst   = 00 00 00 88 55 44 22 11
2473 
2474   sve_dup(vtmp4, B, 0);
2475   // Extend lowest half to type SHORT.
2476   // vtmp1 = 0044 0033 0022 0011
2477   sve_uunpklo(vtmp1, H, src);
2478   // ptmp = 0001 0000 0001 0001
2479   sve_punpklo(ptmp, mask);
2480   // Count the active elements of lowest half.
2481   // rscratch2 = 3
2482   sve_cntp(rscratch2, H, ptrue, ptmp);
2483   // Pack the active elements in size of type SHORT to the right,
2484   // and fill the remainings with zero.
2485   // dst = 0000 0044 0022 0011
2486   sve_compress_short(dst, vtmp1, ptmp, vtmp2, vtmp3, pgtmp);
2487   // Narrow the result back to type BYTE.
2488   // dst = 00 00 00 00 00 44 22 11
2489   sve_uzp1(dst, B, dst, vtmp4);
2490 
2491   // Repeat to the highest half.
2492   // ptmp = 0001 0000 0000 0001
2493   sve_punpkhi(ptmp, mask);
2494   // vtmp1 = 0088 0077 0066 0055
2495   sve_uunpkhi(vtmp2, H, src);
2496   // vtmp1 = 0000 0000 0088 0055
2497   sve_compress_short(vtmp1, vtmp2, ptmp, vtmp3, vtmp4, pgtmp);
2498 
2499   sve_dup(vtmp4, B, 0);
2500   // vtmp1 = 00 00 00 00 00 00 88 55
2501   sve_uzp1(vtmp1, B, vtmp1, vtmp4);
2502 
2503   // Compressed low:   dst   = 00 00 00 00 00 44 22 11
2504   // Compressed high:  vtmp1 = 00 00 00 00 00 00 88 55
2505   // Left shift(cross lane) compressed high with TRUE_CNT lanes,
2506   // TRUE_CNT is the number of active elements in the compressed low.
2507   neg(rscratch2, rscratch2);
2508   // vtmp2 = {4 3 2 1 0 -1 -2 -3}
2509   sve_index(vtmp2, B, rscratch2, 1);
2510   // vtmp1 = 00 00 00 88 55 00 00 00
2511   sve_tbl(vtmp1, B, vtmp1, vtmp2);
2512   // Combine the compressed high(after shifted) with the compressed low.
2513   // dst = 00 00 00 88 55 44 22 11
2514   sve_orr(dst, dst, vtmp1);
2515 }
2516 
2517 void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2518   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2519   SIMD_Arrangement size = isQ ? T16B : T8B;
2520   if (bt == T_BYTE) {
2521     rbit(dst, size, src);
2522   } else {
2523     neon_reverse_bytes(dst, src, bt, isQ);
2524     rbit(dst, size, dst);
2525   }
2526 }
2527 
2528 void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2529   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2530   SIMD_Arrangement size = isQ ? T16B : T8B;
2531   switch (bt) {
2532     case T_BYTE:
2533       if (dst != src) {
2534         orr(dst, size, src, src);
2535       }
2536       break;
2537     case T_SHORT:
2538       rev16(dst, size, src);
2539       break;
2540     case T_INT:
2541       rev32(dst, size, src);
2542       break;
2543     case T_LONG:
2544       rev64(dst, size, src);
2545       break;
2546     default:
2547       assert(false, "unsupported");
2548       ShouldNotReachHere();
2549   }
2550 }
2551 
2552 // Extract a scalar element from an sve vector at position 'idx'.
2553 // The input elements in src are expected to be of integral type.
2554 void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src,
2555                                              int idx, FloatRegister vtmp) {
2556   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2557   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2558   if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction
2559     if (bt == T_INT || bt == T_LONG) {
2560       umov(dst, src, size, idx);
2561     } else {
2562       smov(dst, src, size, idx);
2563     }
2564   } else {
2565     sve_orr(vtmp, src, src);
2566     sve_ext(vtmp, vtmp, idx << size);
2567     if (bt == T_INT || bt == T_LONG) {
2568       umov(dst, vtmp, size, 0);
2569     } else {
2570       smov(dst, vtmp, size, 0);
2571     }
2572   }
2573 }
2574 
2575 // java.lang.Math::round intrinsics
2576 
2577 // Clobbers: rscratch1, rflags
2578 void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2579                                           FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) {
2580   assert_different_registers(tmp1, tmp2, tmp3, src, dst);
2581   switch (T) {
2582     case T2S:
2583     case T4S:
2584       fmovs(tmp1, T, 0.5f);
2585       mov(rscratch1, jint_cast(0x1.0p23f));
2586       break;
2587     case T2D:
2588       fmovd(tmp1, T, 0.5);
2589       mov(rscratch1, julong_cast(0x1.0p52));
2590       break;
2591     default:
2592       assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2593   }
2594   fadd(tmp1, T, tmp1, src);
2595   fcvtms(tmp1, T, tmp1);
2596   // tmp1 = floor(src + 0.5, ties to even)
2597 
2598   fcvtas(dst, T, src);
2599   // dst = round(src), ties to away
2600 
2601   fneg(tmp3, T, src);
2602   dup(tmp2, T, rscratch1);
2603   cm(HS, tmp3, T, tmp3, tmp2);
2604   // tmp3 is now a set of flags
2605 
2606   bif(dst, T16B, tmp1, tmp3);
2607   // result in dst
2608 }
2609 
2610 // Clobbers: rscratch1, rflags
2611 void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2612                                          FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) {
2613   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2614   assert_different_registers(tmp1, tmp2, src, dst);
2615 
2616   switch (T) {
2617     case S:
2618       mov(rscratch1, jint_cast(0x1.0p23f));
2619       break;
2620     case D:
2621       mov(rscratch1, julong_cast(0x1.0p52));
2622       break;
2623     default:
2624       assert(T == S || T == D, "invalid register variant");
2625   }
2626 
2627   sve_frinta(dst, T, ptrue, src);
2628   // dst = round(src), ties to away
2629 
2630   Label none;
2631 
2632   sve_fneg(tmp1, T, ptrue, src);
2633   sve_dup(tmp2, T, rscratch1);
2634   sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1);
2635   br(EQ, none);
2636   {
2637     sve_cpy(tmp1, T, pgtmp, 0.5);
2638     sve_fadd(tmp1, T, pgtmp, src);
2639     sve_frintm(dst, T, pgtmp, tmp1);
2640     // dst = floor(src + 0.5, ties to even)
2641   }
2642   bind(none);
2643 
2644   sve_fcvtzs(dst, T, ptrue, dst, T);
2645   // result in dst
2646 }
2647 
2648 void C2_MacroAssembler::vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero,
2649                                            FloatRegister one, SIMD_Arrangement T) {
2650   assert_different_registers(dst, src, zero, one);
2651   assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2652 
2653   facgt(dst, T, src, zero);
2654   ushr(dst, T, dst, 1); // dst=0 for +-0.0 and NaN. 0x7FF..F otherwise
2655   bsl(dst, T == T2S ? T8B : T16B, one, src); // Result in dst
2656 }
2657 
2658 void C2_MacroAssembler::vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero,
2659                                           FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T) {
2660     assert_different_registers(dst, src, zero, one, vtmp);
2661     assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2662 
2663     sve_orr(vtmp, src, src);
2664     sve_fac(Assembler::GT, pgtmp, T, ptrue, src, zero); // pmtp=0 for +-0.0 and NaN. 0x1 otherwise
2665     switch (T) {
2666     case S:
2667       sve_and(vtmp, T, min_jint); // Extract the sign bit of float value in every lane of src
2668       sve_orr(vtmp, T, jint_cast(1.0)); // OR it with +1 to make the final result +1 or -1 depending
2669                                         // on the sign of the float value
2670       break;
2671     case D:
2672       sve_and(vtmp, T, min_jlong);
2673       sve_orr(vtmp, T, jlong_cast(1.0));
2674       break;
2675     default:
2676       assert(false, "unsupported");
2677       ShouldNotReachHere();
2678     }
2679     sve_sel(dst, T, pgtmp, vtmp, src); // Select either from src or vtmp based on the predicate register pgtmp
2680                                        // Result in dst
2681 }
2682 
2683 bool C2_MacroAssembler::in_scratch_emit_size() {
2684   if (ciEnv::current()->task() != nullptr) {
2685     PhaseOutput* phase_output = Compile::current()->output();
2686     if (phase_output != nullptr && phase_output->in_scratch_emit_size()) {
2687       return true;
2688     }
2689   }
2690   return MacroAssembler::in_scratch_emit_size();
2691 }