1 /*
   2  * Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "asm/assembler.hpp"
  26 #include "asm/assembler.inline.hpp"
  27 #include "opto/c2_MacroAssembler.hpp"
  28 #include "opto/compile.hpp"
  29 #include "opto/intrinsicnode.hpp"
  30 #include "opto/matcher.hpp"
  31 #include "opto/output.hpp"
  32 #include "opto/subnode.hpp"
  33 #include "runtime/stubRoutines.hpp"
  34 #include "utilities/globalDefinitions.hpp"
  35 #include "utilities/powerOfTwo.hpp"
  36 
  37 #ifdef PRODUCT
  38 #define BLOCK_COMMENT(str) /* nothing */
  39 #define STOP(error) stop(error)
  40 #else
  41 #define BLOCK_COMMENT(str) block_comment(str)
  42 #define STOP(error) block_comment(error); stop(error)
  43 #endif
  44 
  45 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  46 
  47 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
  48 
  49 // jdk.internal.util.ArraysSupport.vectorizedHashCode
  50 address C2_MacroAssembler::arrays_hashcode(Register ary, Register cnt, Register result,
  51                                            FloatRegister vdata0, FloatRegister vdata1,
  52                                            FloatRegister vdata2, FloatRegister vdata3,
  53                                            FloatRegister vmul0, FloatRegister vmul1,
  54                                            FloatRegister vmul2, FloatRegister vmul3,
  55                                            FloatRegister vpow, FloatRegister vpowm,
  56                                            BasicType eltype) {
  57   ARRAYS_HASHCODE_REGISTERS;
  58 
  59   Register tmp1 = rscratch1, tmp2 = rscratch2;
  60 
  61   Label TAIL, STUB_SWITCH, STUB_SWITCH_OUT, LOOP, BR_BASE, LARGE, DONE;
  62 
  63   // Vectorization factor. Number of array elements loaded to one SIMD&FP registers by the stubs. We
  64   // use 8H load arrangements for chars and shorts and 8B for booleans and bytes. It's possible to
  65   // use 4H for chars and shorts instead, but using 8H gives better performance.
  66   const size_t vf = eltype == T_BOOLEAN || eltype == T_BYTE ? 8
  67                     : eltype == T_CHAR || eltype == T_SHORT ? 8
  68                     : eltype == T_INT                       ? 4
  69                                                             : 0;
  70   guarantee(vf, "unsupported eltype");
  71 
  72   // Unroll factor for the scalar loop below. The value is chosen based on performance analysis.
  73   const size_t unroll_factor = 4;
  74 
  75   switch (eltype) {
  76   case T_BOOLEAN:
  77     BLOCK_COMMENT("arrays_hashcode(unsigned byte) {");
  78     break;
  79   case T_CHAR:
  80     BLOCK_COMMENT("arrays_hashcode(char) {");
  81     break;
  82   case T_BYTE:
  83     BLOCK_COMMENT("arrays_hashcode(byte) {");
  84     break;
  85   case T_SHORT:
  86     BLOCK_COMMENT("arrays_hashcode(short) {");
  87     break;
  88   case T_INT:
  89     BLOCK_COMMENT("arrays_hashcode(int) {");
  90     break;
  91   default:
  92     ShouldNotReachHere();
  93   }
  94 
  95   // large_arrays_hashcode(T_INT) performs worse than the scalar loop below when the Neon loop
  96   // implemented by the stub executes just once. Call the stub only if at least two iterations will
  97   // be executed.
  98   const size_t large_threshold = eltype == T_INT ? vf * 2 : vf;
  99   cmpw(cnt, large_threshold);
 100   br(Assembler::HS, LARGE);
 101 
 102   bind(TAIL);
 103 
 104   // The andr performs cnt % uf where uf = unroll_factor. The subtract shifted by 3 offsets past
 105   // uf - (cnt % uf) pairs of load + madd insns i.e. it only executes cnt % uf load + madd pairs.
 106   // Iteration eats up the remainder, uf elements at a time.
 107   assert(is_power_of_2(unroll_factor), "can't use this value to calculate the jump target PC");
 108   andr(tmp2, cnt, unroll_factor - 1);
 109   adr(tmp1, BR_BASE);
 110   sub(tmp1, tmp1, tmp2, ext::sxtw, 3);
 111   movw(tmp2, 0x1f);
 112   br(tmp1);
 113 
 114   bind(LOOP);
 115   for (size_t i = 0; i < unroll_factor; ++i) {
 116     load(tmp1, Address(post(ary, type2aelembytes(eltype))), eltype);
 117     maddw(result, result, tmp2, tmp1);
 118   }
 119   bind(BR_BASE);
 120   subsw(cnt, cnt, unroll_factor);
 121   br(Assembler::HS, LOOP);
 122 
 123   b(DONE);
 124 
 125   bind(LARGE);
 126 
 127   RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_arrays_hashcode(eltype));
 128   assert(stub.target() != nullptr, "array_hashcode stub has not been generated");
 129   address tpc = trampoline_call(stub);
 130   if (tpc == nullptr) {
 131     DEBUG_ONLY(reset_labels(TAIL, BR_BASE));
 132     postcond(pc() == badAddress);
 133     return nullptr;
 134   }
 135 
 136   bind(DONE);
 137 
 138   BLOCK_COMMENT("} // arrays_hashcode");
 139 
 140   postcond(pc() != badAddress);
 141   return pc();
 142 }
 143 
 144 void C2_MacroAssembler::fast_lock(Register objectReg, Register boxReg, Register tmpReg,
 145                                   Register tmp2Reg, Register tmp3Reg) {
 146   Register oop = objectReg;
 147   Register box = boxReg;
 148   Register disp_hdr = tmpReg;
 149   Register tmp = tmp2Reg;
 150   Label cont;
 151   Label object_has_monitor;
 152   Label count, no_count;
 153 
 154   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight");
 155   assert_different_registers(oop, box, tmp, disp_hdr, rscratch2);
 156 
 157   // Load markWord from object into displaced_header.
 158   ldr(disp_hdr, Address(oop, oopDesc::mark_offset_in_bytes()));
 159 
 160   if (DiagnoseSyncOnValueBasedClasses != 0) {
 161     load_klass(tmp, oop);
 162     ldrb(tmp, Address(tmp, Klass::misc_flags_offset()));
 163     tst(tmp, KlassFlags::_misc_is_value_based_class);
 164     br(Assembler::NE, cont);
 165   }
 166 
 167   // Check for existing monitor
 168   tbnz(disp_hdr, exact_log2(markWord::monitor_value), object_has_monitor);
 169 
 170   if (LockingMode == LM_MONITOR) {
 171     tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0.
 172     b(cont);
 173   } else {
 174     assert(LockingMode == LM_LEGACY, "must be");
 175     // Set tmp to be (markWord of object | UNLOCK_VALUE).
 176     orr(tmp, disp_hdr, markWord::unlocked_value);
 177 
 178     // Initialize the box. (Must happen before we update the object mark!)
 179     str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 180 
 181     // Compare object markWord with an unlocked value (tmp) and if
 182     // equal exchange the stack address of our box with object markWord.
 183     // On failure disp_hdr contains the possibly locked markWord.
 184     cmpxchg(oop, tmp, box, Assembler::xword, /*acquire*/ true,
 185             /*release*/ true, /*weak*/ false, disp_hdr);
 186     br(Assembler::EQ, cont);
 187 
 188     assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
 189 
 190     // If the compare-and-exchange succeeded, then we found an unlocked
 191     // object, will have now locked it will continue at label cont
 192 
 193     // Check if the owner is self by comparing the value in the
 194     // markWord of object (disp_hdr) with the stack pointer.
 195     mov(rscratch1, sp);
 196     sub(disp_hdr, disp_hdr, rscratch1);
 197     mov(tmp, (address) (~(os::vm_page_size()-1) | markWord::lock_mask_in_place));
 198     // If condition is true we are cont and hence we can store 0 as the
 199     // displaced header in the box, which indicates that it is a recursive lock.
 200     ands(tmp/*==0?*/, disp_hdr, tmp);   // Sets flags for result
 201     str(tmp/*==0, perhaps*/, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 202     b(cont);
 203   }
 204 
 205   // Handle existing monitor.
 206   bind(object_has_monitor);
 207 
 208   // Try to CAS owner (no owner => current thread's _monitor_owner_id).
 209   ldr(rscratch2, Address(rthread, JavaThread::monitor_owner_id_offset()));
 210   add(tmp, disp_hdr, (in_bytes(ObjectMonitor::owner_offset())-markWord::monitor_value));
 211   cmpxchg(tmp, zr, rscratch2, Assembler::xword, /*acquire*/ true,
 212           /*release*/ true, /*weak*/ false, tmp3Reg); // Sets flags for result
 213 
 214   // Store a non-null value into the box to avoid looking like a re-entrant
 215   // lock. The fast-path monitor unlock code checks for
 216   // markWord::monitor_value so use markWord::unused_mark which has the
 217   // relevant bit set, and also matches ObjectSynchronizer::enter.
 218   mov(tmp, (address)markWord::unused_mark().value());
 219   str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 220 
 221   br(Assembler::EQ, cont); // CAS success means locking succeeded
 222 
 223   cmp(tmp3Reg, rscratch2);
 224   br(Assembler::NE, cont); // Check for recursive locking
 225 
 226   // Recursive lock case
 227   increment(Address(disp_hdr, in_bytes(ObjectMonitor::recursions_offset()) - markWord::monitor_value), 1);
 228   // flag == EQ still from the cmp above, checking if this is a reentrant lock
 229 
 230   bind(cont);
 231   // flag == EQ indicates success
 232   // flag == NE indicates failure
 233   br(Assembler::NE, no_count);
 234 
 235   bind(count);
 236   if (LockingMode == LM_LEGACY) {
 237     inc_held_monitor_count(rscratch1);
 238   }
 239 
 240   bind(no_count);
 241 }
 242 
 243 void C2_MacroAssembler::fast_unlock(Register objectReg, Register boxReg, Register tmpReg,
 244                                     Register tmp2Reg) {
 245   Register oop = objectReg;
 246   Register box = boxReg;
 247   Register disp_hdr = tmpReg;
 248   Register owner_addr = tmpReg;
 249   Register tmp = tmp2Reg;
 250   Label cont;
 251   Label object_has_monitor;
 252   Label count, no_count;
 253   Label unlocked;
 254 
 255   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight");
 256   assert_different_registers(oop, box, tmp, disp_hdr);
 257 
 258   if (LockingMode == LM_LEGACY) {
 259     // Find the lock address and load the displaced header from the stack.
 260     ldr(disp_hdr, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 261 
 262     // If the displaced header is 0, we have a recursive unlock.
 263     cmp(disp_hdr, zr);
 264     br(Assembler::EQ, cont);
 265   }
 266 
 267   // Handle existing monitor.
 268   ldr(tmp, Address(oop, oopDesc::mark_offset_in_bytes()));
 269   tbnz(tmp, exact_log2(markWord::monitor_value), object_has_monitor);
 270 
 271   if (LockingMode == LM_MONITOR) {
 272     tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0.
 273     b(cont);
 274   } else {
 275     assert(LockingMode == LM_LEGACY, "must be");
 276     // Check if it is still a light weight lock, this is is true if we
 277     // see the stack address of the basicLock in the markWord of the
 278     // object.
 279 
 280     cmpxchg(oop, box, disp_hdr, Assembler::xword, /*acquire*/ false,
 281             /*release*/ true, /*weak*/ false, tmp);
 282     b(cont);
 283   }
 284 
 285   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
 286 
 287   // Handle existing monitor.
 288   bind(object_has_monitor);
 289   STATIC_ASSERT(markWord::monitor_value <= INT_MAX);
 290   add(tmp, tmp, -(int)markWord::monitor_value); // monitor
 291 
 292   ldr(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset()));
 293 
 294   Label notRecursive;
 295   cbz(disp_hdr, notRecursive);
 296 
 297   // Recursive lock
 298   sub(disp_hdr, disp_hdr, 1u);
 299   str(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset()));
 300   cmp(disp_hdr, disp_hdr); // Sets flags for result
 301   b(cont);
 302 
 303   bind(notRecursive);
 304 
 305   // Compute owner address.
 306   lea(owner_addr, Address(tmp, ObjectMonitor::owner_offset()));
 307 
 308   // Set owner to null.
 309   // Release to satisfy the JMM
 310   stlr(zr, owner_addr);
 311   // We need a full fence after clearing owner to avoid stranding.
 312   // StoreLoad achieves this.
 313   membar(StoreLoad);
 314 
 315   // Check if the entry_list is empty.
 316   ldr(rscratch1, Address(tmp, ObjectMonitor::entry_list_offset()));
 317   cmp(rscratch1, zr);
 318   br(Assembler::EQ, cont);     // If so we are done.
 319 
 320   // Check if there is a successor.
 321   ldr(rscratch1, Address(tmp, ObjectMonitor::succ_offset()));
 322   cmp(rscratch1, zr);
 323   br(Assembler::NE, unlocked); // If so we are done.
 324 
 325   // Save the monitor pointer in the current thread, so we can try to
 326   // reacquire the lock in SharedRuntime::monitor_exit_helper().
 327   str(tmp, Address(rthread, JavaThread::unlocked_inflated_monitor_offset()));
 328 
 329   cmp(zr, rthread); // Set Flag to NE => slow path
 330   b(cont);
 331 
 332   bind(unlocked);
 333   cmp(zr, zr); // Set Flag to EQ => fast path
 334 
 335   // Intentional fall-through
 336 
 337   bind(cont);
 338   // flag == EQ indicates success
 339   // flag == NE indicates failure
 340   br(Assembler::NE, no_count);
 341 
 342   bind(count);
 343   if (LockingMode == LM_LEGACY) {
 344     dec_held_monitor_count(rscratch1);
 345   }
 346 
 347   bind(no_count);
 348 }
 349 
 350 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register t1,
 351                                               Register t2, Register t3) {
 352   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 353   assert_different_registers(obj, box, t1, t2, t3, rscratch2);
 354 
 355   // Handle inflated monitor.
 356   Label inflated;
 357   // Finish fast lock successfully. MUST branch to with flag == EQ
 358   Label locked;
 359   // Finish fast lock unsuccessfully. MUST branch to with flag == NE
 360   Label slow_path;
 361 
 362   if (UseObjectMonitorTable) {
 363     // Clear cache in case fast locking succeeds.
 364     str(zr, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 365   }
 366 
 367   if (DiagnoseSyncOnValueBasedClasses != 0) {
 368     load_klass(t1, obj);
 369     ldrb(t1, Address(t1, Klass::misc_flags_offset()));
 370     tst(t1, KlassFlags::_misc_is_value_based_class);
 371     br(Assembler::NE, slow_path);
 372   }
 373 
 374   const Register t1_mark = t1;
 375   const Register t3_t = t3;
 376 
 377   { // Lightweight locking
 378 
 379     // Push lock to the lock stack and finish successfully. MUST branch to with flag == EQ
 380     Label push;
 381 
 382     const Register t2_top = t2;
 383 
 384     // Check if lock-stack is full.
 385     ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 386     cmpw(t2_top, (unsigned)LockStack::end_offset() - 1);
 387     br(Assembler::GT, slow_path);
 388 
 389     // Check if recursive.
 390     subw(t3_t, t2_top, oopSize);
 391     ldr(t3_t, Address(rthread, t3_t));
 392     cmp(obj, t3_t);
 393     br(Assembler::EQ, push);
 394 
 395     // Relaxed normal load to check for monitor. Optimization for monitor case.
 396     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 397     tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
 398 
 399     // Not inflated
 400     assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea");
 401 
 402     // Try to lock. Transition lock-bits 0b01 => 0b00
 403     orr(t1_mark, t1_mark, markWord::unlocked_value);
 404     eor(t3_t, t1_mark, markWord::unlocked_value);
 405     cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
 406             /*acquire*/ true, /*release*/ false, /*weak*/ false, noreg);
 407     br(Assembler::NE, slow_path);
 408 
 409     bind(push);
 410     // After successful lock, push object on lock-stack.
 411     str(obj, Address(rthread, t2_top));
 412     addw(t2_top, t2_top, oopSize);
 413     strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 414     b(locked);
 415   }
 416 
 417   { // Handle inflated monitor.
 418     bind(inflated);
 419 
 420     const Register t1_monitor = t1;
 421 
 422     if (!UseObjectMonitorTable) {
 423       assert(t1_monitor == t1_mark, "should be the same here");
 424     } else {
 425       Label monitor_found;
 426 
 427       // Load cache address
 428       lea(t3_t, Address(rthread, JavaThread::om_cache_oops_offset()));
 429 
 430       const int num_unrolled = 2;
 431       for (int i = 0; i < num_unrolled; i++) {
 432         ldr(t1, Address(t3_t));
 433         cmp(obj, t1);
 434         br(Assembler::EQ, monitor_found);
 435         increment(t3_t, in_bytes(OMCache::oop_to_oop_difference()));
 436       }
 437 
 438       Label loop;
 439 
 440       // Search for obj in cache.
 441       bind(loop);
 442 
 443       // Check for match.
 444       ldr(t1, Address(t3_t));
 445       cmp(obj, t1);
 446       br(Assembler::EQ, monitor_found);
 447 
 448       // Search until null encountered, guaranteed _null_sentinel at end.
 449       increment(t3_t, in_bytes(OMCache::oop_to_oop_difference()));
 450       cbnz(t1, loop);
 451       // Cache Miss, NE set from cmp above, cbnz does not set flags
 452       b(slow_path);
 453 
 454       bind(monitor_found);
 455       ldr(t1_monitor, Address(t3_t, OMCache::oop_to_monitor_difference()));
 456     }
 457 
 458     const Register t2_owner_addr = t2;
 459     const Register t3_owner = t3;
 460     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 461     const Address owner_address(t1_monitor, ObjectMonitor::owner_offset() - monitor_tag);
 462     const Address recursions_address(t1_monitor, ObjectMonitor::recursions_offset() - monitor_tag);
 463 
 464     Label monitor_locked;
 465 
 466     // Compute owner address.
 467     lea(t2_owner_addr, owner_address);
 468 
 469     // Try to CAS owner (no owner => current thread's _monitor_owner_id).
 470     ldr(rscratch2, Address(rthread, JavaThread::monitor_owner_id_offset()));
 471     cmpxchg(t2_owner_addr, zr, rscratch2, Assembler::xword, /*acquire*/ true,
 472             /*release*/ false, /*weak*/ false, t3_owner);
 473     br(Assembler::EQ, monitor_locked);
 474 
 475     // Check if recursive.
 476     cmp(t3_owner, rscratch2);
 477     br(Assembler::NE, slow_path);
 478 
 479     // Recursive.
 480     increment(recursions_address, 1);
 481 
 482     bind(monitor_locked);
 483     if (UseObjectMonitorTable) {
 484       str(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 485     }
 486   }
 487 
 488   bind(locked);
 489 
 490 #ifdef ASSERT
 491   // Check that locked label is reached with Flags == EQ.
 492   Label flag_correct;
 493   br(Assembler::EQ, flag_correct);
 494   stop("Fast Lock Flag != EQ");
 495 #endif
 496 
 497   bind(slow_path);
 498 #ifdef ASSERT
 499   // Check that slow_path label is reached with Flags == NE.
 500   br(Assembler::NE, flag_correct);
 501   stop("Fast Lock Flag != NE");
 502   bind(flag_correct);
 503 #endif
 504   // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
 505 }
 506 
 507 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register box, Register t1,
 508                                                 Register t2, Register t3) {
 509   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 510   assert_different_registers(obj, box, t1, t2, t3);
 511 
 512   // Handle inflated monitor.
 513   Label inflated, inflated_load_mark;
 514   // Finish fast unlock successfully. MUST branch to with flag == EQ
 515   Label unlocked;
 516   // Finish fast unlock unsuccessfully. MUST branch to with flag == NE
 517   Label slow_path;
 518 
 519   const Register t1_mark = t1;
 520   const Register t2_top = t2;
 521   const Register t3_t = t3;
 522 
 523   { // Lightweight unlock
 524 
 525     Label push_and_slow_path;
 526 
 527     // Check if obj is top of lock-stack.
 528     ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 529     subw(t2_top, t2_top, oopSize);
 530     ldr(t3_t, Address(rthread, t2_top));
 531     cmp(obj, t3_t);
 532     // Top of lock stack was not obj. Must be monitor.
 533     br(Assembler::NE, inflated_load_mark);
 534 
 535     // Pop lock-stack.
 536     DEBUG_ONLY(str(zr, Address(rthread, t2_top));)
 537     strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 538 
 539     // Check if recursive.
 540     subw(t3_t, t2_top, oopSize);
 541     ldr(t3_t, Address(rthread, t3_t));
 542     cmp(obj, t3_t);
 543     br(Assembler::EQ, unlocked);
 544 
 545     // Not recursive.
 546     // Load Mark.
 547     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 548 
 549     // Check header for monitor (0b10).
 550     // Because we got here by popping (meaning we pushed in locked)
 551     // there will be no monitor in the box. So we need to push back the obj
 552     // so that the runtime can fix any potential anonymous owner.
 553     tbnz(t1_mark, exact_log2(markWord::monitor_value), UseObjectMonitorTable ? push_and_slow_path : inflated);
 554 
 555     // Try to unlock. Transition lock bits 0b00 => 0b01
 556     assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea");
 557     orr(t3_t, t1_mark, markWord::unlocked_value);
 558     cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
 559             /*acquire*/ false, /*release*/ true, /*weak*/ false, noreg);
 560     br(Assembler::EQ, unlocked);
 561 
 562     bind(push_and_slow_path);
 563     // Compare and exchange failed.
 564     // Restore lock-stack and handle the unlock in runtime.
 565     DEBUG_ONLY(str(obj, Address(rthread, t2_top));)
 566     addw(t2_top, t2_top, oopSize);
 567     str(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 568     b(slow_path);
 569   }
 570 
 571 
 572   { // Handle inflated monitor.
 573     bind(inflated_load_mark);
 574     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 575 #ifdef ASSERT
 576     tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
 577     stop("Fast Unlock not monitor");
 578 #endif
 579 
 580     bind(inflated);
 581 
 582 #ifdef ASSERT
 583     Label check_done;
 584     subw(t2_top, t2_top, oopSize);
 585     cmpw(t2_top, in_bytes(JavaThread::lock_stack_base_offset()));
 586     br(Assembler::LT, check_done);
 587     ldr(t3_t, Address(rthread, t2_top));
 588     cmp(obj, t3_t);
 589     br(Assembler::NE, inflated);
 590     stop("Fast Unlock lock on stack");
 591     bind(check_done);
 592 #endif
 593 
 594     const Register t1_monitor = t1;
 595 
 596     if (!UseObjectMonitorTable) {
 597       assert(t1_monitor == t1_mark, "should be the same here");
 598 
 599       // Untag the monitor.
 600       add(t1_monitor, t1_mark, -(int)markWord::monitor_value);
 601     } else {
 602       ldr(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 603       // null check with Flags == NE, no valid pointer below alignof(ObjectMonitor*)
 604       cmp(t1_monitor, checked_cast<uint8_t>(alignof(ObjectMonitor*)));
 605       br(Assembler::LO, slow_path);
 606     }
 607 
 608     const Register t2_recursions = t2;
 609     Label not_recursive;
 610 
 611     // Check if recursive.
 612     ldr(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
 613     cbz(t2_recursions, not_recursive);
 614 
 615     // Recursive unlock.
 616     sub(t2_recursions, t2_recursions, 1u);
 617     str(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
 618     // Set flag == EQ
 619     cmp(t2_recursions, t2_recursions);
 620     b(unlocked);
 621 
 622     bind(not_recursive);
 623 
 624     const Register t2_owner_addr = t2;
 625 
 626     // Compute owner address.
 627     lea(t2_owner_addr, Address(t1_monitor, ObjectMonitor::owner_offset()));
 628 
 629     // Set owner to null.
 630     // Release to satisfy the JMM
 631     stlr(zr, t2_owner_addr);
 632     // We need a full fence after clearing owner to avoid stranding.
 633     // StoreLoad achieves this.
 634     membar(StoreLoad);
 635 
 636     // Check if the entry_list is empty.
 637     ldr(rscratch1, Address(t1_monitor, ObjectMonitor::entry_list_offset()));
 638     cmp(rscratch1, zr);
 639     br(Assembler::EQ, unlocked);  // If so we are done.
 640 
 641     // Check if there is a successor.
 642     ldr(rscratch1, Address(t1_monitor, ObjectMonitor::succ_offset()));
 643     cmp(rscratch1, zr);
 644     br(Assembler::NE, unlocked);  // If so we are done.
 645 
 646     // Save the monitor pointer in the current thread, so we can try to
 647     // reacquire the lock in SharedRuntime::monitor_exit_helper().
 648     str(t1_monitor, Address(rthread, JavaThread::unlocked_inflated_monitor_offset()));
 649 
 650     cmp(zr, rthread); // Set Flag to NE => slow path
 651     b(slow_path);
 652   }
 653 
 654   bind(unlocked);
 655   cmp(zr, zr); // Set Flags to EQ => fast path
 656 
 657 #ifdef ASSERT
 658   // Check that unlocked label is reached with Flags == EQ.
 659   Label flag_correct;
 660   br(Assembler::EQ, flag_correct);
 661   stop("Fast Unlock Flag != EQ");
 662 #endif
 663 
 664   bind(slow_path);
 665 #ifdef ASSERT
 666   // Check that slow_path label is reached with Flags == NE.
 667   br(Assembler::NE, flag_correct);
 668   stop("Fast Unlock Flag != NE");
 669   bind(flag_correct);
 670 #endif
 671   // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
 672 }
 673 
 674 // Search for str1 in str2 and return index or -1
 675 // Clobbers: rscratch1, rscratch2, rflags. May also clobber v0-v1, when icnt1==-1.
 676 void C2_MacroAssembler::string_indexof(Register str2, Register str1,
 677                                        Register cnt2, Register cnt1,
 678                                        Register tmp1, Register tmp2,
 679                                        Register tmp3, Register tmp4,
 680                                        Register tmp5, Register tmp6,
 681                                        int icnt1, Register result, int ae) {
 682   // NOTE: tmp5, tmp6 can be zr depending on specific method version
 683   Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
 684 
 685   Register ch1 = rscratch1;
 686   Register ch2 = rscratch2;
 687   Register cnt1tmp = tmp1;
 688   Register cnt2tmp = tmp2;
 689   Register cnt1_neg = cnt1;
 690   Register cnt2_neg = cnt2;
 691   Register result_tmp = tmp4;
 692 
 693   bool isL = ae == StrIntrinsicNode::LL;
 694 
 695   bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
 696   bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
 697   int str1_chr_shift = str1_isL ? 0:1;
 698   int str2_chr_shift = str2_isL ? 0:1;
 699   int str1_chr_size = str1_isL ? 1:2;
 700   int str2_chr_size = str2_isL ? 1:2;
 701   chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
 702                                       (chr_insn)&MacroAssembler::ldrh;
 703   chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
 704                                       (chr_insn)&MacroAssembler::ldrh;
 705   chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
 706   chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
 707 
 708   // Note, inline_string_indexOf() generates checks:
 709   // if (substr.count > string.count) return -1;
 710   // if (substr.count == 0) return 0;
 711 
 712   // We have two strings, a source string in str2, cnt2 and a pattern string
 713   // in str1, cnt1. Find the 1st occurrence of pattern in source or return -1.
 714 
 715   // For larger pattern and source we use a simplified Boyer Moore algorithm.
 716   // With a small pattern and source we use linear scan.
 717 
 718   if (icnt1 == -1) {
 719     sub(result_tmp, cnt2, cnt1);
 720     cmp(cnt1, (u1)8);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
 721     br(LT, LINEARSEARCH);
 722     dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
 723     subs(zr, cnt1, 256);
 724     lsr(tmp1, cnt2, 2);
 725     ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
 726     br(GE, LINEARSTUB);
 727   }
 728 
 729 // The Boyer Moore alogorithm is based on the description here:-
 730 //
 731 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
 732 //
 733 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
 734 // and the 'Good Suffix' rule.
 735 //
 736 // These rules are essentially heuristics for how far we can shift the
 737 // pattern along the search string.
 738 //
 739 // The implementation here uses the 'Bad Character' rule only because of the
 740 // complexity of initialisation for the 'Good Suffix' rule.
 741 //
 742 // This is also known as the Boyer-Moore-Horspool algorithm:-
 743 //
 744 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
 745 //
 746 // This particular implementation has few java-specific optimizations.
 747 //
 748 // #define ASIZE 256
 749 //
 750 //    int bm(unsigned char *x, int m, unsigned char *y, int n) {
 751 //       int i, j;
 752 //       unsigned c;
 753 //       unsigned char bc[ASIZE];
 754 //
 755 //       /* Preprocessing */
 756 //       for (i = 0; i < ASIZE; ++i)
 757 //          bc[i] = m;
 758 //       for (i = 0; i < m - 1; ) {
 759 //          c = x[i];
 760 //          ++i;
 761 //          // c < 256 for Latin1 string, so, no need for branch
 762 //          #ifdef PATTERN_STRING_IS_LATIN1
 763 //          bc[c] = m - i;
 764 //          #else
 765 //          if (c < ASIZE) bc[c] = m - i;
 766 //          #endif
 767 //       }
 768 //
 769 //       /* Searching */
 770 //       j = 0;
 771 //       while (j <= n - m) {
 772 //          c = y[i+j];
 773 //          if (x[m-1] == c)
 774 //            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
 775 //          if (i < 0) return j;
 776 //          // c < 256 for Latin1 string, so, no need for branch
 777 //          #ifdef SOURCE_STRING_IS_LATIN1
 778 //          // LL case: (c< 256) always true. Remove branch
 779 //          j += bc[y[j+m-1]];
 780 //          #endif
 781 //          #ifndef PATTERN_STRING_IS_UTF
 782 //          // UU case: need if (c<ASIZE) check. Skip 1 character if not.
 783 //          if (c < ASIZE)
 784 //            j += bc[y[j+m-1]];
 785 //          else
 786 //            j += 1
 787 //          #endif
 788 //          #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
 789 //          // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
 790 //          if (c < ASIZE)
 791 //            j += bc[y[j+m-1]];
 792 //          else
 793 //            j += m
 794 //          #endif
 795 //       }
 796 //    }
 797 
 798   if (icnt1 == -1) {
 799     Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
 800         BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
 801     Register cnt1end = tmp2;
 802     Register str2end = cnt2;
 803     Register skipch = tmp2;
 804 
 805     // str1 length is >=8, so, we can read at least 1 register for cases when
 806     // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
 807     // UL case. We'll re-read last character in inner pre-loop code to have
 808     // single outer pre-loop load
 809     const int firstStep = isL ? 7 : 3;
 810 
 811     const int ASIZE = 256;
 812     const int STORED_BYTES = 32; // amount of bytes stored per instruction
 813     sub(sp, sp, ASIZE);
 814     mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
 815     mov(ch1, sp);
 816     BIND(BM_INIT_LOOP);
 817       stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
 818       subs(tmp5, tmp5, 1);
 819       br(GT, BM_INIT_LOOP);
 820 
 821       sub(cnt1tmp, cnt1, 1);
 822       mov(tmp5, str2);
 823       add(str2end, str2, result_tmp, LSL, str2_chr_shift);
 824       sub(ch2, cnt1, 1);
 825       mov(tmp3, str1);
 826     BIND(BCLOOP);
 827       (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
 828       if (!str1_isL) {
 829         subs(zr, ch1, ASIZE);
 830         br(HS, BCSKIP);
 831       }
 832       strb(ch2, Address(sp, ch1));
 833     BIND(BCSKIP);
 834       subs(ch2, ch2, 1);
 835       br(GT, BCLOOP);
 836 
 837       add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
 838       if (str1_isL == str2_isL) {
 839         // load last 8 bytes (8LL/4UU symbols)
 840         ldr(tmp6, Address(tmp6, -wordSize));
 841       } else {
 842         ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
 843         // convert Latin1 to UTF. We'll have to wait until load completed, but
 844         // it's still faster than per-character loads+checks
 845         lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
 846         ubfx(ch1, tmp6, 8, 8); // str1[N-2]
 847         ubfx(ch2, tmp6, 16, 8); // str1[N-3]
 848         andr(tmp6, tmp6, 0xFF); // str1[N-4]
 849         orr(ch2, ch1, ch2, LSL, 16);
 850         orr(tmp6, tmp6, tmp3, LSL, 48);
 851         orr(tmp6, tmp6, ch2, LSL, 16);
 852       }
 853     BIND(BMLOOPSTR2);
 854       (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 855       sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
 856       if (str1_isL == str2_isL) {
 857         // re-init tmp3. It's for free because it's executed in parallel with
 858         // load above. Alternative is to initialize it before loop, but it'll
 859         // affect performance on in-order systems with 2 or more ld/st pipelines
 860         lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
 861       }
 862       if (!isL) { // UU/UL case
 863         lsl(ch2, cnt1tmp, 1); // offset in bytes
 864       }
 865       cmp(tmp3, skipch);
 866       br(NE, BMSKIP);
 867       ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
 868       mov(ch1, tmp6);
 869       if (isL) {
 870         b(BMLOOPSTR1_AFTER_LOAD);
 871       } else {
 872         sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
 873         b(BMLOOPSTR1_CMP);
 874       }
 875     BIND(BMLOOPSTR1);
 876       (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
 877       (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 878     BIND(BMLOOPSTR1_AFTER_LOAD);
 879       subs(cnt1tmp, cnt1tmp, 1);
 880       br(LT, BMLOOPSTR1_LASTCMP);
 881     BIND(BMLOOPSTR1_CMP);
 882       cmp(ch1, ch2);
 883       br(EQ, BMLOOPSTR1);
 884     BIND(BMSKIP);
 885       if (!isL) {
 886         // if we've met UTF symbol while searching Latin1 pattern, then we can
 887         // skip cnt1 symbols
 888         if (str1_isL != str2_isL) {
 889           mov(result_tmp, cnt1);
 890         } else {
 891           mov(result_tmp, 1);
 892         }
 893         subs(zr, skipch, ASIZE);
 894         br(HS, BMADV);
 895       }
 896       ldrb(result_tmp, Address(sp, skipch)); // load skip distance
 897     BIND(BMADV);
 898       sub(cnt1tmp, cnt1, 1);
 899       add(str2, str2, result_tmp, LSL, str2_chr_shift);
 900       cmp(str2, str2end);
 901       br(LE, BMLOOPSTR2);
 902       add(sp, sp, ASIZE);
 903       b(NOMATCH);
 904     BIND(BMLOOPSTR1_LASTCMP);
 905       cmp(ch1, ch2);
 906       br(NE, BMSKIP);
 907     BIND(BMMATCH);
 908       sub(result, str2, tmp5);
 909       if (!str2_isL) lsr(result, result, 1);
 910       add(sp, sp, ASIZE);
 911       b(DONE);
 912 
 913     BIND(LINEARSTUB);
 914     cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm
 915     br(LT, LINEAR_MEDIUM);
 916     mov(result, zr);
 917     RuntimeAddress stub = nullptr;
 918     if (isL) {
 919       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
 920       assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated");
 921     } else if (str1_isL) {
 922       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
 923        assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated");
 924     } else {
 925       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
 926       assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated");
 927     }
 928     address call = trampoline_call(stub);
 929     if (call == nullptr) {
 930       DEBUG_ONLY(reset_labels(LINEARSEARCH, LINEAR_MEDIUM, DONE, NOMATCH, MATCH));
 931       ciEnv::current()->record_failure("CodeCache is full");
 932       return;
 933     }
 934     b(DONE);
 935   }
 936 
 937   BIND(LINEARSEARCH);
 938   {
 939     Label DO1, DO2, DO3;
 940 
 941     Register str2tmp = tmp2;
 942     Register first = tmp3;
 943 
 944     if (icnt1 == -1)
 945     {
 946         Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
 947 
 948         cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2));
 949         br(LT, DOSHORT);
 950       BIND(LINEAR_MEDIUM);
 951         (this->*str1_load_1chr)(first, Address(str1));
 952         lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
 953         sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
 954         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 955         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 956 
 957       BIND(FIRST_LOOP);
 958         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
 959         cmp(first, ch2);
 960         br(EQ, STR1_LOOP);
 961       BIND(STR2_NEXT);
 962         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 963         br(LE, FIRST_LOOP);
 964         b(NOMATCH);
 965 
 966       BIND(STR1_LOOP);
 967         adds(cnt1tmp, cnt1_neg, str1_chr_size);
 968         add(cnt2tmp, cnt2_neg, str2_chr_size);
 969         br(GE, MATCH);
 970 
 971       BIND(STR1_NEXT);
 972         (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
 973         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
 974         cmp(ch1, ch2);
 975         br(NE, STR2_NEXT);
 976         adds(cnt1tmp, cnt1tmp, str1_chr_size);
 977         add(cnt2tmp, cnt2tmp, str2_chr_size);
 978         br(LT, STR1_NEXT);
 979         b(MATCH);
 980 
 981       BIND(DOSHORT);
 982       if (str1_isL == str2_isL) {
 983         cmp(cnt1, (u1)2);
 984         br(LT, DO1);
 985         br(GT, DO3);
 986       }
 987     }
 988 
 989     if (icnt1 == 4) {
 990       Label CH1_LOOP;
 991 
 992         (this->*load_4chr)(ch1, str1);
 993         sub(result_tmp, cnt2, 4);
 994         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 995         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 996 
 997       BIND(CH1_LOOP);
 998         (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
 999         cmp(ch1, ch2);
1000         br(EQ, MATCH);
1001         adds(cnt2_neg, cnt2_neg, str2_chr_size);
1002         br(LE, CH1_LOOP);
1003         b(NOMATCH);
1004       }
1005 
1006     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
1007       Label CH1_LOOP;
1008 
1009       BIND(DO2);
1010         (this->*load_2chr)(ch1, str1);
1011         if (icnt1 == 2) {
1012           sub(result_tmp, cnt2, 2);
1013         }
1014         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
1015         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
1016       BIND(CH1_LOOP);
1017         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
1018         cmp(ch1, ch2);
1019         br(EQ, MATCH);
1020         adds(cnt2_neg, cnt2_neg, str2_chr_size);
1021         br(LE, CH1_LOOP);
1022         b(NOMATCH);
1023     }
1024 
1025     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
1026       Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
1027 
1028       BIND(DO3);
1029         (this->*load_2chr)(first, str1);
1030         (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
1031         if (icnt1 == 3) {
1032           sub(result_tmp, cnt2, 3);
1033         }
1034         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
1035         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
1036       BIND(FIRST_LOOP);
1037         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
1038         cmpw(first, ch2);
1039         br(EQ, STR1_LOOP);
1040       BIND(STR2_NEXT);
1041         adds(cnt2_neg, cnt2_neg, str2_chr_size);
1042         br(LE, FIRST_LOOP);
1043         b(NOMATCH);
1044 
1045       BIND(STR1_LOOP);
1046         add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
1047         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
1048         cmp(ch1, ch2);
1049         br(NE, STR2_NEXT);
1050         b(MATCH);
1051     }
1052 
1053     if (icnt1 == -1 || icnt1 == 1) {
1054       Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
1055 
1056       BIND(DO1);
1057         (this->*str1_load_1chr)(ch1, str1);
1058         cmp(cnt2, (u1)8);
1059         br(LT, DO1_SHORT);
1060 
1061         sub(result_tmp, cnt2, 8/str2_chr_size);
1062         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
1063         mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
1064         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
1065 
1066         if (str2_isL) {
1067           orr(ch1, ch1, ch1, LSL, 8);
1068         }
1069         orr(ch1, ch1, ch1, LSL, 16);
1070         orr(ch1, ch1, ch1, LSL, 32);
1071       BIND(CH1_LOOP);
1072         ldr(ch2, Address(str2, cnt2_neg));
1073         eor(ch2, ch1, ch2);
1074         sub(tmp1, ch2, tmp3);
1075         orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
1076         bics(tmp1, tmp1, tmp2);
1077         br(NE, HAS_ZERO);
1078         adds(cnt2_neg, cnt2_neg, 8);
1079         br(LT, CH1_LOOP);
1080 
1081         cmp(cnt2_neg, (u1)8);
1082         mov(cnt2_neg, 0);
1083         br(LT, CH1_LOOP);
1084         b(NOMATCH);
1085 
1086       BIND(HAS_ZERO);
1087         rev(tmp1, tmp1);
1088         clz(tmp1, tmp1);
1089         add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
1090         b(MATCH);
1091 
1092       BIND(DO1_SHORT);
1093         mov(result_tmp, cnt2);
1094         lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
1095         sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
1096       BIND(DO1_LOOP);
1097         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
1098         cmpw(ch1, ch2);
1099         br(EQ, MATCH);
1100         adds(cnt2_neg, cnt2_neg, str2_chr_size);
1101         br(LT, DO1_LOOP);
1102     }
1103   }
1104   BIND(NOMATCH);
1105     mov(result, -1);
1106     b(DONE);
1107   BIND(MATCH);
1108     add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
1109   BIND(DONE);
1110 }
1111 
1112 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
1113 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
1114 
1115 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1,
1116                                             Register ch, Register result,
1117                                             Register tmp1, Register tmp2, Register tmp3)
1118 {
1119   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
1120   Register cnt1_neg = cnt1;
1121   Register ch1 = rscratch1;
1122   Register result_tmp = rscratch2;
1123 
1124   cbz(cnt1, NOMATCH);
1125 
1126   cmp(cnt1, (u1)4);
1127   br(LT, DO1_SHORT);
1128 
1129   orr(ch, ch, ch, LSL, 16);
1130   orr(ch, ch, ch, LSL, 32);
1131 
1132   sub(cnt1, cnt1, 4);
1133   mov(result_tmp, cnt1);
1134   lea(str1, Address(str1, cnt1, Address::uxtw(1)));
1135   sub(cnt1_neg, zr, cnt1, LSL, 1);
1136 
1137   mov(tmp3, 0x0001000100010001);
1138 
1139   BIND(CH1_LOOP);
1140     ldr(ch1, Address(str1, cnt1_neg));
1141     eor(ch1, ch, ch1);
1142     sub(tmp1, ch1, tmp3);
1143     orr(tmp2, ch1, 0x7fff7fff7fff7fff);
1144     bics(tmp1, tmp1, tmp2);
1145     br(NE, HAS_ZERO);
1146     adds(cnt1_neg, cnt1_neg, 8);
1147     br(LT, CH1_LOOP);
1148 
1149     cmp(cnt1_neg, (u1)8);
1150     mov(cnt1_neg, 0);
1151     br(LT, CH1_LOOP);
1152     b(NOMATCH);
1153 
1154   BIND(HAS_ZERO);
1155     rev(tmp1, tmp1);
1156     clz(tmp1, tmp1);
1157     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
1158     b(MATCH);
1159 
1160   BIND(DO1_SHORT);
1161     mov(result_tmp, cnt1);
1162     lea(str1, Address(str1, cnt1, Address::uxtw(1)));
1163     sub(cnt1_neg, zr, cnt1, LSL, 1);
1164   BIND(DO1_LOOP);
1165     ldrh(ch1, Address(str1, cnt1_neg));
1166     cmpw(ch, ch1);
1167     br(EQ, MATCH);
1168     adds(cnt1_neg, cnt1_neg, 2);
1169     br(LT, DO1_LOOP);
1170   BIND(NOMATCH);
1171     mov(result, -1);
1172     b(DONE);
1173   BIND(MATCH);
1174     add(result, result_tmp, cnt1_neg, ASR, 1);
1175   BIND(DONE);
1176 }
1177 
1178 void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1,
1179                                                 Register ch, Register result,
1180                                                 FloatRegister ztmp1,
1181                                                 FloatRegister ztmp2,
1182                                                 PRegister tmp_pg,
1183                                                 PRegister tmp_pdn, bool isL)
1184 {
1185   // Note that `tmp_pdn` should *NOT* be used as governing predicate register.
1186   assert(tmp_pg->is_governing(),
1187          "this register has to be a governing predicate register");
1188 
1189   Label LOOP, MATCH, DONE, NOMATCH;
1190   Register vec_len = rscratch1;
1191   Register idx = rscratch2;
1192 
1193   SIMD_RegVariant T = (isL == true) ? B : H;
1194 
1195   cbz(cnt1, NOMATCH);
1196 
1197   // Assign the particular char throughout the vector.
1198   sve_dup(ztmp2, T, ch);
1199   if (isL) {
1200     sve_cntb(vec_len);
1201   } else {
1202     sve_cnth(vec_len);
1203   }
1204   mov(idx, 0);
1205 
1206   // Generate a predicate to control the reading of input string.
1207   sve_whilelt(tmp_pg, T, idx, cnt1);
1208 
1209   BIND(LOOP);
1210     // Read a vector of 8- or 16-bit data depending on the string type. Note
1211     // that inactive elements indicated by the predicate register won't cause
1212     // a data read from memory to the destination vector.
1213     if (isL) {
1214       sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx));
1215     } else {
1216       sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1)));
1217     }
1218     add(idx, idx, vec_len);
1219 
1220     // Perform the comparison. An element of the destination predicate is set
1221     // to active if the particular char is matched.
1222     sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2);
1223 
1224     // Branch if the particular char is found.
1225     br(NE, MATCH);
1226 
1227     sve_whilelt(tmp_pg, T, idx, cnt1);
1228 
1229     // Loop back if the particular char not found.
1230     br(MI, LOOP);
1231 
1232   BIND(NOMATCH);
1233     mov(result, -1);
1234     b(DONE);
1235 
1236   BIND(MATCH);
1237     // Undo the index increment.
1238     sub(idx, idx, vec_len);
1239 
1240     // Crop the vector to find its location.
1241     sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */);
1242     add(result, idx, -1);
1243     sve_incp(result, T, tmp_pdn);
1244   BIND(DONE);
1245 }
1246 
1247 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1,
1248                                             Register ch, Register result,
1249                                             Register tmp1, Register tmp2, Register tmp3)
1250 {
1251   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
1252   Register cnt1_neg = cnt1;
1253   Register ch1 = rscratch1;
1254   Register result_tmp = rscratch2;
1255 
1256   cbz(cnt1, NOMATCH);
1257 
1258   cmp(cnt1, (u1)8);
1259   br(LT, DO1_SHORT);
1260 
1261   orr(ch, ch, ch, LSL, 8);
1262   orr(ch, ch, ch, LSL, 16);
1263   orr(ch, ch, ch, LSL, 32);
1264 
1265   sub(cnt1, cnt1, 8);
1266   mov(result_tmp, cnt1);
1267   lea(str1, Address(str1, cnt1));
1268   sub(cnt1_neg, zr, cnt1);
1269 
1270   mov(tmp3, 0x0101010101010101);
1271 
1272   BIND(CH1_LOOP);
1273     ldr(ch1, Address(str1, cnt1_neg));
1274     eor(ch1, ch, ch1);
1275     sub(tmp1, ch1, tmp3);
1276     orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f);
1277     bics(tmp1, tmp1, tmp2);
1278     br(NE, HAS_ZERO);
1279     adds(cnt1_neg, cnt1_neg, 8);
1280     br(LT, CH1_LOOP);
1281 
1282     cmp(cnt1_neg, (u1)8);
1283     mov(cnt1_neg, 0);
1284     br(LT, CH1_LOOP);
1285     b(NOMATCH);
1286 
1287   BIND(HAS_ZERO);
1288     rev(tmp1, tmp1);
1289     clz(tmp1, tmp1);
1290     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
1291     b(MATCH);
1292 
1293   BIND(DO1_SHORT);
1294     mov(result_tmp, cnt1);
1295     lea(str1, Address(str1, cnt1));
1296     sub(cnt1_neg, zr, cnt1);
1297   BIND(DO1_LOOP);
1298     ldrb(ch1, Address(str1, cnt1_neg));
1299     cmp(ch, ch1);
1300     br(EQ, MATCH);
1301     adds(cnt1_neg, cnt1_neg, 1);
1302     br(LT, DO1_LOOP);
1303   BIND(NOMATCH);
1304     mov(result, -1);
1305     b(DONE);
1306   BIND(MATCH);
1307     add(result, result_tmp, cnt1_neg);
1308   BIND(DONE);
1309 }
1310 
1311 // Compare strings.
1312 void C2_MacroAssembler::string_compare(Register str1, Register str2,
1313     Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
1314     FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
1315     PRegister pgtmp1, PRegister pgtmp2, int ae) {
1316   Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
1317       DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
1318       SHORT_LOOP_START, TAIL_CHECK;
1319 
1320   bool isLL = ae == StrIntrinsicNode::LL;
1321   bool isLU = ae == StrIntrinsicNode::LU;
1322   bool isUL = ae == StrIntrinsicNode::UL;
1323 
1324   // The stub threshold for LL strings is: 72 (64 + 8) chars
1325   // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch)
1326   // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least)
1327   const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36);
1328 
1329   bool str1_isL = isLL || isLU;
1330   bool str2_isL = isLL || isUL;
1331 
1332   int str1_chr_shift = str1_isL ? 0 : 1;
1333   int str2_chr_shift = str2_isL ? 0 : 1;
1334   int str1_chr_size = str1_isL ? 1 : 2;
1335   int str2_chr_size = str2_isL ? 1 : 2;
1336   int minCharsInWord = isLL ? wordSize : wordSize/2;
1337 
1338   FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
1339   chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
1340                                       (chr_insn)&MacroAssembler::ldrh;
1341   chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
1342                                       (chr_insn)&MacroAssembler::ldrh;
1343   uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
1344                             (uxt_insn)&MacroAssembler::uxthw;
1345 
1346   BLOCK_COMMENT("string_compare {");
1347 
1348   // Bizarrely, the counts are passed in bytes, regardless of whether they
1349   // are L or U strings, however the result is always in characters.
1350   if (!str1_isL) asrw(cnt1, cnt1, 1);
1351   if (!str2_isL) asrw(cnt2, cnt2, 1);
1352 
1353   // Compute the minimum of the string lengths and save the difference.
1354   subsw(result, cnt1, cnt2);
1355   cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
1356 
1357   // A very short string
1358   cmpw(cnt2, minCharsInWord);
1359   br(Assembler::LE, SHORT_STRING);
1360 
1361   // Compare longwords
1362   // load first parts of strings and finish initialization while loading
1363   {
1364     if (str1_isL == str2_isL) { // LL or UU
1365       ldr(tmp1, Address(str1));
1366       cmp(str1, str2);
1367       br(Assembler::EQ, DONE);
1368       ldr(tmp2, Address(str2));
1369       cmp(cnt2, stub_threshold);
1370       br(GE, STUB);
1371       subsw(cnt2, cnt2, minCharsInWord);
1372       br(EQ, TAIL_CHECK);
1373       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1374       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1375       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1376     } else if (isLU) {
1377       ldrs(vtmp, Address(str1));
1378       ldr(tmp2, Address(str2));
1379       cmp(cnt2, stub_threshold);
1380       br(GE, STUB);
1381       subw(cnt2, cnt2, 4);
1382       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1383       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1384       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1385       zip1(vtmp, T8B, vtmp, vtmpZ);
1386       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1387       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1388       add(cnt1, cnt1, 4);
1389       fmovd(tmp1, vtmp);
1390     } else { // UL case
1391       ldr(tmp1, Address(str1));
1392       ldrs(vtmp, Address(str2));
1393       cmp(cnt2, stub_threshold);
1394       br(GE, STUB);
1395       subw(cnt2, cnt2, 4);
1396       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1397       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1398       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1399       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1400       zip1(vtmp, T8B, vtmp, vtmpZ);
1401       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1402       add(cnt1, cnt1, 8);
1403       fmovd(tmp2, vtmp);
1404     }
1405     adds(cnt2, cnt2, isUL ? 4 : 8);
1406     br(GE, TAIL);
1407     eor(rscratch2, tmp1, tmp2);
1408     cbnz(rscratch2, DIFF);
1409     // main loop
1410     bind(NEXT_WORD);
1411     if (str1_isL == str2_isL) {
1412       ldr(tmp1, Address(str1, cnt2));
1413       ldr(tmp2, Address(str2, cnt2));
1414       adds(cnt2, cnt2, 8);
1415     } else if (isLU) {
1416       ldrs(vtmp, Address(str1, cnt1));
1417       ldr(tmp2, Address(str2, cnt2));
1418       add(cnt1, cnt1, 4);
1419       zip1(vtmp, T8B, vtmp, vtmpZ);
1420       fmovd(tmp1, vtmp);
1421       adds(cnt2, cnt2, 8);
1422     } else { // UL
1423       ldrs(vtmp, Address(str2, cnt2));
1424       ldr(tmp1, Address(str1, cnt1));
1425       zip1(vtmp, T8B, vtmp, vtmpZ);
1426       add(cnt1, cnt1, 8);
1427       fmovd(tmp2, vtmp);
1428       adds(cnt2, cnt2, 4);
1429     }
1430     br(GE, TAIL);
1431 
1432     eor(rscratch2, tmp1, tmp2);
1433     cbz(rscratch2, NEXT_WORD);
1434     b(DIFF);
1435     bind(TAIL);
1436     eor(rscratch2, tmp1, tmp2);
1437     cbnz(rscratch2, DIFF);
1438     // Last longword.  In the case where length == 4 we compare the
1439     // same longword twice, but that's still faster than another
1440     // conditional branch.
1441     if (str1_isL == str2_isL) {
1442       ldr(tmp1, Address(str1));
1443       ldr(tmp2, Address(str2));
1444     } else if (isLU) {
1445       ldrs(vtmp, Address(str1));
1446       ldr(tmp2, Address(str2));
1447       zip1(vtmp, T8B, vtmp, vtmpZ);
1448       fmovd(tmp1, vtmp);
1449     } else { // UL
1450       ldrs(vtmp, Address(str2));
1451       ldr(tmp1, Address(str1));
1452       zip1(vtmp, T8B, vtmp, vtmpZ);
1453       fmovd(tmp2, vtmp);
1454     }
1455     bind(TAIL_CHECK);
1456     eor(rscratch2, tmp1, tmp2);
1457     cbz(rscratch2, DONE);
1458 
1459     // Find the first different characters in the longwords and
1460     // compute their difference.
1461     bind(DIFF);
1462     rev(rscratch2, rscratch2);
1463     clz(rscratch2, rscratch2);
1464     andr(rscratch2, rscratch2, isLL ? -8 : -16);
1465     lsrv(tmp1, tmp1, rscratch2);
1466     (this->*ext_chr)(tmp1, tmp1);
1467     lsrv(tmp2, tmp2, rscratch2);
1468     (this->*ext_chr)(tmp2, tmp2);
1469     subw(result, tmp1, tmp2);
1470     b(DONE);
1471   }
1472 
1473   bind(STUB);
1474     RuntimeAddress stub = nullptr;
1475     switch(ae) {
1476       case StrIntrinsicNode::LL:
1477         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
1478         break;
1479       case StrIntrinsicNode::UU:
1480         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
1481         break;
1482       case StrIntrinsicNode::LU:
1483         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
1484         break;
1485       case StrIntrinsicNode::UL:
1486         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
1487         break;
1488       default:
1489         ShouldNotReachHere();
1490      }
1491     assert(stub.target() != nullptr, "compare_long_string stub has not been generated");
1492     address call = trampoline_call(stub);
1493     if (call == nullptr) {
1494       DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START));
1495       ciEnv::current()->record_failure("CodeCache is full");
1496       return;
1497     }
1498     b(DONE);
1499 
1500   bind(SHORT_STRING);
1501   // Is the minimum length zero?
1502   cbz(cnt2, DONE);
1503   // arrange code to do most branches while loading and loading next characters
1504   // while comparing previous
1505   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1506   subs(cnt2, cnt2, 1);
1507   br(EQ, SHORT_LAST_INIT);
1508   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1509   b(SHORT_LOOP_START);
1510   bind(SHORT_LOOP);
1511   subs(cnt2, cnt2, 1);
1512   br(EQ, SHORT_LAST);
1513   bind(SHORT_LOOP_START);
1514   (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
1515   (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
1516   cmp(tmp1, cnt1);
1517   br(NE, SHORT_LOOP_TAIL);
1518   subs(cnt2, cnt2, 1);
1519   br(EQ, SHORT_LAST2);
1520   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1521   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1522   cmp(tmp2, rscratch1);
1523   br(EQ, SHORT_LOOP);
1524   sub(result, tmp2, rscratch1);
1525   b(DONE);
1526   bind(SHORT_LOOP_TAIL);
1527   sub(result, tmp1, cnt1);
1528   b(DONE);
1529   bind(SHORT_LAST2);
1530   cmp(tmp2, rscratch1);
1531   br(EQ, DONE);
1532   sub(result, tmp2, rscratch1);
1533 
1534   b(DONE);
1535   bind(SHORT_LAST_INIT);
1536   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1537   bind(SHORT_LAST);
1538   cmp(tmp1, cnt1);
1539   br(EQ, DONE);
1540   sub(result, tmp1, cnt1);
1541 
1542   bind(DONE);
1543 
1544   BLOCK_COMMENT("} string_compare");
1545 }
1546 
1547 void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1,
1548                                      FloatRegister src2, Condition cond, bool isQ) {
1549   SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1550   FloatRegister zn = src1, zm = src2;
1551   bool needs_negation = false;
1552   switch (cond) {
1553     case LT: cond = GT; zn = src2; zm = src1; break;
1554     case LE: cond = GE; zn = src2; zm = src1; break;
1555     case LO: cond = HI; zn = src2; zm = src1; break;
1556     case LS: cond = HS; zn = src2; zm = src1; break;
1557     case NE: cond = EQ; needs_negation = true; break;
1558     default:
1559       break;
1560   }
1561 
1562   if (is_floating_point_type(bt)) {
1563     fcm(cond, dst, size, zn, zm);
1564   } else {
1565     cm(cond, dst, size, zn, zm);
1566   }
1567 
1568   if (needs_negation) {
1569     notr(dst, isQ ? T16B : T8B, dst);
1570   }
1571 }
1572 
1573 void C2_MacroAssembler::neon_compare_zero(FloatRegister dst, BasicType bt, FloatRegister src,
1574                                           Condition cond, bool isQ) {
1575   SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1576   if (bt == T_FLOAT || bt == T_DOUBLE) {
1577     if (cond == Assembler::NE) {
1578       fcm(Assembler::EQ, dst, size, src);
1579       notr(dst, isQ ? T16B : T8B, dst);
1580     } else {
1581       fcm(cond, dst, size, src);
1582     }
1583   } else {
1584     if (cond == Assembler::NE) {
1585       cm(Assembler::EQ, dst, size, src);
1586       notr(dst, isQ ? T16B : T8B, dst);
1587     } else {
1588       cm(cond, dst, size, src);
1589     }
1590   }
1591 }
1592 
1593 // Compress the least significant bit of each byte to the rightmost and clear
1594 // the higher garbage bits.
1595 void C2_MacroAssembler::bytemask_compress(Register dst) {
1596   // Example input, dst = 0x01 00 00 00 01 01 00 01
1597   // The "??" bytes are garbage.
1598   orr(dst, dst, dst, Assembler::LSR, 7);  // dst = 0x?? 02 ?? 00 ?? 03 ?? 01
1599   orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D
1600   orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D
1601   andr(dst, dst, 0xff);                   // dst = 0x8D
1602 }
1603 
1604 // Pack the lowest-numbered bit of each mask element in src into a long value
1605 // in dst, at most the first 64 lane elements.
1606 // Clobbers: rscratch1, if UseSVE=1 or the hardware doesn't support FEAT_BITPERM.
1607 void C2_MacroAssembler::sve_vmask_tolong(Register dst, PRegister src, BasicType bt, int lane_cnt,
1608                                          FloatRegister vtmp1, FloatRegister vtmp2) {
1609   assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
1610   assert_different_registers(dst, rscratch1);
1611   assert_different_registers(vtmp1, vtmp2);
1612 
1613   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
1614   // Example:   src = 0b01100101 10001101, bt = T_BYTE, lane_cnt = 16
1615   // Expected:  dst = 0x658D
1616 
1617   // Convert the mask into vector with sequential bytes.
1618   // vtmp1 = 0x00010100 0x00010001 0x01000000 0x01010001
1619   sve_cpy(vtmp1, size, src, 1, false);
1620   if (bt != T_BYTE) {
1621     sve_vector_narrow(vtmp1, B, vtmp1, size, vtmp2);
1622   }
1623 
1624   if (UseSVE > 1 && VM_Version::supports_svebitperm()) {
1625     // Given a vector with the value 0x00 or 0x01 in each byte, the basic idea
1626     // is to compress each significant bit of the byte in a cross-lane way. Due
1627     // to the lack of a cross-lane bit-compress instruction, we use BEXT
1628     // (bit-compress in each lane) with the biggest lane size (T = D) then
1629     // concatenate the results.
1630 
1631     // The second source input of BEXT, initialized with 0x01 in each byte.
1632     // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1633     sve_dup(vtmp2, B, 1);
1634 
1635     // BEXT vtmp1.D, vtmp1.D, vtmp2.D
1636     // vtmp1 = 0x0001010000010001 | 0x0100000001010001
1637     // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1638     //         ---------------------------------------
1639     // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1640     sve_bext(vtmp1, D, vtmp1, vtmp2);
1641 
1642     // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the
1643     // result to dst.
1644     // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1645     // dst   = 0x658D
1646     if (lane_cnt <= 8) {
1647       // No need to concatenate.
1648       umov(dst, vtmp1, B, 0);
1649     } else if (lane_cnt <= 16) {
1650       ins(vtmp1, B, vtmp1, 1, 8);
1651       umov(dst, vtmp1, H, 0);
1652     } else {
1653       // As the lane count is 64 at most, the final expected value must be in
1654       // the lowest 64 bits after narrowing vtmp1 from D to B.
1655       sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2);
1656       umov(dst, vtmp1, D, 0);
1657     }
1658   } else if (UseSVE > 0) {
1659     // Compress the lowest 8 bytes.
1660     fmovd(dst, vtmp1);
1661     bytemask_compress(dst);
1662     if (lane_cnt <= 8) return;
1663 
1664     // Repeat on higher bytes and join the results.
1665     // Compress 8 bytes in each iteration.
1666     for (int idx = 1; idx < (lane_cnt / 8); idx++) {
1667       sve_extract_integral(rscratch1, T_LONG, vtmp1, idx, vtmp2);
1668       bytemask_compress(rscratch1);
1669       orr(dst, dst, rscratch1, Assembler::LSL, idx << 3);
1670     }
1671   } else {
1672     assert(false, "unsupported");
1673     ShouldNotReachHere();
1674   }
1675 }
1676 
1677 // Unpack the mask, a long value in src, into predicate register dst based on the
1678 // corresponding data type. Note that dst can support at most 64 lanes.
1679 // Below example gives the expected dst predicate register in different types, with
1680 // a valid src(0x658D) on a 1024-bit vector size machine.
1681 // BYTE:  dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 00 00 65 8D
1682 // SHORT: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 14 11 40 51
1683 // INT:   dst = 0x00 00 00 00 00 00 00 00 01 10 01 01 10 00 11 01
1684 // LONG:  dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01
1685 //
1686 // The number of significant bits of src must be equal to lane_cnt. E.g., 0xFF658D which
1687 // has 24 significant bits would be an invalid input if dst predicate register refers to
1688 // a LONG type 1024-bit vector, which has at most 16 lanes.
1689 void C2_MacroAssembler::sve_vmask_fromlong(PRegister dst, Register src, BasicType bt, int lane_cnt,
1690                                            FloatRegister vtmp1, FloatRegister vtmp2) {
1691   assert(UseSVE == 2 && VM_Version::supports_svebitperm() &&
1692          lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported");
1693   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
1694   // Example:   src = 0x658D, bt = T_BYTE, size = B, lane_cnt = 16
1695   // Expected:  dst = 0b01101001 10001101
1696 
1697   // Put long value from general purpose register into the first lane of vector.
1698   // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1699   sve_dup(vtmp1, B, 0);
1700   mov(vtmp1, D, 0, src);
1701 
1702   // As sve_cmp generates mask value with the minimum unit in byte, we should
1703   // transform the value in the first lane which is mask in bit now to the
1704   // mask in byte, which can be done by SVE2's BDEP instruction.
1705 
1706   // The first source input of BDEP instruction. Deposite each byte in every 8 bytes.
1707   // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1708   if (lane_cnt <= 8) {
1709     // Nothing. As only one byte exsits.
1710   } else if (lane_cnt <= 16) {
1711     ins(vtmp1, B, vtmp1, 8, 1);
1712     mov(vtmp1, B, 1, zr);
1713   } else {
1714     sve_vector_extend(vtmp1, D, vtmp1, B);
1715   }
1716 
1717   // The second source input of BDEP instruction, initialized with 0x01 for each byte.
1718   // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1719   sve_dup(vtmp2, B, 1);
1720 
1721   // BDEP vtmp1.D, vtmp1.D, vtmp2.D
1722   // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1723   // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1724   //         ---------------------------------------
1725   // vtmp1 = 0x0001010000010001 | 0x0100000001010001
1726   sve_bdep(vtmp1, D, vtmp1, vtmp2);
1727 
1728   if (bt != T_BYTE) {
1729     sve_vector_extend(vtmp1, size, vtmp1, B);
1730   }
1731   // Generate mask according to the given vector, in which the elements have been
1732   // extended to expected type.
1733   // dst = 0b01101001 10001101
1734   sve_cmp(Assembler::NE, dst, size, ptrue, vtmp1, 0);
1735 }
1736 
1737 // Clobbers: rflags
1738 void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg,
1739                                     FloatRegister zn, FloatRegister zm, Condition cond) {
1740   assert(pg->is_governing(), "This register has to be a governing predicate register");
1741   FloatRegister z1 = zn, z2 = zm;
1742   switch (cond) {
1743     case LE: z1 = zm; z2 = zn; cond = GE; break;
1744     case LT: z1 = zm; z2 = zn; cond = GT; break;
1745     case LO: z1 = zm; z2 = zn; cond = HI; break;
1746     case LS: z1 = zm; z2 = zn; cond = HS; break;
1747     default:
1748       break;
1749   }
1750 
1751   SIMD_RegVariant size = elemType_to_regVariant(bt);
1752   if (is_floating_point_type(bt)) {
1753     sve_fcm(cond, pd, size, pg, z1, z2);
1754   } else {
1755     assert(is_integral_type(bt), "unsupported element type");
1756     sve_cmp(cond, pd, size, pg, z1, z2);
1757   }
1758 }
1759 
1760 // Get index of the last mask lane that is set
1761 void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) {
1762   SIMD_RegVariant size = elemType_to_regVariant(bt);
1763   sve_rev(ptmp, size, src);
1764   sve_brkb(ptmp, ptrue, ptmp, false);
1765   sve_cntp(dst, size, ptrue, ptmp);
1766   movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1);
1767   subw(dst, rscratch1, dst);
1768 }
1769 
1770 // Extend integer vector src to dst with the same lane count
1771 // but larger element size, e.g. 4B -> 4I
1772 void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes,
1773                                            FloatRegister src, BasicType src_bt, bool is_unsigned) {
1774   if (src_bt == T_BYTE) {
1775     if (dst_bt == T_SHORT) {
1776       // 4B/8B to 4S/8S
1777       _xshll(is_unsigned, dst, T8H, src, T8B, 0);
1778     } else {
1779       // 4B to 4I
1780       assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported");
1781       _xshll(is_unsigned, dst, T8H, src, T8B, 0);
1782       _xshll(is_unsigned, dst, T4S, dst, T4H, 0);
1783     }
1784   } else if (src_bt == T_SHORT) {
1785     // 4S to 4I
1786     assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported");
1787     _xshll(is_unsigned, dst, T4S, src, T4H, 0);
1788   } else if (src_bt == T_INT) {
1789     // 2I to 2L
1790     assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported");
1791     _xshll(is_unsigned, dst, T2D, src, T2S, 0);
1792   } else {
1793     ShouldNotReachHere();
1794   }
1795 }
1796 
1797 // Narrow integer vector src down to dst with the same lane count
1798 // but smaller element size, e.g. 4I -> 4B
1799 void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt,
1800                                            FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) {
1801   if (src_bt == T_SHORT) {
1802     // 4S/8S to 4B/8B
1803     assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
1804     assert(dst_bt == T_BYTE, "unsupported");
1805     xtn(dst, T8B, src, T8H);
1806   } else if (src_bt == T_INT) {
1807     // 4I to 4B/4S
1808     assert(src_vlen_in_bytes == 16, "unsupported");
1809     assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported");
1810     xtn(dst, T4H, src, T4S);
1811     if (dst_bt == T_BYTE) {
1812       xtn(dst, T8B, dst, T8H);
1813     }
1814   } else if (src_bt == T_LONG) {
1815     // 2L to 2I
1816     assert(src_vlen_in_bytes == 16, "unsupported");
1817     assert(dst_bt == T_INT, "unsupported");
1818     xtn(dst, T2S, src, T2D);
1819   } else {
1820     ShouldNotReachHere();
1821   }
1822 }
1823 
1824 void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size,
1825                                           FloatRegister src, SIMD_RegVariant src_size,
1826                                           bool is_unsigned) {
1827   assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size");
1828 
1829   if (src_size == B) {
1830     switch (dst_size) {
1831     case H:
1832       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1833       break;
1834     case S:
1835       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1836       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1837       break;
1838     case D:
1839       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1840       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1841       _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1842       break;
1843     default:
1844       ShouldNotReachHere();
1845     }
1846   } else if (src_size == H) {
1847     if (dst_size == S) {
1848       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1849     } else { // D
1850       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1851       _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1852     }
1853   } else if (src_size == S) {
1854     _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, src);
1855   }
1856 }
1857 
1858 // Vector narrow from src to dst with specified element sizes.
1859 // High part of dst vector will be filled with zero.
1860 void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size,
1861                                           FloatRegister src, SIMD_RegVariant src_size,
1862                                           FloatRegister tmp) {
1863   assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size");
1864   assert_different_registers(src, tmp);
1865   sve_dup(tmp, src_size, 0);
1866   if (src_size == D) {
1867     switch (dst_size) {
1868     case S:
1869       sve_uzp1(dst, S, src, tmp);
1870       break;
1871     case H:
1872       assert_different_registers(dst, tmp);
1873       sve_uzp1(dst, S, src, tmp);
1874       sve_uzp1(dst, H, dst, tmp);
1875       break;
1876     case B:
1877       assert_different_registers(dst, tmp);
1878       sve_uzp1(dst, S, src, tmp);
1879       sve_uzp1(dst, H, dst, tmp);
1880       sve_uzp1(dst, B, dst, tmp);
1881       break;
1882     default:
1883       ShouldNotReachHere();
1884     }
1885   } else if (src_size == S) {
1886     if (dst_size == H) {
1887       sve_uzp1(dst, H, src, tmp);
1888     } else { // B
1889       assert_different_registers(dst, tmp);
1890       sve_uzp1(dst, H, src, tmp);
1891       sve_uzp1(dst, B, dst, tmp);
1892     }
1893   } else if (src_size == H) {
1894     sve_uzp1(dst, B, src, tmp);
1895   }
1896 }
1897 
1898 // Extend src predicate to dst predicate with the same lane count but larger
1899 // element size, e.g. 64Byte -> 512Long
1900 void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src,
1901                                              uint dst_element_length_in_bytes,
1902                                              uint src_element_length_in_bytes) {
1903   if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) {
1904     sve_punpklo(dst, src);
1905   } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) {
1906     sve_punpklo(dst, src);
1907     sve_punpklo(dst, dst);
1908   } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) {
1909     sve_punpklo(dst, src);
1910     sve_punpklo(dst, dst);
1911     sve_punpklo(dst, dst);
1912   } else {
1913     assert(false, "unsupported");
1914     ShouldNotReachHere();
1915   }
1916 }
1917 
1918 // Narrow src predicate to dst predicate with the same lane count but
1919 // smaller element size, e.g. 512Long -> 64Byte
1920 void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src, PRegister ptmp,
1921                                              uint dst_element_length_in_bytes, uint src_element_length_in_bytes) {
1922   // The insignificant bits in src predicate are expected to be zero.
1923   // To ensure the higher order bits of the resultant narrowed vector are 0, an all-zero predicate is
1924   // passed as the second argument. An example narrowing operation with a given mask would be -
1925   // 128Long -> 64Int on a 128-bit machine i.e 2L -> 2I
1926   // Mask (for 2 Longs) : TF
1927   // Predicate register for the above mask (16 bits) : 00000001 00000000
1928   // After narrowing (uzp1 dst.b, src.b, ptmp.b) : 0000 0000 0001 0000
1929   // Which translates to mask for 2 integers as : TF (lower half is considered while upper half is 0)
1930   assert_different_registers(src, ptmp);
1931   assert_different_registers(dst, ptmp);
1932   sve_pfalse(ptmp);
1933   if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) {
1934     sve_uzp1(dst, B, src, ptmp);
1935   } else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) {
1936     sve_uzp1(dst, H, src, ptmp);
1937     sve_uzp1(dst, B, dst, ptmp);
1938   } else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) {
1939     sve_uzp1(dst, S, src, ptmp);
1940     sve_uzp1(dst, H, dst, ptmp);
1941     sve_uzp1(dst, B, dst, ptmp);
1942   } else {
1943     assert(false, "unsupported");
1944     ShouldNotReachHere();
1945   }
1946 }
1947 
1948 // Vector reduction add for integral type with ASIMD instructions.
1949 void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt,
1950                                                  Register isrc, FloatRegister vsrc,
1951                                                  unsigned vector_length_in_bytes,
1952                                                  FloatRegister vtmp) {
1953   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1954   assert_different_registers(dst, isrc);
1955   bool isQ = vector_length_in_bytes == 16;
1956 
1957   BLOCK_COMMENT("neon_reduce_add_integral {");
1958     switch(bt) {
1959       case T_BYTE:
1960         addv(vtmp, isQ ? T16B : T8B, vsrc);
1961         smov(dst, vtmp, B, 0);
1962         addw(dst, dst, isrc, ext::sxtb);
1963         break;
1964       case T_SHORT:
1965         addv(vtmp, isQ ? T8H : T4H, vsrc);
1966         smov(dst, vtmp, H, 0);
1967         addw(dst, dst, isrc, ext::sxth);
1968         break;
1969       case T_INT:
1970         isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc);
1971         umov(dst, vtmp, S, 0);
1972         addw(dst, dst, isrc);
1973         break;
1974       case T_LONG:
1975         assert(isQ, "unsupported");
1976         addpd(vtmp, vsrc);
1977         umov(dst, vtmp, D, 0);
1978         add(dst, dst, isrc);
1979         break;
1980       default:
1981         assert(false, "unsupported");
1982         ShouldNotReachHere();
1983     }
1984   BLOCK_COMMENT("} neon_reduce_add_integral");
1985 }
1986 
1987 // Vector reduction multiply for integral type with ASIMD instructions.
1988 // Note: temporary registers vtmp1 and vtmp2 are not used in some cases.
1989 // Clobbers: rscratch1
1990 void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt,
1991                                                  Register isrc, FloatRegister vsrc,
1992                                                  unsigned vector_length_in_bytes,
1993                                                  FloatRegister vtmp1, FloatRegister vtmp2) {
1994   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1995   bool isQ = vector_length_in_bytes == 16;
1996 
1997   BLOCK_COMMENT("neon_reduce_mul_integral {");
1998     switch(bt) {
1999       case T_BYTE:
2000         if (isQ) {
2001           // Multiply the lower half and higher half of vector iteratively.
2002           // vtmp1 = vsrc[8:15]
2003           ins(vtmp1, D, vsrc, 0, 1);
2004           // vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7]
2005           mulv(vtmp1, T8B, vtmp1, vsrc);
2006           // vtmp2 = vtmp1[4:7]
2007           ins(vtmp2, S, vtmp1, 0, 1);
2008           // vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3]
2009           mulv(vtmp1, T8B, vtmp2, vtmp1);
2010         } else {
2011           ins(vtmp1, S, vsrc, 0, 1);
2012           mulv(vtmp1, T8B, vtmp1, vsrc);
2013         }
2014         // vtmp2 = vtmp1[2:3]
2015         ins(vtmp2, H, vtmp1, 0, 1);
2016         // vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1]
2017         mulv(vtmp2, T8B, vtmp2, vtmp1);
2018         // dst = vtmp2[0] * isrc * vtmp2[1]
2019         umov(rscratch1, vtmp2, B, 0);
2020         mulw(dst, rscratch1, isrc);
2021         sxtb(dst, dst);
2022         umov(rscratch1, vtmp2, B, 1);
2023         mulw(dst, rscratch1, dst);
2024         sxtb(dst, dst);
2025         break;
2026       case T_SHORT:
2027         if (isQ) {
2028           ins(vtmp2, D, vsrc, 0, 1);
2029           mulv(vtmp2, T4H, vtmp2, vsrc);
2030           ins(vtmp1, S, vtmp2, 0, 1);
2031           mulv(vtmp1, T4H, vtmp1, vtmp2);
2032         } else {
2033           ins(vtmp1, S, vsrc, 0, 1);
2034           mulv(vtmp1, T4H, vtmp1, vsrc);
2035         }
2036         umov(rscratch1, vtmp1, H, 0);
2037         mulw(dst, rscratch1, isrc);
2038         sxth(dst, dst);
2039         umov(rscratch1, vtmp1, H, 1);
2040         mulw(dst, rscratch1, dst);
2041         sxth(dst, dst);
2042         break;
2043       case T_INT:
2044         if (isQ) {
2045           ins(vtmp1, D, vsrc, 0, 1);
2046           mulv(vtmp1, T2S, vtmp1, vsrc);
2047         } else {
2048           vtmp1 = vsrc;
2049         }
2050         umov(rscratch1, vtmp1, S, 0);
2051         mul(dst, rscratch1, isrc);
2052         umov(rscratch1, vtmp1, S, 1);
2053         mul(dst, rscratch1, dst);
2054         break;
2055       case T_LONG:
2056         umov(rscratch1, vsrc, D, 0);
2057         mul(dst, isrc, rscratch1);
2058         umov(rscratch1, vsrc, D, 1);
2059         mul(dst, dst, rscratch1);
2060         break;
2061       default:
2062         assert(false, "unsupported");
2063         ShouldNotReachHere();
2064     }
2065   BLOCK_COMMENT("} neon_reduce_mul_integral");
2066 }
2067 
2068 // Vector reduction multiply for floating-point type with ASIMD instructions.
2069 void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt,
2070                                            FloatRegister fsrc, FloatRegister vsrc,
2071                                            unsigned vector_length_in_bytes,
2072                                            FloatRegister vtmp) {
2073   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
2074   bool isQ = vector_length_in_bytes == 16;
2075 
2076   BLOCK_COMMENT("neon_reduce_mul_fp {");
2077     switch(bt) {
2078       case T_FLOAT:
2079         fmuls(dst, fsrc, vsrc);
2080         ins(vtmp, S, vsrc, 0, 1);
2081         fmuls(dst, dst, vtmp);
2082         if (isQ) {
2083           ins(vtmp, S, vsrc, 0, 2);
2084           fmuls(dst, dst, vtmp);
2085           ins(vtmp, S, vsrc, 0, 3);
2086           fmuls(dst, dst, vtmp);
2087          }
2088         break;
2089       case T_DOUBLE:
2090         assert(isQ, "unsupported");
2091         fmuld(dst, fsrc, vsrc);
2092         ins(vtmp, D, vsrc, 0, 1);
2093         fmuld(dst, dst, vtmp);
2094         break;
2095       default:
2096         assert(false, "unsupported");
2097         ShouldNotReachHere();
2098     }
2099   BLOCK_COMMENT("} neon_reduce_mul_fp");
2100 }
2101 
2102 // Helper to select logical instruction
2103 void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd,
2104                                                    Register Rn, Register Rm,
2105                                                    enum shift_kind kind, unsigned shift) {
2106   switch(opc) {
2107     case Op_AndReductionV:
2108       is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift);
2109       break;
2110     case Op_OrReductionV:
2111       is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift);
2112       break;
2113     case Op_XorReductionV:
2114       is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift);
2115       break;
2116     default:
2117       assert(false, "unsupported");
2118       ShouldNotReachHere();
2119   }
2120 }
2121 
2122 // Vector reduction logical operations And, Or, Xor
2123 // Clobbers: rscratch1
2124 void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt,
2125                                             Register isrc, FloatRegister vsrc,
2126                                             unsigned vector_length_in_bytes) {
2127   assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV,
2128          "unsupported");
2129   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
2130   assert_different_registers(dst, isrc);
2131   bool isQ = vector_length_in_bytes == 16;
2132 
2133   BLOCK_COMMENT("neon_reduce_logical {");
2134     umov(rscratch1, vsrc, isQ ? D : S, 0);
2135     umov(dst, vsrc, isQ ? D : S, 1);
2136     neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1);
2137     switch(bt) {
2138       case T_BYTE:
2139         if (isQ) {
2140           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
2141         }
2142         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
2143         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8);
2144         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
2145         sxtb(dst, dst);
2146         break;
2147       case T_SHORT:
2148         if (isQ) {
2149           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
2150         }
2151         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
2152         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
2153         sxth(dst, dst);
2154         break;
2155       case T_INT:
2156         if (isQ) {
2157           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
2158         }
2159         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
2160         break;
2161       case T_LONG:
2162         assert(isQ, "unsupported");
2163         neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst);
2164         break;
2165       default:
2166         assert(false, "unsupported");
2167         ShouldNotReachHere();
2168     }
2169   BLOCK_COMMENT("} neon_reduce_logical");
2170 }
2171 
2172 // Vector reduction min/max for integral type with ASIMD instructions.
2173 // Note: vtmp is not used and expected to be fnoreg for T_LONG case.
2174 // Clobbers: rscratch1, rflags
2175 void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt,
2176                                                     Register isrc, FloatRegister vsrc,
2177                                                     unsigned vector_length_in_bytes,
2178                                                     FloatRegister vtmp) {
2179   assert(opc == Op_MinReductionV || opc == Op_MaxReductionV, "unsupported");
2180   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
2181   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported");
2182   assert_different_registers(dst, isrc);
2183   bool isQ = vector_length_in_bytes == 16;
2184   bool is_min = opc == Op_MinReductionV;
2185 
2186   BLOCK_COMMENT("neon_reduce_minmax_integral {");
2187     if (bt == T_LONG) {
2188       assert(vtmp == fnoreg, "should be");
2189       assert(isQ, "should be");
2190       umov(rscratch1, vsrc, D, 0);
2191       cmp(isrc, rscratch1);
2192       csel(dst, isrc, rscratch1, is_min ? LT : GT);
2193       umov(rscratch1, vsrc, D, 1);
2194       cmp(dst, rscratch1);
2195       csel(dst, dst, rscratch1, is_min ? LT : GT);
2196     } else {
2197       SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
2198       if (size == T2S) {
2199         is_min ? sminp(vtmp, size, vsrc, vsrc) : smaxp(vtmp, size, vsrc, vsrc);
2200       } else {
2201         is_min ? sminv(vtmp, size, vsrc) : smaxv(vtmp, size, vsrc);
2202       }
2203       if (bt == T_INT) {
2204         umov(dst, vtmp, S, 0);
2205       } else {
2206         smov(dst, vtmp, elemType_to_regVariant(bt), 0);
2207       }
2208       cmpw(dst, isrc);
2209       cselw(dst, dst, isrc, is_min ? LT : GT);
2210     }
2211   BLOCK_COMMENT("} neon_reduce_minmax_integral");
2212 }
2213 
2214 // Vector reduction for integral type with SVE instruction.
2215 // Supported operations are Add, And, Or, Xor, Max, Min.
2216 // rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV.
2217 void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1,
2218                                             FloatRegister src2, PRegister pg, FloatRegister tmp) {
2219   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2220   assert(pg->is_governing(), "This register has to be a governing predicate register");
2221   assert_different_registers(src1, dst);
2222   // Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved.
2223   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2224   switch (opc) {
2225     case Op_AddReductionVI: {
2226       sve_uaddv(tmp, size, pg, src2);
2227       if (bt == T_BYTE) {
2228         smov(dst, tmp, size, 0);
2229         addw(dst, src1, dst, ext::sxtb);
2230       } else if (bt == T_SHORT) {
2231         smov(dst, tmp, size, 0);
2232         addw(dst, src1, dst, ext::sxth);
2233       } else {
2234         umov(dst, tmp, size, 0);
2235         addw(dst, dst, src1);
2236       }
2237       break;
2238     }
2239     case Op_AddReductionVL: {
2240       sve_uaddv(tmp, size, pg, src2);
2241       umov(dst, tmp, size, 0);
2242       add(dst, dst, src1);
2243       break;
2244     }
2245     case Op_AndReductionV: {
2246       sve_andv(tmp, size, pg, src2);
2247       if (bt == T_INT || bt == T_LONG) {
2248         umov(dst, tmp, size, 0);
2249       } else {
2250         smov(dst, tmp, size, 0);
2251       }
2252       if (bt == T_LONG) {
2253         andr(dst, dst, src1);
2254       } else {
2255         andw(dst, dst, src1);
2256       }
2257       break;
2258     }
2259     case Op_OrReductionV: {
2260       sve_orv(tmp, size, pg, src2);
2261       if (bt == T_INT || bt == T_LONG) {
2262         umov(dst, tmp, size, 0);
2263       } else {
2264         smov(dst, tmp, size, 0);
2265       }
2266       if (bt == T_LONG) {
2267         orr(dst, dst, src1);
2268       } else {
2269         orrw(dst, dst, src1);
2270       }
2271       break;
2272     }
2273     case Op_XorReductionV: {
2274       sve_eorv(tmp, size, pg, src2);
2275       if (bt == T_INT || bt == T_LONG) {
2276         umov(dst, tmp, size, 0);
2277       } else {
2278         smov(dst, tmp, size, 0);
2279       }
2280       if (bt == T_LONG) {
2281         eor(dst, dst, src1);
2282       } else {
2283         eorw(dst, dst, src1);
2284       }
2285       break;
2286     }
2287     case Op_MaxReductionV: {
2288       sve_smaxv(tmp, size, pg, src2);
2289       if (bt == T_INT || bt == T_LONG) {
2290         umov(dst, tmp, size, 0);
2291       } else {
2292         smov(dst, tmp, size, 0);
2293       }
2294       if (bt == T_LONG) {
2295         cmp(dst, src1);
2296         csel(dst, dst, src1, Assembler::GT);
2297       } else {
2298         cmpw(dst, src1);
2299         cselw(dst, dst, src1, Assembler::GT);
2300       }
2301       break;
2302     }
2303     case Op_MinReductionV: {
2304       sve_sminv(tmp, size, pg, src2);
2305       if (bt == T_INT || bt == T_LONG) {
2306         umov(dst, tmp, size, 0);
2307       } else {
2308         smov(dst, tmp, size, 0);
2309       }
2310       if (bt == T_LONG) {
2311         cmp(dst, src1);
2312         csel(dst, dst, src1, Assembler::LT);
2313       } else {
2314         cmpw(dst, src1);
2315         cselw(dst, dst, src1, Assembler::LT);
2316       }
2317       break;
2318     }
2319     default:
2320       assert(false, "unsupported");
2321       ShouldNotReachHere();
2322   }
2323 
2324   if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) {
2325     if (bt == T_BYTE) {
2326       sxtb(dst, dst);
2327     } else if (bt == T_SHORT) {
2328       sxth(dst, dst);
2329     }
2330   }
2331 }
2332 
2333 // Set elements of the dst predicate to true for lanes in the range of [0, lane_cnt), or
2334 // to false otherwise. The input "lane_cnt" should be smaller than or equal to the supported
2335 // max vector length of the basic type. Clobbers: rscratch1 and the rFlagsReg.
2336 void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt) {
2337   uint32_t max_vector_length = Matcher::max_vector_size(bt);
2338   assert(lane_cnt <= max_vector_length, "unsupported input lane_cnt");
2339 
2340   // Set all elements to false if the input "lane_cnt" is zero.
2341   if (lane_cnt == 0) {
2342     sve_pfalse(dst);
2343     return;
2344   }
2345 
2346   SIMD_RegVariant size = elemType_to_regVariant(bt);
2347   assert(size != Q, "invalid size");
2348 
2349   // Set all true if "lane_cnt" equals to the max lane count.
2350   if (lane_cnt == max_vector_length) {
2351     sve_ptrue(dst, size, /* ALL */ 0b11111);
2352     return;
2353   }
2354 
2355   // Fixed numbers for "ptrue".
2356   switch(lane_cnt) {
2357   case 1: /* VL1 */
2358   case 2: /* VL2 */
2359   case 3: /* VL3 */
2360   case 4: /* VL4 */
2361   case 5: /* VL5 */
2362   case 6: /* VL6 */
2363   case 7: /* VL7 */
2364   case 8: /* VL8 */
2365     sve_ptrue(dst, size, lane_cnt);
2366     return;
2367   case 16:
2368     sve_ptrue(dst, size, /* VL16 */ 0b01001);
2369     return;
2370   case 32:
2371     sve_ptrue(dst, size, /* VL32 */ 0b01010);
2372     return;
2373   case 64:
2374     sve_ptrue(dst, size, /* VL64 */ 0b01011);
2375     return;
2376   case 128:
2377     sve_ptrue(dst, size, /* VL128 */ 0b01100);
2378     return;
2379   case 256:
2380     sve_ptrue(dst, size, /* VL256 */ 0b01101);
2381     return;
2382   default:
2383     break;
2384   }
2385 
2386   // Special patterns for "ptrue".
2387   if (lane_cnt == round_down_power_of_2(max_vector_length)) {
2388     sve_ptrue(dst, size, /* POW2 */ 0b00000);
2389   } else if (lane_cnt == max_vector_length - (max_vector_length % 4)) {
2390     sve_ptrue(dst, size, /* MUL4 */ 0b11101);
2391   } else if (lane_cnt == max_vector_length - (max_vector_length % 3)) {
2392     sve_ptrue(dst, size, /* MUL3 */ 0b11110);
2393   } else {
2394     // Encode to "whileltw" for the remaining cases.
2395     mov(rscratch1, lane_cnt);
2396     sve_whileltw(dst, size, zr, rscratch1);
2397   }
2398 }
2399 
2400 // Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst.
2401 // Any remaining elements of dst will be filled with zero.
2402 // Clobbers: rscratch1
2403 // Preserves: src, mask
2404 void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask,
2405                                            FloatRegister vtmp1, FloatRegister vtmp2,
2406                                            PRegister pgtmp) {
2407   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2408   assert_different_registers(dst, src, vtmp1, vtmp2);
2409   assert_different_registers(mask, pgtmp);
2410 
2411   // Example input:   src   = 8888 7777 6666 5555 4444 3333 2222 1111
2412   //                  mask  = 0001 0000 0000 0001 0001 0000 0001 0001
2413   // Expected result: dst   = 0000 0000 0000 8888 5555 4444 2222 1111
2414   sve_dup(vtmp2, H, 0);
2415 
2416   // Extend lowest half to type INT.
2417   // dst = 00004444 00003333 00002222 00001111
2418   sve_uunpklo(dst, S, src);
2419   // pgtmp = 00000001 00000000 00000001 00000001
2420   sve_punpklo(pgtmp, mask);
2421   // Pack the active elements in size of type INT to the right,
2422   // and fill the remainings with zero.
2423   // dst = 00000000 00004444 00002222 00001111
2424   sve_compact(dst, S, dst, pgtmp);
2425   // Narrow the result back to type SHORT.
2426   // dst = 0000 0000 0000 0000 0000 4444 2222 1111
2427   sve_uzp1(dst, H, dst, vtmp2);
2428   // Count the active elements of lowest half.
2429   // rscratch1 = 3
2430   sve_cntp(rscratch1, S, ptrue, pgtmp);
2431 
2432   // Repeat to the highest half.
2433   // pgtmp = 00000001 00000000 00000000 00000001
2434   sve_punpkhi(pgtmp, mask);
2435   // vtmp1 = 00008888 00007777 00006666 00005555
2436   sve_uunpkhi(vtmp1, S, src);
2437   // vtmp1 = 00000000 00000000 00008888 00005555
2438   sve_compact(vtmp1, S, vtmp1, pgtmp);
2439   // vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555
2440   sve_uzp1(vtmp1, H, vtmp1, vtmp2);
2441 
2442   // Compressed low:   dst   = 0000 0000 0000 0000 0000 4444 2222 1111
2443   // Compressed high:  vtmp1 = 0000 0000 0000 0000 0000 0000 8888  5555
2444   // Left shift(cross lane) compressed high with TRUE_CNT lanes,
2445   // TRUE_CNT is the number of active elements in the compressed low.
2446   neg(rscratch1, rscratch1);
2447   // vtmp2 = {4 3 2 1 0 -1 -2 -3}
2448   sve_index(vtmp2, H, rscratch1, 1);
2449   // vtmp1 = 0000 0000 0000 8888 5555 0000 0000 0000
2450   sve_tbl(vtmp1, H, vtmp1, vtmp2);
2451 
2452   // Combine the compressed high(after shifted) with the compressed low.
2453   // dst = 0000 0000 0000 8888 5555 4444 2222 1111
2454   sve_orr(dst, dst, vtmp1);
2455 }
2456 
2457 // Clobbers: rscratch1, rscratch2
2458 // Preserves: src, mask
2459 void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask,
2460                                           FloatRegister vtmp1, FloatRegister vtmp2,
2461                                           FloatRegister vtmp3, FloatRegister vtmp4,
2462                                           PRegister ptmp, PRegister pgtmp) {
2463   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2464   assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3, vtmp4);
2465   assert_different_registers(mask, ptmp, pgtmp);
2466   // Example input:   src   = 88 77 66 55 44 33 22 11
2467   //                  mask  = 01 00 00 01 01 00 01 01
2468   // Expected result: dst   = 00 00 00 88 55 44 22 11
2469 
2470   sve_dup(vtmp4, B, 0);
2471   // Extend lowest half to type SHORT.
2472   // vtmp1 = 0044 0033 0022 0011
2473   sve_uunpklo(vtmp1, H, src);
2474   // ptmp = 0001 0000 0001 0001
2475   sve_punpklo(ptmp, mask);
2476   // Count the active elements of lowest half.
2477   // rscratch2 = 3
2478   sve_cntp(rscratch2, H, ptrue, ptmp);
2479   // Pack the active elements in size of type SHORT to the right,
2480   // and fill the remainings with zero.
2481   // dst = 0000 0044 0022 0011
2482   sve_compress_short(dst, vtmp1, ptmp, vtmp2, vtmp3, pgtmp);
2483   // Narrow the result back to type BYTE.
2484   // dst = 00 00 00 00 00 44 22 11
2485   sve_uzp1(dst, B, dst, vtmp4);
2486 
2487   // Repeat to the highest half.
2488   // ptmp = 0001 0000 0000 0001
2489   sve_punpkhi(ptmp, mask);
2490   // vtmp1 = 0088 0077 0066 0055
2491   sve_uunpkhi(vtmp2, H, src);
2492   // vtmp1 = 0000 0000 0088 0055
2493   sve_compress_short(vtmp1, vtmp2, ptmp, vtmp3, vtmp4, pgtmp);
2494 
2495   sve_dup(vtmp4, B, 0);
2496   // vtmp1 = 00 00 00 00 00 00 88 55
2497   sve_uzp1(vtmp1, B, vtmp1, vtmp4);
2498 
2499   // Compressed low:   dst   = 00 00 00 00 00 44 22 11
2500   // Compressed high:  vtmp1 = 00 00 00 00 00 00 88 55
2501   // Left shift(cross lane) compressed high with TRUE_CNT lanes,
2502   // TRUE_CNT is the number of active elements in the compressed low.
2503   neg(rscratch2, rscratch2);
2504   // vtmp2 = {4 3 2 1 0 -1 -2 -3}
2505   sve_index(vtmp2, B, rscratch2, 1);
2506   // vtmp1 = 00 00 00 88 55 00 00 00
2507   sve_tbl(vtmp1, B, vtmp1, vtmp2);
2508   // Combine the compressed high(after shifted) with the compressed low.
2509   // dst = 00 00 00 88 55 44 22 11
2510   sve_orr(dst, dst, vtmp1);
2511 }
2512 
2513 void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2514   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2515   SIMD_Arrangement size = isQ ? T16B : T8B;
2516   if (bt == T_BYTE) {
2517     rbit(dst, size, src);
2518   } else {
2519     neon_reverse_bytes(dst, src, bt, isQ);
2520     rbit(dst, size, dst);
2521   }
2522 }
2523 
2524 void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2525   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2526   SIMD_Arrangement size = isQ ? T16B : T8B;
2527   switch (bt) {
2528     case T_BYTE:
2529       if (dst != src) {
2530         orr(dst, size, src, src);
2531       }
2532       break;
2533     case T_SHORT:
2534       rev16(dst, size, src);
2535       break;
2536     case T_INT:
2537       rev32(dst, size, src);
2538       break;
2539     case T_LONG:
2540       rev64(dst, size, src);
2541       break;
2542     default:
2543       assert(false, "unsupported");
2544       ShouldNotReachHere();
2545   }
2546 }
2547 
2548 // VectorRearrange implementation for short/int/float/long/double types with NEON
2549 // instructions. For VectorRearrange short/int/float, we use NEON tbl instruction.
2550 // But since it supports bytes table only, we need to lookup 2/4 bytes as a group.
2551 // For VectorRearrange long/double, we compare the shuffle input with iota indices,
2552 // and use bsl to implement the operation.
2553 void C2_MacroAssembler::neon_rearrange_hsd(FloatRegister dst, FloatRegister src,
2554                                            FloatRegister shuffle, FloatRegister tmp,
2555                                            BasicType bt, bool isQ) {
2556   assert_different_registers(dst, src, shuffle, tmp);
2557   SIMD_Arrangement size1 = isQ ? T16B : T8B;
2558   SIMD_Arrangement size2 = esize2arrangement((uint)type2aelembytes(bt), isQ);
2559 
2560   // Here is an example that rearranges a NEON vector with 4 ints:
2561   // Rearrange V1 int[a0, a1, a2, a3] to V2 int[a2, a3, a0, a1]
2562   //   1. We assume the shuffle input is Vi int[2, 3, 0, 1].
2563   //   2. Multiply Vi int[2, 3, 0, 1] with constant int vector
2564   //      [0x04040404, 0x04040404, 0x04040404, 0x04040404], and get
2565   //      tbl base Vm int[0x08080808, 0x0c0c0c0c, 0x00000000, 0x04040404].
2566   //   3. Add Vm with constant int[0x03020100, 0x03020100, 0x03020100, 0x03020100],
2567   //      and get tbl index Vm int[0x0b0a0908, 0x0f0e0d0c, 0x03020100, 0x07060504]
2568   //   4. Use Vm as index register, and use V1 as table register.
2569   //      Then get V2 as the result by tbl NEON instructions.
2570   switch (bt) {
2571     case T_SHORT:
2572       mov(tmp, size1, 0x02);
2573       mulv(dst, size2, shuffle, tmp);
2574       mov(tmp, size2, 0x0100);
2575       addv(dst, size1, dst, tmp);
2576       tbl(dst, size1, src, 1, dst);
2577       break;
2578     case T_INT:
2579     case T_FLOAT:
2580       mov(tmp, size1, 0x04);
2581       mulv(dst, size2, shuffle, tmp);
2582       mov(tmp, size2, 0x03020100);
2583       addv(dst, size1, dst, tmp);
2584       tbl(dst, size1, src, 1, dst);
2585       break;
2586     case T_LONG:
2587     case T_DOUBLE:
2588       // Load the iota indices for Long type. The indices are ordered by
2589       // type B/S/I/L/F/D, and the offset between two types is 16; Hence
2590       // the offset for L is 48.
2591       lea(rscratch1,
2592           ExternalAddress(StubRoutines::aarch64::vector_iota_indices() + 48));
2593       ldrq(tmp, rscratch1);
2594       // Check whether the input "shuffle" is the same with iota indices.
2595       // Return "src" if true, otherwise swap the two elements of "src".
2596       cm(EQ, dst, size2, shuffle, tmp);
2597       ext(tmp, size1, src, src, 8);
2598       bsl(dst, size1, src, tmp);
2599       break;
2600     default:
2601       assert(false, "unsupported element type");
2602       ShouldNotReachHere();
2603   }
2604 }
2605 
2606 // Extract a scalar element from an sve vector at position 'idx'.
2607 // The input elements in src are expected to be of integral type.
2608 void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src,
2609                                              int idx, FloatRegister vtmp) {
2610   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2611   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2612   if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction
2613     if (bt == T_INT || bt == T_LONG) {
2614       umov(dst, src, size, idx);
2615     } else {
2616       smov(dst, src, size, idx);
2617     }
2618   } else {
2619     sve_orr(vtmp, src, src);
2620     sve_ext(vtmp, vtmp, idx << size);
2621     if (bt == T_INT || bt == T_LONG) {
2622       umov(dst, vtmp, size, 0);
2623     } else {
2624       smov(dst, vtmp, size, 0);
2625     }
2626   }
2627 }
2628 
2629 // java.lang.Math::round intrinsics
2630 
2631 // Clobbers: rscratch1, rflags
2632 void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2633                                           FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) {
2634   assert_different_registers(tmp1, tmp2, tmp3, src, dst);
2635   switch (T) {
2636     case T2S:
2637     case T4S:
2638       fmovs(tmp1, T, 0.5f);
2639       mov(rscratch1, jint_cast(0x1.0p23f));
2640       break;
2641     case T2D:
2642       fmovd(tmp1, T, 0.5);
2643       mov(rscratch1, julong_cast(0x1.0p52));
2644       break;
2645     default:
2646       assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2647   }
2648   fadd(tmp1, T, tmp1, src);
2649   fcvtms(tmp1, T, tmp1);
2650   // tmp1 = floor(src + 0.5, ties to even)
2651 
2652   fcvtas(dst, T, src);
2653   // dst = round(src), ties to away
2654 
2655   fneg(tmp3, T, src);
2656   dup(tmp2, T, rscratch1);
2657   cm(HS, tmp3, T, tmp3, tmp2);
2658   // tmp3 is now a set of flags
2659 
2660   bif(dst, T16B, tmp1, tmp3);
2661   // result in dst
2662 }
2663 
2664 // Clobbers: rscratch1, rflags
2665 void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2666                                          FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) {
2667   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2668   assert_different_registers(tmp1, tmp2, src, dst);
2669 
2670   switch (T) {
2671     case S:
2672       mov(rscratch1, jint_cast(0x1.0p23f));
2673       break;
2674     case D:
2675       mov(rscratch1, julong_cast(0x1.0p52));
2676       break;
2677     default:
2678       assert(T == S || T == D, "invalid register variant");
2679   }
2680 
2681   sve_frinta(dst, T, ptrue, src);
2682   // dst = round(src), ties to away
2683 
2684   Label none;
2685 
2686   sve_fneg(tmp1, T, ptrue, src);
2687   sve_dup(tmp2, T, rscratch1);
2688   sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1);
2689   br(EQ, none);
2690   {
2691     sve_cpy(tmp1, T, pgtmp, 0.5);
2692     sve_fadd(tmp1, T, pgtmp, src);
2693     sve_frintm(dst, T, pgtmp, tmp1);
2694     // dst = floor(src + 0.5, ties to even)
2695   }
2696   bind(none);
2697 
2698   sve_fcvtzs(dst, T, ptrue, dst, T);
2699   // result in dst
2700 }
2701 
2702 void C2_MacroAssembler::vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero,
2703                                            FloatRegister one, SIMD_Arrangement T) {
2704   assert_different_registers(dst, src, zero, one);
2705   assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2706 
2707   facgt(dst, T, src, zero);
2708   ushr(dst, T, dst, 1); // dst=0 for +-0.0 and NaN. 0x7FF..F otherwise
2709   bsl(dst, T == T2S ? T8B : T16B, one, src); // Result in dst
2710 }
2711 
2712 void C2_MacroAssembler::vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero,
2713                                           FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T) {
2714     assert_different_registers(dst, src, zero, one, vtmp);
2715     assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2716 
2717     sve_orr(vtmp, src, src);
2718     sve_fac(Assembler::GT, pgtmp, T, ptrue, src, zero); // pmtp=0 for +-0.0 and NaN. 0x1 otherwise
2719     switch (T) {
2720     case S:
2721       sve_and(vtmp, T, min_jint); // Extract the sign bit of float value in every lane of src
2722       sve_orr(vtmp, T, jint_cast(1.0)); // OR it with +1 to make the final result +1 or -1 depending
2723                                         // on the sign of the float value
2724       break;
2725     case D:
2726       sve_and(vtmp, T, min_jlong);
2727       sve_orr(vtmp, T, jlong_cast(1.0));
2728       break;
2729     default:
2730       assert(false, "unsupported");
2731       ShouldNotReachHere();
2732     }
2733     sve_sel(dst, T, pgtmp, vtmp, src); // Select either from src or vtmp based on the predicate register pgtmp
2734                                        // Result in dst
2735 }
2736 
2737 bool C2_MacroAssembler::in_scratch_emit_size() {
2738   if (ciEnv::current()->task() != nullptr) {
2739     PhaseOutput* phase_output = Compile::current()->output();
2740     if (phase_output != nullptr && phase_output->in_scratch_emit_size()) {
2741       return true;
2742     }
2743   }
2744   return MacroAssembler::in_scratch_emit_size();
2745 }