1 /*
   2  * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "opto/c2_MacroAssembler.hpp"
  29 #include "opto/compile.hpp"
  30 #include "opto/intrinsicnode.hpp"
  31 #include "opto/matcher.hpp"
  32 #include "opto/output.hpp"
  33 #include "opto/subnode.hpp"
  34 #include "runtime/stubRoutines.hpp"
  35 #include "utilities/globalDefinitions.hpp"
  36 
  37 #ifdef PRODUCT
  38 #define BLOCK_COMMENT(str) /* nothing */
  39 #define STOP(error) stop(error)
  40 #else
  41 #define BLOCK_COMMENT(str) block_comment(str)
  42 #define STOP(error) block_comment(error); stop(error)
  43 #endif
  44 
  45 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  46 
  47 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
  48 
  49 void C2_MacroAssembler::fast_lock(Register objectReg, Register boxReg, Register tmpReg,
  50                                   Register tmp2Reg, Register tmp3Reg) {
  51   Register oop = objectReg;
  52   Register box = boxReg;
  53   Register disp_hdr = tmpReg;
  54   Register tmp = tmp2Reg;
  55   Label cont;
  56   Label object_has_monitor;
  57   Label count, no_count;
  58 
  59   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight");
  60   assert_different_registers(oop, box, tmp, disp_hdr);
  61 
  62   // Load markWord from object into displaced_header.
  63   ldr(disp_hdr, Address(oop, oopDesc::mark_offset_in_bytes()));
  64 
  65   if (DiagnoseSyncOnValueBasedClasses != 0) {
  66     load_klass(tmp, oop);
  67     ldrb(tmp, Address(tmp, Klass::misc_flags_offset()));
  68     tst(tmp, KlassFlags::_misc_is_value_based_class);
  69     br(Assembler::NE, cont);
  70   }
  71 
  72   // Check for existing monitor
  73   tbnz(disp_hdr, exact_log2(markWord::monitor_value), object_has_monitor);
  74 
  75   if (LockingMode == LM_MONITOR) {
  76     tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0.
  77     b(cont);
  78   } else {
  79     assert(LockingMode == LM_LEGACY, "must be");
  80     // Set tmp to be (markWord of object | UNLOCK_VALUE).
  81     orr(tmp, disp_hdr, markWord::unlocked_value);
  82 
  83     // Initialize the box. (Must happen before we update the object mark!)
  84     str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes()));
  85 
  86     // Compare object markWord with an unlocked value (tmp) and if
  87     // equal exchange the stack address of our box with object markWord.
  88     // On failure disp_hdr contains the possibly locked markWord.
  89     cmpxchg(oop, tmp, box, Assembler::xword, /*acquire*/ true,
  90             /*release*/ true, /*weak*/ false, disp_hdr);
  91     br(Assembler::EQ, cont);
  92 
  93     assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
  94 
  95     // If the compare-and-exchange succeeded, then we found an unlocked
  96     // object, will have now locked it will continue at label cont
  97 
  98     // Check if the owner is self by comparing the value in the
  99     // markWord of object (disp_hdr) with the stack pointer.
 100     mov(rscratch1, sp);
 101     sub(disp_hdr, disp_hdr, rscratch1);
 102     mov(tmp, (address) (~(os::vm_page_size()-1) | markWord::lock_mask_in_place));
 103     // If condition is true we are cont and hence we can store 0 as the
 104     // displaced header in the box, which indicates that it is a recursive lock.
 105     ands(tmp/*==0?*/, disp_hdr, tmp);   // Sets flags for result
 106     str(tmp/*==0, perhaps*/, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 107     b(cont);
 108   }
 109 
 110   // Handle existing monitor.
 111   bind(object_has_monitor);
 112 
 113   // The object's monitor m is unlocked iff m->owner == nullptr,
 114   // otherwise m->owner may contain a thread or a stack address.
 115   //
 116   // Try to CAS m->owner from null to current thread.
 117   add(tmp, disp_hdr, (in_bytes(ObjectMonitor::owner_offset())-markWord::monitor_value));
 118   cmpxchg(tmp, zr, rthread, Assembler::xword, /*acquire*/ true,
 119           /*release*/ true, /*weak*/ false, tmp3Reg); // Sets flags for result
 120 
 121   // Store a non-null value into the box to avoid looking like a re-entrant
 122   // lock. The fast-path monitor unlock code checks for
 123   // markWord::monitor_value so use markWord::unused_mark which has the
 124   // relevant bit set, and also matches ObjectSynchronizer::enter.
 125   mov(tmp, (address)markWord::unused_mark().value());
 126   str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 127 
 128   br(Assembler::EQ, cont); // CAS success means locking succeeded
 129 
 130   cmp(tmp3Reg, rthread);
 131   br(Assembler::NE, cont); // Check for recursive locking
 132 
 133   // Recursive lock case
 134   increment(Address(disp_hdr, in_bytes(ObjectMonitor::recursions_offset()) - markWord::monitor_value), 1);
 135   // flag == EQ still from the cmp above, checking if this is a reentrant lock
 136 
 137   bind(cont);
 138   // flag == EQ indicates success
 139   // flag == NE indicates failure
 140   br(Assembler::NE, no_count);
 141 
 142   bind(count);
 143   increment(Address(rthread, JavaThread::held_monitor_count_offset()));
 144 
 145   bind(no_count);
 146 }
 147 
 148 void C2_MacroAssembler::fast_unlock(Register objectReg, Register boxReg, Register tmpReg,
 149                                     Register tmp2Reg) {
 150   Register oop = objectReg;
 151   Register box = boxReg;
 152   Register disp_hdr = tmpReg;
 153   Register owner_addr = tmpReg;
 154   Register tmp = tmp2Reg;
 155   Label cont;
 156   Label object_has_monitor;
 157   Label count, no_count;
 158   Label unlocked;
 159 
 160   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight");
 161   assert_different_registers(oop, box, tmp, disp_hdr);
 162 
 163   if (LockingMode == LM_LEGACY) {
 164     // Find the lock address and load the displaced header from the stack.
 165     ldr(disp_hdr, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 166 
 167     // If the displaced header is 0, we have a recursive unlock.
 168     cmp(disp_hdr, zr);
 169     br(Assembler::EQ, cont);
 170   }
 171 
 172   // Handle existing monitor.
 173   ldr(tmp, Address(oop, oopDesc::mark_offset_in_bytes()));
 174   tbnz(tmp, exact_log2(markWord::monitor_value), object_has_monitor);
 175 
 176   if (LockingMode == LM_MONITOR) {
 177     tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0.
 178     b(cont);
 179   } else {
 180     assert(LockingMode == LM_LEGACY, "must be");
 181     // Check if it is still a light weight lock, this is is true if we
 182     // see the stack address of the basicLock in the markWord of the
 183     // object.
 184 
 185     cmpxchg(oop, box, disp_hdr, Assembler::xword, /*acquire*/ false,
 186             /*release*/ true, /*weak*/ false, tmp);
 187     b(cont);
 188   }
 189 
 190   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
 191 
 192   // Handle existing monitor.
 193   bind(object_has_monitor);
 194   STATIC_ASSERT(markWord::monitor_value <= INT_MAX);
 195   add(tmp, tmp, -(int)markWord::monitor_value); // monitor
 196 
 197   ldr(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset()));
 198 
 199   Label notRecursive;
 200   cbz(disp_hdr, notRecursive);
 201 
 202   // Recursive lock
 203   sub(disp_hdr, disp_hdr, 1u);
 204   str(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset()));
 205   cmp(disp_hdr, disp_hdr); // Sets flags for result
 206   b(cont);
 207 
 208   bind(notRecursive);
 209 
 210   // Compute owner address.
 211   lea(owner_addr, Address(tmp, ObjectMonitor::owner_offset()));
 212 
 213   // Set owner to null.
 214   // Release to satisfy the JMM
 215   stlr(zr, owner_addr);
 216   // We need a full fence after clearing owner to avoid stranding.
 217   // StoreLoad achieves this.
 218   membar(StoreLoad);
 219 
 220   // Check if the entry lists are empty.
 221   ldr(rscratch1, Address(tmp, ObjectMonitor::EntryList_offset()));
 222   ldr(tmpReg, Address(tmp, ObjectMonitor::cxq_offset()));
 223   orr(rscratch1, rscratch1, tmpReg);
 224   cmp(rscratch1, zr);
 225   br(Assembler::EQ, cont);     // If so we are done.
 226 
 227   // Check if there is a successor.
 228   ldr(rscratch1, Address(tmp, ObjectMonitor::succ_offset()));
 229   cmp(rscratch1, zr);
 230   br(Assembler::NE, unlocked); // If so we are done.
 231 
 232   // Save the monitor pointer in the current thread, so we can try to
 233   // reacquire the lock in SharedRuntime::monitor_exit_helper().
 234   str(tmp, Address(rthread, JavaThread::unlocked_inflated_monitor_offset()));
 235 
 236   cmp(zr, rthread); // Set Flag to NE => slow path
 237   b(cont);
 238 
 239   bind(unlocked);
 240   cmp(zr, zr); // Set Flag to EQ => fast path
 241 
 242   // Intentional fall-through
 243 
 244   bind(cont);
 245   // flag == EQ indicates success
 246   // flag == NE indicates failure
 247   br(Assembler::NE, no_count);
 248 
 249   bind(count);
 250   decrement(Address(rthread, JavaThread::held_monitor_count_offset()));
 251 
 252   bind(no_count);
 253 }
 254 
 255 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register t1,
 256                                               Register t2, Register t3) {
 257   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 258   assert_different_registers(obj, box, t1, t2, t3);
 259 
 260   // Handle inflated monitor.
 261   Label inflated;
 262   // Finish fast lock successfully. MUST branch to with flag == EQ
 263   Label locked;
 264   // Finish fast lock unsuccessfully. MUST branch to with flag == NE
 265   Label slow_path;
 266 
 267   if (UseObjectMonitorTable) {
 268     // Clear cache in case fast locking succeeds.
 269     str(zr, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 270   }
 271 
 272   if (DiagnoseSyncOnValueBasedClasses != 0) {
 273     load_klass(t1, obj);
 274     ldrb(t1, Address(t1, Klass::misc_flags_offset()));
 275     tst(t1, KlassFlags::_misc_is_value_based_class);
 276     br(Assembler::NE, slow_path);
 277   }
 278 
 279   const Register t1_mark = t1;
 280   const Register t3_t = t3;
 281 
 282   { // Lightweight locking
 283 
 284     // Push lock to the lock stack and finish successfully. MUST branch to with flag == EQ
 285     Label push;
 286 
 287     const Register t2_top = t2;
 288 
 289     // Check if lock-stack is full.
 290     ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 291     cmpw(t2_top, (unsigned)LockStack::end_offset() - 1);
 292     br(Assembler::GT, slow_path);
 293 
 294     // Check if recursive.
 295     subw(t3_t, t2_top, oopSize);
 296     ldr(t3_t, Address(rthread, t3_t));
 297     cmp(obj, t3_t);
 298     br(Assembler::EQ, push);
 299 
 300     // Relaxed normal load to check for monitor. Optimization for monitor case.
 301     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 302     tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
 303 
 304     // Not inflated
 305     assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea");
 306 
 307     // Try to lock. Transition lock-bits 0b01 => 0b00
 308     orr(t1_mark, t1_mark, markWord::unlocked_value);
 309     eor(t3_t, t1_mark, markWord::unlocked_value);
 310     cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
 311             /*acquire*/ true, /*release*/ false, /*weak*/ false, noreg);
 312     br(Assembler::NE, slow_path);
 313 
 314     bind(push);
 315     // After successful lock, push object on lock-stack.
 316     str(obj, Address(rthread, t2_top));
 317     addw(t2_top, t2_top, oopSize);
 318     strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 319     b(locked);
 320   }
 321 
 322   { // Handle inflated monitor.
 323     bind(inflated);
 324 
 325     const Register t1_monitor = t1;
 326 
 327     if (!UseObjectMonitorTable) {
 328       assert(t1_monitor == t1_mark, "should be the same here");
 329     } else {
 330       Label monitor_found;
 331 
 332       // Load cache address
 333       lea(t3_t, Address(rthread, JavaThread::om_cache_oops_offset()));
 334 
 335       const int num_unrolled = 2;
 336       for (int i = 0; i < num_unrolled; i++) {
 337         ldr(t1, Address(t3_t));
 338         cmp(obj, t1);
 339         br(Assembler::EQ, monitor_found);
 340         increment(t3_t, in_bytes(OMCache::oop_to_oop_difference()));
 341       }
 342 
 343       Label loop;
 344 
 345       // Search for obj in cache.
 346       bind(loop);
 347 
 348       // Check for match.
 349       ldr(t1, Address(t3_t));
 350       cmp(obj, t1);
 351       br(Assembler::EQ, monitor_found);
 352 
 353       // Search until null encountered, guaranteed _null_sentinel at end.
 354       increment(t3_t, in_bytes(OMCache::oop_to_oop_difference()));
 355       cbnz(t1, loop);
 356       // Cache Miss, NE set from cmp above, cbnz does not set flags
 357       b(slow_path);
 358 
 359       bind(monitor_found);
 360       ldr(t1_monitor, Address(t3_t, OMCache::oop_to_monitor_difference()));
 361     }
 362 
 363     const Register t2_owner_addr = t2;
 364     const Register t3_owner = t3;
 365     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 366     const Address owner_address(t1_monitor, ObjectMonitor::owner_offset() - monitor_tag);
 367     const Address recursions_address(t1_monitor, ObjectMonitor::recursions_offset() - monitor_tag);
 368 
 369     Label monitor_locked;
 370 
 371     // Compute owner address.
 372     lea(t2_owner_addr, owner_address);
 373 
 374     // CAS owner (null => current thread).
 375     cmpxchg(t2_owner_addr, zr, rthread, Assembler::xword, /*acquire*/ true,
 376             /*release*/ false, /*weak*/ false, t3_owner);
 377     br(Assembler::EQ, monitor_locked);
 378 
 379     // Check if recursive.
 380     cmp(t3_owner, rthread);
 381     br(Assembler::NE, slow_path);
 382 
 383     // Recursive.
 384     increment(recursions_address, 1);
 385 
 386     bind(monitor_locked);
 387     if (UseObjectMonitorTable) {
 388       str(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 389     }
 390   }
 391 
 392   bind(locked);
 393   increment(Address(rthread, JavaThread::held_monitor_count_offset()));
 394 
 395 #ifdef ASSERT
 396   // Check that locked label is reached with Flags == EQ.
 397   Label flag_correct;
 398   br(Assembler::EQ, flag_correct);
 399   stop("Fast Lock Flag != EQ");
 400 #endif
 401 
 402   bind(slow_path);
 403 #ifdef ASSERT
 404   // Check that slow_path label is reached with Flags == NE.
 405   br(Assembler::NE, flag_correct);
 406   stop("Fast Lock Flag != NE");
 407   bind(flag_correct);
 408 #endif
 409   // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
 410 }
 411 
 412 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register box, Register t1,
 413                                                 Register t2, Register t3) {
 414   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 415   assert_different_registers(obj, box, t1, t2, t3);
 416 
 417   // Handle inflated monitor.
 418   Label inflated, inflated_load_mark;
 419   // Finish fast unlock successfully. MUST branch to with flag == EQ
 420   Label unlocked;
 421   // Finish fast unlock unsuccessfully. MUST branch to with flag == NE
 422   Label slow_path;
 423 
 424   const Register t1_mark = t1;
 425   const Register t2_top = t2;
 426   const Register t3_t = t3;
 427 
 428   { // Lightweight unlock
 429 
 430     Label push_and_slow_path;
 431 
 432     // Check if obj is top of lock-stack.
 433     ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 434     subw(t2_top, t2_top, oopSize);
 435     ldr(t3_t, Address(rthread, t2_top));
 436     cmp(obj, t3_t);
 437     // Top of lock stack was not obj. Must be monitor.
 438     br(Assembler::NE, inflated_load_mark);
 439 
 440     // Pop lock-stack.
 441     DEBUG_ONLY(str(zr, Address(rthread, t2_top));)
 442     strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 443 
 444     // Check if recursive.
 445     subw(t3_t, t2_top, oopSize);
 446     ldr(t3_t, Address(rthread, t3_t));
 447     cmp(obj, t3_t);
 448     br(Assembler::EQ, unlocked);
 449 
 450     // Not recursive.
 451     // Load Mark.
 452     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 453 
 454     // Check header for monitor (0b10).
 455     // Because we got here by popping (meaning we pushed in locked)
 456     // there will be no monitor in the box. So we need to push back the obj
 457     // so that the runtime can fix any potential anonymous owner.
 458     tbnz(t1_mark, exact_log2(markWord::monitor_value), UseObjectMonitorTable ? push_and_slow_path : inflated);
 459 
 460     // Try to unlock. Transition lock bits 0b00 => 0b01
 461     assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea");
 462     orr(t3_t, t1_mark, markWord::unlocked_value);
 463     cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
 464             /*acquire*/ false, /*release*/ true, /*weak*/ false, noreg);
 465     br(Assembler::EQ, unlocked);
 466 
 467     bind(push_and_slow_path);
 468     // Compare and exchange failed.
 469     // Restore lock-stack and handle the unlock in runtime.
 470     DEBUG_ONLY(str(obj, Address(rthread, t2_top));)
 471     addw(t2_top, t2_top, oopSize);
 472     str(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 473     b(slow_path);
 474   }
 475 
 476 
 477   { // Handle inflated monitor.
 478     bind(inflated_load_mark);
 479     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 480 #ifdef ASSERT
 481     tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
 482     stop("Fast Unlock not monitor");
 483 #endif
 484 
 485     bind(inflated);
 486 
 487 #ifdef ASSERT
 488     Label check_done;
 489     subw(t2_top, t2_top, oopSize);
 490     cmpw(t2_top, in_bytes(JavaThread::lock_stack_base_offset()));
 491     br(Assembler::LT, check_done);
 492     ldr(t3_t, Address(rthread, t2_top));
 493     cmp(obj, t3_t);
 494     br(Assembler::NE, inflated);
 495     stop("Fast Unlock lock on stack");
 496     bind(check_done);
 497 #endif
 498 
 499     const Register t1_monitor = t1;
 500 
 501     if (!UseObjectMonitorTable) {
 502       assert(t1_monitor == t1_mark, "should be the same here");
 503 
 504       // Untag the monitor.
 505       add(t1_monitor, t1_mark, -(int)markWord::monitor_value);
 506     } else {
 507       ldr(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 508       // null check with Flags == NE, no valid pointer below alignof(ObjectMonitor*)
 509       cmp(t1_monitor, checked_cast<uint8_t>(alignof(ObjectMonitor*)));
 510       br(Assembler::LO, slow_path);
 511     }
 512 
 513     const Register t2_recursions = t2;
 514     Label not_recursive;
 515 
 516     // Check if recursive.
 517     ldr(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
 518     cbz(t2_recursions, not_recursive);
 519 
 520     // Recursive unlock.
 521     sub(t2_recursions, t2_recursions, 1u);
 522     str(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
 523     // Set flag == EQ
 524     cmp(t2_recursions, t2_recursions);
 525     b(unlocked);
 526 
 527     bind(not_recursive);
 528 
 529     const Register t2_owner_addr = t2;
 530 
 531     // Compute owner address.
 532     lea(t2_owner_addr, Address(t1_monitor, ObjectMonitor::owner_offset()));
 533 
 534     // Set owner to null.
 535     // Release to satisfy the JMM
 536     stlr(zr, t2_owner_addr);
 537     // We need a full fence after clearing owner to avoid stranding.
 538     // StoreLoad achieves this.
 539     membar(StoreLoad);
 540 
 541     // Check if the entry lists are empty.
 542     ldr(rscratch1, Address(t1_monitor, ObjectMonitor::EntryList_offset()));
 543     ldr(t3_t, Address(t1_monitor, ObjectMonitor::cxq_offset()));
 544     orr(rscratch1, rscratch1, t3_t);
 545     cmp(rscratch1, zr);
 546     br(Assembler::EQ, unlocked);  // If so we are done.
 547 
 548     // Check if there is a successor.
 549     ldr(rscratch1, Address(t1_monitor, ObjectMonitor::succ_offset()));
 550     cmp(rscratch1, zr);
 551     br(Assembler::NE, unlocked);  // If so we are done.
 552 
 553     // Save the monitor pointer in the current thread, so we can try to
 554     // reacquire the lock in SharedRuntime::monitor_exit_helper().
 555     str(t1_monitor, Address(rthread, JavaThread::unlocked_inflated_monitor_offset()));
 556 
 557     cmp(zr, rthread); // Set Flag to NE => slow path
 558     b(slow_path);
 559   }
 560 
 561   bind(unlocked);
 562   decrement(Address(rthread, JavaThread::held_monitor_count_offset()));
 563   cmp(zr, zr); // Set Flags to EQ => fast path
 564 
 565 #ifdef ASSERT
 566   // Check that unlocked label is reached with Flags == EQ.
 567   Label flag_correct;
 568   br(Assembler::EQ, flag_correct);
 569   stop("Fast Unlock Flag != EQ");
 570 #endif
 571 
 572   bind(slow_path);
 573 #ifdef ASSERT
 574   // Check that slow_path label is reached with Flags == NE.
 575   br(Assembler::NE, flag_correct);
 576   stop("Fast Unlock Flag != NE");
 577   bind(flag_correct);
 578 #endif
 579   // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
 580 }
 581 
 582 // Search for str1 in str2 and return index or -1
 583 // Clobbers: rscratch1, rscratch2, rflags. May also clobber v0-v1, when icnt1==-1.
 584 void C2_MacroAssembler::string_indexof(Register str2, Register str1,
 585                                        Register cnt2, Register cnt1,
 586                                        Register tmp1, Register tmp2,
 587                                        Register tmp3, Register tmp4,
 588                                        Register tmp5, Register tmp6,
 589                                        int icnt1, Register result, int ae) {
 590   // NOTE: tmp5, tmp6 can be zr depending on specific method version
 591   Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
 592 
 593   Register ch1 = rscratch1;
 594   Register ch2 = rscratch2;
 595   Register cnt1tmp = tmp1;
 596   Register cnt2tmp = tmp2;
 597   Register cnt1_neg = cnt1;
 598   Register cnt2_neg = cnt2;
 599   Register result_tmp = tmp4;
 600 
 601   bool isL = ae == StrIntrinsicNode::LL;
 602 
 603   bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
 604   bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
 605   int str1_chr_shift = str1_isL ? 0:1;
 606   int str2_chr_shift = str2_isL ? 0:1;
 607   int str1_chr_size = str1_isL ? 1:2;
 608   int str2_chr_size = str2_isL ? 1:2;
 609   chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
 610                                       (chr_insn)&MacroAssembler::ldrh;
 611   chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
 612                                       (chr_insn)&MacroAssembler::ldrh;
 613   chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
 614   chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
 615 
 616   // Note, inline_string_indexOf() generates checks:
 617   // if (substr.count > string.count) return -1;
 618   // if (substr.count == 0) return 0;
 619 
 620   // We have two strings, a source string in str2, cnt2 and a pattern string
 621   // in str1, cnt1. Find the 1st occurrence of pattern in source or return -1.
 622 
 623   // For larger pattern and source we use a simplified Boyer Moore algorithm.
 624   // With a small pattern and source we use linear scan.
 625 
 626   if (icnt1 == -1) {
 627     sub(result_tmp, cnt2, cnt1);
 628     cmp(cnt1, (u1)8);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
 629     br(LT, LINEARSEARCH);
 630     dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
 631     subs(zr, cnt1, 256);
 632     lsr(tmp1, cnt2, 2);
 633     ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
 634     br(GE, LINEARSTUB);
 635   }
 636 
 637 // The Boyer Moore alogorithm is based on the description here:-
 638 //
 639 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
 640 //
 641 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
 642 // and the 'Good Suffix' rule.
 643 //
 644 // These rules are essentially heuristics for how far we can shift the
 645 // pattern along the search string.
 646 //
 647 // The implementation here uses the 'Bad Character' rule only because of the
 648 // complexity of initialisation for the 'Good Suffix' rule.
 649 //
 650 // This is also known as the Boyer-Moore-Horspool algorithm:-
 651 //
 652 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
 653 //
 654 // This particular implementation has few java-specific optimizations.
 655 //
 656 // #define ASIZE 256
 657 //
 658 //    int bm(unsigned char *x, int m, unsigned char *y, int n) {
 659 //       int i, j;
 660 //       unsigned c;
 661 //       unsigned char bc[ASIZE];
 662 //
 663 //       /* Preprocessing */
 664 //       for (i = 0; i < ASIZE; ++i)
 665 //          bc[i] = m;
 666 //       for (i = 0; i < m - 1; ) {
 667 //          c = x[i];
 668 //          ++i;
 669 //          // c < 256 for Latin1 string, so, no need for branch
 670 //          #ifdef PATTERN_STRING_IS_LATIN1
 671 //          bc[c] = m - i;
 672 //          #else
 673 //          if (c < ASIZE) bc[c] = m - i;
 674 //          #endif
 675 //       }
 676 //
 677 //       /* Searching */
 678 //       j = 0;
 679 //       while (j <= n - m) {
 680 //          c = y[i+j];
 681 //          if (x[m-1] == c)
 682 //            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
 683 //          if (i < 0) return j;
 684 //          // c < 256 for Latin1 string, so, no need for branch
 685 //          #ifdef SOURCE_STRING_IS_LATIN1
 686 //          // LL case: (c< 256) always true. Remove branch
 687 //          j += bc[y[j+m-1]];
 688 //          #endif
 689 //          #ifndef PATTERN_STRING_IS_UTF
 690 //          // UU case: need if (c<ASIZE) check. Skip 1 character if not.
 691 //          if (c < ASIZE)
 692 //            j += bc[y[j+m-1]];
 693 //          else
 694 //            j += 1
 695 //          #endif
 696 //          #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
 697 //          // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
 698 //          if (c < ASIZE)
 699 //            j += bc[y[j+m-1]];
 700 //          else
 701 //            j += m
 702 //          #endif
 703 //       }
 704 //    }
 705 
 706   if (icnt1 == -1) {
 707     Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
 708         BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
 709     Register cnt1end = tmp2;
 710     Register str2end = cnt2;
 711     Register skipch = tmp2;
 712 
 713     // str1 length is >=8, so, we can read at least 1 register for cases when
 714     // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
 715     // UL case. We'll re-read last character in inner pre-loop code to have
 716     // single outer pre-loop load
 717     const int firstStep = isL ? 7 : 3;
 718 
 719     const int ASIZE = 256;
 720     const int STORED_BYTES = 32; // amount of bytes stored per instruction
 721     sub(sp, sp, ASIZE);
 722     mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
 723     mov(ch1, sp);
 724     BIND(BM_INIT_LOOP);
 725       stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
 726       subs(tmp5, tmp5, 1);
 727       br(GT, BM_INIT_LOOP);
 728 
 729       sub(cnt1tmp, cnt1, 1);
 730       mov(tmp5, str2);
 731       add(str2end, str2, result_tmp, LSL, str2_chr_shift);
 732       sub(ch2, cnt1, 1);
 733       mov(tmp3, str1);
 734     BIND(BCLOOP);
 735       (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
 736       if (!str1_isL) {
 737         subs(zr, ch1, ASIZE);
 738         br(HS, BCSKIP);
 739       }
 740       strb(ch2, Address(sp, ch1));
 741     BIND(BCSKIP);
 742       subs(ch2, ch2, 1);
 743       br(GT, BCLOOP);
 744 
 745       add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
 746       if (str1_isL == str2_isL) {
 747         // load last 8 bytes (8LL/4UU symbols)
 748         ldr(tmp6, Address(tmp6, -wordSize));
 749       } else {
 750         ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
 751         // convert Latin1 to UTF. We'll have to wait until load completed, but
 752         // it's still faster than per-character loads+checks
 753         lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
 754         ubfx(ch1, tmp6, 8, 8); // str1[N-2]
 755         ubfx(ch2, tmp6, 16, 8); // str1[N-3]
 756         andr(tmp6, tmp6, 0xFF); // str1[N-4]
 757         orr(ch2, ch1, ch2, LSL, 16);
 758         orr(tmp6, tmp6, tmp3, LSL, 48);
 759         orr(tmp6, tmp6, ch2, LSL, 16);
 760       }
 761     BIND(BMLOOPSTR2);
 762       (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 763       sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
 764       if (str1_isL == str2_isL) {
 765         // re-init tmp3. It's for free because it's executed in parallel with
 766         // load above. Alternative is to initialize it before loop, but it'll
 767         // affect performance on in-order systems with 2 or more ld/st pipelines
 768         lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
 769       }
 770       if (!isL) { // UU/UL case
 771         lsl(ch2, cnt1tmp, 1); // offset in bytes
 772       }
 773       cmp(tmp3, skipch);
 774       br(NE, BMSKIP);
 775       ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
 776       mov(ch1, tmp6);
 777       if (isL) {
 778         b(BMLOOPSTR1_AFTER_LOAD);
 779       } else {
 780         sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
 781         b(BMLOOPSTR1_CMP);
 782       }
 783     BIND(BMLOOPSTR1);
 784       (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
 785       (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 786     BIND(BMLOOPSTR1_AFTER_LOAD);
 787       subs(cnt1tmp, cnt1tmp, 1);
 788       br(LT, BMLOOPSTR1_LASTCMP);
 789     BIND(BMLOOPSTR1_CMP);
 790       cmp(ch1, ch2);
 791       br(EQ, BMLOOPSTR1);
 792     BIND(BMSKIP);
 793       if (!isL) {
 794         // if we've met UTF symbol while searching Latin1 pattern, then we can
 795         // skip cnt1 symbols
 796         if (str1_isL != str2_isL) {
 797           mov(result_tmp, cnt1);
 798         } else {
 799           mov(result_tmp, 1);
 800         }
 801         subs(zr, skipch, ASIZE);
 802         br(HS, BMADV);
 803       }
 804       ldrb(result_tmp, Address(sp, skipch)); // load skip distance
 805     BIND(BMADV);
 806       sub(cnt1tmp, cnt1, 1);
 807       add(str2, str2, result_tmp, LSL, str2_chr_shift);
 808       cmp(str2, str2end);
 809       br(LE, BMLOOPSTR2);
 810       add(sp, sp, ASIZE);
 811       b(NOMATCH);
 812     BIND(BMLOOPSTR1_LASTCMP);
 813       cmp(ch1, ch2);
 814       br(NE, BMSKIP);
 815     BIND(BMMATCH);
 816       sub(result, str2, tmp5);
 817       if (!str2_isL) lsr(result, result, 1);
 818       add(sp, sp, ASIZE);
 819       b(DONE);
 820 
 821     BIND(LINEARSTUB);
 822     cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm
 823     br(LT, LINEAR_MEDIUM);
 824     mov(result, zr);
 825     RuntimeAddress stub = nullptr;
 826     if (isL) {
 827       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
 828       assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated");
 829     } else if (str1_isL) {
 830       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
 831        assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated");
 832     } else {
 833       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
 834       assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated");
 835     }
 836     address call = trampoline_call(stub);
 837     if (call == nullptr) {
 838       DEBUG_ONLY(reset_labels(LINEARSEARCH, LINEAR_MEDIUM, DONE, NOMATCH, MATCH));
 839       ciEnv::current()->record_failure("CodeCache is full");
 840       return;
 841     }
 842     b(DONE);
 843   }
 844 
 845   BIND(LINEARSEARCH);
 846   {
 847     Label DO1, DO2, DO3;
 848 
 849     Register str2tmp = tmp2;
 850     Register first = tmp3;
 851 
 852     if (icnt1 == -1)
 853     {
 854         Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
 855 
 856         cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2));
 857         br(LT, DOSHORT);
 858       BIND(LINEAR_MEDIUM);
 859         (this->*str1_load_1chr)(first, Address(str1));
 860         lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
 861         sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
 862         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 863         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 864 
 865       BIND(FIRST_LOOP);
 866         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
 867         cmp(first, ch2);
 868         br(EQ, STR1_LOOP);
 869       BIND(STR2_NEXT);
 870         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 871         br(LE, FIRST_LOOP);
 872         b(NOMATCH);
 873 
 874       BIND(STR1_LOOP);
 875         adds(cnt1tmp, cnt1_neg, str1_chr_size);
 876         add(cnt2tmp, cnt2_neg, str2_chr_size);
 877         br(GE, MATCH);
 878 
 879       BIND(STR1_NEXT);
 880         (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
 881         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
 882         cmp(ch1, ch2);
 883         br(NE, STR2_NEXT);
 884         adds(cnt1tmp, cnt1tmp, str1_chr_size);
 885         add(cnt2tmp, cnt2tmp, str2_chr_size);
 886         br(LT, STR1_NEXT);
 887         b(MATCH);
 888 
 889       BIND(DOSHORT);
 890       if (str1_isL == str2_isL) {
 891         cmp(cnt1, (u1)2);
 892         br(LT, DO1);
 893         br(GT, DO3);
 894       }
 895     }
 896 
 897     if (icnt1 == 4) {
 898       Label CH1_LOOP;
 899 
 900         (this->*load_4chr)(ch1, str1);
 901         sub(result_tmp, cnt2, 4);
 902         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 903         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 904 
 905       BIND(CH1_LOOP);
 906         (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
 907         cmp(ch1, ch2);
 908         br(EQ, MATCH);
 909         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 910         br(LE, CH1_LOOP);
 911         b(NOMATCH);
 912       }
 913 
 914     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
 915       Label CH1_LOOP;
 916 
 917       BIND(DO2);
 918         (this->*load_2chr)(ch1, str1);
 919         if (icnt1 == 2) {
 920           sub(result_tmp, cnt2, 2);
 921         }
 922         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 923         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 924       BIND(CH1_LOOP);
 925         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
 926         cmp(ch1, ch2);
 927         br(EQ, MATCH);
 928         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 929         br(LE, CH1_LOOP);
 930         b(NOMATCH);
 931     }
 932 
 933     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
 934       Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
 935 
 936       BIND(DO3);
 937         (this->*load_2chr)(first, str1);
 938         (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
 939         if (icnt1 == 3) {
 940           sub(result_tmp, cnt2, 3);
 941         }
 942         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 943         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 944       BIND(FIRST_LOOP);
 945         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
 946         cmpw(first, ch2);
 947         br(EQ, STR1_LOOP);
 948       BIND(STR2_NEXT);
 949         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 950         br(LE, FIRST_LOOP);
 951         b(NOMATCH);
 952 
 953       BIND(STR1_LOOP);
 954         add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
 955         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
 956         cmp(ch1, ch2);
 957         br(NE, STR2_NEXT);
 958         b(MATCH);
 959     }
 960 
 961     if (icnt1 == -1 || icnt1 == 1) {
 962       Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
 963 
 964       BIND(DO1);
 965         (this->*str1_load_1chr)(ch1, str1);
 966         cmp(cnt2, (u1)8);
 967         br(LT, DO1_SHORT);
 968 
 969         sub(result_tmp, cnt2, 8/str2_chr_size);
 970         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 971         mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
 972         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 973 
 974         if (str2_isL) {
 975           orr(ch1, ch1, ch1, LSL, 8);
 976         }
 977         orr(ch1, ch1, ch1, LSL, 16);
 978         orr(ch1, ch1, ch1, LSL, 32);
 979       BIND(CH1_LOOP);
 980         ldr(ch2, Address(str2, cnt2_neg));
 981         eor(ch2, ch1, ch2);
 982         sub(tmp1, ch2, tmp3);
 983         orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 984         bics(tmp1, tmp1, tmp2);
 985         br(NE, HAS_ZERO);
 986         adds(cnt2_neg, cnt2_neg, 8);
 987         br(LT, CH1_LOOP);
 988 
 989         cmp(cnt2_neg, (u1)8);
 990         mov(cnt2_neg, 0);
 991         br(LT, CH1_LOOP);
 992         b(NOMATCH);
 993 
 994       BIND(HAS_ZERO);
 995         rev(tmp1, tmp1);
 996         clz(tmp1, tmp1);
 997         add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
 998         b(MATCH);
 999 
1000       BIND(DO1_SHORT);
1001         mov(result_tmp, cnt2);
1002         lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
1003         sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
1004       BIND(DO1_LOOP);
1005         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
1006         cmpw(ch1, ch2);
1007         br(EQ, MATCH);
1008         adds(cnt2_neg, cnt2_neg, str2_chr_size);
1009         br(LT, DO1_LOOP);
1010     }
1011   }
1012   BIND(NOMATCH);
1013     mov(result, -1);
1014     b(DONE);
1015   BIND(MATCH);
1016     add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
1017   BIND(DONE);
1018 }
1019 
1020 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
1021 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
1022 
1023 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1,
1024                                             Register ch, Register result,
1025                                             Register tmp1, Register tmp2, Register tmp3)
1026 {
1027   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
1028   Register cnt1_neg = cnt1;
1029   Register ch1 = rscratch1;
1030   Register result_tmp = rscratch2;
1031 
1032   cbz(cnt1, NOMATCH);
1033 
1034   cmp(cnt1, (u1)4);
1035   br(LT, DO1_SHORT);
1036 
1037   orr(ch, ch, ch, LSL, 16);
1038   orr(ch, ch, ch, LSL, 32);
1039 
1040   sub(cnt1, cnt1, 4);
1041   mov(result_tmp, cnt1);
1042   lea(str1, Address(str1, cnt1, Address::uxtw(1)));
1043   sub(cnt1_neg, zr, cnt1, LSL, 1);
1044 
1045   mov(tmp3, 0x0001000100010001);
1046 
1047   BIND(CH1_LOOP);
1048     ldr(ch1, Address(str1, cnt1_neg));
1049     eor(ch1, ch, ch1);
1050     sub(tmp1, ch1, tmp3);
1051     orr(tmp2, ch1, 0x7fff7fff7fff7fff);
1052     bics(tmp1, tmp1, tmp2);
1053     br(NE, HAS_ZERO);
1054     adds(cnt1_neg, cnt1_neg, 8);
1055     br(LT, CH1_LOOP);
1056 
1057     cmp(cnt1_neg, (u1)8);
1058     mov(cnt1_neg, 0);
1059     br(LT, CH1_LOOP);
1060     b(NOMATCH);
1061 
1062   BIND(HAS_ZERO);
1063     rev(tmp1, tmp1);
1064     clz(tmp1, tmp1);
1065     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
1066     b(MATCH);
1067 
1068   BIND(DO1_SHORT);
1069     mov(result_tmp, cnt1);
1070     lea(str1, Address(str1, cnt1, Address::uxtw(1)));
1071     sub(cnt1_neg, zr, cnt1, LSL, 1);
1072   BIND(DO1_LOOP);
1073     ldrh(ch1, Address(str1, cnt1_neg));
1074     cmpw(ch, ch1);
1075     br(EQ, MATCH);
1076     adds(cnt1_neg, cnt1_neg, 2);
1077     br(LT, DO1_LOOP);
1078   BIND(NOMATCH);
1079     mov(result, -1);
1080     b(DONE);
1081   BIND(MATCH);
1082     add(result, result_tmp, cnt1_neg, ASR, 1);
1083   BIND(DONE);
1084 }
1085 
1086 void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1,
1087                                                 Register ch, Register result,
1088                                                 FloatRegister ztmp1,
1089                                                 FloatRegister ztmp2,
1090                                                 PRegister tmp_pg,
1091                                                 PRegister tmp_pdn, bool isL)
1092 {
1093   // Note that `tmp_pdn` should *NOT* be used as governing predicate register.
1094   assert(tmp_pg->is_governing(),
1095          "this register has to be a governing predicate register");
1096 
1097   Label LOOP, MATCH, DONE, NOMATCH;
1098   Register vec_len = rscratch1;
1099   Register idx = rscratch2;
1100 
1101   SIMD_RegVariant T = (isL == true) ? B : H;
1102 
1103   cbz(cnt1, NOMATCH);
1104 
1105   // Assign the particular char throughout the vector.
1106   sve_dup(ztmp2, T, ch);
1107   if (isL) {
1108     sve_cntb(vec_len);
1109   } else {
1110     sve_cnth(vec_len);
1111   }
1112   mov(idx, 0);
1113 
1114   // Generate a predicate to control the reading of input string.
1115   sve_whilelt(tmp_pg, T, idx, cnt1);
1116 
1117   BIND(LOOP);
1118     // Read a vector of 8- or 16-bit data depending on the string type. Note
1119     // that inactive elements indicated by the predicate register won't cause
1120     // a data read from memory to the destination vector.
1121     if (isL) {
1122       sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx));
1123     } else {
1124       sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1)));
1125     }
1126     add(idx, idx, vec_len);
1127 
1128     // Perform the comparison. An element of the destination predicate is set
1129     // to active if the particular char is matched.
1130     sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2);
1131 
1132     // Branch if the particular char is found.
1133     br(NE, MATCH);
1134 
1135     sve_whilelt(tmp_pg, T, idx, cnt1);
1136 
1137     // Loop back if the particular char not found.
1138     br(MI, LOOP);
1139 
1140   BIND(NOMATCH);
1141     mov(result, -1);
1142     b(DONE);
1143 
1144   BIND(MATCH);
1145     // Undo the index increment.
1146     sub(idx, idx, vec_len);
1147 
1148     // Crop the vector to find its location.
1149     sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */);
1150     add(result, idx, -1);
1151     sve_incp(result, T, tmp_pdn);
1152   BIND(DONE);
1153 }
1154 
1155 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1,
1156                                             Register ch, Register result,
1157                                             Register tmp1, Register tmp2, Register tmp3)
1158 {
1159   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
1160   Register cnt1_neg = cnt1;
1161   Register ch1 = rscratch1;
1162   Register result_tmp = rscratch2;
1163 
1164   cbz(cnt1, NOMATCH);
1165 
1166   cmp(cnt1, (u1)8);
1167   br(LT, DO1_SHORT);
1168 
1169   orr(ch, ch, ch, LSL, 8);
1170   orr(ch, ch, ch, LSL, 16);
1171   orr(ch, ch, ch, LSL, 32);
1172 
1173   sub(cnt1, cnt1, 8);
1174   mov(result_tmp, cnt1);
1175   lea(str1, Address(str1, cnt1));
1176   sub(cnt1_neg, zr, cnt1);
1177 
1178   mov(tmp3, 0x0101010101010101);
1179 
1180   BIND(CH1_LOOP);
1181     ldr(ch1, Address(str1, cnt1_neg));
1182     eor(ch1, ch, ch1);
1183     sub(tmp1, ch1, tmp3);
1184     orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f);
1185     bics(tmp1, tmp1, tmp2);
1186     br(NE, HAS_ZERO);
1187     adds(cnt1_neg, cnt1_neg, 8);
1188     br(LT, CH1_LOOP);
1189 
1190     cmp(cnt1_neg, (u1)8);
1191     mov(cnt1_neg, 0);
1192     br(LT, CH1_LOOP);
1193     b(NOMATCH);
1194 
1195   BIND(HAS_ZERO);
1196     rev(tmp1, tmp1);
1197     clz(tmp1, tmp1);
1198     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
1199     b(MATCH);
1200 
1201   BIND(DO1_SHORT);
1202     mov(result_tmp, cnt1);
1203     lea(str1, Address(str1, cnt1));
1204     sub(cnt1_neg, zr, cnt1);
1205   BIND(DO1_LOOP);
1206     ldrb(ch1, Address(str1, cnt1_neg));
1207     cmp(ch, ch1);
1208     br(EQ, MATCH);
1209     adds(cnt1_neg, cnt1_neg, 1);
1210     br(LT, DO1_LOOP);
1211   BIND(NOMATCH);
1212     mov(result, -1);
1213     b(DONE);
1214   BIND(MATCH);
1215     add(result, result_tmp, cnt1_neg);
1216   BIND(DONE);
1217 }
1218 
1219 // Compare strings.
1220 void C2_MacroAssembler::string_compare(Register str1, Register str2,
1221     Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
1222     FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
1223     PRegister pgtmp1, PRegister pgtmp2, int ae) {
1224   Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
1225       DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
1226       SHORT_LOOP_START, TAIL_CHECK;
1227 
1228   bool isLL = ae == StrIntrinsicNode::LL;
1229   bool isLU = ae == StrIntrinsicNode::LU;
1230   bool isUL = ae == StrIntrinsicNode::UL;
1231 
1232   // The stub threshold for LL strings is: 72 (64 + 8) chars
1233   // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch)
1234   // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least)
1235   const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36);
1236 
1237   bool str1_isL = isLL || isLU;
1238   bool str2_isL = isLL || isUL;
1239 
1240   int str1_chr_shift = str1_isL ? 0 : 1;
1241   int str2_chr_shift = str2_isL ? 0 : 1;
1242   int str1_chr_size = str1_isL ? 1 : 2;
1243   int str2_chr_size = str2_isL ? 1 : 2;
1244   int minCharsInWord = isLL ? wordSize : wordSize/2;
1245 
1246   FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
1247   chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
1248                                       (chr_insn)&MacroAssembler::ldrh;
1249   chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
1250                                       (chr_insn)&MacroAssembler::ldrh;
1251   uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
1252                             (uxt_insn)&MacroAssembler::uxthw;
1253 
1254   BLOCK_COMMENT("string_compare {");
1255 
1256   // Bizarrely, the counts are passed in bytes, regardless of whether they
1257   // are L or U strings, however the result is always in characters.
1258   if (!str1_isL) asrw(cnt1, cnt1, 1);
1259   if (!str2_isL) asrw(cnt2, cnt2, 1);
1260 
1261   // Compute the minimum of the string lengths and save the difference.
1262   subsw(result, cnt1, cnt2);
1263   cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
1264 
1265   // A very short string
1266   cmpw(cnt2, minCharsInWord);
1267   br(Assembler::LE, SHORT_STRING);
1268 
1269   // Compare longwords
1270   // load first parts of strings and finish initialization while loading
1271   {
1272     if (str1_isL == str2_isL) { // LL or UU
1273       ldr(tmp1, Address(str1));
1274       cmp(str1, str2);
1275       br(Assembler::EQ, DONE);
1276       ldr(tmp2, Address(str2));
1277       cmp(cnt2, stub_threshold);
1278       br(GE, STUB);
1279       subsw(cnt2, cnt2, minCharsInWord);
1280       br(EQ, TAIL_CHECK);
1281       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1282       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1283       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1284     } else if (isLU) {
1285       ldrs(vtmp, Address(str1));
1286       ldr(tmp2, Address(str2));
1287       cmp(cnt2, stub_threshold);
1288       br(GE, STUB);
1289       subw(cnt2, cnt2, 4);
1290       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1291       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1292       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1293       zip1(vtmp, T8B, vtmp, vtmpZ);
1294       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1295       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1296       add(cnt1, cnt1, 4);
1297       fmovd(tmp1, vtmp);
1298     } else { // UL case
1299       ldr(tmp1, Address(str1));
1300       ldrs(vtmp, Address(str2));
1301       cmp(cnt2, stub_threshold);
1302       br(GE, STUB);
1303       subw(cnt2, cnt2, 4);
1304       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1305       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1306       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1307       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1308       zip1(vtmp, T8B, vtmp, vtmpZ);
1309       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1310       add(cnt1, cnt1, 8);
1311       fmovd(tmp2, vtmp);
1312     }
1313     adds(cnt2, cnt2, isUL ? 4 : 8);
1314     br(GE, TAIL);
1315     eor(rscratch2, tmp1, tmp2);
1316     cbnz(rscratch2, DIFF);
1317     // main loop
1318     bind(NEXT_WORD);
1319     if (str1_isL == str2_isL) {
1320       ldr(tmp1, Address(str1, cnt2));
1321       ldr(tmp2, Address(str2, cnt2));
1322       adds(cnt2, cnt2, 8);
1323     } else if (isLU) {
1324       ldrs(vtmp, Address(str1, cnt1));
1325       ldr(tmp2, Address(str2, cnt2));
1326       add(cnt1, cnt1, 4);
1327       zip1(vtmp, T8B, vtmp, vtmpZ);
1328       fmovd(tmp1, vtmp);
1329       adds(cnt2, cnt2, 8);
1330     } else { // UL
1331       ldrs(vtmp, Address(str2, cnt2));
1332       ldr(tmp1, Address(str1, cnt1));
1333       zip1(vtmp, T8B, vtmp, vtmpZ);
1334       add(cnt1, cnt1, 8);
1335       fmovd(tmp2, vtmp);
1336       adds(cnt2, cnt2, 4);
1337     }
1338     br(GE, TAIL);
1339 
1340     eor(rscratch2, tmp1, tmp2);
1341     cbz(rscratch2, NEXT_WORD);
1342     b(DIFF);
1343     bind(TAIL);
1344     eor(rscratch2, tmp1, tmp2);
1345     cbnz(rscratch2, DIFF);
1346     // Last longword.  In the case where length == 4 we compare the
1347     // same longword twice, but that's still faster than another
1348     // conditional branch.
1349     if (str1_isL == str2_isL) {
1350       ldr(tmp1, Address(str1));
1351       ldr(tmp2, Address(str2));
1352     } else if (isLU) {
1353       ldrs(vtmp, Address(str1));
1354       ldr(tmp2, Address(str2));
1355       zip1(vtmp, T8B, vtmp, vtmpZ);
1356       fmovd(tmp1, vtmp);
1357     } else { // UL
1358       ldrs(vtmp, Address(str2));
1359       ldr(tmp1, Address(str1));
1360       zip1(vtmp, T8B, vtmp, vtmpZ);
1361       fmovd(tmp2, vtmp);
1362     }
1363     bind(TAIL_CHECK);
1364     eor(rscratch2, tmp1, tmp2);
1365     cbz(rscratch2, DONE);
1366 
1367     // Find the first different characters in the longwords and
1368     // compute their difference.
1369     bind(DIFF);
1370     rev(rscratch2, rscratch2);
1371     clz(rscratch2, rscratch2);
1372     andr(rscratch2, rscratch2, isLL ? -8 : -16);
1373     lsrv(tmp1, tmp1, rscratch2);
1374     (this->*ext_chr)(tmp1, tmp1);
1375     lsrv(tmp2, tmp2, rscratch2);
1376     (this->*ext_chr)(tmp2, tmp2);
1377     subw(result, tmp1, tmp2);
1378     b(DONE);
1379   }
1380 
1381   bind(STUB);
1382     RuntimeAddress stub = nullptr;
1383     switch(ae) {
1384       case StrIntrinsicNode::LL:
1385         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
1386         break;
1387       case StrIntrinsicNode::UU:
1388         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
1389         break;
1390       case StrIntrinsicNode::LU:
1391         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
1392         break;
1393       case StrIntrinsicNode::UL:
1394         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
1395         break;
1396       default:
1397         ShouldNotReachHere();
1398      }
1399     assert(stub.target() != nullptr, "compare_long_string stub has not been generated");
1400     address call = trampoline_call(stub);
1401     if (call == nullptr) {
1402       DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START));
1403       ciEnv::current()->record_failure("CodeCache is full");
1404       return;
1405     }
1406     b(DONE);
1407 
1408   bind(SHORT_STRING);
1409   // Is the minimum length zero?
1410   cbz(cnt2, DONE);
1411   // arrange code to do most branches while loading and loading next characters
1412   // while comparing previous
1413   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1414   subs(cnt2, cnt2, 1);
1415   br(EQ, SHORT_LAST_INIT);
1416   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1417   b(SHORT_LOOP_START);
1418   bind(SHORT_LOOP);
1419   subs(cnt2, cnt2, 1);
1420   br(EQ, SHORT_LAST);
1421   bind(SHORT_LOOP_START);
1422   (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
1423   (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
1424   cmp(tmp1, cnt1);
1425   br(NE, SHORT_LOOP_TAIL);
1426   subs(cnt2, cnt2, 1);
1427   br(EQ, SHORT_LAST2);
1428   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1429   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1430   cmp(tmp2, rscratch1);
1431   br(EQ, SHORT_LOOP);
1432   sub(result, tmp2, rscratch1);
1433   b(DONE);
1434   bind(SHORT_LOOP_TAIL);
1435   sub(result, tmp1, cnt1);
1436   b(DONE);
1437   bind(SHORT_LAST2);
1438   cmp(tmp2, rscratch1);
1439   br(EQ, DONE);
1440   sub(result, tmp2, rscratch1);
1441 
1442   b(DONE);
1443   bind(SHORT_LAST_INIT);
1444   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1445   bind(SHORT_LAST);
1446   cmp(tmp1, cnt1);
1447   br(EQ, DONE);
1448   sub(result, tmp1, cnt1);
1449 
1450   bind(DONE);
1451 
1452   BLOCK_COMMENT("} string_compare");
1453 }
1454 
1455 void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1,
1456                                      FloatRegister src2, Condition cond, bool isQ) {
1457   SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1458   FloatRegister zn = src1, zm = src2;
1459   bool needs_negation = false;
1460   switch (cond) {
1461     case LT: cond = GT; zn = src2; zm = src1; break;
1462     case LE: cond = GE; zn = src2; zm = src1; break;
1463     case LO: cond = HI; zn = src2; zm = src1; break;
1464     case LS: cond = HS; zn = src2; zm = src1; break;
1465     case NE: cond = EQ; needs_negation = true; break;
1466     default:
1467       break;
1468   }
1469 
1470   if (is_floating_point_type(bt)) {
1471     fcm(cond, dst, size, zn, zm);
1472   } else {
1473     cm(cond, dst, size, zn, zm);
1474   }
1475 
1476   if (needs_negation) {
1477     notr(dst, isQ ? T16B : T8B, dst);
1478   }
1479 }
1480 
1481 void C2_MacroAssembler::neon_compare_zero(FloatRegister dst, BasicType bt, FloatRegister src,
1482                                           Condition cond, bool isQ) {
1483   SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1484   if (bt == T_FLOAT || bt == T_DOUBLE) {
1485     if (cond == Assembler::NE) {
1486       fcm(Assembler::EQ, dst, size, src);
1487       notr(dst, isQ ? T16B : T8B, dst);
1488     } else {
1489       fcm(cond, dst, size, src);
1490     }
1491   } else {
1492     if (cond == Assembler::NE) {
1493       cm(Assembler::EQ, dst, size, src);
1494       notr(dst, isQ ? T16B : T8B, dst);
1495     } else {
1496       cm(cond, dst, size, src);
1497     }
1498   }
1499 }
1500 
1501 // Compress the least significant bit of each byte to the rightmost and clear
1502 // the higher garbage bits.
1503 void C2_MacroAssembler::bytemask_compress(Register dst) {
1504   // Example input, dst = 0x01 00 00 00 01 01 00 01
1505   // The "??" bytes are garbage.
1506   orr(dst, dst, dst, Assembler::LSR, 7);  // dst = 0x?? 02 ?? 00 ?? 03 ?? 01
1507   orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D
1508   orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D
1509   andr(dst, dst, 0xff);                   // dst = 0x8D
1510 }
1511 
1512 // Pack the lowest-numbered bit of each mask element in src into a long value
1513 // in dst, at most the first 64 lane elements.
1514 // Clobbers: rscratch1, if UseSVE=1 or the hardware doesn't support FEAT_BITPERM.
1515 void C2_MacroAssembler::sve_vmask_tolong(Register dst, PRegister src, BasicType bt, int lane_cnt,
1516                                          FloatRegister vtmp1, FloatRegister vtmp2) {
1517   assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
1518   assert_different_registers(dst, rscratch1);
1519   assert_different_registers(vtmp1, vtmp2);
1520 
1521   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
1522   // Example:   src = 0b01100101 10001101, bt = T_BYTE, lane_cnt = 16
1523   // Expected:  dst = 0x658D
1524 
1525   // Convert the mask into vector with sequential bytes.
1526   // vtmp1 = 0x00010100 0x00010001 0x01000000 0x01010001
1527   sve_cpy(vtmp1, size, src, 1, false);
1528   if (bt != T_BYTE) {
1529     sve_vector_narrow(vtmp1, B, vtmp1, size, vtmp2);
1530   }
1531 
1532   if (UseSVE > 1 && VM_Version::supports_svebitperm()) {
1533     // Given a vector with the value 0x00 or 0x01 in each byte, the basic idea
1534     // is to compress each significant bit of the byte in a cross-lane way. Due
1535     // to the lack of a cross-lane bit-compress instruction, we use BEXT
1536     // (bit-compress in each lane) with the biggest lane size (T = D) then
1537     // concatenate the results.
1538 
1539     // The second source input of BEXT, initialized with 0x01 in each byte.
1540     // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1541     sve_dup(vtmp2, B, 1);
1542 
1543     // BEXT vtmp1.D, vtmp1.D, vtmp2.D
1544     // vtmp1 = 0x0001010000010001 | 0x0100000001010001
1545     // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1546     //         ---------------------------------------
1547     // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1548     sve_bext(vtmp1, D, vtmp1, vtmp2);
1549 
1550     // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the
1551     // result to dst.
1552     // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1553     // dst   = 0x658D
1554     if (lane_cnt <= 8) {
1555       // No need to concatenate.
1556       umov(dst, vtmp1, B, 0);
1557     } else if (lane_cnt <= 16) {
1558       ins(vtmp1, B, vtmp1, 1, 8);
1559       umov(dst, vtmp1, H, 0);
1560     } else {
1561       // As the lane count is 64 at most, the final expected value must be in
1562       // the lowest 64 bits after narrowing vtmp1 from D to B.
1563       sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2);
1564       umov(dst, vtmp1, D, 0);
1565     }
1566   } else if (UseSVE > 0) {
1567     // Compress the lowest 8 bytes.
1568     fmovd(dst, vtmp1);
1569     bytemask_compress(dst);
1570     if (lane_cnt <= 8) return;
1571 
1572     // Repeat on higher bytes and join the results.
1573     // Compress 8 bytes in each iteration.
1574     for (int idx = 1; idx < (lane_cnt / 8); idx++) {
1575       sve_extract_integral(rscratch1, T_LONG, vtmp1, idx, vtmp2);
1576       bytemask_compress(rscratch1);
1577       orr(dst, dst, rscratch1, Assembler::LSL, idx << 3);
1578     }
1579   } else {
1580     assert(false, "unsupported");
1581     ShouldNotReachHere();
1582   }
1583 }
1584 
1585 // Unpack the mask, a long value in src, into predicate register dst based on the
1586 // corresponding data type. Note that dst can support at most 64 lanes.
1587 // Below example gives the expected dst predicate register in different types, with
1588 // a valid src(0x658D) on a 1024-bit vector size machine.
1589 // BYTE:  dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 00 00 65 8D
1590 // SHORT: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 14 11 40 51
1591 // INT:   dst = 0x00 00 00 00 00 00 00 00 01 10 01 01 10 00 11 01
1592 // LONG:  dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01
1593 //
1594 // The number of significant bits of src must be equal to lane_cnt. E.g., 0xFF658D which
1595 // has 24 significant bits would be an invalid input if dst predicate register refers to
1596 // a LONG type 1024-bit vector, which has at most 16 lanes.
1597 void C2_MacroAssembler::sve_vmask_fromlong(PRegister dst, Register src, BasicType bt, int lane_cnt,
1598                                            FloatRegister vtmp1, FloatRegister vtmp2) {
1599   assert(UseSVE == 2 && VM_Version::supports_svebitperm() &&
1600          lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported");
1601   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
1602   // Example:   src = 0x658D, bt = T_BYTE, size = B, lane_cnt = 16
1603   // Expected:  dst = 0b01101001 10001101
1604 
1605   // Put long value from general purpose register into the first lane of vector.
1606   // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1607   sve_dup(vtmp1, B, 0);
1608   mov(vtmp1, D, 0, src);
1609 
1610   // As sve_cmp generates mask value with the minimum unit in byte, we should
1611   // transform the value in the first lane which is mask in bit now to the
1612   // mask in byte, which can be done by SVE2's BDEP instruction.
1613 
1614   // The first source input of BDEP instruction. Deposite each byte in every 8 bytes.
1615   // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1616   if (lane_cnt <= 8) {
1617     // Nothing. As only one byte exsits.
1618   } else if (lane_cnt <= 16) {
1619     ins(vtmp1, B, vtmp1, 8, 1);
1620     mov(vtmp1, B, 1, zr);
1621   } else {
1622     sve_vector_extend(vtmp1, D, vtmp1, B);
1623   }
1624 
1625   // The second source input of BDEP instruction, initialized with 0x01 for each byte.
1626   // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1627   sve_dup(vtmp2, B, 1);
1628 
1629   // BDEP vtmp1.D, vtmp1.D, vtmp2.D
1630   // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1631   // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1632   //         ---------------------------------------
1633   // vtmp1 = 0x0001010000010001 | 0x0100000001010001
1634   sve_bdep(vtmp1, D, vtmp1, vtmp2);
1635 
1636   if (bt != T_BYTE) {
1637     sve_vector_extend(vtmp1, size, vtmp1, B);
1638   }
1639   // Generate mask according to the given vector, in which the elements have been
1640   // extended to expected type.
1641   // dst = 0b01101001 10001101
1642   sve_cmp(Assembler::NE, dst, size, ptrue, vtmp1, 0);
1643 }
1644 
1645 // Clobbers: rflags
1646 void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg,
1647                                     FloatRegister zn, FloatRegister zm, Condition cond) {
1648   assert(pg->is_governing(), "This register has to be a governing predicate register");
1649   FloatRegister z1 = zn, z2 = zm;
1650   switch (cond) {
1651     case LE: z1 = zm; z2 = zn; cond = GE; break;
1652     case LT: z1 = zm; z2 = zn; cond = GT; break;
1653     case LO: z1 = zm; z2 = zn; cond = HI; break;
1654     case LS: z1 = zm; z2 = zn; cond = HS; break;
1655     default:
1656       break;
1657   }
1658 
1659   SIMD_RegVariant size = elemType_to_regVariant(bt);
1660   if (is_floating_point_type(bt)) {
1661     sve_fcm(cond, pd, size, pg, z1, z2);
1662   } else {
1663     assert(is_integral_type(bt), "unsupported element type");
1664     sve_cmp(cond, pd, size, pg, z1, z2);
1665   }
1666 }
1667 
1668 // Get index of the last mask lane that is set
1669 void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) {
1670   SIMD_RegVariant size = elemType_to_regVariant(bt);
1671   sve_rev(ptmp, size, src);
1672   sve_brkb(ptmp, ptrue, ptmp, false);
1673   sve_cntp(dst, size, ptrue, ptmp);
1674   movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1);
1675   subw(dst, rscratch1, dst);
1676 }
1677 
1678 // Extend integer vector src to dst with the same lane count
1679 // but larger element size, e.g. 4B -> 4I
1680 void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes,
1681                                            FloatRegister src, BasicType src_bt, bool is_unsigned) {
1682   if (src_bt == T_BYTE) {
1683     if (dst_bt == T_SHORT) {
1684       // 4B/8B to 4S/8S
1685       _xshll(is_unsigned, dst, T8H, src, T8B, 0);
1686     } else {
1687       // 4B to 4I
1688       assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported");
1689       _xshll(is_unsigned, dst, T8H, src, T8B, 0);
1690       _xshll(is_unsigned, dst, T4S, dst, T4H, 0);
1691     }
1692   } else if (src_bt == T_SHORT) {
1693     // 4S to 4I
1694     assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported");
1695     _xshll(is_unsigned, dst, T4S, src, T4H, 0);
1696   } else if (src_bt == T_INT) {
1697     // 2I to 2L
1698     assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported");
1699     _xshll(is_unsigned, dst, T2D, src, T2S, 0);
1700   } else {
1701     ShouldNotReachHere();
1702   }
1703 }
1704 
1705 // Narrow integer vector src down to dst with the same lane count
1706 // but smaller element size, e.g. 4I -> 4B
1707 void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt,
1708                                            FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) {
1709   if (src_bt == T_SHORT) {
1710     // 4S/8S to 4B/8B
1711     assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
1712     assert(dst_bt == T_BYTE, "unsupported");
1713     xtn(dst, T8B, src, T8H);
1714   } else if (src_bt == T_INT) {
1715     // 4I to 4B/4S
1716     assert(src_vlen_in_bytes == 16, "unsupported");
1717     assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported");
1718     xtn(dst, T4H, src, T4S);
1719     if (dst_bt == T_BYTE) {
1720       xtn(dst, T8B, dst, T8H);
1721     }
1722   } else if (src_bt == T_LONG) {
1723     // 2L to 2I
1724     assert(src_vlen_in_bytes == 16, "unsupported");
1725     assert(dst_bt == T_INT, "unsupported");
1726     xtn(dst, T2S, src, T2D);
1727   } else {
1728     ShouldNotReachHere();
1729   }
1730 }
1731 
1732 void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size,
1733                                           FloatRegister src, SIMD_RegVariant src_size,
1734                                           bool is_unsigned) {
1735   assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size");
1736 
1737   if (src_size == B) {
1738     switch (dst_size) {
1739     case H:
1740       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1741       break;
1742     case S:
1743       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1744       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1745       break;
1746     case D:
1747       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1748       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1749       _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1750       break;
1751     default:
1752       ShouldNotReachHere();
1753     }
1754   } else if (src_size == H) {
1755     if (dst_size == S) {
1756       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1757     } else { // D
1758       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1759       _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1760     }
1761   } else if (src_size == S) {
1762     _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, src);
1763   }
1764 }
1765 
1766 // Vector narrow from src to dst with specified element sizes.
1767 // High part of dst vector will be filled with zero.
1768 void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size,
1769                                           FloatRegister src, SIMD_RegVariant src_size,
1770                                           FloatRegister tmp) {
1771   assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size");
1772   assert_different_registers(src, tmp);
1773   sve_dup(tmp, src_size, 0);
1774   if (src_size == D) {
1775     switch (dst_size) {
1776     case S:
1777       sve_uzp1(dst, S, src, tmp);
1778       break;
1779     case H:
1780       assert_different_registers(dst, tmp);
1781       sve_uzp1(dst, S, src, tmp);
1782       sve_uzp1(dst, H, dst, tmp);
1783       break;
1784     case B:
1785       assert_different_registers(dst, tmp);
1786       sve_uzp1(dst, S, src, tmp);
1787       sve_uzp1(dst, H, dst, tmp);
1788       sve_uzp1(dst, B, dst, tmp);
1789       break;
1790     default:
1791       ShouldNotReachHere();
1792     }
1793   } else if (src_size == S) {
1794     if (dst_size == H) {
1795       sve_uzp1(dst, H, src, tmp);
1796     } else { // B
1797       assert_different_registers(dst, tmp);
1798       sve_uzp1(dst, H, src, tmp);
1799       sve_uzp1(dst, B, dst, tmp);
1800     }
1801   } else if (src_size == H) {
1802     sve_uzp1(dst, B, src, tmp);
1803   }
1804 }
1805 
1806 // Extend src predicate to dst predicate with the same lane count but larger
1807 // element size, e.g. 64Byte -> 512Long
1808 void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src,
1809                                              uint dst_element_length_in_bytes,
1810                                              uint src_element_length_in_bytes) {
1811   if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) {
1812     sve_punpklo(dst, src);
1813   } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) {
1814     sve_punpklo(dst, src);
1815     sve_punpklo(dst, dst);
1816   } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) {
1817     sve_punpklo(dst, src);
1818     sve_punpklo(dst, dst);
1819     sve_punpklo(dst, dst);
1820   } else {
1821     assert(false, "unsupported");
1822     ShouldNotReachHere();
1823   }
1824 }
1825 
1826 // Narrow src predicate to dst predicate with the same lane count but
1827 // smaller element size, e.g. 512Long -> 64Byte
1828 void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src, PRegister ptmp,
1829                                              uint dst_element_length_in_bytes, uint src_element_length_in_bytes) {
1830   // The insignificant bits in src predicate are expected to be zero.
1831   // To ensure the higher order bits of the resultant narrowed vector are 0, an all-zero predicate is
1832   // passed as the second argument. An example narrowing operation with a given mask would be -
1833   // 128Long -> 64Int on a 128-bit machine i.e 2L -> 2I
1834   // Mask (for 2 Longs) : TF
1835   // Predicate register for the above mask (16 bits) : 00000001 00000000
1836   // After narrowing (uzp1 dst.b, src.b, ptmp.b) : 0000 0000 0001 0000
1837   // Which translates to mask for 2 integers as : TF (lower half is considered while upper half is 0)
1838   assert_different_registers(src, ptmp);
1839   assert_different_registers(dst, ptmp);
1840   sve_pfalse(ptmp);
1841   if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) {
1842     sve_uzp1(dst, B, src, ptmp);
1843   } else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) {
1844     sve_uzp1(dst, H, src, ptmp);
1845     sve_uzp1(dst, B, dst, ptmp);
1846   } else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) {
1847     sve_uzp1(dst, S, src, ptmp);
1848     sve_uzp1(dst, H, dst, ptmp);
1849     sve_uzp1(dst, B, dst, ptmp);
1850   } else {
1851     assert(false, "unsupported");
1852     ShouldNotReachHere();
1853   }
1854 }
1855 
1856 // Vector reduction add for integral type with ASIMD instructions.
1857 void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt,
1858                                                  Register isrc, FloatRegister vsrc,
1859                                                  unsigned vector_length_in_bytes,
1860                                                  FloatRegister vtmp) {
1861   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1862   assert_different_registers(dst, isrc);
1863   bool isQ = vector_length_in_bytes == 16;
1864 
1865   BLOCK_COMMENT("neon_reduce_add_integral {");
1866     switch(bt) {
1867       case T_BYTE:
1868         addv(vtmp, isQ ? T16B : T8B, vsrc);
1869         smov(dst, vtmp, B, 0);
1870         addw(dst, dst, isrc, ext::sxtb);
1871         break;
1872       case T_SHORT:
1873         addv(vtmp, isQ ? T8H : T4H, vsrc);
1874         smov(dst, vtmp, H, 0);
1875         addw(dst, dst, isrc, ext::sxth);
1876         break;
1877       case T_INT:
1878         isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc);
1879         umov(dst, vtmp, S, 0);
1880         addw(dst, dst, isrc);
1881         break;
1882       case T_LONG:
1883         assert(isQ, "unsupported");
1884         addpd(vtmp, vsrc);
1885         umov(dst, vtmp, D, 0);
1886         add(dst, dst, isrc);
1887         break;
1888       default:
1889         assert(false, "unsupported");
1890         ShouldNotReachHere();
1891     }
1892   BLOCK_COMMENT("} neon_reduce_add_integral");
1893 }
1894 
1895 // Vector reduction multiply for integral type with ASIMD instructions.
1896 // Note: temporary registers vtmp1 and vtmp2 are not used in some cases.
1897 // Clobbers: rscratch1
1898 void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt,
1899                                                  Register isrc, FloatRegister vsrc,
1900                                                  unsigned vector_length_in_bytes,
1901                                                  FloatRegister vtmp1, FloatRegister vtmp2) {
1902   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1903   bool isQ = vector_length_in_bytes == 16;
1904 
1905   BLOCK_COMMENT("neon_reduce_mul_integral {");
1906     switch(bt) {
1907       case T_BYTE:
1908         if (isQ) {
1909           // Multiply the lower half and higher half of vector iteratively.
1910           // vtmp1 = vsrc[8:15]
1911           ins(vtmp1, D, vsrc, 0, 1);
1912           // vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7]
1913           mulv(vtmp1, T8B, vtmp1, vsrc);
1914           // vtmp2 = vtmp1[4:7]
1915           ins(vtmp2, S, vtmp1, 0, 1);
1916           // vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3]
1917           mulv(vtmp1, T8B, vtmp2, vtmp1);
1918         } else {
1919           ins(vtmp1, S, vsrc, 0, 1);
1920           mulv(vtmp1, T8B, vtmp1, vsrc);
1921         }
1922         // vtmp2 = vtmp1[2:3]
1923         ins(vtmp2, H, vtmp1, 0, 1);
1924         // vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1]
1925         mulv(vtmp2, T8B, vtmp2, vtmp1);
1926         // dst = vtmp2[0] * isrc * vtmp2[1]
1927         umov(rscratch1, vtmp2, B, 0);
1928         mulw(dst, rscratch1, isrc);
1929         sxtb(dst, dst);
1930         umov(rscratch1, vtmp2, B, 1);
1931         mulw(dst, rscratch1, dst);
1932         sxtb(dst, dst);
1933         break;
1934       case T_SHORT:
1935         if (isQ) {
1936           ins(vtmp2, D, vsrc, 0, 1);
1937           mulv(vtmp2, T4H, vtmp2, vsrc);
1938           ins(vtmp1, S, vtmp2, 0, 1);
1939           mulv(vtmp1, T4H, vtmp1, vtmp2);
1940         } else {
1941           ins(vtmp1, S, vsrc, 0, 1);
1942           mulv(vtmp1, T4H, vtmp1, vsrc);
1943         }
1944         umov(rscratch1, vtmp1, H, 0);
1945         mulw(dst, rscratch1, isrc);
1946         sxth(dst, dst);
1947         umov(rscratch1, vtmp1, H, 1);
1948         mulw(dst, rscratch1, dst);
1949         sxth(dst, dst);
1950         break;
1951       case T_INT:
1952         if (isQ) {
1953           ins(vtmp1, D, vsrc, 0, 1);
1954           mulv(vtmp1, T2S, vtmp1, vsrc);
1955         } else {
1956           vtmp1 = vsrc;
1957         }
1958         umov(rscratch1, vtmp1, S, 0);
1959         mul(dst, rscratch1, isrc);
1960         umov(rscratch1, vtmp1, S, 1);
1961         mul(dst, rscratch1, dst);
1962         break;
1963       case T_LONG:
1964         umov(rscratch1, vsrc, D, 0);
1965         mul(dst, isrc, rscratch1);
1966         umov(rscratch1, vsrc, D, 1);
1967         mul(dst, dst, rscratch1);
1968         break;
1969       default:
1970         assert(false, "unsupported");
1971         ShouldNotReachHere();
1972     }
1973   BLOCK_COMMENT("} neon_reduce_mul_integral");
1974 }
1975 
1976 // Vector reduction multiply for floating-point type with ASIMD instructions.
1977 void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt,
1978                                            FloatRegister fsrc, FloatRegister vsrc,
1979                                            unsigned vector_length_in_bytes,
1980                                            FloatRegister vtmp) {
1981   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1982   bool isQ = vector_length_in_bytes == 16;
1983 
1984   BLOCK_COMMENT("neon_reduce_mul_fp {");
1985     switch(bt) {
1986       case T_FLOAT:
1987         fmuls(dst, fsrc, vsrc);
1988         ins(vtmp, S, vsrc, 0, 1);
1989         fmuls(dst, dst, vtmp);
1990         if (isQ) {
1991           ins(vtmp, S, vsrc, 0, 2);
1992           fmuls(dst, dst, vtmp);
1993           ins(vtmp, S, vsrc, 0, 3);
1994           fmuls(dst, dst, vtmp);
1995          }
1996         break;
1997       case T_DOUBLE:
1998         assert(isQ, "unsupported");
1999         fmuld(dst, fsrc, vsrc);
2000         ins(vtmp, D, vsrc, 0, 1);
2001         fmuld(dst, dst, vtmp);
2002         break;
2003       default:
2004         assert(false, "unsupported");
2005         ShouldNotReachHere();
2006     }
2007   BLOCK_COMMENT("} neon_reduce_mul_fp");
2008 }
2009 
2010 // Helper to select logical instruction
2011 void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd,
2012                                                    Register Rn, Register Rm,
2013                                                    enum shift_kind kind, unsigned shift) {
2014   switch(opc) {
2015     case Op_AndReductionV:
2016       is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift);
2017       break;
2018     case Op_OrReductionV:
2019       is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift);
2020       break;
2021     case Op_XorReductionV:
2022       is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift);
2023       break;
2024     default:
2025       assert(false, "unsupported");
2026       ShouldNotReachHere();
2027   }
2028 }
2029 
2030 // Vector reduction logical operations And, Or, Xor
2031 // Clobbers: rscratch1
2032 void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt,
2033                                             Register isrc, FloatRegister vsrc,
2034                                             unsigned vector_length_in_bytes) {
2035   assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV,
2036          "unsupported");
2037   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
2038   assert_different_registers(dst, isrc);
2039   bool isQ = vector_length_in_bytes == 16;
2040 
2041   BLOCK_COMMENT("neon_reduce_logical {");
2042     umov(rscratch1, vsrc, isQ ? D : S, 0);
2043     umov(dst, vsrc, isQ ? D : S, 1);
2044     neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1);
2045     switch(bt) {
2046       case T_BYTE:
2047         if (isQ) {
2048           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
2049         }
2050         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
2051         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8);
2052         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
2053         sxtb(dst, dst);
2054         break;
2055       case T_SHORT:
2056         if (isQ) {
2057           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
2058         }
2059         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
2060         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
2061         sxth(dst, dst);
2062         break;
2063       case T_INT:
2064         if (isQ) {
2065           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
2066         }
2067         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
2068         break;
2069       case T_LONG:
2070         assert(isQ, "unsupported");
2071         neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst);
2072         break;
2073       default:
2074         assert(false, "unsupported");
2075         ShouldNotReachHere();
2076     }
2077   BLOCK_COMMENT("} neon_reduce_logical");
2078 }
2079 
2080 // Vector reduction min/max for integral type with ASIMD instructions.
2081 // Note: vtmp is not used and expected to be fnoreg for T_LONG case.
2082 // Clobbers: rscratch1, rflags
2083 void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt,
2084                                                     Register isrc, FloatRegister vsrc,
2085                                                     unsigned vector_length_in_bytes,
2086                                                     FloatRegister vtmp) {
2087   assert(opc == Op_MinReductionV || opc == Op_MaxReductionV, "unsupported");
2088   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
2089   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported");
2090   assert_different_registers(dst, isrc);
2091   bool isQ = vector_length_in_bytes == 16;
2092   bool is_min = opc == Op_MinReductionV;
2093 
2094   BLOCK_COMMENT("neon_reduce_minmax_integral {");
2095     if (bt == T_LONG) {
2096       assert(vtmp == fnoreg, "should be");
2097       assert(isQ, "should be");
2098       umov(rscratch1, vsrc, D, 0);
2099       cmp(isrc, rscratch1);
2100       csel(dst, isrc, rscratch1, is_min ? LT : GT);
2101       umov(rscratch1, vsrc, D, 1);
2102       cmp(dst, rscratch1);
2103       csel(dst, dst, rscratch1, is_min ? LT : GT);
2104     } else {
2105       SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
2106       if (size == T2S) {
2107         is_min ? sminp(vtmp, size, vsrc, vsrc) : smaxp(vtmp, size, vsrc, vsrc);
2108       } else {
2109         is_min ? sminv(vtmp, size, vsrc) : smaxv(vtmp, size, vsrc);
2110       }
2111       if (bt == T_INT) {
2112         umov(dst, vtmp, S, 0);
2113       } else {
2114         smov(dst, vtmp, elemType_to_regVariant(bt), 0);
2115       }
2116       cmpw(dst, isrc);
2117       cselw(dst, dst, isrc, is_min ? LT : GT);
2118     }
2119   BLOCK_COMMENT("} neon_reduce_minmax_integral");
2120 }
2121 
2122 // Vector reduction for integral type with SVE instruction.
2123 // Supported operations are Add, And, Or, Xor, Max, Min.
2124 // rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV.
2125 void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1,
2126                                             FloatRegister src2, PRegister pg, FloatRegister tmp) {
2127   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2128   assert(pg->is_governing(), "This register has to be a governing predicate register");
2129   assert_different_registers(src1, dst);
2130   // Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved.
2131   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2132   switch (opc) {
2133     case Op_AddReductionVI: {
2134       sve_uaddv(tmp, size, pg, src2);
2135       if (bt == T_BYTE) {
2136         smov(dst, tmp, size, 0);
2137         addw(dst, src1, dst, ext::sxtb);
2138       } else if (bt == T_SHORT) {
2139         smov(dst, tmp, size, 0);
2140         addw(dst, src1, dst, ext::sxth);
2141       } else {
2142         umov(dst, tmp, size, 0);
2143         addw(dst, dst, src1);
2144       }
2145       break;
2146     }
2147     case Op_AddReductionVL: {
2148       sve_uaddv(tmp, size, pg, src2);
2149       umov(dst, tmp, size, 0);
2150       add(dst, dst, src1);
2151       break;
2152     }
2153     case Op_AndReductionV: {
2154       sve_andv(tmp, size, pg, src2);
2155       if (bt == T_INT || bt == T_LONG) {
2156         umov(dst, tmp, size, 0);
2157       } else {
2158         smov(dst, tmp, size, 0);
2159       }
2160       if (bt == T_LONG) {
2161         andr(dst, dst, src1);
2162       } else {
2163         andw(dst, dst, src1);
2164       }
2165       break;
2166     }
2167     case Op_OrReductionV: {
2168       sve_orv(tmp, size, pg, src2);
2169       if (bt == T_INT || bt == T_LONG) {
2170         umov(dst, tmp, size, 0);
2171       } else {
2172         smov(dst, tmp, size, 0);
2173       }
2174       if (bt == T_LONG) {
2175         orr(dst, dst, src1);
2176       } else {
2177         orrw(dst, dst, src1);
2178       }
2179       break;
2180     }
2181     case Op_XorReductionV: {
2182       sve_eorv(tmp, size, pg, src2);
2183       if (bt == T_INT || bt == T_LONG) {
2184         umov(dst, tmp, size, 0);
2185       } else {
2186         smov(dst, tmp, size, 0);
2187       }
2188       if (bt == T_LONG) {
2189         eor(dst, dst, src1);
2190       } else {
2191         eorw(dst, dst, src1);
2192       }
2193       break;
2194     }
2195     case Op_MaxReductionV: {
2196       sve_smaxv(tmp, size, pg, src2);
2197       if (bt == T_INT || bt == T_LONG) {
2198         umov(dst, tmp, size, 0);
2199       } else {
2200         smov(dst, tmp, size, 0);
2201       }
2202       if (bt == T_LONG) {
2203         cmp(dst, src1);
2204         csel(dst, dst, src1, Assembler::GT);
2205       } else {
2206         cmpw(dst, src1);
2207         cselw(dst, dst, src1, Assembler::GT);
2208       }
2209       break;
2210     }
2211     case Op_MinReductionV: {
2212       sve_sminv(tmp, size, pg, src2);
2213       if (bt == T_INT || bt == T_LONG) {
2214         umov(dst, tmp, size, 0);
2215       } else {
2216         smov(dst, tmp, size, 0);
2217       }
2218       if (bt == T_LONG) {
2219         cmp(dst, src1);
2220         csel(dst, dst, src1, Assembler::LT);
2221       } else {
2222         cmpw(dst, src1);
2223         cselw(dst, dst, src1, Assembler::LT);
2224       }
2225       break;
2226     }
2227     default:
2228       assert(false, "unsupported");
2229       ShouldNotReachHere();
2230   }
2231 
2232   if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) {
2233     if (bt == T_BYTE) {
2234       sxtb(dst, dst);
2235     } else if (bt == T_SHORT) {
2236       sxth(dst, dst);
2237     }
2238   }
2239 }
2240 
2241 // Set elements of the dst predicate to true for lanes in the range of [0, lane_cnt), or
2242 // to false otherwise. The input "lane_cnt" should be smaller than or equal to the supported
2243 // max vector length of the basic type. Clobbers: rscratch1 and the rFlagsReg.
2244 void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt) {
2245   uint32_t max_vector_length = Matcher::max_vector_size(bt);
2246   assert(lane_cnt <= max_vector_length, "unsupported input lane_cnt");
2247 
2248   // Set all elements to false if the input "lane_cnt" is zero.
2249   if (lane_cnt == 0) {
2250     sve_pfalse(dst);
2251     return;
2252   }
2253 
2254   SIMD_RegVariant size = elemType_to_regVariant(bt);
2255   assert(size != Q, "invalid size");
2256 
2257   // Set all true if "lane_cnt" equals to the max lane count.
2258   if (lane_cnt == max_vector_length) {
2259     sve_ptrue(dst, size, /* ALL */ 0b11111);
2260     return;
2261   }
2262 
2263   // Fixed numbers for "ptrue".
2264   switch(lane_cnt) {
2265   case 1: /* VL1 */
2266   case 2: /* VL2 */
2267   case 3: /* VL3 */
2268   case 4: /* VL4 */
2269   case 5: /* VL5 */
2270   case 6: /* VL6 */
2271   case 7: /* VL7 */
2272   case 8: /* VL8 */
2273     sve_ptrue(dst, size, lane_cnt);
2274     return;
2275   case 16:
2276     sve_ptrue(dst, size, /* VL16 */ 0b01001);
2277     return;
2278   case 32:
2279     sve_ptrue(dst, size, /* VL32 */ 0b01010);
2280     return;
2281   case 64:
2282     sve_ptrue(dst, size, /* VL64 */ 0b01011);
2283     return;
2284   case 128:
2285     sve_ptrue(dst, size, /* VL128 */ 0b01100);
2286     return;
2287   case 256:
2288     sve_ptrue(dst, size, /* VL256 */ 0b01101);
2289     return;
2290   default:
2291     break;
2292   }
2293 
2294   // Special patterns for "ptrue".
2295   if (lane_cnt == round_down_power_of_2(max_vector_length)) {
2296     sve_ptrue(dst, size, /* POW2 */ 0b00000);
2297   } else if (lane_cnt == max_vector_length - (max_vector_length % 4)) {
2298     sve_ptrue(dst, size, /* MUL4 */ 0b11101);
2299   } else if (lane_cnt == max_vector_length - (max_vector_length % 3)) {
2300     sve_ptrue(dst, size, /* MUL3 */ 0b11110);
2301   } else {
2302     // Encode to "whileltw" for the remaining cases.
2303     mov(rscratch1, lane_cnt);
2304     sve_whileltw(dst, size, zr, rscratch1);
2305   }
2306 }
2307 
2308 // Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst.
2309 // Any remaining elements of dst will be filled with zero.
2310 // Clobbers: rscratch1
2311 // Preserves: src, mask
2312 void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask,
2313                                            FloatRegister vtmp1, FloatRegister vtmp2,
2314                                            PRegister pgtmp) {
2315   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2316   assert_different_registers(dst, src, vtmp1, vtmp2);
2317   assert_different_registers(mask, pgtmp);
2318 
2319   // Example input:   src   = 8888 7777 6666 5555 4444 3333 2222 1111
2320   //                  mask  = 0001 0000 0000 0001 0001 0000 0001 0001
2321   // Expected result: dst   = 0000 0000 0000 8888 5555 4444 2222 1111
2322   sve_dup(vtmp2, H, 0);
2323 
2324   // Extend lowest half to type INT.
2325   // dst = 00004444 00003333 00002222 00001111
2326   sve_uunpklo(dst, S, src);
2327   // pgtmp = 00000001 00000000 00000001 00000001
2328   sve_punpklo(pgtmp, mask);
2329   // Pack the active elements in size of type INT to the right,
2330   // and fill the remainings with zero.
2331   // dst = 00000000 00004444 00002222 00001111
2332   sve_compact(dst, S, dst, pgtmp);
2333   // Narrow the result back to type SHORT.
2334   // dst = 0000 0000 0000 0000 0000 4444 2222 1111
2335   sve_uzp1(dst, H, dst, vtmp2);
2336   // Count the active elements of lowest half.
2337   // rscratch1 = 3
2338   sve_cntp(rscratch1, S, ptrue, pgtmp);
2339 
2340   // Repeat to the highest half.
2341   // pgtmp = 00000001 00000000 00000000 00000001
2342   sve_punpkhi(pgtmp, mask);
2343   // vtmp1 = 00008888 00007777 00006666 00005555
2344   sve_uunpkhi(vtmp1, S, src);
2345   // vtmp1 = 00000000 00000000 00008888 00005555
2346   sve_compact(vtmp1, S, vtmp1, pgtmp);
2347   // vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555
2348   sve_uzp1(vtmp1, H, vtmp1, vtmp2);
2349 
2350   // Compressed low:   dst   = 0000 0000 0000 0000 0000 4444 2222 1111
2351   // Compressed high:  vtmp1 = 0000 0000 0000 0000 0000 0000 8888  5555
2352   // Left shift(cross lane) compressed high with TRUE_CNT lanes,
2353   // TRUE_CNT is the number of active elements in the compressed low.
2354   neg(rscratch1, rscratch1);
2355   // vtmp2 = {4 3 2 1 0 -1 -2 -3}
2356   sve_index(vtmp2, H, rscratch1, 1);
2357   // vtmp1 = 0000 0000 0000 8888 5555 0000 0000 0000
2358   sve_tbl(vtmp1, H, vtmp1, vtmp2);
2359 
2360   // Combine the compressed high(after shifted) with the compressed low.
2361   // dst = 0000 0000 0000 8888 5555 4444 2222 1111
2362   sve_orr(dst, dst, vtmp1);
2363 }
2364 
2365 // Clobbers: rscratch1, rscratch2
2366 // Preserves: src, mask
2367 void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask,
2368                                           FloatRegister vtmp1, FloatRegister vtmp2,
2369                                           FloatRegister vtmp3, FloatRegister vtmp4,
2370                                           PRegister ptmp, PRegister pgtmp) {
2371   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2372   assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3, vtmp4);
2373   assert_different_registers(mask, ptmp, pgtmp);
2374   // Example input:   src   = 88 77 66 55 44 33 22 11
2375   //                  mask  = 01 00 00 01 01 00 01 01
2376   // Expected result: dst   = 00 00 00 88 55 44 22 11
2377 
2378   sve_dup(vtmp4, B, 0);
2379   // Extend lowest half to type SHORT.
2380   // vtmp1 = 0044 0033 0022 0011
2381   sve_uunpklo(vtmp1, H, src);
2382   // ptmp = 0001 0000 0001 0001
2383   sve_punpklo(ptmp, mask);
2384   // Count the active elements of lowest half.
2385   // rscratch2 = 3
2386   sve_cntp(rscratch2, H, ptrue, ptmp);
2387   // Pack the active elements in size of type SHORT to the right,
2388   // and fill the remainings with zero.
2389   // dst = 0000 0044 0022 0011
2390   sve_compress_short(dst, vtmp1, ptmp, vtmp2, vtmp3, pgtmp);
2391   // Narrow the result back to type BYTE.
2392   // dst = 00 00 00 00 00 44 22 11
2393   sve_uzp1(dst, B, dst, vtmp4);
2394 
2395   // Repeat to the highest half.
2396   // ptmp = 0001 0000 0000 0001
2397   sve_punpkhi(ptmp, mask);
2398   // vtmp1 = 0088 0077 0066 0055
2399   sve_uunpkhi(vtmp2, H, src);
2400   // vtmp1 = 0000 0000 0088 0055
2401   sve_compress_short(vtmp1, vtmp2, ptmp, vtmp3, vtmp4, pgtmp);
2402 
2403   sve_dup(vtmp4, B, 0);
2404   // vtmp1 = 00 00 00 00 00 00 88 55
2405   sve_uzp1(vtmp1, B, vtmp1, vtmp4);
2406 
2407   // Compressed low:   dst   = 00 00 00 00 00 44 22 11
2408   // Compressed high:  vtmp1 = 00 00 00 00 00 00 88 55
2409   // Left shift(cross lane) compressed high with TRUE_CNT lanes,
2410   // TRUE_CNT is the number of active elements in the compressed low.
2411   neg(rscratch2, rscratch2);
2412   // vtmp2 = {4 3 2 1 0 -1 -2 -3}
2413   sve_index(vtmp2, B, rscratch2, 1);
2414   // vtmp1 = 00 00 00 88 55 00 00 00
2415   sve_tbl(vtmp1, B, vtmp1, vtmp2);
2416   // Combine the compressed high(after shifted) with the compressed low.
2417   // dst = 00 00 00 88 55 44 22 11
2418   sve_orr(dst, dst, vtmp1);
2419 }
2420 
2421 void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2422   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2423   SIMD_Arrangement size = isQ ? T16B : T8B;
2424   if (bt == T_BYTE) {
2425     rbit(dst, size, src);
2426   } else {
2427     neon_reverse_bytes(dst, src, bt, isQ);
2428     rbit(dst, size, dst);
2429   }
2430 }
2431 
2432 void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2433   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2434   SIMD_Arrangement size = isQ ? T16B : T8B;
2435   switch (bt) {
2436     case T_BYTE:
2437       if (dst != src) {
2438         orr(dst, size, src, src);
2439       }
2440       break;
2441     case T_SHORT:
2442       rev16(dst, size, src);
2443       break;
2444     case T_INT:
2445       rev32(dst, size, src);
2446       break;
2447     case T_LONG:
2448       rev64(dst, size, src);
2449       break;
2450     default:
2451       assert(false, "unsupported");
2452       ShouldNotReachHere();
2453   }
2454 }
2455 
2456 // Extract a scalar element from an sve vector at position 'idx'.
2457 // The input elements in src are expected to be of integral type.
2458 void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src,
2459                                              int idx, FloatRegister vtmp) {
2460   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2461   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2462   if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction
2463     if (bt == T_INT || bt == T_LONG) {
2464       umov(dst, src, size, idx);
2465     } else {
2466       smov(dst, src, size, idx);
2467     }
2468   } else {
2469     sve_orr(vtmp, src, src);
2470     sve_ext(vtmp, vtmp, idx << size);
2471     if (bt == T_INT || bt == T_LONG) {
2472       umov(dst, vtmp, size, 0);
2473     } else {
2474       smov(dst, vtmp, size, 0);
2475     }
2476   }
2477 }
2478 
2479 // java.lang.Math::round intrinsics
2480 
2481 // Clobbers: rscratch1, rflags
2482 void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2483                                           FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) {
2484   assert_different_registers(tmp1, tmp2, tmp3, src, dst);
2485   switch (T) {
2486     case T2S:
2487     case T4S:
2488       fmovs(tmp1, T, 0.5f);
2489       mov(rscratch1, jint_cast(0x1.0p23f));
2490       break;
2491     case T2D:
2492       fmovd(tmp1, T, 0.5);
2493       mov(rscratch1, julong_cast(0x1.0p52));
2494       break;
2495     default:
2496       assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2497   }
2498   fadd(tmp1, T, tmp1, src);
2499   fcvtms(tmp1, T, tmp1);
2500   // tmp1 = floor(src + 0.5, ties to even)
2501 
2502   fcvtas(dst, T, src);
2503   // dst = round(src), ties to away
2504 
2505   fneg(tmp3, T, src);
2506   dup(tmp2, T, rscratch1);
2507   cm(HS, tmp3, T, tmp3, tmp2);
2508   // tmp3 is now a set of flags
2509 
2510   bif(dst, T16B, tmp1, tmp3);
2511   // result in dst
2512 }
2513 
2514 // Clobbers: rscratch1, rflags
2515 void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2516                                          FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) {
2517   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2518   assert_different_registers(tmp1, tmp2, src, dst);
2519 
2520   switch (T) {
2521     case S:
2522       mov(rscratch1, jint_cast(0x1.0p23f));
2523       break;
2524     case D:
2525       mov(rscratch1, julong_cast(0x1.0p52));
2526       break;
2527     default:
2528       assert(T == S || T == D, "invalid register variant");
2529   }
2530 
2531   sve_frinta(dst, T, ptrue, src);
2532   // dst = round(src), ties to away
2533 
2534   Label none;
2535 
2536   sve_fneg(tmp1, T, ptrue, src);
2537   sve_dup(tmp2, T, rscratch1);
2538   sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1);
2539   br(EQ, none);
2540   {
2541     sve_cpy(tmp1, T, pgtmp, 0.5);
2542     sve_fadd(tmp1, T, pgtmp, src);
2543     sve_frintm(dst, T, pgtmp, tmp1);
2544     // dst = floor(src + 0.5, ties to even)
2545   }
2546   bind(none);
2547 
2548   sve_fcvtzs(dst, T, ptrue, dst, T);
2549   // result in dst
2550 }
2551 
2552 void C2_MacroAssembler::vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero,
2553                                            FloatRegister one, SIMD_Arrangement T) {
2554   assert_different_registers(dst, src, zero, one);
2555   assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2556 
2557   facgt(dst, T, src, zero);
2558   ushr(dst, T, dst, 1); // dst=0 for +-0.0 and NaN. 0x7FF..F otherwise
2559   bsl(dst, T == T2S ? T8B : T16B, one, src); // Result in dst
2560 }
2561 
2562 void C2_MacroAssembler::vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero,
2563                                           FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T) {
2564     assert_different_registers(dst, src, zero, one, vtmp);
2565     assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2566 
2567     sve_orr(vtmp, src, src);
2568     sve_fac(Assembler::GT, pgtmp, T, ptrue, src, zero); // pmtp=0 for +-0.0 and NaN. 0x1 otherwise
2569     switch (T) {
2570     case S:
2571       sve_and(vtmp, T, min_jint); // Extract the sign bit of float value in every lane of src
2572       sve_orr(vtmp, T, jint_cast(1.0)); // OR it with +1 to make the final result +1 or -1 depending
2573                                         // on the sign of the float value
2574       break;
2575     case D:
2576       sve_and(vtmp, T, min_jlong);
2577       sve_orr(vtmp, T, jlong_cast(1.0));
2578       break;
2579     default:
2580       assert(false, "unsupported");
2581       ShouldNotReachHere();
2582     }
2583     sve_sel(dst, T, pgtmp, vtmp, src); // Select either from src or vtmp based on the predicate register pgtmp
2584                                        // Result in dst
2585 }
2586 
2587 bool C2_MacroAssembler::in_scratch_emit_size() {
2588   if (ciEnv::current()->task() != nullptr) {
2589     PhaseOutput* phase_output = Compile::current()->output();
2590     if (phase_output != nullptr && phase_output->in_scratch_emit_size()) {
2591       return true;
2592     }
2593   }
2594   return MacroAssembler::in_scratch_emit_size();
2595 }