1 /*
   2  * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "opto/c2_MacroAssembler.hpp"
  29 #include "opto/compile.hpp"
  30 #include "opto/intrinsicnode.hpp"
  31 #include "opto/matcher.hpp"
  32 #include "opto/output.hpp"
  33 #include "opto/subnode.hpp"
  34 #include "runtime/stubRoutines.hpp"
  35 #include "utilities/globalDefinitions.hpp"
  36 
  37 #ifdef PRODUCT
  38 #define BLOCK_COMMENT(str) /* nothing */
  39 #define STOP(error) stop(error)
  40 #else
  41 #define BLOCK_COMMENT(str) block_comment(str)
  42 #define STOP(error) block_comment(error); stop(error)
  43 #endif
  44 
  45 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  46 
  47 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
  48 
  49 void C2_MacroAssembler::entry_barrier() {
  50   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
  51   if (BarrierSet::barrier_set()->barrier_set_nmethod() != nullptr) {
  52     // Dummy labels for just measuring the code size
  53     Label dummy_slow_path;
  54     Label dummy_continuation;
  55     Label dummy_guard;
  56     Label* slow_path = &dummy_slow_path;
  57     Label* continuation = &dummy_continuation;
  58     Label* guard = &dummy_guard;
  59     if (!Compile::current()->output()->in_scratch_emit_size()) {
  60       // Use real labels from actual stub when not emitting code for the purpose of measuring its size
  61       C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
  62       Compile::current()->output()->add_stub(stub);
  63       slow_path = &stub->entry();
  64       continuation = &stub->continuation();
  65       guard = &stub->guard();
  66     }
  67     // In the C2 code, we move the non-hot part of nmethod entry barriers out-of-line to a stub.
  68     bs->nmethod_entry_barrier(this, slow_path, continuation, guard);
  69   }
  70 }
  71 
  72 void C2_MacroAssembler::fast_lock(Register objectReg, Register boxReg, Register tmpReg,
  73                                   Register tmp2Reg, Register tmp3Reg) {
  74   Register oop = objectReg;
  75   Register box = boxReg;
  76   Register disp_hdr = tmpReg;
  77   Register tmp = tmp2Reg;
  78   Label cont;
  79   Label object_has_monitor;
  80   Label count, no_count;
  81 
  82   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight");
  83   assert_different_registers(oop, box, tmp, disp_hdr);
  84 
  85   // Load markWord from object into displaced_header.
  86   ldr(disp_hdr, Address(oop, oopDesc::mark_offset_in_bytes()));
  87 
  88   if (DiagnoseSyncOnValueBasedClasses != 0) {
  89     load_klass(tmp, oop);
  90     ldrb(tmp, Address(tmp, Klass::misc_flags_offset()));
  91     tst(tmp, KlassFlags::_misc_is_value_based_class);
  92     br(Assembler::NE, cont);
  93   }
  94 
  95   // Check for existing monitor
  96   tbnz(disp_hdr, exact_log2(markWord::monitor_value), object_has_monitor);
  97 
  98   if (LockingMode == LM_MONITOR) {
  99     tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0.
 100     b(cont);
 101   } else {
 102     assert(LockingMode == LM_LEGACY, "must be");
 103     // Set tmp to be (markWord of object | UNLOCK_VALUE).
 104     orr(tmp, disp_hdr, markWord::unlocked_value);
 105 
 106     if (EnableValhalla) {
 107       // Mask inline_type bit such that we go to the slow path if object is an inline type
 108       andr(tmp, tmp, ~((int) markWord::inline_type_bit_in_place));
 109     }
 110 
 111     // Initialize the box. (Must happen before we update the object mark!)
 112     str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 113 
 114     // Compare object markWord with an unlocked value (tmp) and if
 115     // equal exchange the stack address of our box with object markWord.
 116     // On failure disp_hdr contains the possibly locked markWord.
 117     cmpxchg(oop, tmp, box, Assembler::xword, /*acquire*/ true,
 118             /*release*/ true, /*weak*/ false, disp_hdr);
 119     br(Assembler::EQ, cont);
 120 
 121     assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
 122 
 123     // If the compare-and-exchange succeeded, then we found an unlocked
 124     // object, will have now locked it will continue at label cont
 125 
 126     // Check if the owner is self by comparing the value in the
 127     // markWord of object (disp_hdr) with the stack pointer.
 128     mov(rscratch1, sp);
 129     sub(disp_hdr, disp_hdr, rscratch1);
 130     mov(tmp, (address) (~(os::vm_page_size()-1) | markWord::lock_mask_in_place));
 131     // If condition is true we are cont and hence we can store 0 as the
 132     // displaced header in the box, which indicates that it is a recursive lock.
 133     ands(tmp/*==0?*/, disp_hdr, tmp);   // Sets flags for result
 134     str(tmp/*==0, perhaps*/, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 135     b(cont);
 136   }
 137 
 138   // Handle existing monitor.
 139   bind(object_has_monitor);
 140 
 141   // The object's monitor m is unlocked iff m->owner == nullptr,
 142   // otherwise m->owner may contain a thread or a stack address.
 143   //
 144   // Try to CAS m->owner from null to current thread.
 145   add(tmp, disp_hdr, (in_bytes(ObjectMonitor::owner_offset())-markWord::monitor_value));
 146   cmpxchg(tmp, zr, rthread, Assembler::xword, /*acquire*/ true,
 147           /*release*/ true, /*weak*/ false, tmp3Reg); // Sets flags for result
 148 
 149   // Store a non-null value into the box to avoid looking like a re-entrant
 150   // lock. The fast-path monitor unlock code checks for
 151   // markWord::monitor_value so use markWord::unused_mark which has the
 152   // relevant bit set, and also matches ObjectSynchronizer::enter.
 153   mov(tmp, (address)markWord::unused_mark().value());
 154   str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 155 
 156   br(Assembler::EQ, cont); // CAS success means locking succeeded
 157 
 158   cmp(tmp3Reg, rthread);
 159   br(Assembler::NE, cont); // Check for recursive locking
 160 
 161   // Recursive lock case
 162   increment(Address(disp_hdr, in_bytes(ObjectMonitor::recursions_offset()) - markWord::monitor_value), 1);
 163   // flag == EQ still from the cmp above, checking if this is a reentrant lock
 164 
 165   bind(cont);
 166   // flag == EQ indicates success
 167   // flag == NE indicates failure
 168   br(Assembler::NE, no_count);
 169 
 170   bind(count);
 171   increment(Address(rthread, JavaThread::held_monitor_count_offset()));
 172 
 173   bind(no_count);
 174 }
 175 
 176 void C2_MacroAssembler::fast_unlock(Register objectReg, Register boxReg, Register tmpReg,
 177                                     Register tmp2Reg) {
 178   Register oop = objectReg;
 179   Register box = boxReg;
 180   Register disp_hdr = tmpReg;
 181   Register owner_addr = tmpReg;
 182   Register tmp = tmp2Reg;
 183   Label cont;
 184   Label object_has_monitor;
 185   Label count, no_count;
 186   Label unlocked;
 187 
 188   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight");
 189   assert_different_registers(oop, box, tmp, disp_hdr);
 190 
 191   if (LockingMode == LM_LEGACY) {
 192     // Find the lock address and load the displaced header from the stack.
 193     ldr(disp_hdr, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 194 
 195     // If the displaced header is 0, we have a recursive unlock.
 196     cmp(disp_hdr, zr);
 197     br(Assembler::EQ, cont);
 198   }
 199 
 200   // Handle existing monitor.
 201   ldr(tmp, Address(oop, oopDesc::mark_offset_in_bytes()));
 202   tbnz(tmp, exact_log2(markWord::monitor_value), object_has_monitor);
 203 
 204   if (LockingMode == LM_MONITOR) {
 205     tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0.
 206     b(cont);
 207   } else {
 208     assert(LockingMode == LM_LEGACY, "must be");
 209     // Check if it is still a light weight lock, this is is true if we
 210     // see the stack address of the basicLock in the markWord of the
 211     // object.
 212 
 213     cmpxchg(oop, box, disp_hdr, Assembler::xword, /*acquire*/ false,
 214             /*release*/ true, /*weak*/ false, tmp);
 215     b(cont);
 216   }
 217 
 218   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
 219 
 220   // Handle existing monitor.
 221   bind(object_has_monitor);
 222   STATIC_ASSERT(markWord::monitor_value <= INT_MAX);
 223   add(tmp, tmp, -(int)markWord::monitor_value); // monitor
 224 
 225   ldr(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset()));
 226 
 227   Label notRecursive;
 228   cbz(disp_hdr, notRecursive);
 229 
 230   // Recursive lock
 231   sub(disp_hdr, disp_hdr, 1u);
 232   str(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset()));
 233   cmp(disp_hdr, disp_hdr); // Sets flags for result
 234   b(cont);
 235 
 236   bind(notRecursive);
 237 
 238   // Compute owner address.
 239   lea(owner_addr, Address(tmp, ObjectMonitor::owner_offset()));
 240 
 241   // Set owner to null.
 242   // Release to satisfy the JMM
 243   stlr(zr, owner_addr);
 244   // We need a full fence after clearing owner to avoid stranding.
 245   // StoreLoad achieves this.
 246   membar(StoreLoad);
 247 
 248   // Check if the entry lists are empty.
 249   ldr(rscratch1, Address(tmp, ObjectMonitor::EntryList_offset()));
 250   ldr(tmpReg, Address(tmp, ObjectMonitor::cxq_offset()));
 251   orr(rscratch1, rscratch1, tmpReg);
 252   cmp(rscratch1, zr);
 253   br(Assembler::EQ, cont);     // If so we are done.
 254 
 255   // Check if there is a successor.
 256   ldr(rscratch1, Address(tmp, ObjectMonitor::succ_offset()));
 257   cmp(rscratch1, zr);
 258   br(Assembler::NE, unlocked); // If so we are done.
 259 
 260   // Save the monitor pointer in the current thread, so we can try to
 261   // reacquire the lock in SharedRuntime::monitor_exit_helper().
 262   str(tmp, Address(rthread, JavaThread::unlocked_inflated_monitor_offset()));
 263 
 264   cmp(zr, rthread); // Set Flag to NE => slow path
 265   b(cont);
 266 
 267   bind(unlocked);
 268   cmp(zr, zr); // Set Flag to EQ => fast path
 269 
 270   // Intentional fall-through
 271 
 272   bind(cont);
 273   // flag == EQ indicates success
 274   // flag == NE indicates failure
 275   br(Assembler::NE, no_count);
 276 
 277   bind(count);
 278   decrement(Address(rthread, JavaThread::held_monitor_count_offset()));
 279 
 280   bind(no_count);
 281 }
 282 
 283 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register t1,
 284                                               Register t2, Register t3) {
 285   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 286   assert_different_registers(obj, box, t1, t2, t3);
 287 
 288   // Handle inflated monitor.
 289   Label inflated;
 290   // Finish fast lock successfully. MUST branch to with flag == EQ
 291   Label locked;
 292   // Finish fast lock unsuccessfully. MUST branch to with flag == NE
 293   Label slow_path;
 294 
 295   if (UseObjectMonitorTable) {
 296     // Clear cache in case fast locking succeeds.
 297     str(zr, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 298   }
 299 
 300   if (DiagnoseSyncOnValueBasedClasses != 0) {
 301     load_klass(t1, obj);
 302     ldrb(t1, Address(t1, Klass::misc_flags_offset()));
 303     tst(t1, KlassFlags::_misc_is_value_based_class);
 304     br(Assembler::NE, slow_path);
 305   }
 306 
 307   const Register t1_mark = t1;
 308   const Register t3_t = t3;
 309 
 310   { // Lightweight locking
 311 
 312     // Push lock to the lock stack and finish successfully. MUST branch to with flag == EQ
 313     Label push;
 314 
 315     const Register t2_top = t2;
 316 
 317     // Check if lock-stack is full.
 318     ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 319     cmpw(t2_top, (unsigned)LockStack::end_offset() - 1);
 320     br(Assembler::GT, slow_path);
 321 
 322     // Check if recursive.
 323     subw(t3_t, t2_top, oopSize);
 324     ldr(t3_t, Address(rthread, t3_t));
 325     cmp(obj, t3_t);
 326     br(Assembler::EQ, push);
 327 
 328     // Relaxed normal load to check for monitor. Optimization for monitor case.
 329     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 330     tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
 331 
 332     // Not inflated
 333     assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea");
 334 
 335     // Try to lock. Transition lock-bits 0b01 => 0b00
 336     orr(t1_mark, t1_mark, markWord::unlocked_value);
 337     eor(t3_t, t1_mark, markWord::unlocked_value);
 338     cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
 339             /*acquire*/ true, /*release*/ false, /*weak*/ false, noreg);
 340     br(Assembler::NE, slow_path);
 341 
 342     bind(push);
 343     // After successful lock, push object on lock-stack.
 344     str(obj, Address(rthread, t2_top));
 345     addw(t2_top, t2_top, oopSize);
 346     strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 347     b(locked);
 348   }
 349 
 350   { // Handle inflated monitor.
 351     bind(inflated);
 352 
 353     const Register t1_monitor = t1;
 354 
 355     if (!UseObjectMonitorTable) {
 356       assert(t1_monitor == t1_mark, "should be the same here");
 357     } else {
 358       Label monitor_found;
 359 
 360       // Load cache address
 361       lea(t3_t, Address(rthread, JavaThread::om_cache_oops_offset()));
 362 
 363       const int num_unrolled = 2;
 364       for (int i = 0; i < num_unrolled; i++) {
 365         ldr(t1, Address(t3_t));
 366         cmp(obj, t1);
 367         br(Assembler::EQ, monitor_found);
 368         increment(t3_t, in_bytes(OMCache::oop_to_oop_difference()));
 369       }
 370 
 371       Label loop;
 372 
 373       // Search for obj in cache.
 374       bind(loop);
 375 
 376       // Check for match.
 377       ldr(t1, Address(t3_t));
 378       cmp(obj, t1);
 379       br(Assembler::EQ, monitor_found);
 380 
 381       // Search until null encountered, guaranteed _null_sentinel at end.
 382       increment(t3_t, in_bytes(OMCache::oop_to_oop_difference()));
 383       cbnz(t1, loop);
 384       // Cache Miss, NE set from cmp above, cbnz does not set flags
 385       b(slow_path);
 386 
 387       bind(monitor_found);
 388       ldr(t1_monitor, Address(t3_t, OMCache::oop_to_monitor_difference()));
 389     }
 390 
 391     const Register t2_owner_addr = t2;
 392     const Register t3_owner = t3;
 393     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 394     const Address owner_address(t1_monitor, ObjectMonitor::owner_offset() - monitor_tag);
 395     const Address recursions_address(t1_monitor, ObjectMonitor::recursions_offset() - monitor_tag);
 396 
 397     Label monitor_locked;
 398 
 399     // Compute owner address.
 400     lea(t2_owner_addr, owner_address);
 401 
 402     // CAS owner (null => current thread).
 403     cmpxchg(t2_owner_addr, zr, rthread, Assembler::xword, /*acquire*/ true,
 404             /*release*/ false, /*weak*/ false, t3_owner);
 405     br(Assembler::EQ, monitor_locked);
 406 
 407     // Check if recursive.
 408     cmp(t3_owner, rthread);
 409     br(Assembler::NE, slow_path);
 410 
 411     // Recursive.
 412     increment(recursions_address, 1);
 413 
 414     bind(monitor_locked);
 415     if (UseObjectMonitorTable) {
 416       str(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 417     }
 418   }
 419 
 420   bind(locked);
 421   increment(Address(rthread, JavaThread::held_monitor_count_offset()));
 422 
 423 #ifdef ASSERT
 424   // Check that locked label is reached with Flags == EQ.
 425   Label flag_correct;
 426   br(Assembler::EQ, flag_correct);
 427   stop("Fast Lock Flag != EQ");
 428 #endif
 429 
 430   bind(slow_path);
 431 #ifdef ASSERT
 432   // Check that slow_path label is reached with Flags == NE.
 433   br(Assembler::NE, flag_correct);
 434   stop("Fast Lock Flag != NE");
 435   bind(flag_correct);
 436 #endif
 437   // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
 438 }
 439 
 440 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register box, Register t1,
 441                                                 Register t2, Register t3) {
 442   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 443   assert_different_registers(obj, box, t1, t2, t3);
 444 
 445   // Handle inflated monitor.
 446   Label inflated, inflated_load_mark;
 447   // Finish fast unlock successfully. MUST branch to with flag == EQ
 448   Label unlocked;
 449   // Finish fast unlock unsuccessfully. MUST branch to with flag == NE
 450   Label slow_path;
 451 
 452   const Register t1_mark = t1;
 453   const Register t2_top = t2;
 454   const Register t3_t = t3;
 455 
 456   { // Lightweight unlock
 457 
 458     Label push_and_slow_path;
 459 
 460     // Check if obj is top of lock-stack.
 461     ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 462     subw(t2_top, t2_top, oopSize);
 463     ldr(t3_t, Address(rthread, t2_top));
 464     cmp(obj, t3_t);
 465     // Top of lock stack was not obj. Must be monitor.
 466     br(Assembler::NE, inflated_load_mark);
 467 
 468     // Pop lock-stack.
 469     DEBUG_ONLY(str(zr, Address(rthread, t2_top));)
 470     strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 471 
 472     // Check if recursive.
 473     subw(t3_t, t2_top, oopSize);
 474     ldr(t3_t, Address(rthread, t3_t));
 475     cmp(obj, t3_t);
 476     br(Assembler::EQ, unlocked);
 477 
 478     // Not recursive.
 479     // Load Mark.
 480     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 481 
 482     // Check header for monitor (0b10).
 483     // Because we got here by popping (meaning we pushed in locked)
 484     // there will be no monitor in the box. So we need to push back the obj
 485     // so that the runtime can fix any potential anonymous owner.
 486     tbnz(t1_mark, exact_log2(markWord::monitor_value), UseObjectMonitorTable ? push_and_slow_path : inflated);
 487 
 488     // Try to unlock. Transition lock bits 0b00 => 0b01
 489     assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea");
 490     orr(t3_t, t1_mark, markWord::unlocked_value);
 491     cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
 492             /*acquire*/ false, /*release*/ true, /*weak*/ false, noreg);
 493     br(Assembler::EQ, unlocked);
 494 
 495     bind(push_and_slow_path);
 496     // Compare and exchange failed.
 497     // Restore lock-stack and handle the unlock in runtime.
 498     DEBUG_ONLY(str(obj, Address(rthread, t2_top));)
 499     addw(t2_top, t2_top, oopSize);
 500     str(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 501     b(slow_path);
 502   }
 503 
 504 
 505   { // Handle inflated monitor.
 506     bind(inflated_load_mark);
 507     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 508 #ifdef ASSERT
 509     tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
 510     stop("Fast Unlock not monitor");
 511 #endif
 512 
 513     bind(inflated);
 514 
 515 #ifdef ASSERT
 516     Label check_done;
 517     subw(t2_top, t2_top, oopSize);
 518     cmpw(t2_top, in_bytes(JavaThread::lock_stack_base_offset()));
 519     br(Assembler::LT, check_done);
 520     ldr(t3_t, Address(rthread, t2_top));
 521     cmp(obj, t3_t);
 522     br(Assembler::NE, inflated);
 523     stop("Fast Unlock lock on stack");
 524     bind(check_done);
 525 #endif
 526 
 527     const Register t1_monitor = t1;
 528 
 529     if (!UseObjectMonitorTable) {
 530       assert(t1_monitor == t1_mark, "should be the same here");
 531 
 532       // Untag the monitor.
 533       add(t1_monitor, t1_mark, -(int)markWord::monitor_value);
 534     } else {
 535       ldr(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 536       // null check with Flags == NE, no valid pointer below alignof(ObjectMonitor*)
 537       cmp(t1_monitor, checked_cast<uint8_t>(alignof(ObjectMonitor*)));
 538       br(Assembler::LO, slow_path);
 539     }
 540 
 541     const Register t2_recursions = t2;
 542     Label not_recursive;
 543 
 544     // Check if recursive.
 545     ldr(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
 546     cbz(t2_recursions, not_recursive);
 547 
 548     // Recursive unlock.
 549     sub(t2_recursions, t2_recursions, 1u);
 550     str(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
 551     // Set flag == EQ
 552     cmp(t2_recursions, t2_recursions);
 553     b(unlocked);
 554 
 555     bind(not_recursive);
 556 
 557     const Register t2_owner_addr = t2;
 558 
 559     // Compute owner address.
 560     lea(t2_owner_addr, Address(t1_monitor, ObjectMonitor::owner_offset()));
 561 
 562     // Set owner to null.
 563     // Release to satisfy the JMM
 564     stlr(zr, t2_owner_addr);
 565     // We need a full fence after clearing owner to avoid stranding.
 566     // StoreLoad achieves this.
 567     membar(StoreLoad);
 568 
 569     // Check if the entry lists are empty.
 570     ldr(rscratch1, Address(t1_monitor, ObjectMonitor::EntryList_offset()));
 571     ldr(t3_t, Address(t1_monitor, ObjectMonitor::cxq_offset()));
 572     orr(rscratch1, rscratch1, t3_t);
 573     cmp(rscratch1, zr);
 574     br(Assembler::EQ, unlocked);  // If so we are done.
 575 
 576     // Check if there is a successor.
 577     ldr(rscratch1, Address(t1_monitor, ObjectMonitor::succ_offset()));
 578     cmp(rscratch1, zr);
 579     br(Assembler::NE, unlocked);  // If so we are done.
 580 
 581     // Save the monitor pointer in the current thread, so we can try to
 582     // reacquire the lock in SharedRuntime::monitor_exit_helper().
 583     str(t1_monitor, Address(rthread, JavaThread::unlocked_inflated_monitor_offset()));
 584 
 585     cmp(zr, rthread); // Set Flag to NE => slow path
 586     b(slow_path);
 587   }
 588 
 589   bind(unlocked);
 590   decrement(Address(rthread, JavaThread::held_monitor_count_offset()));
 591   cmp(zr, zr); // Set Flags to EQ => fast path
 592 
 593 #ifdef ASSERT
 594   // Check that unlocked label is reached with Flags == EQ.
 595   Label flag_correct;
 596   br(Assembler::EQ, flag_correct);
 597   stop("Fast Unlock Flag != EQ");
 598 #endif
 599 
 600   bind(slow_path);
 601 #ifdef ASSERT
 602   // Check that slow_path label is reached with Flags == NE.
 603   br(Assembler::NE, flag_correct);
 604   stop("Fast Unlock Flag != NE");
 605   bind(flag_correct);
 606 #endif
 607   // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
 608 }
 609 
 610 // Search for str1 in str2 and return index or -1
 611 // Clobbers: rscratch1, rscratch2, rflags. May also clobber v0-v1, when icnt1==-1.
 612 void C2_MacroAssembler::string_indexof(Register str2, Register str1,
 613                                        Register cnt2, Register cnt1,
 614                                        Register tmp1, Register tmp2,
 615                                        Register tmp3, Register tmp4,
 616                                        Register tmp5, Register tmp6,
 617                                        int icnt1, Register result, int ae) {
 618   // NOTE: tmp5, tmp6 can be zr depending on specific method version
 619   Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
 620 
 621   Register ch1 = rscratch1;
 622   Register ch2 = rscratch2;
 623   Register cnt1tmp = tmp1;
 624   Register cnt2tmp = tmp2;
 625   Register cnt1_neg = cnt1;
 626   Register cnt2_neg = cnt2;
 627   Register result_tmp = tmp4;
 628 
 629   bool isL = ae == StrIntrinsicNode::LL;
 630 
 631   bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
 632   bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
 633   int str1_chr_shift = str1_isL ? 0:1;
 634   int str2_chr_shift = str2_isL ? 0:1;
 635   int str1_chr_size = str1_isL ? 1:2;
 636   int str2_chr_size = str2_isL ? 1:2;
 637   chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
 638                                       (chr_insn)&MacroAssembler::ldrh;
 639   chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
 640                                       (chr_insn)&MacroAssembler::ldrh;
 641   chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
 642   chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
 643 
 644   // Note, inline_string_indexOf() generates checks:
 645   // if (substr.count > string.count) return -1;
 646   // if (substr.count == 0) return 0;
 647 
 648   // We have two strings, a source string in str2, cnt2 and a pattern string
 649   // in str1, cnt1. Find the 1st occurrence of pattern in source or return -1.
 650 
 651   // For larger pattern and source we use a simplified Boyer Moore algorithm.
 652   // With a small pattern and source we use linear scan.
 653 
 654   if (icnt1 == -1) {
 655     sub(result_tmp, cnt2, cnt1);
 656     cmp(cnt1, (u1)8);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
 657     br(LT, LINEARSEARCH);
 658     dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
 659     subs(zr, cnt1, 256);
 660     lsr(tmp1, cnt2, 2);
 661     ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
 662     br(GE, LINEARSTUB);
 663   }
 664 
 665 // The Boyer Moore alogorithm is based on the description here:-
 666 //
 667 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
 668 //
 669 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
 670 // and the 'Good Suffix' rule.
 671 //
 672 // These rules are essentially heuristics for how far we can shift the
 673 // pattern along the search string.
 674 //
 675 // The implementation here uses the 'Bad Character' rule only because of the
 676 // complexity of initialisation for the 'Good Suffix' rule.
 677 //
 678 // This is also known as the Boyer-Moore-Horspool algorithm:-
 679 //
 680 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
 681 //
 682 // This particular implementation has few java-specific optimizations.
 683 //
 684 // #define ASIZE 256
 685 //
 686 //    int bm(unsigned char *x, int m, unsigned char *y, int n) {
 687 //       int i, j;
 688 //       unsigned c;
 689 //       unsigned char bc[ASIZE];
 690 //
 691 //       /* Preprocessing */
 692 //       for (i = 0; i < ASIZE; ++i)
 693 //          bc[i] = m;
 694 //       for (i = 0; i < m - 1; ) {
 695 //          c = x[i];
 696 //          ++i;
 697 //          // c < 256 for Latin1 string, so, no need for branch
 698 //          #ifdef PATTERN_STRING_IS_LATIN1
 699 //          bc[c] = m - i;
 700 //          #else
 701 //          if (c < ASIZE) bc[c] = m - i;
 702 //          #endif
 703 //       }
 704 //
 705 //       /* Searching */
 706 //       j = 0;
 707 //       while (j <= n - m) {
 708 //          c = y[i+j];
 709 //          if (x[m-1] == c)
 710 //            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
 711 //          if (i < 0) return j;
 712 //          // c < 256 for Latin1 string, so, no need for branch
 713 //          #ifdef SOURCE_STRING_IS_LATIN1
 714 //          // LL case: (c< 256) always true. Remove branch
 715 //          j += bc[y[j+m-1]];
 716 //          #endif
 717 //          #ifndef PATTERN_STRING_IS_UTF
 718 //          // UU case: need if (c<ASIZE) check. Skip 1 character if not.
 719 //          if (c < ASIZE)
 720 //            j += bc[y[j+m-1]];
 721 //          else
 722 //            j += 1
 723 //          #endif
 724 //          #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
 725 //          // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
 726 //          if (c < ASIZE)
 727 //            j += bc[y[j+m-1]];
 728 //          else
 729 //            j += m
 730 //          #endif
 731 //       }
 732 //    }
 733 
 734   if (icnt1 == -1) {
 735     Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
 736         BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
 737     Register cnt1end = tmp2;
 738     Register str2end = cnt2;
 739     Register skipch = tmp2;
 740 
 741     // str1 length is >=8, so, we can read at least 1 register for cases when
 742     // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
 743     // UL case. We'll re-read last character in inner pre-loop code to have
 744     // single outer pre-loop load
 745     const int firstStep = isL ? 7 : 3;
 746 
 747     const int ASIZE = 256;
 748     const int STORED_BYTES = 32; // amount of bytes stored per instruction
 749     sub(sp, sp, ASIZE);
 750     mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
 751     mov(ch1, sp);
 752     BIND(BM_INIT_LOOP);
 753       stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
 754       subs(tmp5, tmp5, 1);
 755       br(GT, BM_INIT_LOOP);
 756 
 757       sub(cnt1tmp, cnt1, 1);
 758       mov(tmp5, str2);
 759       add(str2end, str2, result_tmp, LSL, str2_chr_shift);
 760       sub(ch2, cnt1, 1);
 761       mov(tmp3, str1);
 762     BIND(BCLOOP);
 763       (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
 764       if (!str1_isL) {
 765         subs(zr, ch1, ASIZE);
 766         br(HS, BCSKIP);
 767       }
 768       strb(ch2, Address(sp, ch1));
 769     BIND(BCSKIP);
 770       subs(ch2, ch2, 1);
 771       br(GT, BCLOOP);
 772 
 773       add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
 774       if (str1_isL == str2_isL) {
 775         // load last 8 bytes (8LL/4UU symbols)
 776         ldr(tmp6, Address(tmp6, -wordSize));
 777       } else {
 778         ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
 779         // convert Latin1 to UTF. We'll have to wait until load completed, but
 780         // it's still faster than per-character loads+checks
 781         lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
 782         ubfx(ch1, tmp6, 8, 8); // str1[N-2]
 783         ubfx(ch2, tmp6, 16, 8); // str1[N-3]
 784         andr(tmp6, tmp6, 0xFF); // str1[N-4]
 785         orr(ch2, ch1, ch2, LSL, 16);
 786         orr(tmp6, tmp6, tmp3, LSL, 48);
 787         orr(tmp6, tmp6, ch2, LSL, 16);
 788       }
 789     BIND(BMLOOPSTR2);
 790       (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 791       sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
 792       if (str1_isL == str2_isL) {
 793         // re-init tmp3. It's for free because it's executed in parallel with
 794         // load above. Alternative is to initialize it before loop, but it'll
 795         // affect performance on in-order systems with 2 or more ld/st pipelines
 796         lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
 797       }
 798       if (!isL) { // UU/UL case
 799         lsl(ch2, cnt1tmp, 1); // offset in bytes
 800       }
 801       cmp(tmp3, skipch);
 802       br(NE, BMSKIP);
 803       ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
 804       mov(ch1, tmp6);
 805       if (isL) {
 806         b(BMLOOPSTR1_AFTER_LOAD);
 807       } else {
 808         sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
 809         b(BMLOOPSTR1_CMP);
 810       }
 811     BIND(BMLOOPSTR1);
 812       (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
 813       (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 814     BIND(BMLOOPSTR1_AFTER_LOAD);
 815       subs(cnt1tmp, cnt1tmp, 1);
 816       br(LT, BMLOOPSTR1_LASTCMP);
 817     BIND(BMLOOPSTR1_CMP);
 818       cmp(ch1, ch2);
 819       br(EQ, BMLOOPSTR1);
 820     BIND(BMSKIP);
 821       if (!isL) {
 822         // if we've met UTF symbol while searching Latin1 pattern, then we can
 823         // skip cnt1 symbols
 824         if (str1_isL != str2_isL) {
 825           mov(result_tmp, cnt1);
 826         } else {
 827           mov(result_tmp, 1);
 828         }
 829         subs(zr, skipch, ASIZE);
 830         br(HS, BMADV);
 831       }
 832       ldrb(result_tmp, Address(sp, skipch)); // load skip distance
 833     BIND(BMADV);
 834       sub(cnt1tmp, cnt1, 1);
 835       add(str2, str2, result_tmp, LSL, str2_chr_shift);
 836       cmp(str2, str2end);
 837       br(LE, BMLOOPSTR2);
 838       add(sp, sp, ASIZE);
 839       b(NOMATCH);
 840     BIND(BMLOOPSTR1_LASTCMP);
 841       cmp(ch1, ch2);
 842       br(NE, BMSKIP);
 843     BIND(BMMATCH);
 844       sub(result, str2, tmp5);
 845       if (!str2_isL) lsr(result, result, 1);
 846       add(sp, sp, ASIZE);
 847       b(DONE);
 848 
 849     BIND(LINEARSTUB);
 850     cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm
 851     br(LT, LINEAR_MEDIUM);
 852     mov(result, zr);
 853     RuntimeAddress stub = nullptr;
 854     if (isL) {
 855       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
 856       assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated");
 857     } else if (str1_isL) {
 858       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
 859        assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated");
 860     } else {
 861       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
 862       assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated");
 863     }
 864     address call = trampoline_call(stub);
 865     if (call == nullptr) {
 866       DEBUG_ONLY(reset_labels(LINEARSEARCH, LINEAR_MEDIUM, DONE, NOMATCH, MATCH));
 867       ciEnv::current()->record_failure("CodeCache is full");
 868       return;
 869     }
 870     b(DONE);
 871   }
 872 
 873   BIND(LINEARSEARCH);
 874   {
 875     Label DO1, DO2, DO3;
 876 
 877     Register str2tmp = tmp2;
 878     Register first = tmp3;
 879 
 880     if (icnt1 == -1)
 881     {
 882         Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
 883 
 884         cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2));
 885         br(LT, DOSHORT);
 886       BIND(LINEAR_MEDIUM);
 887         (this->*str1_load_1chr)(first, Address(str1));
 888         lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
 889         sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
 890         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 891         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 892 
 893       BIND(FIRST_LOOP);
 894         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
 895         cmp(first, ch2);
 896         br(EQ, STR1_LOOP);
 897       BIND(STR2_NEXT);
 898         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 899         br(LE, FIRST_LOOP);
 900         b(NOMATCH);
 901 
 902       BIND(STR1_LOOP);
 903         adds(cnt1tmp, cnt1_neg, str1_chr_size);
 904         add(cnt2tmp, cnt2_neg, str2_chr_size);
 905         br(GE, MATCH);
 906 
 907       BIND(STR1_NEXT);
 908         (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
 909         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
 910         cmp(ch1, ch2);
 911         br(NE, STR2_NEXT);
 912         adds(cnt1tmp, cnt1tmp, str1_chr_size);
 913         add(cnt2tmp, cnt2tmp, str2_chr_size);
 914         br(LT, STR1_NEXT);
 915         b(MATCH);
 916 
 917       BIND(DOSHORT);
 918       if (str1_isL == str2_isL) {
 919         cmp(cnt1, (u1)2);
 920         br(LT, DO1);
 921         br(GT, DO3);
 922       }
 923     }
 924 
 925     if (icnt1 == 4) {
 926       Label CH1_LOOP;
 927 
 928         (this->*load_4chr)(ch1, str1);
 929         sub(result_tmp, cnt2, 4);
 930         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 931         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 932 
 933       BIND(CH1_LOOP);
 934         (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
 935         cmp(ch1, ch2);
 936         br(EQ, MATCH);
 937         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 938         br(LE, CH1_LOOP);
 939         b(NOMATCH);
 940       }
 941 
 942     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
 943       Label CH1_LOOP;
 944 
 945       BIND(DO2);
 946         (this->*load_2chr)(ch1, str1);
 947         if (icnt1 == 2) {
 948           sub(result_tmp, cnt2, 2);
 949         }
 950         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 951         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 952       BIND(CH1_LOOP);
 953         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
 954         cmp(ch1, ch2);
 955         br(EQ, MATCH);
 956         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 957         br(LE, CH1_LOOP);
 958         b(NOMATCH);
 959     }
 960 
 961     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
 962       Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
 963 
 964       BIND(DO3);
 965         (this->*load_2chr)(first, str1);
 966         (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
 967         if (icnt1 == 3) {
 968           sub(result_tmp, cnt2, 3);
 969         }
 970         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 971         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 972       BIND(FIRST_LOOP);
 973         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
 974         cmpw(first, ch2);
 975         br(EQ, STR1_LOOP);
 976       BIND(STR2_NEXT);
 977         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 978         br(LE, FIRST_LOOP);
 979         b(NOMATCH);
 980 
 981       BIND(STR1_LOOP);
 982         add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
 983         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
 984         cmp(ch1, ch2);
 985         br(NE, STR2_NEXT);
 986         b(MATCH);
 987     }
 988 
 989     if (icnt1 == -1 || icnt1 == 1) {
 990       Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
 991 
 992       BIND(DO1);
 993         (this->*str1_load_1chr)(ch1, str1);
 994         cmp(cnt2, (u1)8);
 995         br(LT, DO1_SHORT);
 996 
 997         sub(result_tmp, cnt2, 8/str2_chr_size);
 998         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 999         mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
1000         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
1001 
1002         if (str2_isL) {
1003           orr(ch1, ch1, ch1, LSL, 8);
1004         }
1005         orr(ch1, ch1, ch1, LSL, 16);
1006         orr(ch1, ch1, ch1, LSL, 32);
1007       BIND(CH1_LOOP);
1008         ldr(ch2, Address(str2, cnt2_neg));
1009         eor(ch2, ch1, ch2);
1010         sub(tmp1, ch2, tmp3);
1011         orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
1012         bics(tmp1, tmp1, tmp2);
1013         br(NE, HAS_ZERO);
1014         adds(cnt2_neg, cnt2_neg, 8);
1015         br(LT, CH1_LOOP);
1016 
1017         cmp(cnt2_neg, (u1)8);
1018         mov(cnt2_neg, 0);
1019         br(LT, CH1_LOOP);
1020         b(NOMATCH);
1021 
1022       BIND(HAS_ZERO);
1023         rev(tmp1, tmp1);
1024         clz(tmp1, tmp1);
1025         add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
1026         b(MATCH);
1027 
1028       BIND(DO1_SHORT);
1029         mov(result_tmp, cnt2);
1030         lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
1031         sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
1032       BIND(DO1_LOOP);
1033         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
1034         cmpw(ch1, ch2);
1035         br(EQ, MATCH);
1036         adds(cnt2_neg, cnt2_neg, str2_chr_size);
1037         br(LT, DO1_LOOP);
1038     }
1039   }
1040   BIND(NOMATCH);
1041     mov(result, -1);
1042     b(DONE);
1043   BIND(MATCH);
1044     add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
1045   BIND(DONE);
1046 }
1047 
1048 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
1049 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
1050 
1051 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1,
1052                                             Register ch, Register result,
1053                                             Register tmp1, Register tmp2, Register tmp3)
1054 {
1055   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
1056   Register cnt1_neg = cnt1;
1057   Register ch1 = rscratch1;
1058   Register result_tmp = rscratch2;
1059 
1060   cbz(cnt1, NOMATCH);
1061 
1062   cmp(cnt1, (u1)4);
1063   br(LT, DO1_SHORT);
1064 
1065   orr(ch, ch, ch, LSL, 16);
1066   orr(ch, ch, ch, LSL, 32);
1067 
1068   sub(cnt1, cnt1, 4);
1069   mov(result_tmp, cnt1);
1070   lea(str1, Address(str1, cnt1, Address::uxtw(1)));
1071   sub(cnt1_neg, zr, cnt1, LSL, 1);
1072 
1073   mov(tmp3, 0x0001000100010001);
1074 
1075   BIND(CH1_LOOP);
1076     ldr(ch1, Address(str1, cnt1_neg));
1077     eor(ch1, ch, ch1);
1078     sub(tmp1, ch1, tmp3);
1079     orr(tmp2, ch1, 0x7fff7fff7fff7fff);
1080     bics(tmp1, tmp1, tmp2);
1081     br(NE, HAS_ZERO);
1082     adds(cnt1_neg, cnt1_neg, 8);
1083     br(LT, CH1_LOOP);
1084 
1085     cmp(cnt1_neg, (u1)8);
1086     mov(cnt1_neg, 0);
1087     br(LT, CH1_LOOP);
1088     b(NOMATCH);
1089 
1090   BIND(HAS_ZERO);
1091     rev(tmp1, tmp1);
1092     clz(tmp1, tmp1);
1093     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
1094     b(MATCH);
1095 
1096   BIND(DO1_SHORT);
1097     mov(result_tmp, cnt1);
1098     lea(str1, Address(str1, cnt1, Address::uxtw(1)));
1099     sub(cnt1_neg, zr, cnt1, LSL, 1);
1100   BIND(DO1_LOOP);
1101     ldrh(ch1, Address(str1, cnt1_neg));
1102     cmpw(ch, ch1);
1103     br(EQ, MATCH);
1104     adds(cnt1_neg, cnt1_neg, 2);
1105     br(LT, DO1_LOOP);
1106   BIND(NOMATCH);
1107     mov(result, -1);
1108     b(DONE);
1109   BIND(MATCH);
1110     add(result, result_tmp, cnt1_neg, ASR, 1);
1111   BIND(DONE);
1112 }
1113 
1114 void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1,
1115                                                 Register ch, Register result,
1116                                                 FloatRegister ztmp1,
1117                                                 FloatRegister ztmp2,
1118                                                 PRegister tmp_pg,
1119                                                 PRegister tmp_pdn, bool isL)
1120 {
1121   // Note that `tmp_pdn` should *NOT* be used as governing predicate register.
1122   assert(tmp_pg->is_governing(),
1123          "this register has to be a governing predicate register");
1124 
1125   Label LOOP, MATCH, DONE, NOMATCH;
1126   Register vec_len = rscratch1;
1127   Register idx = rscratch2;
1128 
1129   SIMD_RegVariant T = (isL == true) ? B : H;
1130 
1131   cbz(cnt1, NOMATCH);
1132 
1133   // Assign the particular char throughout the vector.
1134   sve_dup(ztmp2, T, ch);
1135   if (isL) {
1136     sve_cntb(vec_len);
1137   } else {
1138     sve_cnth(vec_len);
1139   }
1140   mov(idx, 0);
1141 
1142   // Generate a predicate to control the reading of input string.
1143   sve_whilelt(tmp_pg, T, idx, cnt1);
1144 
1145   BIND(LOOP);
1146     // Read a vector of 8- or 16-bit data depending on the string type. Note
1147     // that inactive elements indicated by the predicate register won't cause
1148     // a data read from memory to the destination vector.
1149     if (isL) {
1150       sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx));
1151     } else {
1152       sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1)));
1153     }
1154     add(idx, idx, vec_len);
1155 
1156     // Perform the comparison. An element of the destination predicate is set
1157     // to active if the particular char is matched.
1158     sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2);
1159 
1160     // Branch if the particular char is found.
1161     br(NE, MATCH);
1162 
1163     sve_whilelt(tmp_pg, T, idx, cnt1);
1164 
1165     // Loop back if the particular char not found.
1166     br(MI, LOOP);
1167 
1168   BIND(NOMATCH);
1169     mov(result, -1);
1170     b(DONE);
1171 
1172   BIND(MATCH);
1173     // Undo the index increment.
1174     sub(idx, idx, vec_len);
1175 
1176     // Crop the vector to find its location.
1177     sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */);
1178     add(result, idx, -1);
1179     sve_incp(result, T, tmp_pdn);
1180   BIND(DONE);
1181 }
1182 
1183 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1,
1184                                             Register ch, Register result,
1185                                             Register tmp1, Register tmp2, Register tmp3)
1186 {
1187   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
1188   Register cnt1_neg = cnt1;
1189   Register ch1 = rscratch1;
1190   Register result_tmp = rscratch2;
1191 
1192   cbz(cnt1, NOMATCH);
1193 
1194   cmp(cnt1, (u1)8);
1195   br(LT, DO1_SHORT);
1196 
1197   orr(ch, ch, ch, LSL, 8);
1198   orr(ch, ch, ch, LSL, 16);
1199   orr(ch, ch, ch, LSL, 32);
1200 
1201   sub(cnt1, cnt1, 8);
1202   mov(result_tmp, cnt1);
1203   lea(str1, Address(str1, cnt1));
1204   sub(cnt1_neg, zr, cnt1);
1205 
1206   mov(tmp3, 0x0101010101010101);
1207 
1208   BIND(CH1_LOOP);
1209     ldr(ch1, Address(str1, cnt1_neg));
1210     eor(ch1, ch, ch1);
1211     sub(tmp1, ch1, tmp3);
1212     orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f);
1213     bics(tmp1, tmp1, tmp2);
1214     br(NE, HAS_ZERO);
1215     adds(cnt1_neg, cnt1_neg, 8);
1216     br(LT, CH1_LOOP);
1217 
1218     cmp(cnt1_neg, (u1)8);
1219     mov(cnt1_neg, 0);
1220     br(LT, CH1_LOOP);
1221     b(NOMATCH);
1222 
1223   BIND(HAS_ZERO);
1224     rev(tmp1, tmp1);
1225     clz(tmp1, tmp1);
1226     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
1227     b(MATCH);
1228 
1229   BIND(DO1_SHORT);
1230     mov(result_tmp, cnt1);
1231     lea(str1, Address(str1, cnt1));
1232     sub(cnt1_neg, zr, cnt1);
1233   BIND(DO1_LOOP);
1234     ldrb(ch1, Address(str1, cnt1_neg));
1235     cmp(ch, ch1);
1236     br(EQ, MATCH);
1237     adds(cnt1_neg, cnt1_neg, 1);
1238     br(LT, DO1_LOOP);
1239   BIND(NOMATCH);
1240     mov(result, -1);
1241     b(DONE);
1242   BIND(MATCH);
1243     add(result, result_tmp, cnt1_neg);
1244   BIND(DONE);
1245 }
1246 
1247 // Compare strings.
1248 void C2_MacroAssembler::string_compare(Register str1, Register str2,
1249     Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
1250     FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
1251     PRegister pgtmp1, PRegister pgtmp2, int ae) {
1252   Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
1253       DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
1254       SHORT_LOOP_START, TAIL_CHECK;
1255 
1256   bool isLL = ae == StrIntrinsicNode::LL;
1257   bool isLU = ae == StrIntrinsicNode::LU;
1258   bool isUL = ae == StrIntrinsicNode::UL;
1259 
1260   // The stub threshold for LL strings is: 72 (64 + 8) chars
1261   // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch)
1262   // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least)
1263   const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36);
1264 
1265   bool str1_isL = isLL || isLU;
1266   bool str2_isL = isLL || isUL;
1267 
1268   int str1_chr_shift = str1_isL ? 0 : 1;
1269   int str2_chr_shift = str2_isL ? 0 : 1;
1270   int str1_chr_size = str1_isL ? 1 : 2;
1271   int str2_chr_size = str2_isL ? 1 : 2;
1272   int minCharsInWord = isLL ? wordSize : wordSize/2;
1273 
1274   FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
1275   chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
1276                                       (chr_insn)&MacroAssembler::ldrh;
1277   chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
1278                                       (chr_insn)&MacroAssembler::ldrh;
1279   uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
1280                             (uxt_insn)&MacroAssembler::uxthw;
1281 
1282   BLOCK_COMMENT("string_compare {");
1283 
1284   // Bizarrely, the counts are passed in bytes, regardless of whether they
1285   // are L or U strings, however the result is always in characters.
1286   if (!str1_isL) asrw(cnt1, cnt1, 1);
1287   if (!str2_isL) asrw(cnt2, cnt2, 1);
1288 
1289   // Compute the minimum of the string lengths and save the difference.
1290   subsw(result, cnt1, cnt2);
1291   cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
1292 
1293   // A very short string
1294   cmpw(cnt2, minCharsInWord);
1295   br(Assembler::LE, SHORT_STRING);
1296 
1297   // Compare longwords
1298   // load first parts of strings and finish initialization while loading
1299   {
1300     if (str1_isL == str2_isL) { // LL or UU
1301       ldr(tmp1, Address(str1));
1302       cmp(str1, str2);
1303       br(Assembler::EQ, DONE);
1304       ldr(tmp2, Address(str2));
1305       cmp(cnt2, stub_threshold);
1306       br(GE, STUB);
1307       subsw(cnt2, cnt2, minCharsInWord);
1308       br(EQ, TAIL_CHECK);
1309       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1310       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1311       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1312     } else if (isLU) {
1313       ldrs(vtmp, Address(str1));
1314       ldr(tmp2, Address(str2));
1315       cmp(cnt2, stub_threshold);
1316       br(GE, STUB);
1317       subw(cnt2, cnt2, 4);
1318       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1319       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1320       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1321       zip1(vtmp, T8B, vtmp, vtmpZ);
1322       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1323       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1324       add(cnt1, cnt1, 4);
1325       fmovd(tmp1, vtmp);
1326     } else { // UL case
1327       ldr(tmp1, Address(str1));
1328       ldrs(vtmp, Address(str2));
1329       cmp(cnt2, stub_threshold);
1330       br(GE, STUB);
1331       subw(cnt2, cnt2, 4);
1332       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1333       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1334       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1335       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1336       zip1(vtmp, T8B, vtmp, vtmpZ);
1337       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1338       add(cnt1, cnt1, 8);
1339       fmovd(tmp2, vtmp);
1340     }
1341     adds(cnt2, cnt2, isUL ? 4 : 8);
1342     br(GE, TAIL);
1343     eor(rscratch2, tmp1, tmp2);
1344     cbnz(rscratch2, DIFF);
1345     // main loop
1346     bind(NEXT_WORD);
1347     if (str1_isL == str2_isL) {
1348       ldr(tmp1, Address(str1, cnt2));
1349       ldr(tmp2, Address(str2, cnt2));
1350       adds(cnt2, cnt2, 8);
1351     } else if (isLU) {
1352       ldrs(vtmp, Address(str1, cnt1));
1353       ldr(tmp2, Address(str2, cnt2));
1354       add(cnt1, cnt1, 4);
1355       zip1(vtmp, T8B, vtmp, vtmpZ);
1356       fmovd(tmp1, vtmp);
1357       adds(cnt2, cnt2, 8);
1358     } else { // UL
1359       ldrs(vtmp, Address(str2, cnt2));
1360       ldr(tmp1, Address(str1, cnt1));
1361       zip1(vtmp, T8B, vtmp, vtmpZ);
1362       add(cnt1, cnt1, 8);
1363       fmovd(tmp2, vtmp);
1364       adds(cnt2, cnt2, 4);
1365     }
1366     br(GE, TAIL);
1367 
1368     eor(rscratch2, tmp1, tmp2);
1369     cbz(rscratch2, NEXT_WORD);
1370     b(DIFF);
1371     bind(TAIL);
1372     eor(rscratch2, tmp1, tmp2);
1373     cbnz(rscratch2, DIFF);
1374     // Last longword.  In the case where length == 4 we compare the
1375     // same longword twice, but that's still faster than another
1376     // conditional branch.
1377     if (str1_isL == str2_isL) {
1378       ldr(tmp1, Address(str1));
1379       ldr(tmp2, Address(str2));
1380     } else if (isLU) {
1381       ldrs(vtmp, Address(str1));
1382       ldr(tmp2, Address(str2));
1383       zip1(vtmp, T8B, vtmp, vtmpZ);
1384       fmovd(tmp1, vtmp);
1385     } else { // UL
1386       ldrs(vtmp, Address(str2));
1387       ldr(tmp1, Address(str1));
1388       zip1(vtmp, T8B, vtmp, vtmpZ);
1389       fmovd(tmp2, vtmp);
1390     }
1391     bind(TAIL_CHECK);
1392     eor(rscratch2, tmp1, tmp2);
1393     cbz(rscratch2, DONE);
1394 
1395     // Find the first different characters in the longwords and
1396     // compute their difference.
1397     bind(DIFF);
1398     rev(rscratch2, rscratch2);
1399     clz(rscratch2, rscratch2);
1400     andr(rscratch2, rscratch2, isLL ? -8 : -16);
1401     lsrv(tmp1, tmp1, rscratch2);
1402     (this->*ext_chr)(tmp1, tmp1);
1403     lsrv(tmp2, tmp2, rscratch2);
1404     (this->*ext_chr)(tmp2, tmp2);
1405     subw(result, tmp1, tmp2);
1406     b(DONE);
1407   }
1408 
1409   bind(STUB);
1410     RuntimeAddress stub = nullptr;
1411     switch(ae) {
1412       case StrIntrinsicNode::LL:
1413         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
1414         break;
1415       case StrIntrinsicNode::UU:
1416         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
1417         break;
1418       case StrIntrinsicNode::LU:
1419         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
1420         break;
1421       case StrIntrinsicNode::UL:
1422         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
1423         break;
1424       default:
1425         ShouldNotReachHere();
1426      }
1427     assert(stub.target() != nullptr, "compare_long_string stub has not been generated");
1428     address call = trampoline_call(stub);
1429     if (call == nullptr) {
1430       DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START));
1431       ciEnv::current()->record_failure("CodeCache is full");
1432       return;
1433     }
1434     b(DONE);
1435 
1436   bind(SHORT_STRING);
1437   // Is the minimum length zero?
1438   cbz(cnt2, DONE);
1439   // arrange code to do most branches while loading and loading next characters
1440   // while comparing previous
1441   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1442   subs(cnt2, cnt2, 1);
1443   br(EQ, SHORT_LAST_INIT);
1444   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1445   b(SHORT_LOOP_START);
1446   bind(SHORT_LOOP);
1447   subs(cnt2, cnt2, 1);
1448   br(EQ, SHORT_LAST);
1449   bind(SHORT_LOOP_START);
1450   (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
1451   (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
1452   cmp(tmp1, cnt1);
1453   br(NE, SHORT_LOOP_TAIL);
1454   subs(cnt2, cnt2, 1);
1455   br(EQ, SHORT_LAST2);
1456   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1457   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1458   cmp(tmp2, rscratch1);
1459   br(EQ, SHORT_LOOP);
1460   sub(result, tmp2, rscratch1);
1461   b(DONE);
1462   bind(SHORT_LOOP_TAIL);
1463   sub(result, tmp1, cnt1);
1464   b(DONE);
1465   bind(SHORT_LAST2);
1466   cmp(tmp2, rscratch1);
1467   br(EQ, DONE);
1468   sub(result, tmp2, rscratch1);
1469 
1470   b(DONE);
1471   bind(SHORT_LAST_INIT);
1472   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1473   bind(SHORT_LAST);
1474   cmp(tmp1, cnt1);
1475   br(EQ, DONE);
1476   sub(result, tmp1, cnt1);
1477 
1478   bind(DONE);
1479 
1480   BLOCK_COMMENT("} string_compare");
1481 }
1482 
1483 void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1,
1484                                      FloatRegister src2, Condition cond, bool isQ) {
1485   SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1486   FloatRegister zn = src1, zm = src2;
1487   bool needs_negation = false;
1488   switch (cond) {
1489     case LT: cond = GT; zn = src2; zm = src1; break;
1490     case LE: cond = GE; zn = src2; zm = src1; break;
1491     case LO: cond = HI; zn = src2; zm = src1; break;
1492     case LS: cond = HS; zn = src2; zm = src1; break;
1493     case NE: cond = EQ; needs_negation = true; break;
1494     default:
1495       break;
1496   }
1497 
1498   if (is_floating_point_type(bt)) {
1499     fcm(cond, dst, size, zn, zm);
1500   } else {
1501     cm(cond, dst, size, zn, zm);
1502   }
1503 
1504   if (needs_negation) {
1505     notr(dst, isQ ? T16B : T8B, dst);
1506   }
1507 }
1508 
1509 void C2_MacroAssembler::neon_compare_zero(FloatRegister dst, BasicType bt, FloatRegister src,
1510                                           Condition cond, bool isQ) {
1511   SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1512   if (bt == T_FLOAT || bt == T_DOUBLE) {
1513     if (cond == Assembler::NE) {
1514       fcm(Assembler::EQ, dst, size, src);
1515       notr(dst, isQ ? T16B : T8B, dst);
1516     } else {
1517       fcm(cond, dst, size, src);
1518     }
1519   } else {
1520     if (cond == Assembler::NE) {
1521       cm(Assembler::EQ, dst, size, src);
1522       notr(dst, isQ ? T16B : T8B, dst);
1523     } else {
1524       cm(cond, dst, size, src);
1525     }
1526   }
1527 }
1528 
1529 // Compress the least significant bit of each byte to the rightmost and clear
1530 // the higher garbage bits.
1531 void C2_MacroAssembler::bytemask_compress(Register dst) {
1532   // Example input, dst = 0x01 00 00 00 01 01 00 01
1533   // The "??" bytes are garbage.
1534   orr(dst, dst, dst, Assembler::LSR, 7);  // dst = 0x?? 02 ?? 00 ?? 03 ?? 01
1535   orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D
1536   orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D
1537   andr(dst, dst, 0xff);                   // dst = 0x8D
1538 }
1539 
1540 // Pack the lowest-numbered bit of each mask element in src into a long value
1541 // in dst, at most the first 64 lane elements.
1542 // Clobbers: rscratch1, if UseSVE=1 or the hardware doesn't support FEAT_BITPERM.
1543 void C2_MacroAssembler::sve_vmask_tolong(Register dst, PRegister src, BasicType bt, int lane_cnt,
1544                                          FloatRegister vtmp1, FloatRegister vtmp2) {
1545   assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
1546   assert_different_registers(dst, rscratch1);
1547   assert_different_registers(vtmp1, vtmp2);
1548 
1549   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
1550   // Example:   src = 0b01100101 10001101, bt = T_BYTE, lane_cnt = 16
1551   // Expected:  dst = 0x658D
1552 
1553   // Convert the mask into vector with sequential bytes.
1554   // vtmp1 = 0x00010100 0x00010001 0x01000000 0x01010001
1555   sve_cpy(vtmp1, size, src, 1, false);
1556   if (bt != T_BYTE) {
1557     sve_vector_narrow(vtmp1, B, vtmp1, size, vtmp2);
1558   }
1559 
1560   if (UseSVE > 1 && VM_Version::supports_svebitperm()) {
1561     // Given a vector with the value 0x00 or 0x01 in each byte, the basic idea
1562     // is to compress each significant bit of the byte in a cross-lane way. Due
1563     // to the lack of a cross-lane bit-compress instruction, we use BEXT
1564     // (bit-compress in each lane) with the biggest lane size (T = D) then
1565     // concatenate the results.
1566 
1567     // The second source input of BEXT, initialized with 0x01 in each byte.
1568     // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1569     sve_dup(vtmp2, B, 1);
1570 
1571     // BEXT vtmp1.D, vtmp1.D, vtmp2.D
1572     // vtmp1 = 0x0001010000010001 | 0x0100000001010001
1573     // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1574     //         ---------------------------------------
1575     // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1576     sve_bext(vtmp1, D, vtmp1, vtmp2);
1577 
1578     // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the
1579     // result to dst.
1580     // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1581     // dst   = 0x658D
1582     if (lane_cnt <= 8) {
1583       // No need to concatenate.
1584       umov(dst, vtmp1, B, 0);
1585     } else if (lane_cnt <= 16) {
1586       ins(vtmp1, B, vtmp1, 1, 8);
1587       umov(dst, vtmp1, H, 0);
1588     } else {
1589       // As the lane count is 64 at most, the final expected value must be in
1590       // the lowest 64 bits after narrowing vtmp1 from D to B.
1591       sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2);
1592       umov(dst, vtmp1, D, 0);
1593     }
1594   } else if (UseSVE > 0) {
1595     // Compress the lowest 8 bytes.
1596     fmovd(dst, vtmp1);
1597     bytemask_compress(dst);
1598     if (lane_cnt <= 8) return;
1599 
1600     // Repeat on higher bytes and join the results.
1601     // Compress 8 bytes in each iteration.
1602     for (int idx = 1; idx < (lane_cnt / 8); idx++) {
1603       sve_extract_integral(rscratch1, T_LONG, vtmp1, idx, vtmp2);
1604       bytemask_compress(rscratch1);
1605       orr(dst, dst, rscratch1, Assembler::LSL, idx << 3);
1606     }
1607   } else {
1608     assert(false, "unsupported");
1609     ShouldNotReachHere();
1610   }
1611 }
1612 
1613 // Unpack the mask, a long value in src, into predicate register dst based on the
1614 // corresponding data type. Note that dst can support at most 64 lanes.
1615 // Below example gives the expected dst predicate register in different types, with
1616 // a valid src(0x658D) on a 1024-bit vector size machine.
1617 // BYTE:  dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 00 00 65 8D
1618 // SHORT: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 14 11 40 51
1619 // INT:   dst = 0x00 00 00 00 00 00 00 00 01 10 01 01 10 00 11 01
1620 // LONG:  dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01
1621 //
1622 // The number of significant bits of src must be equal to lane_cnt. E.g., 0xFF658D which
1623 // has 24 significant bits would be an invalid input if dst predicate register refers to
1624 // a LONG type 1024-bit vector, which has at most 16 lanes.
1625 void C2_MacroAssembler::sve_vmask_fromlong(PRegister dst, Register src, BasicType bt, int lane_cnt,
1626                                            FloatRegister vtmp1, FloatRegister vtmp2) {
1627   assert(UseSVE == 2 && VM_Version::supports_svebitperm() &&
1628          lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported");
1629   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
1630   // Example:   src = 0x658D, bt = T_BYTE, size = B, lane_cnt = 16
1631   // Expected:  dst = 0b01101001 10001101
1632 
1633   // Put long value from general purpose register into the first lane of vector.
1634   // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1635   sve_dup(vtmp1, B, 0);
1636   mov(vtmp1, D, 0, src);
1637 
1638   // As sve_cmp generates mask value with the minimum unit in byte, we should
1639   // transform the value in the first lane which is mask in bit now to the
1640   // mask in byte, which can be done by SVE2's BDEP instruction.
1641 
1642   // The first source input of BDEP instruction. Deposite each byte in every 8 bytes.
1643   // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1644   if (lane_cnt <= 8) {
1645     // Nothing. As only one byte exsits.
1646   } else if (lane_cnt <= 16) {
1647     ins(vtmp1, B, vtmp1, 8, 1);
1648     mov(vtmp1, B, 1, zr);
1649   } else {
1650     sve_vector_extend(vtmp1, D, vtmp1, B);
1651   }
1652 
1653   // The second source input of BDEP instruction, initialized with 0x01 for each byte.
1654   // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1655   sve_dup(vtmp2, B, 1);
1656 
1657   // BDEP vtmp1.D, vtmp1.D, vtmp2.D
1658   // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1659   // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1660   //         ---------------------------------------
1661   // vtmp1 = 0x0001010000010001 | 0x0100000001010001
1662   sve_bdep(vtmp1, D, vtmp1, vtmp2);
1663 
1664   if (bt != T_BYTE) {
1665     sve_vector_extend(vtmp1, size, vtmp1, B);
1666   }
1667   // Generate mask according to the given vector, in which the elements have been
1668   // extended to expected type.
1669   // dst = 0b01101001 10001101
1670   sve_cmp(Assembler::NE, dst, size, ptrue, vtmp1, 0);
1671 }
1672 
1673 // Clobbers: rflags
1674 void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg,
1675                                     FloatRegister zn, FloatRegister zm, Condition cond) {
1676   assert(pg->is_governing(), "This register has to be a governing predicate register");
1677   FloatRegister z1 = zn, z2 = zm;
1678   switch (cond) {
1679     case LE: z1 = zm; z2 = zn; cond = GE; break;
1680     case LT: z1 = zm; z2 = zn; cond = GT; break;
1681     case LO: z1 = zm; z2 = zn; cond = HI; break;
1682     case LS: z1 = zm; z2 = zn; cond = HS; break;
1683     default:
1684       break;
1685   }
1686 
1687   SIMD_RegVariant size = elemType_to_regVariant(bt);
1688   if (is_floating_point_type(bt)) {
1689     sve_fcm(cond, pd, size, pg, z1, z2);
1690   } else {
1691     assert(is_integral_type(bt), "unsupported element type");
1692     sve_cmp(cond, pd, size, pg, z1, z2);
1693   }
1694 }
1695 
1696 // Get index of the last mask lane that is set
1697 void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) {
1698   SIMD_RegVariant size = elemType_to_regVariant(bt);
1699   sve_rev(ptmp, size, src);
1700   sve_brkb(ptmp, ptrue, ptmp, false);
1701   sve_cntp(dst, size, ptrue, ptmp);
1702   movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1);
1703   subw(dst, rscratch1, dst);
1704 }
1705 
1706 // Extend integer vector src to dst with the same lane count
1707 // but larger element size, e.g. 4B -> 4I
1708 void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes,
1709                                            FloatRegister src, BasicType src_bt, bool is_unsigned) {
1710   if (src_bt == T_BYTE) {
1711     if (dst_bt == T_SHORT) {
1712       // 4B/8B to 4S/8S
1713       _xshll(is_unsigned, dst, T8H, src, T8B, 0);
1714     } else {
1715       // 4B to 4I
1716       assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported");
1717       _xshll(is_unsigned, dst, T8H, src, T8B, 0);
1718       _xshll(is_unsigned, dst, T4S, dst, T4H, 0);
1719     }
1720   } else if (src_bt == T_SHORT) {
1721     // 4S to 4I
1722     assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported");
1723     _xshll(is_unsigned, dst, T4S, src, T4H, 0);
1724   } else if (src_bt == T_INT) {
1725     // 2I to 2L
1726     assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported");
1727     _xshll(is_unsigned, dst, T2D, src, T2S, 0);
1728   } else {
1729     ShouldNotReachHere();
1730   }
1731 }
1732 
1733 // Narrow integer vector src down to dst with the same lane count
1734 // but smaller element size, e.g. 4I -> 4B
1735 void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt,
1736                                            FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) {
1737   if (src_bt == T_SHORT) {
1738     // 4S/8S to 4B/8B
1739     assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
1740     assert(dst_bt == T_BYTE, "unsupported");
1741     xtn(dst, T8B, src, T8H);
1742   } else if (src_bt == T_INT) {
1743     // 4I to 4B/4S
1744     assert(src_vlen_in_bytes == 16, "unsupported");
1745     assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported");
1746     xtn(dst, T4H, src, T4S);
1747     if (dst_bt == T_BYTE) {
1748       xtn(dst, T8B, dst, T8H);
1749     }
1750   } else if (src_bt == T_LONG) {
1751     // 2L to 2I
1752     assert(src_vlen_in_bytes == 16, "unsupported");
1753     assert(dst_bt == T_INT, "unsupported");
1754     xtn(dst, T2S, src, T2D);
1755   } else {
1756     ShouldNotReachHere();
1757   }
1758 }
1759 
1760 void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size,
1761                                           FloatRegister src, SIMD_RegVariant src_size,
1762                                           bool is_unsigned) {
1763   assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size");
1764 
1765   if (src_size == B) {
1766     switch (dst_size) {
1767     case H:
1768       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1769       break;
1770     case S:
1771       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1772       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1773       break;
1774     case D:
1775       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1776       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1777       _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1778       break;
1779     default:
1780       ShouldNotReachHere();
1781     }
1782   } else if (src_size == H) {
1783     if (dst_size == S) {
1784       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1785     } else { // D
1786       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1787       _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1788     }
1789   } else if (src_size == S) {
1790     _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, src);
1791   }
1792 }
1793 
1794 // Vector narrow from src to dst with specified element sizes.
1795 // High part of dst vector will be filled with zero.
1796 void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size,
1797                                           FloatRegister src, SIMD_RegVariant src_size,
1798                                           FloatRegister tmp) {
1799   assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size");
1800   assert_different_registers(src, tmp);
1801   sve_dup(tmp, src_size, 0);
1802   if (src_size == D) {
1803     switch (dst_size) {
1804     case S:
1805       sve_uzp1(dst, S, src, tmp);
1806       break;
1807     case H:
1808       assert_different_registers(dst, tmp);
1809       sve_uzp1(dst, S, src, tmp);
1810       sve_uzp1(dst, H, dst, tmp);
1811       break;
1812     case B:
1813       assert_different_registers(dst, tmp);
1814       sve_uzp1(dst, S, src, tmp);
1815       sve_uzp1(dst, H, dst, tmp);
1816       sve_uzp1(dst, B, dst, tmp);
1817       break;
1818     default:
1819       ShouldNotReachHere();
1820     }
1821   } else if (src_size == S) {
1822     if (dst_size == H) {
1823       sve_uzp1(dst, H, src, tmp);
1824     } else { // B
1825       assert_different_registers(dst, tmp);
1826       sve_uzp1(dst, H, src, tmp);
1827       sve_uzp1(dst, B, dst, tmp);
1828     }
1829   } else if (src_size == H) {
1830     sve_uzp1(dst, B, src, tmp);
1831   }
1832 }
1833 
1834 // Extend src predicate to dst predicate with the same lane count but larger
1835 // element size, e.g. 64Byte -> 512Long
1836 void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src,
1837                                              uint dst_element_length_in_bytes,
1838                                              uint src_element_length_in_bytes) {
1839   if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) {
1840     sve_punpklo(dst, src);
1841   } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) {
1842     sve_punpklo(dst, src);
1843     sve_punpklo(dst, dst);
1844   } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) {
1845     sve_punpklo(dst, src);
1846     sve_punpklo(dst, dst);
1847     sve_punpklo(dst, dst);
1848   } else {
1849     assert(false, "unsupported");
1850     ShouldNotReachHere();
1851   }
1852 }
1853 
1854 // Narrow src predicate to dst predicate with the same lane count but
1855 // smaller element size, e.g. 512Long -> 64Byte
1856 void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src, PRegister ptmp,
1857                                              uint dst_element_length_in_bytes, uint src_element_length_in_bytes) {
1858   // The insignificant bits in src predicate are expected to be zero.
1859   // To ensure the higher order bits of the resultant narrowed vector are 0, an all-zero predicate is
1860   // passed as the second argument. An example narrowing operation with a given mask would be -
1861   // 128Long -> 64Int on a 128-bit machine i.e 2L -> 2I
1862   // Mask (for 2 Longs) : TF
1863   // Predicate register for the above mask (16 bits) : 00000001 00000000
1864   // After narrowing (uzp1 dst.b, src.b, ptmp.b) : 0000 0000 0001 0000
1865   // Which translates to mask for 2 integers as : TF (lower half is considered while upper half is 0)
1866   assert_different_registers(src, ptmp);
1867   assert_different_registers(dst, ptmp);
1868   sve_pfalse(ptmp);
1869   if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) {
1870     sve_uzp1(dst, B, src, ptmp);
1871   } else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) {
1872     sve_uzp1(dst, H, src, ptmp);
1873     sve_uzp1(dst, B, dst, ptmp);
1874   } else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) {
1875     sve_uzp1(dst, S, src, ptmp);
1876     sve_uzp1(dst, H, dst, ptmp);
1877     sve_uzp1(dst, B, dst, ptmp);
1878   } else {
1879     assert(false, "unsupported");
1880     ShouldNotReachHere();
1881   }
1882 }
1883 
1884 // Vector reduction add for integral type with ASIMD instructions.
1885 void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt,
1886                                                  Register isrc, FloatRegister vsrc,
1887                                                  unsigned vector_length_in_bytes,
1888                                                  FloatRegister vtmp) {
1889   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1890   assert_different_registers(dst, isrc);
1891   bool isQ = vector_length_in_bytes == 16;
1892 
1893   BLOCK_COMMENT("neon_reduce_add_integral {");
1894     switch(bt) {
1895       case T_BYTE:
1896         addv(vtmp, isQ ? T16B : T8B, vsrc);
1897         smov(dst, vtmp, B, 0);
1898         addw(dst, dst, isrc, ext::sxtb);
1899         break;
1900       case T_SHORT:
1901         addv(vtmp, isQ ? T8H : T4H, vsrc);
1902         smov(dst, vtmp, H, 0);
1903         addw(dst, dst, isrc, ext::sxth);
1904         break;
1905       case T_INT:
1906         isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc);
1907         umov(dst, vtmp, S, 0);
1908         addw(dst, dst, isrc);
1909         break;
1910       case T_LONG:
1911         assert(isQ, "unsupported");
1912         addpd(vtmp, vsrc);
1913         umov(dst, vtmp, D, 0);
1914         add(dst, dst, isrc);
1915         break;
1916       default:
1917         assert(false, "unsupported");
1918         ShouldNotReachHere();
1919     }
1920   BLOCK_COMMENT("} neon_reduce_add_integral");
1921 }
1922 
1923 // Vector reduction multiply for integral type with ASIMD instructions.
1924 // Note: temporary registers vtmp1 and vtmp2 are not used in some cases.
1925 // Clobbers: rscratch1
1926 void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt,
1927                                                  Register isrc, FloatRegister vsrc,
1928                                                  unsigned vector_length_in_bytes,
1929                                                  FloatRegister vtmp1, FloatRegister vtmp2) {
1930   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1931   bool isQ = vector_length_in_bytes == 16;
1932 
1933   BLOCK_COMMENT("neon_reduce_mul_integral {");
1934     switch(bt) {
1935       case T_BYTE:
1936         if (isQ) {
1937           // Multiply the lower half and higher half of vector iteratively.
1938           // vtmp1 = vsrc[8:15]
1939           ins(vtmp1, D, vsrc, 0, 1);
1940           // vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7]
1941           mulv(vtmp1, T8B, vtmp1, vsrc);
1942           // vtmp2 = vtmp1[4:7]
1943           ins(vtmp2, S, vtmp1, 0, 1);
1944           // vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3]
1945           mulv(vtmp1, T8B, vtmp2, vtmp1);
1946         } else {
1947           ins(vtmp1, S, vsrc, 0, 1);
1948           mulv(vtmp1, T8B, vtmp1, vsrc);
1949         }
1950         // vtmp2 = vtmp1[2:3]
1951         ins(vtmp2, H, vtmp1, 0, 1);
1952         // vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1]
1953         mulv(vtmp2, T8B, vtmp2, vtmp1);
1954         // dst = vtmp2[0] * isrc * vtmp2[1]
1955         umov(rscratch1, vtmp2, B, 0);
1956         mulw(dst, rscratch1, isrc);
1957         sxtb(dst, dst);
1958         umov(rscratch1, vtmp2, B, 1);
1959         mulw(dst, rscratch1, dst);
1960         sxtb(dst, dst);
1961         break;
1962       case T_SHORT:
1963         if (isQ) {
1964           ins(vtmp2, D, vsrc, 0, 1);
1965           mulv(vtmp2, T4H, vtmp2, vsrc);
1966           ins(vtmp1, S, vtmp2, 0, 1);
1967           mulv(vtmp1, T4H, vtmp1, vtmp2);
1968         } else {
1969           ins(vtmp1, S, vsrc, 0, 1);
1970           mulv(vtmp1, T4H, vtmp1, vsrc);
1971         }
1972         umov(rscratch1, vtmp1, H, 0);
1973         mulw(dst, rscratch1, isrc);
1974         sxth(dst, dst);
1975         umov(rscratch1, vtmp1, H, 1);
1976         mulw(dst, rscratch1, dst);
1977         sxth(dst, dst);
1978         break;
1979       case T_INT:
1980         if (isQ) {
1981           ins(vtmp1, D, vsrc, 0, 1);
1982           mulv(vtmp1, T2S, vtmp1, vsrc);
1983         } else {
1984           vtmp1 = vsrc;
1985         }
1986         umov(rscratch1, vtmp1, S, 0);
1987         mul(dst, rscratch1, isrc);
1988         umov(rscratch1, vtmp1, S, 1);
1989         mul(dst, rscratch1, dst);
1990         break;
1991       case T_LONG:
1992         umov(rscratch1, vsrc, D, 0);
1993         mul(dst, isrc, rscratch1);
1994         umov(rscratch1, vsrc, D, 1);
1995         mul(dst, dst, rscratch1);
1996         break;
1997       default:
1998         assert(false, "unsupported");
1999         ShouldNotReachHere();
2000     }
2001   BLOCK_COMMENT("} neon_reduce_mul_integral");
2002 }
2003 
2004 // Vector reduction multiply for floating-point type with ASIMD instructions.
2005 void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt,
2006                                            FloatRegister fsrc, FloatRegister vsrc,
2007                                            unsigned vector_length_in_bytes,
2008                                            FloatRegister vtmp) {
2009   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
2010   bool isQ = vector_length_in_bytes == 16;
2011 
2012   BLOCK_COMMENT("neon_reduce_mul_fp {");
2013     switch(bt) {
2014       case T_FLOAT:
2015         fmuls(dst, fsrc, vsrc);
2016         ins(vtmp, S, vsrc, 0, 1);
2017         fmuls(dst, dst, vtmp);
2018         if (isQ) {
2019           ins(vtmp, S, vsrc, 0, 2);
2020           fmuls(dst, dst, vtmp);
2021           ins(vtmp, S, vsrc, 0, 3);
2022           fmuls(dst, dst, vtmp);
2023          }
2024         break;
2025       case T_DOUBLE:
2026         assert(isQ, "unsupported");
2027         fmuld(dst, fsrc, vsrc);
2028         ins(vtmp, D, vsrc, 0, 1);
2029         fmuld(dst, dst, vtmp);
2030         break;
2031       default:
2032         assert(false, "unsupported");
2033         ShouldNotReachHere();
2034     }
2035   BLOCK_COMMENT("} neon_reduce_mul_fp");
2036 }
2037 
2038 // Helper to select logical instruction
2039 void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd,
2040                                                    Register Rn, Register Rm,
2041                                                    enum shift_kind kind, unsigned shift) {
2042   switch(opc) {
2043     case Op_AndReductionV:
2044       is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift);
2045       break;
2046     case Op_OrReductionV:
2047       is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift);
2048       break;
2049     case Op_XorReductionV:
2050       is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift);
2051       break;
2052     default:
2053       assert(false, "unsupported");
2054       ShouldNotReachHere();
2055   }
2056 }
2057 
2058 // Vector reduction logical operations And, Or, Xor
2059 // Clobbers: rscratch1
2060 void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt,
2061                                             Register isrc, FloatRegister vsrc,
2062                                             unsigned vector_length_in_bytes) {
2063   assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV,
2064          "unsupported");
2065   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
2066   assert_different_registers(dst, isrc);
2067   bool isQ = vector_length_in_bytes == 16;
2068 
2069   BLOCK_COMMENT("neon_reduce_logical {");
2070     umov(rscratch1, vsrc, isQ ? D : S, 0);
2071     umov(dst, vsrc, isQ ? D : S, 1);
2072     neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1);
2073     switch(bt) {
2074       case T_BYTE:
2075         if (isQ) {
2076           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
2077         }
2078         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
2079         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8);
2080         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
2081         sxtb(dst, dst);
2082         break;
2083       case T_SHORT:
2084         if (isQ) {
2085           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
2086         }
2087         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
2088         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
2089         sxth(dst, dst);
2090         break;
2091       case T_INT:
2092         if (isQ) {
2093           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
2094         }
2095         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
2096         break;
2097       case T_LONG:
2098         assert(isQ, "unsupported");
2099         neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst);
2100         break;
2101       default:
2102         assert(false, "unsupported");
2103         ShouldNotReachHere();
2104     }
2105   BLOCK_COMMENT("} neon_reduce_logical");
2106 }
2107 
2108 // Vector reduction min/max for integral type with ASIMD instructions.
2109 // Note: vtmp is not used and expected to be fnoreg for T_LONG case.
2110 // Clobbers: rscratch1, rflags
2111 void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt,
2112                                                     Register isrc, FloatRegister vsrc,
2113                                                     unsigned vector_length_in_bytes,
2114                                                     FloatRegister vtmp) {
2115   assert(opc == Op_MinReductionV || opc == Op_MaxReductionV, "unsupported");
2116   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
2117   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported");
2118   assert_different_registers(dst, isrc);
2119   bool isQ = vector_length_in_bytes == 16;
2120   bool is_min = opc == Op_MinReductionV;
2121 
2122   BLOCK_COMMENT("neon_reduce_minmax_integral {");
2123     if (bt == T_LONG) {
2124       assert(vtmp == fnoreg, "should be");
2125       assert(isQ, "should be");
2126       umov(rscratch1, vsrc, D, 0);
2127       cmp(isrc, rscratch1);
2128       csel(dst, isrc, rscratch1, is_min ? LT : GT);
2129       umov(rscratch1, vsrc, D, 1);
2130       cmp(dst, rscratch1);
2131       csel(dst, dst, rscratch1, is_min ? LT : GT);
2132     } else {
2133       SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
2134       if (size == T2S) {
2135         is_min ? sminp(vtmp, size, vsrc, vsrc) : smaxp(vtmp, size, vsrc, vsrc);
2136       } else {
2137         is_min ? sminv(vtmp, size, vsrc) : smaxv(vtmp, size, vsrc);
2138       }
2139       if (bt == T_INT) {
2140         umov(dst, vtmp, S, 0);
2141       } else {
2142         smov(dst, vtmp, elemType_to_regVariant(bt), 0);
2143       }
2144       cmpw(dst, isrc);
2145       cselw(dst, dst, isrc, is_min ? LT : GT);
2146     }
2147   BLOCK_COMMENT("} neon_reduce_minmax_integral");
2148 }
2149 
2150 // Vector reduction for integral type with SVE instruction.
2151 // Supported operations are Add, And, Or, Xor, Max, Min.
2152 // rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV.
2153 void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1,
2154                                             FloatRegister src2, PRegister pg, FloatRegister tmp) {
2155   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2156   assert(pg->is_governing(), "This register has to be a governing predicate register");
2157   assert_different_registers(src1, dst);
2158   // Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved.
2159   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2160   switch (opc) {
2161     case Op_AddReductionVI: {
2162       sve_uaddv(tmp, size, pg, src2);
2163       if (bt == T_BYTE) {
2164         smov(dst, tmp, size, 0);
2165         addw(dst, src1, dst, ext::sxtb);
2166       } else if (bt == T_SHORT) {
2167         smov(dst, tmp, size, 0);
2168         addw(dst, src1, dst, ext::sxth);
2169       } else {
2170         umov(dst, tmp, size, 0);
2171         addw(dst, dst, src1);
2172       }
2173       break;
2174     }
2175     case Op_AddReductionVL: {
2176       sve_uaddv(tmp, size, pg, src2);
2177       umov(dst, tmp, size, 0);
2178       add(dst, dst, src1);
2179       break;
2180     }
2181     case Op_AndReductionV: {
2182       sve_andv(tmp, size, pg, src2);
2183       if (bt == T_INT || bt == T_LONG) {
2184         umov(dst, tmp, size, 0);
2185       } else {
2186         smov(dst, tmp, size, 0);
2187       }
2188       if (bt == T_LONG) {
2189         andr(dst, dst, src1);
2190       } else {
2191         andw(dst, dst, src1);
2192       }
2193       break;
2194     }
2195     case Op_OrReductionV: {
2196       sve_orv(tmp, size, pg, src2);
2197       if (bt == T_INT || bt == T_LONG) {
2198         umov(dst, tmp, size, 0);
2199       } else {
2200         smov(dst, tmp, size, 0);
2201       }
2202       if (bt == T_LONG) {
2203         orr(dst, dst, src1);
2204       } else {
2205         orrw(dst, dst, src1);
2206       }
2207       break;
2208     }
2209     case Op_XorReductionV: {
2210       sve_eorv(tmp, size, pg, src2);
2211       if (bt == T_INT || bt == T_LONG) {
2212         umov(dst, tmp, size, 0);
2213       } else {
2214         smov(dst, tmp, size, 0);
2215       }
2216       if (bt == T_LONG) {
2217         eor(dst, dst, src1);
2218       } else {
2219         eorw(dst, dst, src1);
2220       }
2221       break;
2222     }
2223     case Op_MaxReductionV: {
2224       sve_smaxv(tmp, size, pg, src2);
2225       if (bt == T_INT || bt == T_LONG) {
2226         umov(dst, tmp, size, 0);
2227       } else {
2228         smov(dst, tmp, size, 0);
2229       }
2230       if (bt == T_LONG) {
2231         cmp(dst, src1);
2232         csel(dst, dst, src1, Assembler::GT);
2233       } else {
2234         cmpw(dst, src1);
2235         cselw(dst, dst, src1, Assembler::GT);
2236       }
2237       break;
2238     }
2239     case Op_MinReductionV: {
2240       sve_sminv(tmp, size, pg, src2);
2241       if (bt == T_INT || bt == T_LONG) {
2242         umov(dst, tmp, size, 0);
2243       } else {
2244         smov(dst, tmp, size, 0);
2245       }
2246       if (bt == T_LONG) {
2247         cmp(dst, src1);
2248         csel(dst, dst, src1, Assembler::LT);
2249       } else {
2250         cmpw(dst, src1);
2251         cselw(dst, dst, src1, Assembler::LT);
2252       }
2253       break;
2254     }
2255     default:
2256       assert(false, "unsupported");
2257       ShouldNotReachHere();
2258   }
2259 
2260   if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) {
2261     if (bt == T_BYTE) {
2262       sxtb(dst, dst);
2263     } else if (bt == T_SHORT) {
2264       sxth(dst, dst);
2265     }
2266   }
2267 }
2268 
2269 // Set elements of the dst predicate to true for lanes in the range of [0, lane_cnt), or
2270 // to false otherwise. The input "lane_cnt" should be smaller than or equal to the supported
2271 // max vector length of the basic type. Clobbers: rscratch1 and the rFlagsReg.
2272 void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt) {
2273   uint32_t max_vector_length = Matcher::max_vector_size(bt);
2274   assert(lane_cnt <= max_vector_length, "unsupported input lane_cnt");
2275 
2276   // Set all elements to false if the input "lane_cnt" is zero.
2277   if (lane_cnt == 0) {
2278     sve_pfalse(dst);
2279     return;
2280   }
2281 
2282   SIMD_RegVariant size = elemType_to_regVariant(bt);
2283   assert(size != Q, "invalid size");
2284 
2285   // Set all true if "lane_cnt" equals to the max lane count.
2286   if (lane_cnt == max_vector_length) {
2287     sve_ptrue(dst, size, /* ALL */ 0b11111);
2288     return;
2289   }
2290 
2291   // Fixed numbers for "ptrue".
2292   switch(lane_cnt) {
2293   case 1: /* VL1 */
2294   case 2: /* VL2 */
2295   case 3: /* VL3 */
2296   case 4: /* VL4 */
2297   case 5: /* VL5 */
2298   case 6: /* VL6 */
2299   case 7: /* VL7 */
2300   case 8: /* VL8 */
2301     sve_ptrue(dst, size, lane_cnt);
2302     return;
2303   case 16:
2304     sve_ptrue(dst, size, /* VL16 */ 0b01001);
2305     return;
2306   case 32:
2307     sve_ptrue(dst, size, /* VL32 */ 0b01010);
2308     return;
2309   case 64:
2310     sve_ptrue(dst, size, /* VL64 */ 0b01011);
2311     return;
2312   case 128:
2313     sve_ptrue(dst, size, /* VL128 */ 0b01100);
2314     return;
2315   case 256:
2316     sve_ptrue(dst, size, /* VL256 */ 0b01101);
2317     return;
2318   default:
2319     break;
2320   }
2321 
2322   // Special patterns for "ptrue".
2323   if (lane_cnt == round_down_power_of_2(max_vector_length)) {
2324     sve_ptrue(dst, size, /* POW2 */ 0b00000);
2325   } else if (lane_cnt == max_vector_length - (max_vector_length % 4)) {
2326     sve_ptrue(dst, size, /* MUL4 */ 0b11101);
2327   } else if (lane_cnt == max_vector_length - (max_vector_length % 3)) {
2328     sve_ptrue(dst, size, /* MUL3 */ 0b11110);
2329   } else {
2330     // Encode to "whileltw" for the remaining cases.
2331     mov(rscratch1, lane_cnt);
2332     sve_whileltw(dst, size, zr, rscratch1);
2333   }
2334 }
2335 
2336 // Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst.
2337 // Any remaining elements of dst will be filled with zero.
2338 // Clobbers: rscratch1
2339 // Preserves: src, mask
2340 void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask,
2341                                            FloatRegister vtmp1, FloatRegister vtmp2,
2342                                            PRegister pgtmp) {
2343   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2344   assert_different_registers(dst, src, vtmp1, vtmp2);
2345   assert_different_registers(mask, pgtmp);
2346 
2347   // Example input:   src   = 8888 7777 6666 5555 4444 3333 2222 1111
2348   //                  mask  = 0001 0000 0000 0001 0001 0000 0001 0001
2349   // Expected result: dst   = 0000 0000 0000 8888 5555 4444 2222 1111
2350   sve_dup(vtmp2, H, 0);
2351 
2352   // Extend lowest half to type INT.
2353   // dst = 00004444 00003333 00002222 00001111
2354   sve_uunpklo(dst, S, src);
2355   // pgtmp = 00000001 00000000 00000001 00000001
2356   sve_punpklo(pgtmp, mask);
2357   // Pack the active elements in size of type INT to the right,
2358   // and fill the remainings with zero.
2359   // dst = 00000000 00004444 00002222 00001111
2360   sve_compact(dst, S, dst, pgtmp);
2361   // Narrow the result back to type SHORT.
2362   // dst = 0000 0000 0000 0000 0000 4444 2222 1111
2363   sve_uzp1(dst, H, dst, vtmp2);
2364   // Count the active elements of lowest half.
2365   // rscratch1 = 3
2366   sve_cntp(rscratch1, S, ptrue, pgtmp);
2367 
2368   // Repeat to the highest half.
2369   // pgtmp = 00000001 00000000 00000000 00000001
2370   sve_punpkhi(pgtmp, mask);
2371   // vtmp1 = 00008888 00007777 00006666 00005555
2372   sve_uunpkhi(vtmp1, S, src);
2373   // vtmp1 = 00000000 00000000 00008888 00005555
2374   sve_compact(vtmp1, S, vtmp1, pgtmp);
2375   // vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555
2376   sve_uzp1(vtmp1, H, vtmp1, vtmp2);
2377 
2378   // Compressed low:   dst   = 0000 0000 0000 0000 0000 4444 2222 1111
2379   // Compressed high:  vtmp1 = 0000 0000 0000 0000 0000 0000 8888  5555
2380   // Left shift(cross lane) compressed high with TRUE_CNT lanes,
2381   // TRUE_CNT is the number of active elements in the compressed low.
2382   neg(rscratch1, rscratch1);
2383   // vtmp2 = {4 3 2 1 0 -1 -2 -3}
2384   sve_index(vtmp2, H, rscratch1, 1);
2385   // vtmp1 = 0000 0000 0000 8888 5555 0000 0000 0000
2386   sve_tbl(vtmp1, H, vtmp1, vtmp2);
2387 
2388   // Combine the compressed high(after shifted) with the compressed low.
2389   // dst = 0000 0000 0000 8888 5555 4444 2222 1111
2390   sve_orr(dst, dst, vtmp1);
2391 }
2392 
2393 // Clobbers: rscratch1, rscratch2
2394 // Preserves: src, mask
2395 void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask,
2396                                           FloatRegister vtmp1, FloatRegister vtmp2,
2397                                           FloatRegister vtmp3, FloatRegister vtmp4,
2398                                           PRegister ptmp, PRegister pgtmp) {
2399   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2400   assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3, vtmp4);
2401   assert_different_registers(mask, ptmp, pgtmp);
2402   // Example input:   src   = 88 77 66 55 44 33 22 11
2403   //                  mask  = 01 00 00 01 01 00 01 01
2404   // Expected result: dst   = 00 00 00 88 55 44 22 11
2405 
2406   sve_dup(vtmp4, B, 0);
2407   // Extend lowest half to type SHORT.
2408   // vtmp1 = 0044 0033 0022 0011
2409   sve_uunpklo(vtmp1, H, src);
2410   // ptmp = 0001 0000 0001 0001
2411   sve_punpklo(ptmp, mask);
2412   // Count the active elements of lowest half.
2413   // rscratch2 = 3
2414   sve_cntp(rscratch2, H, ptrue, ptmp);
2415   // Pack the active elements in size of type SHORT to the right,
2416   // and fill the remainings with zero.
2417   // dst = 0000 0044 0022 0011
2418   sve_compress_short(dst, vtmp1, ptmp, vtmp2, vtmp3, pgtmp);
2419   // Narrow the result back to type BYTE.
2420   // dst = 00 00 00 00 00 44 22 11
2421   sve_uzp1(dst, B, dst, vtmp4);
2422 
2423   // Repeat to the highest half.
2424   // ptmp = 0001 0000 0000 0001
2425   sve_punpkhi(ptmp, mask);
2426   // vtmp1 = 0088 0077 0066 0055
2427   sve_uunpkhi(vtmp2, H, src);
2428   // vtmp1 = 0000 0000 0088 0055
2429   sve_compress_short(vtmp1, vtmp2, ptmp, vtmp3, vtmp4, pgtmp);
2430 
2431   sve_dup(vtmp4, B, 0);
2432   // vtmp1 = 00 00 00 00 00 00 88 55
2433   sve_uzp1(vtmp1, B, vtmp1, vtmp4);
2434 
2435   // Compressed low:   dst   = 00 00 00 00 00 44 22 11
2436   // Compressed high:  vtmp1 = 00 00 00 00 00 00 88 55
2437   // Left shift(cross lane) compressed high with TRUE_CNT lanes,
2438   // TRUE_CNT is the number of active elements in the compressed low.
2439   neg(rscratch2, rscratch2);
2440   // vtmp2 = {4 3 2 1 0 -1 -2 -3}
2441   sve_index(vtmp2, B, rscratch2, 1);
2442   // vtmp1 = 00 00 00 88 55 00 00 00
2443   sve_tbl(vtmp1, B, vtmp1, vtmp2);
2444   // Combine the compressed high(after shifted) with the compressed low.
2445   // dst = 00 00 00 88 55 44 22 11
2446   sve_orr(dst, dst, vtmp1);
2447 }
2448 
2449 void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2450   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2451   SIMD_Arrangement size = isQ ? T16B : T8B;
2452   if (bt == T_BYTE) {
2453     rbit(dst, size, src);
2454   } else {
2455     neon_reverse_bytes(dst, src, bt, isQ);
2456     rbit(dst, size, dst);
2457   }
2458 }
2459 
2460 void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2461   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2462   SIMD_Arrangement size = isQ ? T16B : T8B;
2463   switch (bt) {
2464     case T_BYTE:
2465       if (dst != src) {
2466         orr(dst, size, src, src);
2467       }
2468       break;
2469     case T_SHORT:
2470       rev16(dst, size, src);
2471       break;
2472     case T_INT:
2473       rev32(dst, size, src);
2474       break;
2475     case T_LONG:
2476       rev64(dst, size, src);
2477       break;
2478     default:
2479       assert(false, "unsupported");
2480       ShouldNotReachHere();
2481   }
2482 }
2483 
2484 // Extract a scalar element from an sve vector at position 'idx'.
2485 // The input elements in src are expected to be of integral type.
2486 void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src,
2487                                              int idx, FloatRegister vtmp) {
2488   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2489   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2490   if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction
2491     if (bt == T_INT || bt == T_LONG) {
2492       umov(dst, src, size, idx);
2493     } else {
2494       smov(dst, src, size, idx);
2495     }
2496   } else {
2497     sve_orr(vtmp, src, src);
2498     sve_ext(vtmp, vtmp, idx << size);
2499     if (bt == T_INT || bt == T_LONG) {
2500       umov(dst, vtmp, size, 0);
2501     } else {
2502       smov(dst, vtmp, size, 0);
2503     }
2504   }
2505 }
2506 
2507 // java.lang.Math::round intrinsics
2508 
2509 // Clobbers: rscratch1, rflags
2510 void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2511                                           FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) {
2512   assert_different_registers(tmp1, tmp2, tmp3, src, dst);
2513   switch (T) {
2514     case T2S:
2515     case T4S:
2516       fmovs(tmp1, T, 0.5f);
2517       mov(rscratch1, jint_cast(0x1.0p23f));
2518       break;
2519     case T2D:
2520       fmovd(tmp1, T, 0.5);
2521       mov(rscratch1, julong_cast(0x1.0p52));
2522       break;
2523     default:
2524       assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2525   }
2526   fadd(tmp1, T, tmp1, src);
2527   fcvtms(tmp1, T, tmp1);
2528   // tmp1 = floor(src + 0.5, ties to even)
2529 
2530   fcvtas(dst, T, src);
2531   // dst = round(src), ties to away
2532 
2533   fneg(tmp3, T, src);
2534   dup(tmp2, T, rscratch1);
2535   cm(HS, tmp3, T, tmp3, tmp2);
2536   // tmp3 is now a set of flags
2537 
2538   bif(dst, T16B, tmp1, tmp3);
2539   // result in dst
2540 }
2541 
2542 // Clobbers: rscratch1, rflags
2543 void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2544                                          FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) {
2545   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2546   assert_different_registers(tmp1, tmp2, src, dst);
2547 
2548   switch (T) {
2549     case S:
2550       mov(rscratch1, jint_cast(0x1.0p23f));
2551       break;
2552     case D:
2553       mov(rscratch1, julong_cast(0x1.0p52));
2554       break;
2555     default:
2556       assert(T == S || T == D, "invalid register variant");
2557   }
2558 
2559   sve_frinta(dst, T, ptrue, src);
2560   // dst = round(src), ties to away
2561 
2562   Label none;
2563 
2564   sve_fneg(tmp1, T, ptrue, src);
2565   sve_dup(tmp2, T, rscratch1);
2566   sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1);
2567   br(EQ, none);
2568   {
2569     sve_cpy(tmp1, T, pgtmp, 0.5);
2570     sve_fadd(tmp1, T, pgtmp, src);
2571     sve_frintm(dst, T, pgtmp, tmp1);
2572     // dst = floor(src + 0.5, ties to even)
2573   }
2574   bind(none);
2575 
2576   sve_fcvtzs(dst, T, ptrue, dst, T);
2577   // result in dst
2578 }
2579 
2580 void C2_MacroAssembler::vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero,
2581                                            FloatRegister one, SIMD_Arrangement T) {
2582   assert_different_registers(dst, src, zero, one);
2583   assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2584 
2585   facgt(dst, T, src, zero);
2586   ushr(dst, T, dst, 1); // dst=0 for +-0.0 and NaN. 0x7FF..F otherwise
2587   bsl(dst, T == T2S ? T8B : T16B, one, src); // Result in dst
2588 }
2589 
2590 void C2_MacroAssembler::vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero,
2591                                           FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T) {
2592     assert_different_registers(dst, src, zero, one, vtmp);
2593     assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2594 
2595     sve_orr(vtmp, src, src);
2596     sve_fac(Assembler::GT, pgtmp, T, ptrue, src, zero); // pmtp=0 for +-0.0 and NaN. 0x1 otherwise
2597     switch (T) {
2598     case S:
2599       sve_and(vtmp, T, min_jint); // Extract the sign bit of float value in every lane of src
2600       sve_orr(vtmp, T, jint_cast(1.0)); // OR it with +1 to make the final result +1 or -1 depending
2601                                         // on the sign of the float value
2602       break;
2603     case D:
2604       sve_and(vtmp, T, min_jlong);
2605       sve_orr(vtmp, T, jlong_cast(1.0));
2606       break;
2607     default:
2608       assert(false, "unsupported");
2609       ShouldNotReachHere();
2610     }
2611     sve_sel(dst, T, pgtmp, vtmp, src); // Select either from src or vtmp based on the predicate register pgtmp
2612                                        // Result in dst
2613 }
2614 
2615 bool C2_MacroAssembler::in_scratch_emit_size() {
2616   if (ciEnv::current()->task() != nullptr) {
2617     PhaseOutput* phase_output = Compile::current()->output();
2618     if (phase_output != nullptr && phase_output->in_scratch_emit_size()) {
2619       return true;
2620     }
2621   }
2622   return MacroAssembler::in_scratch_emit_size();
2623 }