1 /*
   2  * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "opto/c2_MacroAssembler.hpp"
  29 #include "opto/compile.hpp"
  30 #include "opto/intrinsicnode.hpp"
  31 #include "opto/matcher.hpp"
  32 #include "opto/output.hpp"
  33 #include "opto/subnode.hpp"
  34 #include "runtime/stubRoutines.hpp"
  35 #include "utilities/globalDefinitions.hpp"
  36 
  37 #ifdef PRODUCT
  38 #define BLOCK_COMMENT(str) /* nothing */
  39 #define STOP(error) stop(error)
  40 #else
  41 #define BLOCK_COMMENT(str) block_comment(str)
  42 #define STOP(error) block_comment(error); stop(error)
  43 #endif
  44 
  45 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  46 
  47 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
  48 
  49 void C2_MacroAssembler::fast_lock(Register objectReg, Register boxReg, Register tmpReg,
  50                                   Register tmp2Reg, Register tmp3Reg) {
  51   Register oop = objectReg;
  52   Register box = boxReg;
  53   Register disp_hdr = tmpReg;
  54   Register tmp = tmp2Reg;
  55   Label cont;
  56   Label object_has_monitor;
  57   Label count, no_count;
  58 
  59   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight");
  60   assert_different_registers(oop, box, tmp, disp_hdr, rscratch2);
  61 
  62   // Load markWord from object into displaced_header.
  63   ldr(disp_hdr, Address(oop, oopDesc::mark_offset_in_bytes()));
  64 
  65   if (DiagnoseSyncOnValueBasedClasses != 0) {
  66     load_klass(tmp, oop);
  67     ldrb(tmp, Address(tmp, Klass::misc_flags_offset()));
  68     tst(tmp, KlassFlags::_misc_is_value_based_class);
  69     br(Assembler::NE, cont);
  70   }
  71 
  72   // Check for existing monitor
  73   tbnz(disp_hdr, exact_log2(markWord::monitor_value), object_has_monitor);
  74 
  75   if (LockingMode == LM_MONITOR) {
  76     tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0.
  77     b(cont);
  78   } else {
  79     assert(LockingMode == LM_LEGACY, "must be");
  80     // Set tmp to be (markWord of object | UNLOCK_VALUE).
  81     orr(tmp, disp_hdr, markWord::unlocked_value);
  82 
  83     // Initialize the box. (Must happen before we update the object mark!)
  84     str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes()));
  85 
  86     // Compare object markWord with an unlocked value (tmp) and if
  87     // equal exchange the stack address of our box with object markWord.
  88     // On failure disp_hdr contains the possibly locked markWord.
  89     cmpxchg(oop, tmp, box, Assembler::xword, /*acquire*/ true,
  90             /*release*/ true, /*weak*/ false, disp_hdr);
  91     br(Assembler::EQ, cont);
  92 
  93     assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
  94 
  95     // If the compare-and-exchange succeeded, then we found an unlocked
  96     // object, will have now locked it will continue at label cont
  97 
  98     // Check if the owner is self by comparing the value in the
  99     // markWord of object (disp_hdr) with the stack pointer.
 100     mov(rscratch1, sp);
 101     sub(disp_hdr, disp_hdr, rscratch1);
 102     mov(tmp, (address) (~(os::vm_page_size()-1) | markWord::lock_mask_in_place));
 103     // If condition is true we are cont and hence we can store 0 as the
 104     // displaced header in the box, which indicates that it is a recursive lock.
 105     ands(tmp/*==0?*/, disp_hdr, tmp);   // Sets flags for result
 106     str(tmp/*==0, perhaps*/, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 107     b(cont);
 108   }
 109 
 110   // Handle existing monitor.
 111   bind(object_has_monitor);
 112 
 113   // The object's monitor m is unlocked iff m->owner == nullptr,
 114   // otherwise m->owner may contain a thread id, a stack address for LM_LEGACY,
 115   // or the ANONYMOUS_OWNER constant for LM_LIGHTWEIGHT.
 116   //
 117   // Try to CAS m->owner from null to current thread.
 118   ldr(rscratch2, Address(rthread, JavaThread::lock_id_offset()));
 119   add(tmp, disp_hdr, (in_bytes(ObjectMonitor::owner_offset())-markWord::monitor_value));
 120   cmpxchg(tmp, zr, rscratch2, Assembler::xword, /*acquire*/ true,
 121           /*release*/ true, /*weak*/ false, tmp3Reg); // Sets flags for result
 122 
 123   // Store a non-null value into the box to avoid looking like a re-entrant
 124   // lock. The fast-path monitor unlock code checks for
 125   // markWord::monitor_value so use markWord::unused_mark which has the
 126   // relevant bit set, and also matches ObjectSynchronizer::enter.
 127   mov(tmp, (address)markWord::unused_mark().value());
 128   str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 129 
 130   br(Assembler::EQ, cont); // CAS success means locking succeeded
 131 
 132   cmp(tmp3Reg, rscratch2);
 133   br(Assembler::NE, cont); // Check for recursive locking
 134 
 135   // Recursive lock case
 136   increment(Address(disp_hdr, in_bytes(ObjectMonitor::recursions_offset()) - markWord::monitor_value), 1);
 137   // flag == EQ still from the cmp above, checking if this is a reentrant lock
 138 
 139   bind(cont);
 140   // flag == EQ indicates success
 141   // flag == NE indicates failure
 142   br(Assembler::NE, no_count);
 143 
 144   bind(count);
 145   if (LockingMode == LM_LEGACY) {
 146     inc_held_monitor_count();
 147   }
 148 
 149   bind(no_count);
 150 }
 151 
 152 void C2_MacroAssembler::fast_unlock(Register objectReg, Register boxReg, Register tmpReg,
 153                                     Register tmp2Reg) {
 154   Register oop = objectReg;
 155   Register box = boxReg;
 156   Register disp_hdr = tmpReg;
 157   Register owner_addr = tmpReg;
 158   Register tmp = tmp2Reg;
 159   Label cont;
 160   Label object_has_monitor;
 161   Label count, no_count;
 162   Label unlocked;
 163 
 164   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight");
 165   assert_different_registers(oop, box, tmp, disp_hdr);
 166 
 167   if (LockingMode == LM_LEGACY) {
 168     // Find the lock address and load the displaced header from the stack.
 169     ldr(disp_hdr, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 170 
 171     // If the displaced header is 0, we have a recursive unlock.
 172     cmp(disp_hdr, zr);
 173     br(Assembler::EQ, cont);
 174   }
 175 
 176   // Handle existing monitor.
 177   ldr(tmp, Address(oop, oopDesc::mark_offset_in_bytes()));
 178   tbnz(tmp, exact_log2(markWord::monitor_value), object_has_monitor);
 179 
 180   if (LockingMode == LM_MONITOR) {
 181     tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0.
 182     b(cont);
 183   } else {
 184     assert(LockingMode == LM_LEGACY, "must be");
 185     // Check if it is still a light weight lock, this is is true if we
 186     // see the stack address of the basicLock in the markWord of the
 187     // object.
 188 
 189     cmpxchg(oop, box, disp_hdr, Assembler::xword, /*acquire*/ false,
 190             /*release*/ true, /*weak*/ false, tmp);
 191     b(cont);
 192   }
 193 
 194   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
 195 
 196   // Handle existing monitor.
 197   bind(object_has_monitor);
 198   STATIC_ASSERT(markWord::monitor_value <= INT_MAX);
 199   add(tmp, tmp, -(int)markWord::monitor_value); // monitor
 200 
 201   ldr(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset()));
 202 
 203   Label notRecursive;
 204   cbz(disp_hdr, notRecursive);
 205 
 206   // Recursive lock
 207   sub(disp_hdr, disp_hdr, 1u);
 208   str(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset()));
 209   cmp(disp_hdr, disp_hdr); // Sets flags for result
 210   b(cont);
 211 
 212   bind(notRecursive);
 213 
 214   // Compute owner address.
 215   lea(owner_addr, Address(tmp, ObjectMonitor::owner_offset()));
 216 
 217   // Set owner to null.
 218   // Release to satisfy the JMM
 219   stlr(zr, owner_addr);
 220   // We need a full fence after clearing owner to avoid stranding.
 221   // StoreLoad achieves this.
 222   membar(StoreLoad);
 223 
 224   // Check if the entry lists are empty.
 225   ldr(rscratch1, Address(tmp, ObjectMonitor::EntryList_offset()));
 226   ldr(tmpReg, Address(tmp, ObjectMonitor::cxq_offset()));
 227   orr(rscratch1, rscratch1, tmpReg);
 228   cmp(rscratch1, zr);
 229   br(Assembler::EQ, cont);     // If so we are done.
 230 
 231   // Check if there is a successor.
 232   ldr(rscratch1, Address(tmp, ObjectMonitor::succ_offset()));
 233   cmp(rscratch1, zr);
 234   br(Assembler::NE, unlocked); // If so we are done.
 235 
 236   // Save the monitor pointer in the current thread, so we can try to
 237   // reacquire the lock in SharedRuntime::monitor_exit_helper().
 238   str(tmp, Address(rthread, JavaThread::unlocked_inflated_monitor_offset()));
 239 
 240   cmp(zr, rthread); // Set Flag to NE => slow path
 241   b(cont);
 242 
 243   bind(unlocked);
 244   cmp(zr, zr); // Set Flag to EQ => fast path
 245 
 246   // Intentional fall-through
 247 
 248   bind(cont);
 249   // flag == EQ indicates success
 250   // flag == NE indicates failure
 251   br(Assembler::NE, no_count);
 252 
 253   bind(count);
 254   if (LockingMode == LM_LEGACY) {
 255     dec_held_monitor_count();
 256   }
 257 
 258   bind(no_count);
 259 }
 260 
 261 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register t1,
 262                                               Register t2, Register t3) {
 263   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 264   assert_different_registers(obj, box, t1, t2, t3, rscratch2);
 265 
 266   // Handle inflated monitor.
 267   Label inflated;
 268   // Finish fast lock successfully. MUST branch to with flag == EQ
 269   Label locked;
 270   // Finish fast lock unsuccessfully. MUST branch to with flag == NE
 271   Label slow_path;
 272 
 273   if (UseObjectMonitorTable) {
 274     // Clear cache in case fast locking succeeds.
 275     str(zr, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 276   }
 277 
 278   if (DiagnoseSyncOnValueBasedClasses != 0) {
 279     load_klass(t1, obj);
 280     ldrb(t1, Address(t1, Klass::misc_flags_offset()));
 281     tst(t1, KlassFlags::_misc_is_value_based_class);
 282     br(Assembler::NE, slow_path);
 283   }
 284 
 285   const Register t1_mark = t1;
 286   const Register t3_t = t3;
 287 
 288   { // Lightweight locking
 289 
 290     // Push lock to the lock stack and finish successfully. MUST branch to with flag == EQ
 291     Label push;
 292 
 293     const Register t2_top = t2;
 294 
 295     // Check if lock-stack is full.
 296     ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 297     cmpw(t2_top, (unsigned)LockStack::end_offset() - 1);
 298     br(Assembler::GT, slow_path);
 299 
 300     // Check if recursive.
 301     subw(t3_t, t2_top, oopSize);
 302     ldr(t3_t, Address(rthread, t3_t));
 303     cmp(obj, t3_t);
 304     br(Assembler::EQ, push);
 305 
 306     // Relaxed normal load to check for monitor. Optimization for monitor case.
 307     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 308     tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
 309 
 310     // Not inflated
 311     assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea");
 312 
 313     // Try to lock. Transition lock-bits 0b01 => 0b00
 314     orr(t1_mark, t1_mark, markWord::unlocked_value);
 315     eor(t3_t, t1_mark, markWord::unlocked_value);
 316     cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
 317             /*acquire*/ true, /*release*/ false, /*weak*/ false, noreg);
 318     br(Assembler::NE, slow_path);
 319 
 320     bind(push);
 321     // After successful lock, push object on lock-stack.
 322     str(obj, Address(rthread, t2_top));
 323     addw(t2_top, t2_top, oopSize);
 324     strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 325     b(locked);
 326   }
 327 
 328   { // Handle inflated monitor.
 329     bind(inflated);
 330 
 331     const Register t1_monitor = t1;
 332 
 333     if (!UseObjectMonitorTable) {
 334       assert(t1_monitor == t1_mark, "should be the same here");
 335     } else {
 336       Label monitor_found;
 337 
 338       // Load cache address
 339       lea(t3_t, Address(rthread, JavaThread::om_cache_oops_offset()));
 340 
 341       const int num_unrolled = 2;
 342       for (int i = 0; i < num_unrolled; i++) {
 343         ldr(t1, Address(t3_t));
 344         cmp(obj, t1);
 345         br(Assembler::EQ, monitor_found);
 346         increment(t3_t, in_bytes(OMCache::oop_to_oop_difference()));
 347       }
 348 
 349       Label loop;
 350 
 351       // Search for obj in cache.
 352       bind(loop);
 353 
 354       // Check for match.
 355       ldr(t1, Address(t3_t));
 356       cmp(obj, t1);
 357       br(Assembler::EQ, monitor_found);
 358 
 359       // Search until null encountered, guaranteed _null_sentinel at end.
 360       increment(t3_t, in_bytes(OMCache::oop_to_oop_difference()));
 361       cbnz(t1, loop);
 362       // Cache Miss, NE set from cmp above, cbnz does not set flags
 363       b(slow_path);
 364 
 365       bind(monitor_found);
 366       ldr(t1_monitor, Address(t3_t, OMCache::oop_to_monitor_difference()));
 367     }
 368 
 369     const Register t2_owner_addr = t2;
 370     const Register t3_owner = t3;
 371     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 372     const Address owner_address(t1_monitor, ObjectMonitor::owner_offset() - monitor_tag);
 373     const Address recursions_address(t1_monitor, ObjectMonitor::recursions_offset() - monitor_tag);
 374 
 375     Label monitor_locked;
 376 
 377     // Compute owner address.
 378     lea(t2_owner_addr, owner_address);
 379 
 380     // CAS owner (null => current thread id).
 381     ldr(rscratch2, Address(rthread, JavaThread::lock_id_offset()));
 382     cmpxchg(t2_owner_addr, zr, rscratch2, Assembler::xword, /*acquire*/ true,
 383             /*release*/ false, /*weak*/ false, t3_owner);
 384     br(Assembler::EQ, monitor_locked);
 385 
 386     // Check if recursive.
 387     cmp(t3_owner, rscratch2);
 388     br(Assembler::NE, slow_path);
 389 
 390     // Recursive.
 391     increment(recursions_address, 1);
 392 
 393     bind(monitor_locked);
 394     if (UseObjectMonitorTable) {
 395       str(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 396     }
 397   }
 398 
 399   bind(locked);
 400 
 401 #ifdef ASSERT
 402   // Check that locked label is reached with Flags == EQ.
 403   Label flag_correct;
 404   br(Assembler::EQ, flag_correct);
 405   stop("Fast Lock Flag != EQ");
 406 #endif
 407 
 408   bind(slow_path);
 409 #ifdef ASSERT
 410   // Check that slow_path label is reached with Flags == NE.
 411   br(Assembler::NE, flag_correct);
 412   stop("Fast Lock Flag != NE");
 413   bind(flag_correct);
 414 #endif
 415   // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
 416 }
 417 
 418 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register box, Register t1,
 419                                                 Register t2, Register t3) {
 420   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 421   assert_different_registers(obj, box, t1, t2, t3);
 422 
 423   // Handle inflated monitor.
 424   Label inflated, inflated_load_mark;
 425   // Finish fast unlock successfully. MUST branch to with flag == EQ
 426   Label unlocked;
 427   // Finish fast unlock unsuccessfully. MUST branch to with flag == NE
 428   Label slow_path;
 429 
 430   const Register t1_mark = t1;
 431   const Register t2_top = t2;
 432   const Register t3_t = t3;
 433 
 434   { // Lightweight unlock
 435 
 436     Label push_and_slow_path;
 437 
 438     // Check if obj is top of lock-stack.
 439     ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 440     subw(t2_top, t2_top, oopSize);
 441     ldr(t3_t, Address(rthread, t2_top));
 442     cmp(obj, t3_t);
 443     // Top of lock stack was not obj. Must be monitor.
 444     br(Assembler::NE, inflated_load_mark);
 445 
 446     // Pop lock-stack.
 447     DEBUG_ONLY(str(zr, Address(rthread, t2_top));)
 448     strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 449 
 450     // Check if recursive.
 451     subw(t3_t, t2_top, oopSize);
 452     ldr(t3_t, Address(rthread, t3_t));
 453     cmp(obj, t3_t);
 454     br(Assembler::EQ, unlocked);
 455 
 456     // Not recursive.
 457     // Load Mark.
 458     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 459 
 460     // Check header for monitor (0b10).
 461     // Because we got here by popping (meaning we pushed in locked)
 462     // there will be no monitor in the box. So we need to push back the obj
 463     // so that the runtime can fix any potential anonymous owner.
 464     tbnz(t1_mark, exact_log2(markWord::monitor_value), UseObjectMonitorTable ? push_and_slow_path : inflated);
 465 
 466     // Try to unlock. Transition lock bits 0b00 => 0b01
 467     assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea");
 468     orr(t3_t, t1_mark, markWord::unlocked_value);
 469     cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
 470             /*acquire*/ false, /*release*/ true, /*weak*/ false, noreg);
 471     br(Assembler::EQ, unlocked);
 472 
 473     bind(push_and_slow_path);
 474     // Compare and exchange failed.
 475     // Restore lock-stack and handle the unlock in runtime.
 476     DEBUG_ONLY(str(obj, Address(rthread, t2_top));)
 477     addw(t2_top, t2_top, oopSize);
 478     str(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 479     b(slow_path);
 480   }
 481 
 482 
 483   { // Handle inflated monitor.
 484     bind(inflated_load_mark);
 485     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 486 #ifdef ASSERT
 487     tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
 488     stop("Fast Unlock not monitor");
 489 #endif
 490 
 491     bind(inflated);
 492 
 493 #ifdef ASSERT
 494     Label check_done;
 495     subw(t2_top, t2_top, oopSize);
 496     cmpw(t2_top, in_bytes(JavaThread::lock_stack_base_offset()));
 497     br(Assembler::LT, check_done);
 498     ldr(t3_t, Address(rthread, t2_top));
 499     cmp(obj, t3_t);
 500     br(Assembler::NE, inflated);
 501     stop("Fast Unlock lock on stack");
 502     bind(check_done);
 503 #endif
 504 
 505     const Register t1_monitor = t1;
 506 
 507     if (!UseObjectMonitorTable) {
 508       assert(t1_monitor == t1_mark, "should be the same here");
 509 
 510       // Untag the monitor.
 511       add(t1_monitor, t1_mark, -(int)markWord::monitor_value);
 512     } else {
 513       ldr(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 514       // null check with Flags == NE, no valid pointer below alignof(ObjectMonitor*)
 515       cmp(t1_monitor, checked_cast<uint8_t>(alignof(ObjectMonitor*)));
 516       br(Assembler::LO, slow_path);
 517     }
 518 
 519     const Register t2_recursions = t2;
 520     Label not_recursive;
 521 
 522     // Check if recursive.
 523     ldr(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
 524     cbz(t2_recursions, not_recursive);
 525 
 526     // Recursive unlock.
 527     sub(t2_recursions, t2_recursions, 1u);
 528     str(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
 529     // Set flag == EQ
 530     cmp(t2_recursions, t2_recursions);
 531     b(unlocked);
 532 
 533     bind(not_recursive);
 534 
 535     const Register t2_owner_addr = t2;
 536 
 537     // Compute owner address.
 538     lea(t2_owner_addr, Address(t1_monitor, ObjectMonitor::owner_offset()));
 539 
 540     // Set owner to null.
 541     // Release to satisfy the JMM
 542     stlr(zr, t2_owner_addr);
 543     // We need a full fence after clearing owner to avoid stranding.
 544     // StoreLoad achieves this.
 545     membar(StoreLoad);
 546 
 547     // Check if the entry lists are empty.
 548     ldr(rscratch1, Address(t1_monitor, ObjectMonitor::EntryList_offset()));
 549     ldr(t3_t, Address(t1_monitor, ObjectMonitor::cxq_offset()));
 550     orr(rscratch1, rscratch1, t3_t);
 551     cmp(rscratch1, zr);
 552     br(Assembler::EQ, unlocked);  // If so we are done.
 553 
 554     // Check if there is a successor.
 555     ldr(rscratch1, Address(t1_monitor, ObjectMonitor::succ_offset()));
 556     cmp(rscratch1, zr);
 557     br(Assembler::NE, unlocked);  // If so we are done.
 558 
 559     // Save the monitor pointer in the current thread, so we can try to
 560     // reacquire the lock in SharedRuntime::monitor_exit_helper().
 561     str(t1_monitor, Address(rthread, JavaThread::unlocked_inflated_monitor_offset()));
 562 
 563     cmp(zr, rthread); // Set Flag to NE => slow path
 564     b(slow_path);
 565   }
 566 
 567   bind(unlocked);
 568   cmp(zr, zr); // Set Flags to EQ => fast path
 569 
 570 #ifdef ASSERT
 571   // Check that unlocked label is reached with Flags == EQ.
 572   Label flag_correct;
 573   br(Assembler::EQ, flag_correct);
 574   stop("Fast Unlock Flag != EQ");
 575 #endif
 576 
 577   bind(slow_path);
 578 #ifdef ASSERT
 579   // Check that slow_path label is reached with Flags == NE.
 580   br(Assembler::NE, flag_correct);
 581   stop("Fast Unlock Flag != NE");
 582   bind(flag_correct);
 583 #endif
 584   // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
 585 }
 586 
 587 // Search for str1 in str2 and return index or -1
 588 // Clobbers: rscratch1, rscratch2, rflags. May also clobber v0-v1, when icnt1==-1.
 589 void C2_MacroAssembler::string_indexof(Register str2, Register str1,
 590                                        Register cnt2, Register cnt1,
 591                                        Register tmp1, Register tmp2,
 592                                        Register tmp3, Register tmp4,
 593                                        Register tmp5, Register tmp6,
 594                                        int icnt1, Register result, int ae) {
 595   // NOTE: tmp5, tmp6 can be zr depending on specific method version
 596   Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
 597 
 598   Register ch1 = rscratch1;
 599   Register ch2 = rscratch2;
 600   Register cnt1tmp = tmp1;
 601   Register cnt2tmp = tmp2;
 602   Register cnt1_neg = cnt1;
 603   Register cnt2_neg = cnt2;
 604   Register result_tmp = tmp4;
 605 
 606   bool isL = ae == StrIntrinsicNode::LL;
 607 
 608   bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
 609   bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
 610   int str1_chr_shift = str1_isL ? 0:1;
 611   int str2_chr_shift = str2_isL ? 0:1;
 612   int str1_chr_size = str1_isL ? 1:2;
 613   int str2_chr_size = str2_isL ? 1:2;
 614   chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
 615                                       (chr_insn)&MacroAssembler::ldrh;
 616   chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
 617                                       (chr_insn)&MacroAssembler::ldrh;
 618   chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
 619   chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
 620 
 621   // Note, inline_string_indexOf() generates checks:
 622   // if (substr.count > string.count) return -1;
 623   // if (substr.count == 0) return 0;
 624 
 625   // We have two strings, a source string in str2, cnt2 and a pattern string
 626   // in str1, cnt1. Find the 1st occurrence of pattern in source or return -1.
 627 
 628   // For larger pattern and source we use a simplified Boyer Moore algorithm.
 629   // With a small pattern and source we use linear scan.
 630 
 631   if (icnt1 == -1) {
 632     sub(result_tmp, cnt2, cnt1);
 633     cmp(cnt1, (u1)8);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
 634     br(LT, LINEARSEARCH);
 635     dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
 636     subs(zr, cnt1, 256);
 637     lsr(tmp1, cnt2, 2);
 638     ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
 639     br(GE, LINEARSTUB);
 640   }
 641 
 642 // The Boyer Moore alogorithm is based on the description here:-
 643 //
 644 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
 645 //
 646 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
 647 // and the 'Good Suffix' rule.
 648 //
 649 // These rules are essentially heuristics for how far we can shift the
 650 // pattern along the search string.
 651 //
 652 // The implementation here uses the 'Bad Character' rule only because of the
 653 // complexity of initialisation for the 'Good Suffix' rule.
 654 //
 655 // This is also known as the Boyer-Moore-Horspool algorithm:-
 656 //
 657 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
 658 //
 659 // This particular implementation has few java-specific optimizations.
 660 //
 661 // #define ASIZE 256
 662 //
 663 //    int bm(unsigned char *x, int m, unsigned char *y, int n) {
 664 //       int i, j;
 665 //       unsigned c;
 666 //       unsigned char bc[ASIZE];
 667 //
 668 //       /* Preprocessing */
 669 //       for (i = 0; i < ASIZE; ++i)
 670 //          bc[i] = m;
 671 //       for (i = 0; i < m - 1; ) {
 672 //          c = x[i];
 673 //          ++i;
 674 //          // c < 256 for Latin1 string, so, no need for branch
 675 //          #ifdef PATTERN_STRING_IS_LATIN1
 676 //          bc[c] = m - i;
 677 //          #else
 678 //          if (c < ASIZE) bc[c] = m - i;
 679 //          #endif
 680 //       }
 681 //
 682 //       /* Searching */
 683 //       j = 0;
 684 //       while (j <= n - m) {
 685 //          c = y[i+j];
 686 //          if (x[m-1] == c)
 687 //            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
 688 //          if (i < 0) return j;
 689 //          // c < 256 for Latin1 string, so, no need for branch
 690 //          #ifdef SOURCE_STRING_IS_LATIN1
 691 //          // LL case: (c< 256) always true. Remove branch
 692 //          j += bc[y[j+m-1]];
 693 //          #endif
 694 //          #ifndef PATTERN_STRING_IS_UTF
 695 //          // UU case: need if (c<ASIZE) check. Skip 1 character if not.
 696 //          if (c < ASIZE)
 697 //            j += bc[y[j+m-1]];
 698 //          else
 699 //            j += 1
 700 //          #endif
 701 //          #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
 702 //          // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
 703 //          if (c < ASIZE)
 704 //            j += bc[y[j+m-1]];
 705 //          else
 706 //            j += m
 707 //          #endif
 708 //       }
 709 //    }
 710 
 711   if (icnt1 == -1) {
 712     Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
 713         BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
 714     Register cnt1end = tmp2;
 715     Register str2end = cnt2;
 716     Register skipch = tmp2;
 717 
 718     // str1 length is >=8, so, we can read at least 1 register for cases when
 719     // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
 720     // UL case. We'll re-read last character in inner pre-loop code to have
 721     // single outer pre-loop load
 722     const int firstStep = isL ? 7 : 3;
 723 
 724     const int ASIZE = 256;
 725     const int STORED_BYTES = 32; // amount of bytes stored per instruction
 726     sub(sp, sp, ASIZE);
 727     mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
 728     mov(ch1, sp);
 729     BIND(BM_INIT_LOOP);
 730       stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
 731       subs(tmp5, tmp5, 1);
 732       br(GT, BM_INIT_LOOP);
 733 
 734       sub(cnt1tmp, cnt1, 1);
 735       mov(tmp5, str2);
 736       add(str2end, str2, result_tmp, LSL, str2_chr_shift);
 737       sub(ch2, cnt1, 1);
 738       mov(tmp3, str1);
 739     BIND(BCLOOP);
 740       (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
 741       if (!str1_isL) {
 742         subs(zr, ch1, ASIZE);
 743         br(HS, BCSKIP);
 744       }
 745       strb(ch2, Address(sp, ch1));
 746     BIND(BCSKIP);
 747       subs(ch2, ch2, 1);
 748       br(GT, BCLOOP);
 749 
 750       add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
 751       if (str1_isL == str2_isL) {
 752         // load last 8 bytes (8LL/4UU symbols)
 753         ldr(tmp6, Address(tmp6, -wordSize));
 754       } else {
 755         ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
 756         // convert Latin1 to UTF. We'll have to wait until load completed, but
 757         // it's still faster than per-character loads+checks
 758         lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
 759         ubfx(ch1, tmp6, 8, 8); // str1[N-2]
 760         ubfx(ch2, tmp6, 16, 8); // str1[N-3]
 761         andr(tmp6, tmp6, 0xFF); // str1[N-4]
 762         orr(ch2, ch1, ch2, LSL, 16);
 763         orr(tmp6, tmp6, tmp3, LSL, 48);
 764         orr(tmp6, tmp6, ch2, LSL, 16);
 765       }
 766     BIND(BMLOOPSTR2);
 767       (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 768       sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
 769       if (str1_isL == str2_isL) {
 770         // re-init tmp3. It's for free because it's executed in parallel with
 771         // load above. Alternative is to initialize it before loop, but it'll
 772         // affect performance on in-order systems with 2 or more ld/st pipelines
 773         lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
 774       }
 775       if (!isL) { // UU/UL case
 776         lsl(ch2, cnt1tmp, 1); // offset in bytes
 777       }
 778       cmp(tmp3, skipch);
 779       br(NE, BMSKIP);
 780       ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
 781       mov(ch1, tmp6);
 782       if (isL) {
 783         b(BMLOOPSTR1_AFTER_LOAD);
 784       } else {
 785         sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
 786         b(BMLOOPSTR1_CMP);
 787       }
 788     BIND(BMLOOPSTR1);
 789       (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
 790       (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 791     BIND(BMLOOPSTR1_AFTER_LOAD);
 792       subs(cnt1tmp, cnt1tmp, 1);
 793       br(LT, BMLOOPSTR1_LASTCMP);
 794     BIND(BMLOOPSTR1_CMP);
 795       cmp(ch1, ch2);
 796       br(EQ, BMLOOPSTR1);
 797     BIND(BMSKIP);
 798       if (!isL) {
 799         // if we've met UTF symbol while searching Latin1 pattern, then we can
 800         // skip cnt1 symbols
 801         if (str1_isL != str2_isL) {
 802           mov(result_tmp, cnt1);
 803         } else {
 804           mov(result_tmp, 1);
 805         }
 806         subs(zr, skipch, ASIZE);
 807         br(HS, BMADV);
 808       }
 809       ldrb(result_tmp, Address(sp, skipch)); // load skip distance
 810     BIND(BMADV);
 811       sub(cnt1tmp, cnt1, 1);
 812       add(str2, str2, result_tmp, LSL, str2_chr_shift);
 813       cmp(str2, str2end);
 814       br(LE, BMLOOPSTR2);
 815       add(sp, sp, ASIZE);
 816       b(NOMATCH);
 817     BIND(BMLOOPSTR1_LASTCMP);
 818       cmp(ch1, ch2);
 819       br(NE, BMSKIP);
 820     BIND(BMMATCH);
 821       sub(result, str2, tmp5);
 822       if (!str2_isL) lsr(result, result, 1);
 823       add(sp, sp, ASIZE);
 824       b(DONE);
 825 
 826     BIND(LINEARSTUB);
 827     cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm
 828     br(LT, LINEAR_MEDIUM);
 829     mov(result, zr);
 830     RuntimeAddress stub = nullptr;
 831     if (isL) {
 832       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
 833       assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated");
 834     } else if (str1_isL) {
 835       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
 836        assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated");
 837     } else {
 838       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
 839       assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated");
 840     }
 841     address call = trampoline_call(stub);
 842     if (call == nullptr) {
 843       DEBUG_ONLY(reset_labels(LINEARSEARCH, LINEAR_MEDIUM, DONE, NOMATCH, MATCH));
 844       ciEnv::current()->record_failure("CodeCache is full");
 845       return;
 846     }
 847     b(DONE);
 848   }
 849 
 850   BIND(LINEARSEARCH);
 851   {
 852     Label DO1, DO2, DO3;
 853 
 854     Register str2tmp = tmp2;
 855     Register first = tmp3;
 856 
 857     if (icnt1 == -1)
 858     {
 859         Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
 860 
 861         cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2));
 862         br(LT, DOSHORT);
 863       BIND(LINEAR_MEDIUM);
 864         (this->*str1_load_1chr)(first, Address(str1));
 865         lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
 866         sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
 867         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 868         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 869 
 870       BIND(FIRST_LOOP);
 871         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
 872         cmp(first, ch2);
 873         br(EQ, STR1_LOOP);
 874       BIND(STR2_NEXT);
 875         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 876         br(LE, FIRST_LOOP);
 877         b(NOMATCH);
 878 
 879       BIND(STR1_LOOP);
 880         adds(cnt1tmp, cnt1_neg, str1_chr_size);
 881         add(cnt2tmp, cnt2_neg, str2_chr_size);
 882         br(GE, MATCH);
 883 
 884       BIND(STR1_NEXT);
 885         (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
 886         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
 887         cmp(ch1, ch2);
 888         br(NE, STR2_NEXT);
 889         adds(cnt1tmp, cnt1tmp, str1_chr_size);
 890         add(cnt2tmp, cnt2tmp, str2_chr_size);
 891         br(LT, STR1_NEXT);
 892         b(MATCH);
 893 
 894       BIND(DOSHORT);
 895       if (str1_isL == str2_isL) {
 896         cmp(cnt1, (u1)2);
 897         br(LT, DO1);
 898         br(GT, DO3);
 899       }
 900     }
 901 
 902     if (icnt1 == 4) {
 903       Label CH1_LOOP;
 904 
 905         (this->*load_4chr)(ch1, str1);
 906         sub(result_tmp, cnt2, 4);
 907         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 908         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 909 
 910       BIND(CH1_LOOP);
 911         (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
 912         cmp(ch1, ch2);
 913         br(EQ, MATCH);
 914         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 915         br(LE, CH1_LOOP);
 916         b(NOMATCH);
 917       }
 918 
 919     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
 920       Label CH1_LOOP;
 921 
 922       BIND(DO2);
 923         (this->*load_2chr)(ch1, str1);
 924         if (icnt1 == 2) {
 925           sub(result_tmp, cnt2, 2);
 926         }
 927         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 928         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 929       BIND(CH1_LOOP);
 930         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
 931         cmp(ch1, ch2);
 932         br(EQ, MATCH);
 933         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 934         br(LE, CH1_LOOP);
 935         b(NOMATCH);
 936     }
 937 
 938     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
 939       Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
 940 
 941       BIND(DO3);
 942         (this->*load_2chr)(first, str1);
 943         (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
 944         if (icnt1 == 3) {
 945           sub(result_tmp, cnt2, 3);
 946         }
 947         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 948         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 949       BIND(FIRST_LOOP);
 950         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
 951         cmpw(first, ch2);
 952         br(EQ, STR1_LOOP);
 953       BIND(STR2_NEXT);
 954         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 955         br(LE, FIRST_LOOP);
 956         b(NOMATCH);
 957 
 958       BIND(STR1_LOOP);
 959         add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
 960         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
 961         cmp(ch1, ch2);
 962         br(NE, STR2_NEXT);
 963         b(MATCH);
 964     }
 965 
 966     if (icnt1 == -1 || icnt1 == 1) {
 967       Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
 968 
 969       BIND(DO1);
 970         (this->*str1_load_1chr)(ch1, str1);
 971         cmp(cnt2, (u1)8);
 972         br(LT, DO1_SHORT);
 973 
 974         sub(result_tmp, cnt2, 8/str2_chr_size);
 975         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 976         mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
 977         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 978 
 979         if (str2_isL) {
 980           orr(ch1, ch1, ch1, LSL, 8);
 981         }
 982         orr(ch1, ch1, ch1, LSL, 16);
 983         orr(ch1, ch1, ch1, LSL, 32);
 984       BIND(CH1_LOOP);
 985         ldr(ch2, Address(str2, cnt2_neg));
 986         eor(ch2, ch1, ch2);
 987         sub(tmp1, ch2, tmp3);
 988         orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 989         bics(tmp1, tmp1, tmp2);
 990         br(NE, HAS_ZERO);
 991         adds(cnt2_neg, cnt2_neg, 8);
 992         br(LT, CH1_LOOP);
 993 
 994         cmp(cnt2_neg, (u1)8);
 995         mov(cnt2_neg, 0);
 996         br(LT, CH1_LOOP);
 997         b(NOMATCH);
 998 
 999       BIND(HAS_ZERO);
1000         rev(tmp1, tmp1);
1001         clz(tmp1, tmp1);
1002         add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
1003         b(MATCH);
1004 
1005       BIND(DO1_SHORT);
1006         mov(result_tmp, cnt2);
1007         lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
1008         sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
1009       BIND(DO1_LOOP);
1010         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
1011         cmpw(ch1, ch2);
1012         br(EQ, MATCH);
1013         adds(cnt2_neg, cnt2_neg, str2_chr_size);
1014         br(LT, DO1_LOOP);
1015     }
1016   }
1017   BIND(NOMATCH);
1018     mov(result, -1);
1019     b(DONE);
1020   BIND(MATCH);
1021     add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
1022   BIND(DONE);
1023 }
1024 
1025 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
1026 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
1027 
1028 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1,
1029                                             Register ch, Register result,
1030                                             Register tmp1, Register tmp2, Register tmp3)
1031 {
1032   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
1033   Register cnt1_neg = cnt1;
1034   Register ch1 = rscratch1;
1035   Register result_tmp = rscratch2;
1036 
1037   cbz(cnt1, NOMATCH);
1038 
1039   cmp(cnt1, (u1)4);
1040   br(LT, DO1_SHORT);
1041 
1042   orr(ch, ch, ch, LSL, 16);
1043   orr(ch, ch, ch, LSL, 32);
1044 
1045   sub(cnt1, cnt1, 4);
1046   mov(result_tmp, cnt1);
1047   lea(str1, Address(str1, cnt1, Address::uxtw(1)));
1048   sub(cnt1_neg, zr, cnt1, LSL, 1);
1049 
1050   mov(tmp3, 0x0001000100010001);
1051 
1052   BIND(CH1_LOOP);
1053     ldr(ch1, Address(str1, cnt1_neg));
1054     eor(ch1, ch, ch1);
1055     sub(tmp1, ch1, tmp3);
1056     orr(tmp2, ch1, 0x7fff7fff7fff7fff);
1057     bics(tmp1, tmp1, tmp2);
1058     br(NE, HAS_ZERO);
1059     adds(cnt1_neg, cnt1_neg, 8);
1060     br(LT, CH1_LOOP);
1061 
1062     cmp(cnt1_neg, (u1)8);
1063     mov(cnt1_neg, 0);
1064     br(LT, CH1_LOOP);
1065     b(NOMATCH);
1066 
1067   BIND(HAS_ZERO);
1068     rev(tmp1, tmp1);
1069     clz(tmp1, tmp1);
1070     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
1071     b(MATCH);
1072 
1073   BIND(DO1_SHORT);
1074     mov(result_tmp, cnt1);
1075     lea(str1, Address(str1, cnt1, Address::uxtw(1)));
1076     sub(cnt1_neg, zr, cnt1, LSL, 1);
1077   BIND(DO1_LOOP);
1078     ldrh(ch1, Address(str1, cnt1_neg));
1079     cmpw(ch, ch1);
1080     br(EQ, MATCH);
1081     adds(cnt1_neg, cnt1_neg, 2);
1082     br(LT, DO1_LOOP);
1083   BIND(NOMATCH);
1084     mov(result, -1);
1085     b(DONE);
1086   BIND(MATCH);
1087     add(result, result_tmp, cnt1_neg, ASR, 1);
1088   BIND(DONE);
1089 }
1090 
1091 void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1,
1092                                                 Register ch, Register result,
1093                                                 FloatRegister ztmp1,
1094                                                 FloatRegister ztmp2,
1095                                                 PRegister tmp_pg,
1096                                                 PRegister tmp_pdn, bool isL)
1097 {
1098   // Note that `tmp_pdn` should *NOT* be used as governing predicate register.
1099   assert(tmp_pg->is_governing(),
1100          "this register has to be a governing predicate register");
1101 
1102   Label LOOP, MATCH, DONE, NOMATCH;
1103   Register vec_len = rscratch1;
1104   Register idx = rscratch2;
1105 
1106   SIMD_RegVariant T = (isL == true) ? B : H;
1107 
1108   cbz(cnt1, NOMATCH);
1109 
1110   // Assign the particular char throughout the vector.
1111   sve_dup(ztmp2, T, ch);
1112   if (isL) {
1113     sve_cntb(vec_len);
1114   } else {
1115     sve_cnth(vec_len);
1116   }
1117   mov(idx, 0);
1118 
1119   // Generate a predicate to control the reading of input string.
1120   sve_whilelt(tmp_pg, T, idx, cnt1);
1121 
1122   BIND(LOOP);
1123     // Read a vector of 8- or 16-bit data depending on the string type. Note
1124     // that inactive elements indicated by the predicate register won't cause
1125     // a data read from memory to the destination vector.
1126     if (isL) {
1127       sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx));
1128     } else {
1129       sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1)));
1130     }
1131     add(idx, idx, vec_len);
1132 
1133     // Perform the comparison. An element of the destination predicate is set
1134     // to active if the particular char is matched.
1135     sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2);
1136 
1137     // Branch if the particular char is found.
1138     br(NE, MATCH);
1139 
1140     sve_whilelt(tmp_pg, T, idx, cnt1);
1141 
1142     // Loop back if the particular char not found.
1143     br(MI, LOOP);
1144 
1145   BIND(NOMATCH);
1146     mov(result, -1);
1147     b(DONE);
1148 
1149   BIND(MATCH);
1150     // Undo the index increment.
1151     sub(idx, idx, vec_len);
1152 
1153     // Crop the vector to find its location.
1154     sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */);
1155     add(result, idx, -1);
1156     sve_incp(result, T, tmp_pdn);
1157   BIND(DONE);
1158 }
1159 
1160 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1,
1161                                             Register ch, Register result,
1162                                             Register tmp1, Register tmp2, Register tmp3)
1163 {
1164   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
1165   Register cnt1_neg = cnt1;
1166   Register ch1 = rscratch1;
1167   Register result_tmp = rscratch2;
1168 
1169   cbz(cnt1, NOMATCH);
1170 
1171   cmp(cnt1, (u1)8);
1172   br(LT, DO1_SHORT);
1173 
1174   orr(ch, ch, ch, LSL, 8);
1175   orr(ch, ch, ch, LSL, 16);
1176   orr(ch, ch, ch, LSL, 32);
1177 
1178   sub(cnt1, cnt1, 8);
1179   mov(result_tmp, cnt1);
1180   lea(str1, Address(str1, cnt1));
1181   sub(cnt1_neg, zr, cnt1);
1182 
1183   mov(tmp3, 0x0101010101010101);
1184 
1185   BIND(CH1_LOOP);
1186     ldr(ch1, Address(str1, cnt1_neg));
1187     eor(ch1, ch, ch1);
1188     sub(tmp1, ch1, tmp3);
1189     orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f);
1190     bics(tmp1, tmp1, tmp2);
1191     br(NE, HAS_ZERO);
1192     adds(cnt1_neg, cnt1_neg, 8);
1193     br(LT, CH1_LOOP);
1194 
1195     cmp(cnt1_neg, (u1)8);
1196     mov(cnt1_neg, 0);
1197     br(LT, CH1_LOOP);
1198     b(NOMATCH);
1199 
1200   BIND(HAS_ZERO);
1201     rev(tmp1, tmp1);
1202     clz(tmp1, tmp1);
1203     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
1204     b(MATCH);
1205 
1206   BIND(DO1_SHORT);
1207     mov(result_tmp, cnt1);
1208     lea(str1, Address(str1, cnt1));
1209     sub(cnt1_neg, zr, cnt1);
1210   BIND(DO1_LOOP);
1211     ldrb(ch1, Address(str1, cnt1_neg));
1212     cmp(ch, ch1);
1213     br(EQ, MATCH);
1214     adds(cnt1_neg, cnt1_neg, 1);
1215     br(LT, DO1_LOOP);
1216   BIND(NOMATCH);
1217     mov(result, -1);
1218     b(DONE);
1219   BIND(MATCH);
1220     add(result, result_tmp, cnt1_neg);
1221   BIND(DONE);
1222 }
1223 
1224 // Compare strings.
1225 void C2_MacroAssembler::string_compare(Register str1, Register str2,
1226     Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
1227     FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
1228     PRegister pgtmp1, PRegister pgtmp2, int ae) {
1229   Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
1230       DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
1231       SHORT_LOOP_START, TAIL_CHECK;
1232 
1233   bool isLL = ae == StrIntrinsicNode::LL;
1234   bool isLU = ae == StrIntrinsicNode::LU;
1235   bool isUL = ae == StrIntrinsicNode::UL;
1236 
1237   // The stub threshold for LL strings is: 72 (64 + 8) chars
1238   // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch)
1239   // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least)
1240   const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36);
1241 
1242   bool str1_isL = isLL || isLU;
1243   bool str2_isL = isLL || isUL;
1244 
1245   int str1_chr_shift = str1_isL ? 0 : 1;
1246   int str2_chr_shift = str2_isL ? 0 : 1;
1247   int str1_chr_size = str1_isL ? 1 : 2;
1248   int str2_chr_size = str2_isL ? 1 : 2;
1249   int minCharsInWord = isLL ? wordSize : wordSize/2;
1250 
1251   FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
1252   chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
1253                                       (chr_insn)&MacroAssembler::ldrh;
1254   chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
1255                                       (chr_insn)&MacroAssembler::ldrh;
1256   uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
1257                             (uxt_insn)&MacroAssembler::uxthw;
1258 
1259   BLOCK_COMMENT("string_compare {");
1260 
1261   // Bizarrely, the counts are passed in bytes, regardless of whether they
1262   // are L or U strings, however the result is always in characters.
1263   if (!str1_isL) asrw(cnt1, cnt1, 1);
1264   if (!str2_isL) asrw(cnt2, cnt2, 1);
1265 
1266   // Compute the minimum of the string lengths and save the difference.
1267   subsw(result, cnt1, cnt2);
1268   cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
1269 
1270   // A very short string
1271   cmpw(cnt2, minCharsInWord);
1272   br(Assembler::LE, SHORT_STRING);
1273 
1274   // Compare longwords
1275   // load first parts of strings and finish initialization while loading
1276   {
1277     if (str1_isL == str2_isL) { // LL or UU
1278       ldr(tmp1, Address(str1));
1279       cmp(str1, str2);
1280       br(Assembler::EQ, DONE);
1281       ldr(tmp2, Address(str2));
1282       cmp(cnt2, stub_threshold);
1283       br(GE, STUB);
1284       subsw(cnt2, cnt2, minCharsInWord);
1285       br(EQ, TAIL_CHECK);
1286       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1287       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1288       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1289     } else if (isLU) {
1290       ldrs(vtmp, Address(str1));
1291       ldr(tmp2, Address(str2));
1292       cmp(cnt2, stub_threshold);
1293       br(GE, STUB);
1294       subw(cnt2, cnt2, 4);
1295       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1296       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1297       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1298       zip1(vtmp, T8B, vtmp, vtmpZ);
1299       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1300       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1301       add(cnt1, cnt1, 4);
1302       fmovd(tmp1, vtmp);
1303     } else { // UL case
1304       ldr(tmp1, Address(str1));
1305       ldrs(vtmp, Address(str2));
1306       cmp(cnt2, stub_threshold);
1307       br(GE, STUB);
1308       subw(cnt2, cnt2, 4);
1309       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1310       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1311       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1312       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1313       zip1(vtmp, T8B, vtmp, vtmpZ);
1314       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1315       add(cnt1, cnt1, 8);
1316       fmovd(tmp2, vtmp);
1317     }
1318     adds(cnt2, cnt2, isUL ? 4 : 8);
1319     br(GE, TAIL);
1320     eor(rscratch2, tmp1, tmp2);
1321     cbnz(rscratch2, DIFF);
1322     // main loop
1323     bind(NEXT_WORD);
1324     if (str1_isL == str2_isL) {
1325       ldr(tmp1, Address(str1, cnt2));
1326       ldr(tmp2, Address(str2, cnt2));
1327       adds(cnt2, cnt2, 8);
1328     } else if (isLU) {
1329       ldrs(vtmp, Address(str1, cnt1));
1330       ldr(tmp2, Address(str2, cnt2));
1331       add(cnt1, cnt1, 4);
1332       zip1(vtmp, T8B, vtmp, vtmpZ);
1333       fmovd(tmp1, vtmp);
1334       adds(cnt2, cnt2, 8);
1335     } else { // UL
1336       ldrs(vtmp, Address(str2, cnt2));
1337       ldr(tmp1, Address(str1, cnt1));
1338       zip1(vtmp, T8B, vtmp, vtmpZ);
1339       add(cnt1, cnt1, 8);
1340       fmovd(tmp2, vtmp);
1341       adds(cnt2, cnt2, 4);
1342     }
1343     br(GE, TAIL);
1344 
1345     eor(rscratch2, tmp1, tmp2);
1346     cbz(rscratch2, NEXT_WORD);
1347     b(DIFF);
1348     bind(TAIL);
1349     eor(rscratch2, tmp1, tmp2);
1350     cbnz(rscratch2, DIFF);
1351     // Last longword.  In the case where length == 4 we compare the
1352     // same longword twice, but that's still faster than another
1353     // conditional branch.
1354     if (str1_isL == str2_isL) {
1355       ldr(tmp1, Address(str1));
1356       ldr(tmp2, Address(str2));
1357     } else if (isLU) {
1358       ldrs(vtmp, Address(str1));
1359       ldr(tmp2, Address(str2));
1360       zip1(vtmp, T8B, vtmp, vtmpZ);
1361       fmovd(tmp1, vtmp);
1362     } else { // UL
1363       ldrs(vtmp, Address(str2));
1364       ldr(tmp1, Address(str1));
1365       zip1(vtmp, T8B, vtmp, vtmpZ);
1366       fmovd(tmp2, vtmp);
1367     }
1368     bind(TAIL_CHECK);
1369     eor(rscratch2, tmp1, tmp2);
1370     cbz(rscratch2, DONE);
1371 
1372     // Find the first different characters in the longwords and
1373     // compute their difference.
1374     bind(DIFF);
1375     rev(rscratch2, rscratch2);
1376     clz(rscratch2, rscratch2);
1377     andr(rscratch2, rscratch2, isLL ? -8 : -16);
1378     lsrv(tmp1, tmp1, rscratch2);
1379     (this->*ext_chr)(tmp1, tmp1);
1380     lsrv(tmp2, tmp2, rscratch2);
1381     (this->*ext_chr)(tmp2, tmp2);
1382     subw(result, tmp1, tmp2);
1383     b(DONE);
1384   }
1385 
1386   bind(STUB);
1387     RuntimeAddress stub = nullptr;
1388     switch(ae) {
1389       case StrIntrinsicNode::LL:
1390         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
1391         break;
1392       case StrIntrinsicNode::UU:
1393         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
1394         break;
1395       case StrIntrinsicNode::LU:
1396         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
1397         break;
1398       case StrIntrinsicNode::UL:
1399         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
1400         break;
1401       default:
1402         ShouldNotReachHere();
1403      }
1404     assert(stub.target() != nullptr, "compare_long_string stub has not been generated");
1405     address call = trampoline_call(stub);
1406     if (call == nullptr) {
1407       DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START));
1408       ciEnv::current()->record_failure("CodeCache is full");
1409       return;
1410     }
1411     b(DONE);
1412 
1413   bind(SHORT_STRING);
1414   // Is the minimum length zero?
1415   cbz(cnt2, DONE);
1416   // arrange code to do most branches while loading and loading next characters
1417   // while comparing previous
1418   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1419   subs(cnt2, cnt2, 1);
1420   br(EQ, SHORT_LAST_INIT);
1421   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1422   b(SHORT_LOOP_START);
1423   bind(SHORT_LOOP);
1424   subs(cnt2, cnt2, 1);
1425   br(EQ, SHORT_LAST);
1426   bind(SHORT_LOOP_START);
1427   (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
1428   (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
1429   cmp(tmp1, cnt1);
1430   br(NE, SHORT_LOOP_TAIL);
1431   subs(cnt2, cnt2, 1);
1432   br(EQ, SHORT_LAST2);
1433   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1434   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1435   cmp(tmp2, rscratch1);
1436   br(EQ, SHORT_LOOP);
1437   sub(result, tmp2, rscratch1);
1438   b(DONE);
1439   bind(SHORT_LOOP_TAIL);
1440   sub(result, tmp1, cnt1);
1441   b(DONE);
1442   bind(SHORT_LAST2);
1443   cmp(tmp2, rscratch1);
1444   br(EQ, DONE);
1445   sub(result, tmp2, rscratch1);
1446 
1447   b(DONE);
1448   bind(SHORT_LAST_INIT);
1449   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1450   bind(SHORT_LAST);
1451   cmp(tmp1, cnt1);
1452   br(EQ, DONE);
1453   sub(result, tmp1, cnt1);
1454 
1455   bind(DONE);
1456 
1457   BLOCK_COMMENT("} string_compare");
1458 }
1459 
1460 void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1,
1461                                      FloatRegister src2, Condition cond, bool isQ) {
1462   SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1463   FloatRegister zn = src1, zm = src2;
1464   bool needs_negation = false;
1465   switch (cond) {
1466     case LT: cond = GT; zn = src2; zm = src1; break;
1467     case LE: cond = GE; zn = src2; zm = src1; break;
1468     case LO: cond = HI; zn = src2; zm = src1; break;
1469     case LS: cond = HS; zn = src2; zm = src1; break;
1470     case NE: cond = EQ; needs_negation = true; break;
1471     default:
1472       break;
1473   }
1474 
1475   if (is_floating_point_type(bt)) {
1476     fcm(cond, dst, size, zn, zm);
1477   } else {
1478     cm(cond, dst, size, zn, zm);
1479   }
1480 
1481   if (needs_negation) {
1482     notr(dst, isQ ? T16B : T8B, dst);
1483   }
1484 }
1485 
1486 void C2_MacroAssembler::neon_compare_zero(FloatRegister dst, BasicType bt, FloatRegister src,
1487                                           Condition cond, bool isQ) {
1488   SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1489   if (bt == T_FLOAT || bt == T_DOUBLE) {
1490     if (cond == Assembler::NE) {
1491       fcm(Assembler::EQ, dst, size, src);
1492       notr(dst, isQ ? T16B : T8B, dst);
1493     } else {
1494       fcm(cond, dst, size, src);
1495     }
1496   } else {
1497     if (cond == Assembler::NE) {
1498       cm(Assembler::EQ, dst, size, src);
1499       notr(dst, isQ ? T16B : T8B, dst);
1500     } else {
1501       cm(cond, dst, size, src);
1502     }
1503   }
1504 }
1505 
1506 // Compress the least significant bit of each byte to the rightmost and clear
1507 // the higher garbage bits.
1508 void C2_MacroAssembler::bytemask_compress(Register dst) {
1509   // Example input, dst = 0x01 00 00 00 01 01 00 01
1510   // The "??" bytes are garbage.
1511   orr(dst, dst, dst, Assembler::LSR, 7);  // dst = 0x?? 02 ?? 00 ?? 03 ?? 01
1512   orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D
1513   orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D
1514   andr(dst, dst, 0xff);                   // dst = 0x8D
1515 }
1516 
1517 // Pack the lowest-numbered bit of each mask element in src into a long value
1518 // in dst, at most the first 64 lane elements.
1519 // Clobbers: rscratch1, if UseSVE=1 or the hardware doesn't support FEAT_BITPERM.
1520 void C2_MacroAssembler::sve_vmask_tolong(Register dst, PRegister src, BasicType bt, int lane_cnt,
1521                                          FloatRegister vtmp1, FloatRegister vtmp2) {
1522   assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
1523   assert_different_registers(dst, rscratch1);
1524   assert_different_registers(vtmp1, vtmp2);
1525 
1526   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
1527   // Example:   src = 0b01100101 10001101, bt = T_BYTE, lane_cnt = 16
1528   // Expected:  dst = 0x658D
1529 
1530   // Convert the mask into vector with sequential bytes.
1531   // vtmp1 = 0x00010100 0x00010001 0x01000000 0x01010001
1532   sve_cpy(vtmp1, size, src, 1, false);
1533   if (bt != T_BYTE) {
1534     sve_vector_narrow(vtmp1, B, vtmp1, size, vtmp2);
1535   }
1536 
1537   if (UseSVE > 1 && VM_Version::supports_svebitperm()) {
1538     // Given a vector with the value 0x00 or 0x01 in each byte, the basic idea
1539     // is to compress each significant bit of the byte in a cross-lane way. Due
1540     // to the lack of a cross-lane bit-compress instruction, we use BEXT
1541     // (bit-compress in each lane) with the biggest lane size (T = D) then
1542     // concatenate the results.
1543 
1544     // The second source input of BEXT, initialized with 0x01 in each byte.
1545     // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1546     sve_dup(vtmp2, B, 1);
1547 
1548     // BEXT vtmp1.D, vtmp1.D, vtmp2.D
1549     // vtmp1 = 0x0001010000010001 | 0x0100000001010001
1550     // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1551     //         ---------------------------------------
1552     // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1553     sve_bext(vtmp1, D, vtmp1, vtmp2);
1554 
1555     // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the
1556     // result to dst.
1557     // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1558     // dst   = 0x658D
1559     if (lane_cnt <= 8) {
1560       // No need to concatenate.
1561       umov(dst, vtmp1, B, 0);
1562     } else if (lane_cnt <= 16) {
1563       ins(vtmp1, B, vtmp1, 1, 8);
1564       umov(dst, vtmp1, H, 0);
1565     } else {
1566       // As the lane count is 64 at most, the final expected value must be in
1567       // the lowest 64 bits after narrowing vtmp1 from D to B.
1568       sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2);
1569       umov(dst, vtmp1, D, 0);
1570     }
1571   } else if (UseSVE > 0) {
1572     // Compress the lowest 8 bytes.
1573     fmovd(dst, vtmp1);
1574     bytemask_compress(dst);
1575     if (lane_cnt <= 8) return;
1576 
1577     // Repeat on higher bytes and join the results.
1578     // Compress 8 bytes in each iteration.
1579     for (int idx = 1; idx < (lane_cnt / 8); idx++) {
1580       sve_extract_integral(rscratch1, T_LONG, vtmp1, idx, vtmp2);
1581       bytemask_compress(rscratch1);
1582       orr(dst, dst, rscratch1, Assembler::LSL, idx << 3);
1583     }
1584   } else {
1585     assert(false, "unsupported");
1586     ShouldNotReachHere();
1587   }
1588 }
1589 
1590 // Unpack the mask, a long value in src, into predicate register dst based on the
1591 // corresponding data type. Note that dst can support at most 64 lanes.
1592 // Below example gives the expected dst predicate register in different types, with
1593 // a valid src(0x658D) on a 1024-bit vector size machine.
1594 // BYTE:  dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 00 00 65 8D
1595 // SHORT: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 14 11 40 51
1596 // INT:   dst = 0x00 00 00 00 00 00 00 00 01 10 01 01 10 00 11 01
1597 // LONG:  dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01
1598 //
1599 // The number of significant bits of src must be equal to lane_cnt. E.g., 0xFF658D which
1600 // has 24 significant bits would be an invalid input if dst predicate register refers to
1601 // a LONG type 1024-bit vector, which has at most 16 lanes.
1602 void C2_MacroAssembler::sve_vmask_fromlong(PRegister dst, Register src, BasicType bt, int lane_cnt,
1603                                            FloatRegister vtmp1, FloatRegister vtmp2) {
1604   assert(UseSVE == 2 && VM_Version::supports_svebitperm() &&
1605          lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported");
1606   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
1607   // Example:   src = 0x658D, bt = T_BYTE, size = B, lane_cnt = 16
1608   // Expected:  dst = 0b01101001 10001101
1609 
1610   // Put long value from general purpose register into the first lane of vector.
1611   // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1612   sve_dup(vtmp1, B, 0);
1613   mov(vtmp1, D, 0, src);
1614 
1615   // As sve_cmp generates mask value with the minimum unit in byte, we should
1616   // transform the value in the first lane which is mask in bit now to the
1617   // mask in byte, which can be done by SVE2's BDEP instruction.
1618 
1619   // The first source input of BDEP instruction. Deposite each byte in every 8 bytes.
1620   // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1621   if (lane_cnt <= 8) {
1622     // Nothing. As only one byte exsits.
1623   } else if (lane_cnt <= 16) {
1624     ins(vtmp1, B, vtmp1, 8, 1);
1625     mov(vtmp1, B, 1, zr);
1626   } else {
1627     sve_vector_extend(vtmp1, D, vtmp1, B);
1628   }
1629 
1630   // The second source input of BDEP instruction, initialized with 0x01 for each byte.
1631   // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1632   sve_dup(vtmp2, B, 1);
1633 
1634   // BDEP vtmp1.D, vtmp1.D, vtmp2.D
1635   // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1636   // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1637   //         ---------------------------------------
1638   // vtmp1 = 0x0001010000010001 | 0x0100000001010001
1639   sve_bdep(vtmp1, D, vtmp1, vtmp2);
1640 
1641   if (bt != T_BYTE) {
1642     sve_vector_extend(vtmp1, size, vtmp1, B);
1643   }
1644   // Generate mask according to the given vector, in which the elements have been
1645   // extended to expected type.
1646   // dst = 0b01101001 10001101
1647   sve_cmp(Assembler::NE, dst, size, ptrue, vtmp1, 0);
1648 }
1649 
1650 // Clobbers: rflags
1651 void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg,
1652                                     FloatRegister zn, FloatRegister zm, Condition cond) {
1653   assert(pg->is_governing(), "This register has to be a governing predicate register");
1654   FloatRegister z1 = zn, z2 = zm;
1655   switch (cond) {
1656     case LE: z1 = zm; z2 = zn; cond = GE; break;
1657     case LT: z1 = zm; z2 = zn; cond = GT; break;
1658     case LO: z1 = zm; z2 = zn; cond = HI; break;
1659     case LS: z1 = zm; z2 = zn; cond = HS; break;
1660     default:
1661       break;
1662   }
1663 
1664   SIMD_RegVariant size = elemType_to_regVariant(bt);
1665   if (is_floating_point_type(bt)) {
1666     sve_fcm(cond, pd, size, pg, z1, z2);
1667   } else {
1668     assert(is_integral_type(bt), "unsupported element type");
1669     sve_cmp(cond, pd, size, pg, z1, z2);
1670   }
1671 }
1672 
1673 // Get index of the last mask lane that is set
1674 void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) {
1675   SIMD_RegVariant size = elemType_to_regVariant(bt);
1676   sve_rev(ptmp, size, src);
1677   sve_brkb(ptmp, ptrue, ptmp, false);
1678   sve_cntp(dst, size, ptrue, ptmp);
1679   movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1);
1680   subw(dst, rscratch1, dst);
1681 }
1682 
1683 // Extend integer vector src to dst with the same lane count
1684 // but larger element size, e.g. 4B -> 4I
1685 void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes,
1686                                            FloatRegister src, BasicType src_bt, bool is_unsigned) {
1687   if (src_bt == T_BYTE) {
1688     if (dst_bt == T_SHORT) {
1689       // 4B/8B to 4S/8S
1690       _xshll(is_unsigned, dst, T8H, src, T8B, 0);
1691     } else {
1692       // 4B to 4I
1693       assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported");
1694       _xshll(is_unsigned, dst, T8H, src, T8B, 0);
1695       _xshll(is_unsigned, dst, T4S, dst, T4H, 0);
1696     }
1697   } else if (src_bt == T_SHORT) {
1698     // 4S to 4I
1699     assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported");
1700     _xshll(is_unsigned, dst, T4S, src, T4H, 0);
1701   } else if (src_bt == T_INT) {
1702     // 2I to 2L
1703     assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported");
1704     _xshll(is_unsigned, dst, T2D, src, T2S, 0);
1705   } else {
1706     ShouldNotReachHere();
1707   }
1708 }
1709 
1710 // Narrow integer vector src down to dst with the same lane count
1711 // but smaller element size, e.g. 4I -> 4B
1712 void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt,
1713                                            FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) {
1714   if (src_bt == T_SHORT) {
1715     // 4S/8S to 4B/8B
1716     assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
1717     assert(dst_bt == T_BYTE, "unsupported");
1718     xtn(dst, T8B, src, T8H);
1719   } else if (src_bt == T_INT) {
1720     // 4I to 4B/4S
1721     assert(src_vlen_in_bytes == 16, "unsupported");
1722     assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported");
1723     xtn(dst, T4H, src, T4S);
1724     if (dst_bt == T_BYTE) {
1725       xtn(dst, T8B, dst, T8H);
1726     }
1727   } else if (src_bt == T_LONG) {
1728     // 2L to 2I
1729     assert(src_vlen_in_bytes == 16, "unsupported");
1730     assert(dst_bt == T_INT, "unsupported");
1731     xtn(dst, T2S, src, T2D);
1732   } else {
1733     ShouldNotReachHere();
1734   }
1735 }
1736 
1737 void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size,
1738                                           FloatRegister src, SIMD_RegVariant src_size,
1739                                           bool is_unsigned) {
1740   assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size");
1741 
1742   if (src_size == B) {
1743     switch (dst_size) {
1744     case H:
1745       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1746       break;
1747     case S:
1748       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1749       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1750       break;
1751     case D:
1752       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1753       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1754       _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1755       break;
1756     default:
1757       ShouldNotReachHere();
1758     }
1759   } else if (src_size == H) {
1760     if (dst_size == S) {
1761       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1762     } else { // D
1763       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1764       _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1765     }
1766   } else if (src_size == S) {
1767     _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, src);
1768   }
1769 }
1770 
1771 // Vector narrow from src to dst with specified element sizes.
1772 // High part of dst vector will be filled with zero.
1773 void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size,
1774                                           FloatRegister src, SIMD_RegVariant src_size,
1775                                           FloatRegister tmp) {
1776   assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size");
1777   assert_different_registers(src, tmp);
1778   sve_dup(tmp, src_size, 0);
1779   if (src_size == D) {
1780     switch (dst_size) {
1781     case S:
1782       sve_uzp1(dst, S, src, tmp);
1783       break;
1784     case H:
1785       assert_different_registers(dst, tmp);
1786       sve_uzp1(dst, S, src, tmp);
1787       sve_uzp1(dst, H, dst, tmp);
1788       break;
1789     case B:
1790       assert_different_registers(dst, tmp);
1791       sve_uzp1(dst, S, src, tmp);
1792       sve_uzp1(dst, H, dst, tmp);
1793       sve_uzp1(dst, B, dst, tmp);
1794       break;
1795     default:
1796       ShouldNotReachHere();
1797     }
1798   } else if (src_size == S) {
1799     if (dst_size == H) {
1800       sve_uzp1(dst, H, src, tmp);
1801     } else { // B
1802       assert_different_registers(dst, tmp);
1803       sve_uzp1(dst, H, src, tmp);
1804       sve_uzp1(dst, B, dst, tmp);
1805     }
1806   } else if (src_size == H) {
1807     sve_uzp1(dst, B, src, tmp);
1808   }
1809 }
1810 
1811 // Extend src predicate to dst predicate with the same lane count but larger
1812 // element size, e.g. 64Byte -> 512Long
1813 void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src,
1814                                              uint dst_element_length_in_bytes,
1815                                              uint src_element_length_in_bytes) {
1816   if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) {
1817     sve_punpklo(dst, src);
1818   } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) {
1819     sve_punpklo(dst, src);
1820     sve_punpklo(dst, dst);
1821   } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) {
1822     sve_punpklo(dst, src);
1823     sve_punpklo(dst, dst);
1824     sve_punpklo(dst, dst);
1825   } else {
1826     assert(false, "unsupported");
1827     ShouldNotReachHere();
1828   }
1829 }
1830 
1831 // Narrow src predicate to dst predicate with the same lane count but
1832 // smaller element size, e.g. 512Long -> 64Byte
1833 void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src, PRegister ptmp,
1834                                              uint dst_element_length_in_bytes, uint src_element_length_in_bytes) {
1835   // The insignificant bits in src predicate are expected to be zero.
1836   // To ensure the higher order bits of the resultant narrowed vector are 0, an all-zero predicate is
1837   // passed as the second argument. An example narrowing operation with a given mask would be -
1838   // 128Long -> 64Int on a 128-bit machine i.e 2L -> 2I
1839   // Mask (for 2 Longs) : TF
1840   // Predicate register for the above mask (16 bits) : 00000001 00000000
1841   // After narrowing (uzp1 dst.b, src.b, ptmp.b) : 0000 0000 0001 0000
1842   // Which translates to mask for 2 integers as : TF (lower half is considered while upper half is 0)
1843   assert_different_registers(src, ptmp);
1844   assert_different_registers(dst, ptmp);
1845   sve_pfalse(ptmp);
1846   if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) {
1847     sve_uzp1(dst, B, src, ptmp);
1848   } else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) {
1849     sve_uzp1(dst, H, src, ptmp);
1850     sve_uzp1(dst, B, dst, ptmp);
1851   } else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) {
1852     sve_uzp1(dst, S, src, ptmp);
1853     sve_uzp1(dst, H, dst, ptmp);
1854     sve_uzp1(dst, B, dst, ptmp);
1855   } else {
1856     assert(false, "unsupported");
1857     ShouldNotReachHere();
1858   }
1859 }
1860 
1861 // Vector reduction add for integral type with ASIMD instructions.
1862 void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt,
1863                                                  Register isrc, FloatRegister vsrc,
1864                                                  unsigned vector_length_in_bytes,
1865                                                  FloatRegister vtmp) {
1866   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1867   assert_different_registers(dst, isrc);
1868   bool isQ = vector_length_in_bytes == 16;
1869 
1870   BLOCK_COMMENT("neon_reduce_add_integral {");
1871     switch(bt) {
1872       case T_BYTE:
1873         addv(vtmp, isQ ? T16B : T8B, vsrc);
1874         smov(dst, vtmp, B, 0);
1875         addw(dst, dst, isrc, ext::sxtb);
1876         break;
1877       case T_SHORT:
1878         addv(vtmp, isQ ? T8H : T4H, vsrc);
1879         smov(dst, vtmp, H, 0);
1880         addw(dst, dst, isrc, ext::sxth);
1881         break;
1882       case T_INT:
1883         isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc);
1884         umov(dst, vtmp, S, 0);
1885         addw(dst, dst, isrc);
1886         break;
1887       case T_LONG:
1888         assert(isQ, "unsupported");
1889         addpd(vtmp, vsrc);
1890         umov(dst, vtmp, D, 0);
1891         add(dst, dst, isrc);
1892         break;
1893       default:
1894         assert(false, "unsupported");
1895         ShouldNotReachHere();
1896     }
1897   BLOCK_COMMENT("} neon_reduce_add_integral");
1898 }
1899 
1900 // Vector reduction multiply for integral type with ASIMD instructions.
1901 // Note: temporary registers vtmp1 and vtmp2 are not used in some cases.
1902 // Clobbers: rscratch1
1903 void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt,
1904                                                  Register isrc, FloatRegister vsrc,
1905                                                  unsigned vector_length_in_bytes,
1906                                                  FloatRegister vtmp1, FloatRegister vtmp2) {
1907   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1908   bool isQ = vector_length_in_bytes == 16;
1909 
1910   BLOCK_COMMENT("neon_reduce_mul_integral {");
1911     switch(bt) {
1912       case T_BYTE:
1913         if (isQ) {
1914           // Multiply the lower half and higher half of vector iteratively.
1915           // vtmp1 = vsrc[8:15]
1916           ins(vtmp1, D, vsrc, 0, 1);
1917           // vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7]
1918           mulv(vtmp1, T8B, vtmp1, vsrc);
1919           // vtmp2 = vtmp1[4:7]
1920           ins(vtmp2, S, vtmp1, 0, 1);
1921           // vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3]
1922           mulv(vtmp1, T8B, vtmp2, vtmp1);
1923         } else {
1924           ins(vtmp1, S, vsrc, 0, 1);
1925           mulv(vtmp1, T8B, vtmp1, vsrc);
1926         }
1927         // vtmp2 = vtmp1[2:3]
1928         ins(vtmp2, H, vtmp1, 0, 1);
1929         // vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1]
1930         mulv(vtmp2, T8B, vtmp2, vtmp1);
1931         // dst = vtmp2[0] * isrc * vtmp2[1]
1932         umov(rscratch1, vtmp2, B, 0);
1933         mulw(dst, rscratch1, isrc);
1934         sxtb(dst, dst);
1935         umov(rscratch1, vtmp2, B, 1);
1936         mulw(dst, rscratch1, dst);
1937         sxtb(dst, dst);
1938         break;
1939       case T_SHORT:
1940         if (isQ) {
1941           ins(vtmp2, D, vsrc, 0, 1);
1942           mulv(vtmp2, T4H, vtmp2, vsrc);
1943           ins(vtmp1, S, vtmp2, 0, 1);
1944           mulv(vtmp1, T4H, vtmp1, vtmp2);
1945         } else {
1946           ins(vtmp1, S, vsrc, 0, 1);
1947           mulv(vtmp1, T4H, vtmp1, vsrc);
1948         }
1949         umov(rscratch1, vtmp1, H, 0);
1950         mulw(dst, rscratch1, isrc);
1951         sxth(dst, dst);
1952         umov(rscratch1, vtmp1, H, 1);
1953         mulw(dst, rscratch1, dst);
1954         sxth(dst, dst);
1955         break;
1956       case T_INT:
1957         if (isQ) {
1958           ins(vtmp1, D, vsrc, 0, 1);
1959           mulv(vtmp1, T2S, vtmp1, vsrc);
1960         } else {
1961           vtmp1 = vsrc;
1962         }
1963         umov(rscratch1, vtmp1, S, 0);
1964         mul(dst, rscratch1, isrc);
1965         umov(rscratch1, vtmp1, S, 1);
1966         mul(dst, rscratch1, dst);
1967         break;
1968       case T_LONG:
1969         umov(rscratch1, vsrc, D, 0);
1970         mul(dst, isrc, rscratch1);
1971         umov(rscratch1, vsrc, D, 1);
1972         mul(dst, dst, rscratch1);
1973         break;
1974       default:
1975         assert(false, "unsupported");
1976         ShouldNotReachHere();
1977     }
1978   BLOCK_COMMENT("} neon_reduce_mul_integral");
1979 }
1980 
1981 // Vector reduction multiply for floating-point type with ASIMD instructions.
1982 void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt,
1983                                            FloatRegister fsrc, FloatRegister vsrc,
1984                                            unsigned vector_length_in_bytes,
1985                                            FloatRegister vtmp) {
1986   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1987   bool isQ = vector_length_in_bytes == 16;
1988 
1989   BLOCK_COMMENT("neon_reduce_mul_fp {");
1990     switch(bt) {
1991       case T_FLOAT:
1992         fmuls(dst, fsrc, vsrc);
1993         ins(vtmp, S, vsrc, 0, 1);
1994         fmuls(dst, dst, vtmp);
1995         if (isQ) {
1996           ins(vtmp, S, vsrc, 0, 2);
1997           fmuls(dst, dst, vtmp);
1998           ins(vtmp, S, vsrc, 0, 3);
1999           fmuls(dst, dst, vtmp);
2000          }
2001         break;
2002       case T_DOUBLE:
2003         assert(isQ, "unsupported");
2004         fmuld(dst, fsrc, vsrc);
2005         ins(vtmp, D, vsrc, 0, 1);
2006         fmuld(dst, dst, vtmp);
2007         break;
2008       default:
2009         assert(false, "unsupported");
2010         ShouldNotReachHere();
2011     }
2012   BLOCK_COMMENT("} neon_reduce_mul_fp");
2013 }
2014 
2015 // Helper to select logical instruction
2016 void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd,
2017                                                    Register Rn, Register Rm,
2018                                                    enum shift_kind kind, unsigned shift) {
2019   switch(opc) {
2020     case Op_AndReductionV:
2021       is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift);
2022       break;
2023     case Op_OrReductionV:
2024       is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift);
2025       break;
2026     case Op_XorReductionV:
2027       is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift);
2028       break;
2029     default:
2030       assert(false, "unsupported");
2031       ShouldNotReachHere();
2032   }
2033 }
2034 
2035 // Vector reduction logical operations And, Or, Xor
2036 // Clobbers: rscratch1
2037 void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt,
2038                                             Register isrc, FloatRegister vsrc,
2039                                             unsigned vector_length_in_bytes) {
2040   assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV,
2041          "unsupported");
2042   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
2043   assert_different_registers(dst, isrc);
2044   bool isQ = vector_length_in_bytes == 16;
2045 
2046   BLOCK_COMMENT("neon_reduce_logical {");
2047     umov(rscratch1, vsrc, isQ ? D : S, 0);
2048     umov(dst, vsrc, isQ ? D : S, 1);
2049     neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1);
2050     switch(bt) {
2051       case T_BYTE:
2052         if (isQ) {
2053           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
2054         }
2055         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
2056         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8);
2057         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
2058         sxtb(dst, dst);
2059         break;
2060       case T_SHORT:
2061         if (isQ) {
2062           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
2063         }
2064         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
2065         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
2066         sxth(dst, dst);
2067         break;
2068       case T_INT:
2069         if (isQ) {
2070           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
2071         }
2072         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
2073         break;
2074       case T_LONG:
2075         assert(isQ, "unsupported");
2076         neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst);
2077         break;
2078       default:
2079         assert(false, "unsupported");
2080         ShouldNotReachHere();
2081     }
2082   BLOCK_COMMENT("} neon_reduce_logical");
2083 }
2084 
2085 // Vector reduction min/max for integral type with ASIMD instructions.
2086 // Note: vtmp is not used and expected to be fnoreg for T_LONG case.
2087 // Clobbers: rscratch1, rflags
2088 void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt,
2089                                                     Register isrc, FloatRegister vsrc,
2090                                                     unsigned vector_length_in_bytes,
2091                                                     FloatRegister vtmp) {
2092   assert(opc == Op_MinReductionV || opc == Op_MaxReductionV, "unsupported");
2093   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
2094   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported");
2095   assert_different_registers(dst, isrc);
2096   bool isQ = vector_length_in_bytes == 16;
2097   bool is_min = opc == Op_MinReductionV;
2098 
2099   BLOCK_COMMENT("neon_reduce_minmax_integral {");
2100     if (bt == T_LONG) {
2101       assert(vtmp == fnoreg, "should be");
2102       assert(isQ, "should be");
2103       umov(rscratch1, vsrc, D, 0);
2104       cmp(isrc, rscratch1);
2105       csel(dst, isrc, rscratch1, is_min ? LT : GT);
2106       umov(rscratch1, vsrc, D, 1);
2107       cmp(dst, rscratch1);
2108       csel(dst, dst, rscratch1, is_min ? LT : GT);
2109     } else {
2110       SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
2111       if (size == T2S) {
2112         is_min ? sminp(vtmp, size, vsrc, vsrc) : smaxp(vtmp, size, vsrc, vsrc);
2113       } else {
2114         is_min ? sminv(vtmp, size, vsrc) : smaxv(vtmp, size, vsrc);
2115       }
2116       if (bt == T_INT) {
2117         umov(dst, vtmp, S, 0);
2118       } else {
2119         smov(dst, vtmp, elemType_to_regVariant(bt), 0);
2120       }
2121       cmpw(dst, isrc);
2122       cselw(dst, dst, isrc, is_min ? LT : GT);
2123     }
2124   BLOCK_COMMENT("} neon_reduce_minmax_integral");
2125 }
2126 
2127 // Vector reduction for integral type with SVE instruction.
2128 // Supported operations are Add, And, Or, Xor, Max, Min.
2129 // rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV.
2130 void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1,
2131                                             FloatRegister src2, PRegister pg, FloatRegister tmp) {
2132   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2133   assert(pg->is_governing(), "This register has to be a governing predicate register");
2134   assert_different_registers(src1, dst);
2135   // Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved.
2136   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2137   switch (opc) {
2138     case Op_AddReductionVI: {
2139       sve_uaddv(tmp, size, pg, src2);
2140       if (bt == T_BYTE) {
2141         smov(dst, tmp, size, 0);
2142         addw(dst, src1, dst, ext::sxtb);
2143       } else if (bt == T_SHORT) {
2144         smov(dst, tmp, size, 0);
2145         addw(dst, src1, dst, ext::sxth);
2146       } else {
2147         umov(dst, tmp, size, 0);
2148         addw(dst, dst, src1);
2149       }
2150       break;
2151     }
2152     case Op_AddReductionVL: {
2153       sve_uaddv(tmp, size, pg, src2);
2154       umov(dst, tmp, size, 0);
2155       add(dst, dst, src1);
2156       break;
2157     }
2158     case Op_AndReductionV: {
2159       sve_andv(tmp, size, pg, src2);
2160       if (bt == T_INT || bt == T_LONG) {
2161         umov(dst, tmp, size, 0);
2162       } else {
2163         smov(dst, tmp, size, 0);
2164       }
2165       if (bt == T_LONG) {
2166         andr(dst, dst, src1);
2167       } else {
2168         andw(dst, dst, src1);
2169       }
2170       break;
2171     }
2172     case Op_OrReductionV: {
2173       sve_orv(tmp, size, pg, src2);
2174       if (bt == T_INT || bt == T_LONG) {
2175         umov(dst, tmp, size, 0);
2176       } else {
2177         smov(dst, tmp, size, 0);
2178       }
2179       if (bt == T_LONG) {
2180         orr(dst, dst, src1);
2181       } else {
2182         orrw(dst, dst, src1);
2183       }
2184       break;
2185     }
2186     case Op_XorReductionV: {
2187       sve_eorv(tmp, size, pg, src2);
2188       if (bt == T_INT || bt == T_LONG) {
2189         umov(dst, tmp, size, 0);
2190       } else {
2191         smov(dst, tmp, size, 0);
2192       }
2193       if (bt == T_LONG) {
2194         eor(dst, dst, src1);
2195       } else {
2196         eorw(dst, dst, src1);
2197       }
2198       break;
2199     }
2200     case Op_MaxReductionV: {
2201       sve_smaxv(tmp, size, pg, src2);
2202       if (bt == T_INT || bt == T_LONG) {
2203         umov(dst, tmp, size, 0);
2204       } else {
2205         smov(dst, tmp, size, 0);
2206       }
2207       if (bt == T_LONG) {
2208         cmp(dst, src1);
2209         csel(dst, dst, src1, Assembler::GT);
2210       } else {
2211         cmpw(dst, src1);
2212         cselw(dst, dst, src1, Assembler::GT);
2213       }
2214       break;
2215     }
2216     case Op_MinReductionV: {
2217       sve_sminv(tmp, size, pg, src2);
2218       if (bt == T_INT || bt == T_LONG) {
2219         umov(dst, tmp, size, 0);
2220       } else {
2221         smov(dst, tmp, size, 0);
2222       }
2223       if (bt == T_LONG) {
2224         cmp(dst, src1);
2225         csel(dst, dst, src1, Assembler::LT);
2226       } else {
2227         cmpw(dst, src1);
2228         cselw(dst, dst, src1, Assembler::LT);
2229       }
2230       break;
2231     }
2232     default:
2233       assert(false, "unsupported");
2234       ShouldNotReachHere();
2235   }
2236 
2237   if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) {
2238     if (bt == T_BYTE) {
2239       sxtb(dst, dst);
2240     } else if (bt == T_SHORT) {
2241       sxth(dst, dst);
2242     }
2243   }
2244 }
2245 
2246 // Set elements of the dst predicate to true for lanes in the range of [0, lane_cnt), or
2247 // to false otherwise. The input "lane_cnt" should be smaller than or equal to the supported
2248 // max vector length of the basic type. Clobbers: rscratch1 and the rFlagsReg.
2249 void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt) {
2250   uint32_t max_vector_length = Matcher::max_vector_size(bt);
2251   assert(lane_cnt <= max_vector_length, "unsupported input lane_cnt");
2252 
2253   // Set all elements to false if the input "lane_cnt" is zero.
2254   if (lane_cnt == 0) {
2255     sve_pfalse(dst);
2256     return;
2257   }
2258 
2259   SIMD_RegVariant size = elemType_to_regVariant(bt);
2260   assert(size != Q, "invalid size");
2261 
2262   // Set all true if "lane_cnt" equals to the max lane count.
2263   if (lane_cnt == max_vector_length) {
2264     sve_ptrue(dst, size, /* ALL */ 0b11111);
2265     return;
2266   }
2267 
2268   // Fixed numbers for "ptrue".
2269   switch(lane_cnt) {
2270   case 1: /* VL1 */
2271   case 2: /* VL2 */
2272   case 3: /* VL3 */
2273   case 4: /* VL4 */
2274   case 5: /* VL5 */
2275   case 6: /* VL6 */
2276   case 7: /* VL7 */
2277   case 8: /* VL8 */
2278     sve_ptrue(dst, size, lane_cnt);
2279     return;
2280   case 16:
2281     sve_ptrue(dst, size, /* VL16 */ 0b01001);
2282     return;
2283   case 32:
2284     sve_ptrue(dst, size, /* VL32 */ 0b01010);
2285     return;
2286   case 64:
2287     sve_ptrue(dst, size, /* VL64 */ 0b01011);
2288     return;
2289   case 128:
2290     sve_ptrue(dst, size, /* VL128 */ 0b01100);
2291     return;
2292   case 256:
2293     sve_ptrue(dst, size, /* VL256 */ 0b01101);
2294     return;
2295   default:
2296     break;
2297   }
2298 
2299   // Special patterns for "ptrue".
2300   if (lane_cnt == round_down_power_of_2(max_vector_length)) {
2301     sve_ptrue(dst, size, /* POW2 */ 0b00000);
2302   } else if (lane_cnt == max_vector_length - (max_vector_length % 4)) {
2303     sve_ptrue(dst, size, /* MUL4 */ 0b11101);
2304   } else if (lane_cnt == max_vector_length - (max_vector_length % 3)) {
2305     sve_ptrue(dst, size, /* MUL3 */ 0b11110);
2306   } else {
2307     // Encode to "whileltw" for the remaining cases.
2308     mov(rscratch1, lane_cnt);
2309     sve_whileltw(dst, size, zr, rscratch1);
2310   }
2311 }
2312 
2313 // Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst.
2314 // Any remaining elements of dst will be filled with zero.
2315 // Clobbers: rscratch1
2316 // Preserves: src, mask
2317 void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask,
2318                                            FloatRegister vtmp1, FloatRegister vtmp2,
2319                                            PRegister pgtmp) {
2320   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2321   assert_different_registers(dst, src, vtmp1, vtmp2);
2322   assert_different_registers(mask, pgtmp);
2323 
2324   // Example input:   src   = 8888 7777 6666 5555 4444 3333 2222 1111
2325   //                  mask  = 0001 0000 0000 0001 0001 0000 0001 0001
2326   // Expected result: dst   = 0000 0000 0000 8888 5555 4444 2222 1111
2327   sve_dup(vtmp2, H, 0);
2328 
2329   // Extend lowest half to type INT.
2330   // dst = 00004444 00003333 00002222 00001111
2331   sve_uunpklo(dst, S, src);
2332   // pgtmp = 00000001 00000000 00000001 00000001
2333   sve_punpklo(pgtmp, mask);
2334   // Pack the active elements in size of type INT to the right,
2335   // and fill the remainings with zero.
2336   // dst = 00000000 00004444 00002222 00001111
2337   sve_compact(dst, S, dst, pgtmp);
2338   // Narrow the result back to type SHORT.
2339   // dst = 0000 0000 0000 0000 0000 4444 2222 1111
2340   sve_uzp1(dst, H, dst, vtmp2);
2341   // Count the active elements of lowest half.
2342   // rscratch1 = 3
2343   sve_cntp(rscratch1, S, ptrue, pgtmp);
2344 
2345   // Repeat to the highest half.
2346   // pgtmp = 00000001 00000000 00000000 00000001
2347   sve_punpkhi(pgtmp, mask);
2348   // vtmp1 = 00008888 00007777 00006666 00005555
2349   sve_uunpkhi(vtmp1, S, src);
2350   // vtmp1 = 00000000 00000000 00008888 00005555
2351   sve_compact(vtmp1, S, vtmp1, pgtmp);
2352   // vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555
2353   sve_uzp1(vtmp1, H, vtmp1, vtmp2);
2354 
2355   // Compressed low:   dst   = 0000 0000 0000 0000 0000 4444 2222 1111
2356   // Compressed high:  vtmp1 = 0000 0000 0000 0000 0000 0000 8888  5555
2357   // Left shift(cross lane) compressed high with TRUE_CNT lanes,
2358   // TRUE_CNT is the number of active elements in the compressed low.
2359   neg(rscratch1, rscratch1);
2360   // vtmp2 = {4 3 2 1 0 -1 -2 -3}
2361   sve_index(vtmp2, H, rscratch1, 1);
2362   // vtmp1 = 0000 0000 0000 8888 5555 0000 0000 0000
2363   sve_tbl(vtmp1, H, vtmp1, vtmp2);
2364 
2365   // Combine the compressed high(after shifted) with the compressed low.
2366   // dst = 0000 0000 0000 8888 5555 4444 2222 1111
2367   sve_orr(dst, dst, vtmp1);
2368 }
2369 
2370 // Clobbers: rscratch1, rscratch2
2371 // Preserves: src, mask
2372 void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask,
2373                                           FloatRegister vtmp1, FloatRegister vtmp2,
2374                                           FloatRegister vtmp3, FloatRegister vtmp4,
2375                                           PRegister ptmp, PRegister pgtmp) {
2376   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2377   assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3, vtmp4);
2378   assert_different_registers(mask, ptmp, pgtmp);
2379   // Example input:   src   = 88 77 66 55 44 33 22 11
2380   //                  mask  = 01 00 00 01 01 00 01 01
2381   // Expected result: dst   = 00 00 00 88 55 44 22 11
2382 
2383   sve_dup(vtmp4, B, 0);
2384   // Extend lowest half to type SHORT.
2385   // vtmp1 = 0044 0033 0022 0011
2386   sve_uunpklo(vtmp1, H, src);
2387   // ptmp = 0001 0000 0001 0001
2388   sve_punpklo(ptmp, mask);
2389   // Count the active elements of lowest half.
2390   // rscratch2 = 3
2391   sve_cntp(rscratch2, H, ptrue, ptmp);
2392   // Pack the active elements in size of type SHORT to the right,
2393   // and fill the remainings with zero.
2394   // dst = 0000 0044 0022 0011
2395   sve_compress_short(dst, vtmp1, ptmp, vtmp2, vtmp3, pgtmp);
2396   // Narrow the result back to type BYTE.
2397   // dst = 00 00 00 00 00 44 22 11
2398   sve_uzp1(dst, B, dst, vtmp4);
2399 
2400   // Repeat to the highest half.
2401   // ptmp = 0001 0000 0000 0001
2402   sve_punpkhi(ptmp, mask);
2403   // vtmp1 = 0088 0077 0066 0055
2404   sve_uunpkhi(vtmp2, H, src);
2405   // vtmp1 = 0000 0000 0088 0055
2406   sve_compress_short(vtmp1, vtmp2, ptmp, vtmp3, vtmp4, pgtmp);
2407 
2408   sve_dup(vtmp4, B, 0);
2409   // vtmp1 = 00 00 00 00 00 00 88 55
2410   sve_uzp1(vtmp1, B, vtmp1, vtmp4);
2411 
2412   // Compressed low:   dst   = 00 00 00 00 00 44 22 11
2413   // Compressed high:  vtmp1 = 00 00 00 00 00 00 88 55
2414   // Left shift(cross lane) compressed high with TRUE_CNT lanes,
2415   // TRUE_CNT is the number of active elements in the compressed low.
2416   neg(rscratch2, rscratch2);
2417   // vtmp2 = {4 3 2 1 0 -1 -2 -3}
2418   sve_index(vtmp2, B, rscratch2, 1);
2419   // vtmp1 = 00 00 00 88 55 00 00 00
2420   sve_tbl(vtmp1, B, vtmp1, vtmp2);
2421   // Combine the compressed high(after shifted) with the compressed low.
2422   // dst = 00 00 00 88 55 44 22 11
2423   sve_orr(dst, dst, vtmp1);
2424 }
2425 
2426 void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2427   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2428   SIMD_Arrangement size = isQ ? T16B : T8B;
2429   if (bt == T_BYTE) {
2430     rbit(dst, size, src);
2431   } else {
2432     neon_reverse_bytes(dst, src, bt, isQ);
2433     rbit(dst, size, dst);
2434   }
2435 }
2436 
2437 void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2438   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2439   SIMD_Arrangement size = isQ ? T16B : T8B;
2440   switch (bt) {
2441     case T_BYTE:
2442       if (dst != src) {
2443         orr(dst, size, src, src);
2444       }
2445       break;
2446     case T_SHORT:
2447       rev16(dst, size, src);
2448       break;
2449     case T_INT:
2450       rev32(dst, size, src);
2451       break;
2452     case T_LONG:
2453       rev64(dst, size, src);
2454       break;
2455     default:
2456       assert(false, "unsupported");
2457       ShouldNotReachHere();
2458   }
2459 }
2460 
2461 // Extract a scalar element from an sve vector at position 'idx'.
2462 // The input elements in src are expected to be of integral type.
2463 void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src,
2464                                              int idx, FloatRegister vtmp) {
2465   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2466   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2467   if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction
2468     if (bt == T_INT || bt == T_LONG) {
2469       umov(dst, src, size, idx);
2470     } else {
2471       smov(dst, src, size, idx);
2472     }
2473   } else {
2474     sve_orr(vtmp, src, src);
2475     sve_ext(vtmp, vtmp, idx << size);
2476     if (bt == T_INT || bt == T_LONG) {
2477       umov(dst, vtmp, size, 0);
2478     } else {
2479       smov(dst, vtmp, size, 0);
2480     }
2481   }
2482 }
2483 
2484 // java.lang.Math::round intrinsics
2485 
2486 // Clobbers: rscratch1, rflags
2487 void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2488                                           FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) {
2489   assert_different_registers(tmp1, tmp2, tmp3, src, dst);
2490   switch (T) {
2491     case T2S:
2492     case T4S:
2493       fmovs(tmp1, T, 0.5f);
2494       mov(rscratch1, jint_cast(0x1.0p23f));
2495       break;
2496     case T2D:
2497       fmovd(tmp1, T, 0.5);
2498       mov(rscratch1, julong_cast(0x1.0p52));
2499       break;
2500     default:
2501       assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2502   }
2503   fadd(tmp1, T, tmp1, src);
2504   fcvtms(tmp1, T, tmp1);
2505   // tmp1 = floor(src + 0.5, ties to even)
2506 
2507   fcvtas(dst, T, src);
2508   // dst = round(src), ties to away
2509 
2510   fneg(tmp3, T, src);
2511   dup(tmp2, T, rscratch1);
2512   cm(HS, tmp3, T, tmp3, tmp2);
2513   // tmp3 is now a set of flags
2514 
2515   bif(dst, T16B, tmp1, tmp3);
2516   // result in dst
2517 }
2518 
2519 // Clobbers: rscratch1, rflags
2520 void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2521                                          FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) {
2522   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2523   assert_different_registers(tmp1, tmp2, src, dst);
2524 
2525   switch (T) {
2526     case S:
2527       mov(rscratch1, jint_cast(0x1.0p23f));
2528       break;
2529     case D:
2530       mov(rscratch1, julong_cast(0x1.0p52));
2531       break;
2532     default:
2533       assert(T == S || T == D, "invalid register variant");
2534   }
2535 
2536   sve_frinta(dst, T, ptrue, src);
2537   // dst = round(src), ties to away
2538 
2539   Label none;
2540 
2541   sve_fneg(tmp1, T, ptrue, src);
2542   sve_dup(tmp2, T, rscratch1);
2543   sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1);
2544   br(EQ, none);
2545   {
2546     sve_cpy(tmp1, T, pgtmp, 0.5);
2547     sve_fadd(tmp1, T, pgtmp, src);
2548     sve_frintm(dst, T, pgtmp, tmp1);
2549     // dst = floor(src + 0.5, ties to even)
2550   }
2551   bind(none);
2552 
2553   sve_fcvtzs(dst, T, ptrue, dst, T);
2554   // result in dst
2555 }
2556 
2557 void C2_MacroAssembler::vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero,
2558                                            FloatRegister one, SIMD_Arrangement T) {
2559   assert_different_registers(dst, src, zero, one);
2560   assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2561 
2562   facgt(dst, T, src, zero);
2563   ushr(dst, T, dst, 1); // dst=0 for +-0.0 and NaN. 0x7FF..F otherwise
2564   bsl(dst, T == T2S ? T8B : T16B, one, src); // Result in dst
2565 }
2566 
2567 void C2_MacroAssembler::vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero,
2568                                           FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T) {
2569     assert_different_registers(dst, src, zero, one, vtmp);
2570     assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2571 
2572     sve_orr(vtmp, src, src);
2573     sve_fac(Assembler::GT, pgtmp, T, ptrue, src, zero); // pmtp=0 for +-0.0 and NaN. 0x1 otherwise
2574     switch (T) {
2575     case S:
2576       sve_and(vtmp, T, min_jint); // Extract the sign bit of float value in every lane of src
2577       sve_orr(vtmp, T, jint_cast(1.0)); // OR it with +1 to make the final result +1 or -1 depending
2578                                         // on the sign of the float value
2579       break;
2580     case D:
2581       sve_and(vtmp, T, min_jlong);
2582       sve_orr(vtmp, T, jlong_cast(1.0));
2583       break;
2584     default:
2585       assert(false, "unsupported");
2586       ShouldNotReachHere();
2587     }
2588     sve_sel(dst, T, pgtmp, vtmp, src); // Select either from src or vtmp based on the predicate register pgtmp
2589                                        // Result in dst
2590 }
2591 
2592 bool C2_MacroAssembler::in_scratch_emit_size() {
2593   if (ciEnv::current()->task() != nullptr) {
2594     PhaseOutput* phase_output = Compile::current()->output();
2595     if (phase_output != nullptr && phase_output->in_scratch_emit_size()) {
2596       return true;
2597     }
2598   }
2599   return MacroAssembler::in_scratch_emit_size();
2600 }