New src/hotspot/cpu/aarch64/c2_MacroAssembler

   1 /*
   2  * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "opto/c2_MacroAssembler.hpp"
  29 #include "opto/compile.hpp"
  30 #include "opto/intrinsicnode.hpp"
  31 #include "opto/matcher.hpp"
  32 #include "opto/output.hpp"
  33 #include "opto/subnode.hpp"
  34 #include "runtime/stubRoutines.hpp"
  35 #include "utilities/globalDefinitions.hpp"
  36 
  37 #ifdef PRODUCT
  38 #define BLOCK_COMMENT(str) /* nothing */
  39 #define STOP(error) stop(error)
  40 #else
  41 #define BLOCK_COMMENT(str) block_comment(str)
  42 #define STOP(error) block_comment(error); stop(error)
  43 #endif
  44 
  45 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  46 
  47 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
  48 
  49 void C2_MacroAssembler::fast_lock(Register objectReg, Register boxReg, Register tmpReg,
  50                                   Register tmp2Reg, Register tmp3Reg) {
  51   Register oop = objectReg;
  52   Register box = boxReg;
  53   Register disp_hdr = tmpReg;
  54   Register tmp = tmp2Reg;
  55   Label cont;
  56   Label object_has_monitor;
  57   Label count, no_count;
  58 
  59   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight");
  60   assert_different_registers(oop, box, tmp, disp_hdr);
  61 
  62   // Load markWord from object into displaced_header.
  63   ldr(disp_hdr, Address(oop, oopDesc::mark_offset_in_bytes()));
  64 
  65   if (DiagnoseSyncOnValueBasedClasses != 0) {
  66     load_klass(tmp, oop);
  67     ldrw(tmp, Address(tmp, Klass::access_flags_offset()));
  68     tstw(tmp, JVM_ACC_IS_VALUE_BASED_CLASS);
  69     br(Assembler::NE, cont);
  70   }
  71 
  72   // Check for existing monitor
  73   tbnz(disp_hdr, exact_log2(markWord::monitor_value), object_has_monitor);
  74 
  75   if (LockingMode == LM_MONITOR) {
  76     tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0.
  77     b(cont);
  78   } else {
  79     assert(LockingMode == LM_LEGACY, "must be");
  80     // Set tmp to be (markWord of object | UNLOCK_VALUE).
  81     orr(tmp, disp_hdr, markWord::unlocked_value);
  82 
  83     // Initialize the box. (Must happen before we update the object mark!)
  84     str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes()));
  85 
  86     // Compare object markWord with an unlocked value (tmp) and if
  87     // equal exchange the stack address of our box with object markWord.
  88     // On failure disp_hdr contains the possibly locked markWord.
  89     cmpxchg(oop, tmp, box, Assembler::xword, /*acquire*/ true,
  90             /*release*/ true, /*weak*/ false, disp_hdr);
  91     br(Assembler::EQ, cont);
  92 
  93     assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
  94 
  95     // If the compare-and-exchange succeeded, then we found an unlocked
  96     // object, will have now locked it will continue at label cont
  97 
  98     // Check if the owner is self by comparing the value in the
  99     // markWord of object (disp_hdr) with the stack pointer.
 100     mov(rscratch1, sp);
 101     sub(disp_hdr, disp_hdr, rscratch1);
 102     mov(tmp, (address) (~(os::vm_page_size()-1) | markWord::lock_mask_in_place));
 103     // If condition is true we are cont and hence we can store 0 as the
 104     // displaced header in the box, which indicates that it is a recursive lock.
 105     ands(tmp/*==0?*/, disp_hdr, tmp);   // Sets flags for result
 106     str(tmp/*==0, perhaps*/, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 107     b(cont);
 108   }
 109 
 110   // Handle existing monitor.
 111   bind(object_has_monitor);
 112 
 113   // The object's monitor m is unlocked iff m->owner == nullptr,
 114   // otherwise m->owner may contain a thread or a stack address.
 115   //
 116   // Try to CAS m->owner from null to current thread.
 117   add(tmp, disp_hdr, (in_bytes(ObjectMonitor::owner_offset())-markWord::monitor_value));
 118   cmpxchg(tmp, zr, rthread, Assembler::xword, /*acquire*/ true,
 119           /*release*/ true, /*weak*/ false, tmp3Reg); // Sets flags for result
 120 
 121   // Store a non-null value into the box to avoid looking like a re-entrant
 122   // lock. The fast-path monitor unlock code checks for
 123   // markWord::monitor_value so use markWord::unused_mark which has the
 124   // relevant bit set, and also matches ObjectSynchronizer::enter.
 125   mov(tmp, (address)markWord::unused_mark().value());
 126   str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 127 
 128   br(Assembler::EQ, cont); // CAS success means locking succeeded
 129 
 130   cmp(tmp3Reg, rthread);
 131   br(Assembler::NE, cont); // Check for recursive locking
 132 
 133   // Recursive lock case
 134   increment(Address(disp_hdr, in_bytes(ObjectMonitor::recursions_offset()) - markWord::monitor_value), 1);
 135   // flag == EQ still from the cmp above, checking if this is a reentrant lock
 136 
 137   bind(cont);
 138   // flag == EQ indicates success
 139   // flag == NE indicates failure
 140   br(Assembler::NE, no_count);
 141 
 142   bind(count);
 143   increment(Address(rthread, JavaThread::held_monitor_count_offset()));
 144 
 145   bind(no_count);
 146 }
 147 
 148 void C2_MacroAssembler::fast_unlock(Register objectReg, Register boxReg, Register tmpReg,
 149                                     Register tmp2Reg) {
 150   Register oop = objectReg;
 151   Register box = boxReg;
 152   Register disp_hdr = tmpReg;
 153   Register tmp = tmp2Reg;
 154   Label cont;
 155   Label object_has_monitor;
 156   Label count, no_count;
 157 
 158   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight");
 159   assert_different_registers(oop, box, tmp, disp_hdr);
 160 
 161   if (LockingMode == LM_LEGACY) {
 162     // Find the lock address and load the displaced header from the stack.
 163     ldr(disp_hdr, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 164 
 165     // If the displaced header is 0, we have a recursive unlock.
 166     cmp(disp_hdr, zr);
 167     br(Assembler::EQ, cont);
 168   }
 169 
 170   // Handle existing monitor.
 171   ldr(tmp, Address(oop, oopDesc::mark_offset_in_bytes()));
 172   tbnz(tmp, exact_log2(markWord::monitor_value), object_has_monitor);
 173 
 174   if (LockingMode == LM_MONITOR) {
 175     tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0.
 176     b(cont);
 177   } else {
 178     assert(LockingMode == LM_LEGACY, "must be");
 179     // Check if it is still a light weight lock, this is is true if we
 180     // see the stack address of the basicLock in the markWord of the
 181     // object.
 182 
 183     cmpxchg(oop, box, disp_hdr, Assembler::xword, /*acquire*/ false,
 184             /*release*/ true, /*weak*/ false, tmp);
 185     b(cont);
 186   }
 187 
 188   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
 189 
 190   // Handle existing monitor.
 191   bind(object_has_monitor);
 192   STATIC_ASSERT(markWord::monitor_value <= INT_MAX);
 193   add(tmp, tmp, -(int)markWord::monitor_value); // monitor
 194 
 195   ldr(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset()));
 196 
 197   Label notRecursive;
 198   cbz(disp_hdr, notRecursive);
 199 
 200   // Recursive lock
 201   sub(disp_hdr, disp_hdr, 1u);
 202   str(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset()));
 203   cmp(disp_hdr, disp_hdr); // Sets flags for result
 204   b(cont);
 205 
 206   bind(notRecursive);
 207   ldr(rscratch1, Address(tmp, ObjectMonitor::EntryList_offset()));
 208   ldr(disp_hdr, Address(tmp, ObjectMonitor::cxq_offset()));
 209   orr(rscratch1, rscratch1, disp_hdr); // Will be 0 if both are 0.
 210   cmp(rscratch1, zr); // Sets flags for result
 211   cbnz(rscratch1, cont);
 212   // need a release store here
 213   lea(tmp, Address(tmp, ObjectMonitor::owner_offset()));
 214   stlr(zr, tmp); // set unowned
 215 
 216   bind(cont);
 217   // flag == EQ indicates success
 218   // flag == NE indicates failure
 219   br(Assembler::NE, no_count);
 220 
 221   bind(count);
 222   decrement(Address(rthread, JavaThread::held_monitor_count_offset()));
 223 
 224   bind(no_count);
 225 }
 226 
 227 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register t1,
 228                                               Register t2, Register t3) {
 229   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 230   assert_different_registers(obj, box, t1, t2, t3);
 231 
 232   // Handle inflated monitor.
 233   Label inflated;
 234   // Finish fast lock successfully. MUST branch to with flag == EQ
 235   Label locked;
 236   // Finish fast lock unsuccessfully. MUST branch to with flag == NE
 237   Label slow_path;
 238 
 239   if (UseObjectMonitorTable) {
 240     // Clear cache in case fast locking succeeds.
 241     str(zr, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 242   }
 243 
 244   if (DiagnoseSyncOnValueBasedClasses != 0) {
 245     load_klass(t1, obj);
 246     ldrw(t1, Address(t1, Klass::access_flags_offset()));
 247     tstw(t1, JVM_ACC_IS_VALUE_BASED_CLASS);
 248     br(Assembler::NE, slow_path);
 249   }
 250 
 251   const Register t1_mark = t1;
 252   const Register t3_t = t3;
 253 
 254   { // Lightweight locking
 255 
 256     // Push lock to the lock stack and finish successfully. MUST branch to with flag == EQ
 257     Label push;
 258 
 259     const Register t2_top = t2;
 260 
 261     // Check if lock-stack is full.
 262     ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 263     cmpw(t2_top, (unsigned)LockStack::end_offset() - 1);
 264     br(Assembler::GT, slow_path);
 265 
 266     // Check if recursive.
 267     subw(t3_t, t2_top, oopSize);
 268     ldr(t3_t, Address(rthread, t3_t));
 269     cmp(obj, t3_t);
 270     br(Assembler::EQ, push);
 271 
 272     // Relaxed normal load to check for monitor. Optimization for monitor case.
 273     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 274     tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
 275 
 276     // Not inflated
 277     assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea");
 278 
 279     // Try to lock. Transition lock-bits 0b01 => 0b00
 280     orr(t1_mark, t1_mark, markWord::unlocked_value);
 281     eor(t3_t, t1_mark, markWord::unlocked_value);
 282     cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
 283             /*acquire*/ true, /*release*/ false, /*weak*/ false, noreg);
 284     br(Assembler::NE, slow_path);
 285 
 286     bind(push);
 287     // After successful lock, push object on lock-stack.
 288     str(obj, Address(rthread, t2_top));
 289     addw(t2_top, t2_top, oopSize);
 290     strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 291     b(locked);
 292   }
 293 
 294   { // Handle inflated monitor.
 295     bind(inflated);
 296 
 297     const Register t1_monitor = t1;
 298 
 299     if (!UseObjectMonitorTable) {
 300       assert(t1_monitor == t1_mark, "should be the same here");
 301     } else {
 302       Label monitor_found;
 303 
 304       // Load cache address
 305       lea(t3_t, Address(rthread, JavaThread::om_cache_oops_offset()));
 306 
 307       const int num_unrolled = 2;
 308       for (int i = 0; i < num_unrolled; i++) {
 309         ldr(t1, Address(t3_t));
 310         cmp(obj, t1);
 311         br(Assembler::EQ, monitor_found);
 312         if (i + 1 != num_unrolled) {
 313           increment(t3_t, in_bytes(OMCache::oop_to_oop_difference()));
 314         }
 315       }
 316 
 317       // Loop after unrolling, advance iterator.
 318       increment(t3_t, in_bytes(OMCache::oop_to_oop_difference()));
 319 
 320       Label loop;
 321 
 322       // Search for obj in cache.
 323       bind(loop);
 324 
 325       // Check for match.
 326       ldr(t1, Address(t3_t));
 327       cmp(obj, t1);
 328       br(Assembler::EQ, monitor_found);
 329 
 330       // Search until null encountered, guaranteed _null_sentinel at end.
 331       increment(t3_t, in_bytes(OMCache::oop_to_oop_difference()));
 332       cbnz(t1, loop);
 333       // Cache Miss, NE set from cmp above, cbnz does not set flags
 334       b(slow_path);
 335 
 336       bind(monitor_found);
 337       ldr(t1_monitor, Address(t3_t, OMCache::oop_to_monitor_difference()));
 338     }
 339 
 340     const Register t2_owner_addr = t2;
 341     const Register t3_owner = t3;
 342     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 343     const Address owner_address{t1_monitor, ObjectMonitor::owner_offset() - monitor_tag};
 344     const Address recursions_address{t1_monitor, ObjectMonitor::recursions_offset() - monitor_tag};
 345 
 346     Label monitor_locked;
 347 
 348     // Compute owner address.
 349     lea(t2_owner_addr, owner_address);
 350 
 351     // CAS owner (null => current thread).
 352     cmpxchg(t2_owner_addr, zr, rthread, Assembler::xword, /*acquire*/ true,
 353             /*release*/ false, /*weak*/ false, t3_owner);
 354     br(Assembler::EQ, monitor_locked);
 355 
 356     // Check if recursive.
 357     cmp(t3_owner, rthread);
 358     br(Assembler::NE, slow_path);
 359 
 360     // Recursive.
 361     increment(recursions_address, 1);
 362 
 363     bind(monitor_locked);
 364     if (UseObjectMonitorTable) {
 365       str(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 366     }
 367   }
 368 
 369   bind(locked);
 370   increment(Address(rthread, JavaThread::held_monitor_count_offset()));
 371 
 372 #ifdef ASSERT
 373   // Check that locked label is reached with Flags == EQ.
 374   Label flag_correct;
 375   br(Assembler::EQ, flag_correct);
 376   stop("Fast Lock Flag != EQ");
 377 #endif
 378 
 379   bind(slow_path);
 380 #ifdef ASSERT
 381   // Check that slow_path label is reached with Flags == NE.
 382   br(Assembler::NE, flag_correct);
 383   stop("Fast Lock Flag != NE");
 384   bind(flag_correct);
 385 #endif
 386   // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
 387 }
 388 
 389 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register box, Register t1,
 390                                                 Register t2, Register t3) {
 391   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 392   assert_different_registers(obj, box, t1, t2, t3);
 393 
 394   // Handle inflated monitor.
 395   Label inflated, inflated_load_mark;
 396   // Finish fast unlock successfully. MUST branch to with flag == EQ
 397   Label unlocked;
 398   // Finish fast unlock unsuccessfully. MUST branch to with flag == NE
 399   Label slow_path;
 400 
 401   const Register t1_mark = t1;
 402   const Register t2_top = t2;
 403   const Register t3_t = t3;
 404 
 405   { // Lightweight unlock
 406 
 407     Label push_and_slow_path;
 408 
 409     // Check if obj is top of lock-stack.
 410     ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 411     subw(t2_top, t2_top, oopSize);
 412     ldr(t3_t, Address(rthread, t2_top));
 413     cmp(obj, t3_t);
 414     // Top of lock stack was not obj. Must be monitor.
 415     br(Assembler::NE, inflated_load_mark);
 416 
 417     // Pop lock-stack.
 418     DEBUG_ONLY(str(zr, Address(rthread, t2_top));)
 419     strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 420 
 421     // Check if recursive.
 422     subw(t3_t, t2_top, oopSize);
 423     ldr(t3_t, Address(rthread, t3_t));
 424     cmp(obj, t3_t);
 425     br(Assembler::EQ, unlocked);
 426 
 427     // Not recursive.
 428     // Load Mark.
 429     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 430 
 431     // Check header for monitor (0b10).
 432     // Because we got here by popping (meaning we pushed in locked)
 433     // there will be no monitor in the box. So we need to push back the obj
 434     // so that the runtime can fix any potential anonymous owner.
 435     tbnz(t1_mark, exact_log2(markWord::monitor_value), UseObjectMonitorTable ? push_and_slow_path : inflated);
 436 
 437     // Try to unlock. Transition lock bits 0b00 => 0b01
 438     assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea");
 439     orr(t3_t, t1_mark, markWord::unlocked_value);
 440     cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
 441             /*acquire*/ false, /*release*/ true, /*weak*/ false, noreg);
 442     br(Assembler::EQ, unlocked);
 443 
 444     bind(push_and_slow_path);
 445     // Compare and exchange failed.
 446     // Restore lock-stack and handle the unlock in runtime.
 447     DEBUG_ONLY(str(obj, Address(rthread, t2_top));)
 448     addw(t2_top, t2_top, oopSize);
 449     str(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 450     b(slow_path);
 451   }
 452 
 453 
 454   { // Handle inflated monitor.
 455     bind(inflated_load_mark);
 456     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 457 #ifdef ASSERT
 458     tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
 459     stop("Fast Unlock not monitor");
 460 #endif
 461 
 462     bind(inflated);
 463 
 464 #ifdef ASSERT
 465     Label check_done;
 466     subw(t2_top, t2_top, oopSize);
 467     cmpw(t2_top, in_bytes(JavaThread::lock_stack_base_offset()));
 468     br(Assembler::LT, check_done);
 469     ldr(t3_t, Address(rthread, t2_top));
 470     cmp(obj, t3_t);
 471     br(Assembler::NE, inflated);
 472     stop("Fast Unlock lock on stack");
 473     bind(check_done);
 474 #endif
 475 
 476     const Register t1_monitor = t1;
 477 
 478     if (!UseObjectMonitorTable) {
 479       assert(t1_monitor == t1_mark, "should be the same here");
 480 
 481       // Untag the monitor.
 482       add(t1_monitor, t1_mark, -(int)markWord::monitor_value);
 483     } else {
 484       ldr(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 485       // null check with Flags == NE, no valid pointer below alignof(ObjectMonitor*)
 486       cmp(t1_monitor, checked_cast<uint8_t>(alignof(ObjectMonitor*)));
 487       br(Assembler::LO, slow_path);
 488     }
 489 
 490     const Register t2_recursions = t2;
 491     Label not_recursive;
 492 
 493     // Check if recursive.
 494     ldr(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
 495     cbz(t2_recursions, not_recursive);
 496 
 497     // Recursive unlock.
 498     sub(t2_recursions, t2_recursions, 1u);
 499     str(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
 500     // Set flag == EQ
 501     cmp(t2_recursions, t2_recursions);
 502     b(unlocked);
 503 
 504     bind(not_recursive);
 505 
 506     Label release;
 507     const Register t2_owner_addr = t2;
 508 
 509     // Compute owner address.
 510     lea(t2_owner_addr, Address(t1_monitor, ObjectMonitor::owner_offset()));
 511 
 512     // Check if the entry lists are empty.
 513     ldr(rscratch1, Address(t1_monitor, ObjectMonitor::EntryList_offset()));
 514     ldr(t3_t, Address(t1_monitor, ObjectMonitor::cxq_offset()));
 515     orr(rscratch1, rscratch1, t3_t);
 516     cmp(rscratch1, zr);
 517     br(Assembler::EQ, release);
 518 
 519     // The owner may be anonymous and we removed the last obj entry in
 520     // the lock-stack. This loses the information about the owner.
 521     // Write the thread to the owner field so the runtime knows the owner.
 522     str(rthread, Address(t2_owner_addr));
 523     b(slow_path);
 524 
 525     bind(release);
 526     // Set owner to null.
 527     // Release to satisfy the JMM
 528     stlr(zr, t2_owner_addr);
 529   }
 530 
 531   bind(unlocked);
 532   decrement(Address(rthread, JavaThread::held_monitor_count_offset()));
 533 
 534 #ifdef ASSERT
 535   // Check that unlocked label is reached with Flags == EQ.
 536   Label flag_correct;
 537   br(Assembler::EQ, flag_correct);
 538   stop("Fast Unlock Flag != EQ");
 539 #endif
 540 
 541   bind(slow_path);
 542 #ifdef ASSERT
 543   // Check that slow_path label is reached with Flags == NE.
 544   br(Assembler::NE, flag_correct);
 545   stop("Fast Unlock Flag != NE");
 546   bind(flag_correct);
 547 #endif
 548   // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
 549 }
 550 
 551 // Search for str1 in str2 and return index or -1
 552 // Clobbers: rscratch1, rscratch2, rflags. May also clobber v0-v1, when icnt1==-1.
 553 void C2_MacroAssembler::string_indexof(Register str2, Register str1,
 554                                        Register cnt2, Register cnt1,
 555                                        Register tmp1, Register tmp2,
 556                                        Register tmp3, Register tmp4,
 557                                        Register tmp5, Register tmp6,
 558                                        int icnt1, Register result, int ae) {
 559   // NOTE: tmp5, tmp6 can be zr depending on specific method version
 560   Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
 561 
 562   Register ch1 = rscratch1;
 563   Register ch2 = rscratch2;
 564   Register cnt1tmp = tmp1;
 565   Register cnt2tmp = tmp2;
 566   Register cnt1_neg = cnt1;
 567   Register cnt2_neg = cnt2;
 568   Register result_tmp = tmp4;
 569 
 570   bool isL = ae == StrIntrinsicNode::LL;
 571 
 572   bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
 573   bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
 574   int str1_chr_shift = str1_isL ? 0:1;
 575   int str2_chr_shift = str2_isL ? 0:1;
 576   int str1_chr_size = str1_isL ? 1:2;
 577   int str2_chr_size = str2_isL ? 1:2;
 578   chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
 579                                       (chr_insn)&MacroAssembler::ldrh;
 580   chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
 581                                       (chr_insn)&MacroAssembler::ldrh;
 582   chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
 583   chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
 584 
 585   // Note, inline_string_indexOf() generates checks:
 586   // if (substr.count > string.count) return -1;
 587   // if (substr.count == 0) return 0;
 588 
 589   // We have two strings, a source string in str2, cnt2 and a pattern string
 590   // in str1, cnt1. Find the 1st occurrence of pattern in source or return -1.
 591 
 592   // For larger pattern and source we use a simplified Boyer Moore algorithm.
 593   // With a small pattern and source we use linear scan.
 594 
 595   if (icnt1 == -1) {
 596     sub(result_tmp, cnt2, cnt1);
 597     cmp(cnt1, (u1)8);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
 598     br(LT, LINEARSEARCH);
 599     dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
 600     subs(zr, cnt1, 256);
 601     lsr(tmp1, cnt2, 2);
 602     ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
 603     br(GE, LINEARSTUB);
 604   }
 605 
 606 // The Boyer Moore alogorithm is based on the description here:-
 607 //
 608 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
 609 //
 610 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
 611 // and the 'Good Suffix' rule.
 612 //
 613 // These rules are essentially heuristics for how far we can shift the
 614 // pattern along the search string.
 615 //
 616 // The implementation here uses the 'Bad Character' rule only because of the
 617 // complexity of initialisation for the 'Good Suffix' rule.
 618 //
 619 // This is also known as the Boyer-Moore-Horspool algorithm:-
 620 //
 621 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
 622 //
 623 // This particular implementation has few java-specific optimizations.
 624 //
 625 // #define ASIZE 256
 626 //
 627 //    int bm(unsigned char *x, int m, unsigned char *y, int n) {
 628 //       int i, j;
 629 //       unsigned c;
 630 //       unsigned char bc[ASIZE];
 631 //
 632 //       /* Preprocessing */
 633 //       for (i = 0; i < ASIZE; ++i)
 634 //          bc[i] = m;
 635 //       for (i = 0; i < m - 1; ) {
 636 //          c = x[i];
 637 //          ++i;
 638 //          // c < 256 for Latin1 string, so, no need for branch
 639 //          #ifdef PATTERN_STRING_IS_LATIN1
 640 //          bc[c] = m - i;
 641 //          #else
 642 //          if (c < ASIZE) bc[c] = m - i;
 643 //          #endif
 644 //       }
 645 //
 646 //       /* Searching */
 647 //       j = 0;
 648 //       while (j <= n - m) {
 649 //          c = y[i+j];
 650 //          if (x[m-1] == c)
 651 //            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
 652 //          if (i < 0) return j;
 653 //          // c < 256 for Latin1 string, so, no need for branch
 654 //          #ifdef SOURCE_STRING_IS_LATIN1
 655 //          // LL case: (c< 256) always true. Remove branch
 656 //          j += bc[y[j+m-1]];
 657 //          #endif
 658 //          #ifndef PATTERN_STRING_IS_UTF
 659 //          // UU case: need if (c<ASIZE) check. Skip 1 character if not.
 660 //          if (c < ASIZE)
 661 //            j += bc[y[j+m-1]];
 662 //          else
 663 //            j += 1
 664 //          #endif
 665 //          #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
 666 //          // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
 667 //          if (c < ASIZE)
 668 //            j += bc[y[j+m-1]];
 669 //          else
 670 //            j += m
 671 //          #endif
 672 //       }
 673 //    }
 674 
 675   if (icnt1 == -1) {
 676     Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
 677         BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
 678     Register cnt1end = tmp2;
 679     Register str2end = cnt2;
 680     Register skipch = tmp2;
 681 
 682     // str1 length is >=8, so, we can read at least 1 register for cases when
 683     // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
 684     // UL case. We'll re-read last character in inner pre-loop code to have
 685     // single outer pre-loop load
 686     const int firstStep = isL ? 7 : 3;
 687 
 688     const int ASIZE = 256;
 689     const int STORED_BYTES = 32; // amount of bytes stored per instruction
 690     sub(sp, sp, ASIZE);
 691     mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
 692     mov(ch1, sp);
 693     BIND(BM_INIT_LOOP);
 694       stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
 695       subs(tmp5, tmp5, 1);
 696       br(GT, BM_INIT_LOOP);
 697 
 698       sub(cnt1tmp, cnt1, 1);
 699       mov(tmp5, str2);
 700       add(str2end, str2, result_tmp, LSL, str2_chr_shift);
 701       sub(ch2, cnt1, 1);
 702       mov(tmp3, str1);
 703     BIND(BCLOOP);
 704       (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
 705       if (!str1_isL) {
 706         subs(zr, ch1, ASIZE);
 707         br(HS, BCSKIP);
 708       }
 709       strb(ch2, Address(sp, ch1));
 710     BIND(BCSKIP);
 711       subs(ch2, ch2, 1);
 712       br(GT, BCLOOP);
 713 
 714       add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
 715       if (str1_isL == str2_isL) {
 716         // load last 8 bytes (8LL/4UU symbols)
 717         ldr(tmp6, Address(tmp6, -wordSize));
 718       } else {
 719         ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
 720         // convert Latin1 to UTF. We'll have to wait until load completed, but
 721         // it's still faster than per-character loads+checks
 722         lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
 723         ubfx(ch1, tmp6, 8, 8); // str1[N-2]
 724         ubfx(ch2, tmp6, 16, 8); // str1[N-3]
 725         andr(tmp6, tmp6, 0xFF); // str1[N-4]
 726         orr(ch2, ch1, ch2, LSL, 16);
 727         orr(tmp6, tmp6, tmp3, LSL, 48);
 728         orr(tmp6, tmp6, ch2, LSL, 16);
 729       }
 730     BIND(BMLOOPSTR2);
 731       (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 732       sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
 733       if (str1_isL == str2_isL) {
 734         // re-init tmp3. It's for free because it's executed in parallel with
 735         // load above. Alternative is to initialize it before loop, but it'll
 736         // affect performance on in-order systems with 2 or more ld/st pipelines
 737         lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
 738       }
 739       if (!isL) { // UU/UL case
 740         lsl(ch2, cnt1tmp, 1); // offset in bytes
 741       }
 742       cmp(tmp3, skipch);
 743       br(NE, BMSKIP);
 744       ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
 745       mov(ch1, tmp6);
 746       if (isL) {
 747         b(BMLOOPSTR1_AFTER_LOAD);
 748       } else {
 749         sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
 750         b(BMLOOPSTR1_CMP);
 751       }
 752     BIND(BMLOOPSTR1);
 753       (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
 754       (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 755     BIND(BMLOOPSTR1_AFTER_LOAD);
 756       subs(cnt1tmp, cnt1tmp, 1);
 757       br(LT, BMLOOPSTR1_LASTCMP);
 758     BIND(BMLOOPSTR1_CMP);
 759       cmp(ch1, ch2);
 760       br(EQ, BMLOOPSTR1);
 761     BIND(BMSKIP);
 762       if (!isL) {
 763         // if we've met UTF symbol while searching Latin1 pattern, then we can
 764         // skip cnt1 symbols
 765         if (str1_isL != str2_isL) {
 766           mov(result_tmp, cnt1);
 767         } else {
 768           mov(result_tmp, 1);
 769         }
 770         subs(zr, skipch, ASIZE);
 771         br(HS, BMADV);
 772       }
 773       ldrb(result_tmp, Address(sp, skipch)); // load skip distance
 774     BIND(BMADV);
 775       sub(cnt1tmp, cnt1, 1);
 776       add(str2, str2, result_tmp, LSL, str2_chr_shift);
 777       cmp(str2, str2end);
 778       br(LE, BMLOOPSTR2);
 779       add(sp, sp, ASIZE);
 780       b(NOMATCH);
 781     BIND(BMLOOPSTR1_LASTCMP);
 782       cmp(ch1, ch2);
 783       br(NE, BMSKIP);
 784     BIND(BMMATCH);
 785       sub(result, str2, tmp5);
 786       if (!str2_isL) lsr(result, result, 1);
 787       add(sp, sp, ASIZE);
 788       b(DONE);
 789 
 790     BIND(LINEARSTUB);
 791     cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm
 792     br(LT, LINEAR_MEDIUM);
 793     mov(result, zr);
 794     RuntimeAddress stub = nullptr;
 795     if (isL) {
 796       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
 797       assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated");
 798     } else if (str1_isL) {
 799       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
 800        assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated");
 801     } else {
 802       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
 803       assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated");
 804     }
 805     address call = trampoline_call(stub);
 806     if (call == nullptr) {
 807       DEBUG_ONLY(reset_labels(LINEARSEARCH, LINEAR_MEDIUM, DONE, NOMATCH, MATCH));
 808       ciEnv::current()->record_failure("CodeCache is full");
 809       return;
 810     }
 811     b(DONE);
 812   }
 813 
 814   BIND(LINEARSEARCH);
 815   {
 816     Label DO1, DO2, DO3;
 817 
 818     Register str2tmp = tmp2;
 819     Register first = tmp3;
 820 
 821     if (icnt1 == -1)
 822     {
 823         Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
 824 
 825         cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2));
 826         br(LT, DOSHORT);
 827       BIND(LINEAR_MEDIUM);
 828         (this->*str1_load_1chr)(first, Address(str1));
 829         lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
 830         sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
 831         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 832         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 833 
 834       BIND(FIRST_LOOP);
 835         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
 836         cmp(first, ch2);
 837         br(EQ, STR1_LOOP);
 838       BIND(STR2_NEXT);
 839         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 840         br(LE, FIRST_LOOP);
 841         b(NOMATCH);
 842 
 843       BIND(STR1_LOOP);
 844         adds(cnt1tmp, cnt1_neg, str1_chr_size);
 845         add(cnt2tmp, cnt2_neg, str2_chr_size);
 846         br(GE, MATCH);
 847 
 848       BIND(STR1_NEXT);
 849         (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
 850         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
 851         cmp(ch1, ch2);
 852         br(NE, STR2_NEXT);
 853         adds(cnt1tmp, cnt1tmp, str1_chr_size);
 854         add(cnt2tmp, cnt2tmp, str2_chr_size);
 855         br(LT, STR1_NEXT);
 856         b(MATCH);
 857 
 858       BIND(DOSHORT);
 859       if (str1_isL == str2_isL) {
 860         cmp(cnt1, (u1)2);
 861         br(LT, DO1);
 862         br(GT, DO3);
 863       }
 864     }
 865 
 866     if (icnt1 == 4) {
 867       Label CH1_LOOP;
 868 
 869         (this->*load_4chr)(ch1, str1);
 870         sub(result_tmp, cnt2, 4);
 871         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 872         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 873 
 874       BIND(CH1_LOOP);
 875         (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
 876         cmp(ch1, ch2);
 877         br(EQ, MATCH);
 878         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 879         br(LE, CH1_LOOP);
 880         b(NOMATCH);
 881       }
 882 
 883     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
 884       Label CH1_LOOP;
 885 
 886       BIND(DO2);
 887         (this->*load_2chr)(ch1, str1);
 888         if (icnt1 == 2) {
 889           sub(result_tmp, cnt2, 2);
 890         }
 891         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 892         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 893       BIND(CH1_LOOP);
 894         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
 895         cmp(ch1, ch2);
 896         br(EQ, MATCH);
 897         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 898         br(LE, CH1_LOOP);
 899         b(NOMATCH);
 900     }
 901 
 902     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
 903       Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
 904 
 905       BIND(DO3);
 906         (this->*load_2chr)(first, str1);
 907         (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
 908         if (icnt1 == 3) {
 909           sub(result_tmp, cnt2, 3);
 910         }
 911         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 912         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 913       BIND(FIRST_LOOP);
 914         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
 915         cmpw(first, ch2);
 916         br(EQ, STR1_LOOP);
 917       BIND(STR2_NEXT);
 918         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 919         br(LE, FIRST_LOOP);
 920         b(NOMATCH);
 921 
 922       BIND(STR1_LOOP);
 923         add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
 924         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
 925         cmp(ch1, ch2);
 926         br(NE, STR2_NEXT);
 927         b(MATCH);
 928     }
 929 
 930     if (icnt1 == -1 || icnt1 == 1) {
 931       Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
 932 
 933       BIND(DO1);
 934         (this->*str1_load_1chr)(ch1, str1);
 935         cmp(cnt2, (u1)8);
 936         br(LT, DO1_SHORT);
 937 
 938         sub(result_tmp, cnt2, 8/str2_chr_size);
 939         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 940         mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
 941         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 942 
 943         if (str2_isL) {
 944           orr(ch1, ch1, ch1, LSL, 8);
 945         }
 946         orr(ch1, ch1, ch1, LSL, 16);
 947         orr(ch1, ch1, ch1, LSL, 32);
 948       BIND(CH1_LOOP);
 949         ldr(ch2, Address(str2, cnt2_neg));
 950         eor(ch2, ch1, ch2);
 951         sub(tmp1, ch2, tmp3);
 952         orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 953         bics(tmp1, tmp1, tmp2);
 954         br(NE, HAS_ZERO);
 955         adds(cnt2_neg, cnt2_neg, 8);
 956         br(LT, CH1_LOOP);
 957 
 958         cmp(cnt2_neg, (u1)8);
 959         mov(cnt2_neg, 0);
 960         br(LT, CH1_LOOP);
 961         b(NOMATCH);
 962 
 963       BIND(HAS_ZERO);
 964         rev(tmp1, tmp1);
 965         clz(tmp1, tmp1);
 966         add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
 967         b(MATCH);
 968 
 969       BIND(DO1_SHORT);
 970         mov(result_tmp, cnt2);
 971         lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
 972         sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
 973       BIND(DO1_LOOP);
 974         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
 975         cmpw(ch1, ch2);
 976         br(EQ, MATCH);
 977         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 978         br(LT, DO1_LOOP);
 979     }
 980   }
 981   BIND(NOMATCH);
 982     mov(result, -1);
 983     b(DONE);
 984   BIND(MATCH);
 985     add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
 986   BIND(DONE);
 987 }
 988 
 989 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
 990 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
 991 
 992 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1,
 993                                             Register ch, Register result,
 994                                             Register tmp1, Register tmp2, Register tmp3)
 995 {
 996   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
 997   Register cnt1_neg = cnt1;
 998   Register ch1 = rscratch1;
 999   Register result_tmp = rscratch2;
1000 
1001   cbz(cnt1, NOMATCH);
1002 
1003   cmp(cnt1, (u1)4);
1004   br(LT, DO1_SHORT);
1005 
1006   orr(ch, ch, ch, LSL, 16);
1007   orr(ch, ch, ch, LSL, 32);
1008 
1009   sub(cnt1, cnt1, 4);
1010   mov(result_tmp, cnt1);
1011   lea(str1, Address(str1, cnt1, Address::uxtw(1)));
1012   sub(cnt1_neg, zr, cnt1, LSL, 1);
1013 
1014   mov(tmp3, 0x0001000100010001);
1015 
1016   BIND(CH1_LOOP);
1017     ldr(ch1, Address(str1, cnt1_neg));
1018     eor(ch1, ch, ch1);
1019     sub(tmp1, ch1, tmp3);
1020     orr(tmp2, ch1, 0x7fff7fff7fff7fff);
1021     bics(tmp1, tmp1, tmp2);
1022     br(NE, HAS_ZERO);
1023     adds(cnt1_neg, cnt1_neg, 8);
1024     br(LT, CH1_LOOP);
1025 
1026     cmp(cnt1_neg, (u1)8);
1027     mov(cnt1_neg, 0);
1028     br(LT, CH1_LOOP);
1029     b(NOMATCH);
1030 
1031   BIND(HAS_ZERO);
1032     rev(tmp1, tmp1);
1033     clz(tmp1, tmp1);
1034     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
1035     b(MATCH);
1036 
1037   BIND(DO1_SHORT);
1038     mov(result_tmp, cnt1);
1039     lea(str1, Address(str1, cnt1, Address::uxtw(1)));
1040     sub(cnt1_neg, zr, cnt1, LSL, 1);
1041   BIND(DO1_LOOP);
1042     ldrh(ch1, Address(str1, cnt1_neg));
1043     cmpw(ch, ch1);
1044     br(EQ, MATCH);
1045     adds(cnt1_neg, cnt1_neg, 2);
1046     br(LT, DO1_LOOP);
1047   BIND(NOMATCH);
1048     mov(result, -1);
1049     b(DONE);
1050   BIND(MATCH);
1051     add(result, result_tmp, cnt1_neg, ASR, 1);
1052   BIND(DONE);
1053 }
1054 
1055 void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1,
1056                                                 Register ch, Register result,
1057                                                 FloatRegister ztmp1,
1058                                                 FloatRegister ztmp2,
1059                                                 PRegister tmp_pg,
1060                                                 PRegister tmp_pdn, bool isL)
1061 {
1062   // Note that `tmp_pdn` should *NOT* be used as governing predicate register.
1063   assert(tmp_pg->is_governing(),
1064          "this register has to be a governing predicate register");
1065 
1066   Label LOOP, MATCH, DONE, NOMATCH;
1067   Register vec_len = rscratch1;
1068   Register idx = rscratch2;
1069 
1070   SIMD_RegVariant T = (isL == true) ? B : H;
1071 
1072   cbz(cnt1, NOMATCH);
1073 
1074   // Assign the particular char throughout the vector.
1075   sve_dup(ztmp2, T, ch);
1076   if (isL) {
1077     sve_cntb(vec_len);
1078   } else {
1079     sve_cnth(vec_len);
1080   }
1081   mov(idx, 0);
1082 
1083   // Generate a predicate to control the reading of input string.
1084   sve_whilelt(tmp_pg, T, idx, cnt1);
1085 
1086   BIND(LOOP);
1087     // Read a vector of 8- or 16-bit data depending on the string type. Note
1088     // that inactive elements indicated by the predicate register won't cause
1089     // a data read from memory to the destination vector.
1090     if (isL) {
1091       sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx));
1092     } else {
1093       sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1)));
1094     }
1095     add(idx, idx, vec_len);
1096 
1097     // Perform the comparison. An element of the destination predicate is set
1098     // to active if the particular char is matched.
1099     sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2);
1100 
1101     // Branch if the particular char is found.
1102     br(NE, MATCH);
1103 
1104     sve_whilelt(tmp_pg, T, idx, cnt1);
1105 
1106     // Loop back if the particular char not found.
1107     br(MI, LOOP);
1108 
1109   BIND(NOMATCH);
1110     mov(result, -1);
1111     b(DONE);
1112 
1113   BIND(MATCH);
1114     // Undo the index increment.
1115     sub(idx, idx, vec_len);
1116 
1117     // Crop the vector to find its location.
1118     sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */);
1119     add(result, idx, -1);
1120     sve_incp(result, T, tmp_pdn);
1121   BIND(DONE);
1122 }
1123 
1124 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1,
1125                                             Register ch, Register result,
1126                                             Register tmp1, Register tmp2, Register tmp3)
1127 {
1128   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
1129   Register cnt1_neg = cnt1;
1130   Register ch1 = rscratch1;
1131   Register result_tmp = rscratch2;
1132 
1133   cbz(cnt1, NOMATCH);
1134 
1135   cmp(cnt1, (u1)8);
1136   br(LT, DO1_SHORT);
1137 
1138   orr(ch, ch, ch, LSL, 8);
1139   orr(ch, ch, ch, LSL, 16);
1140   orr(ch, ch, ch, LSL, 32);
1141 
1142   sub(cnt1, cnt1, 8);
1143   mov(result_tmp, cnt1);
1144   lea(str1, Address(str1, cnt1));
1145   sub(cnt1_neg, zr, cnt1);
1146 
1147   mov(tmp3, 0x0101010101010101);
1148 
1149   BIND(CH1_LOOP);
1150     ldr(ch1, Address(str1, cnt1_neg));
1151     eor(ch1, ch, ch1);
1152     sub(tmp1, ch1, tmp3);
1153     orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f);
1154     bics(tmp1, tmp1, tmp2);
1155     br(NE, HAS_ZERO);
1156     adds(cnt1_neg, cnt1_neg, 8);
1157     br(LT, CH1_LOOP);
1158 
1159     cmp(cnt1_neg, (u1)8);
1160     mov(cnt1_neg, 0);
1161     br(LT, CH1_LOOP);
1162     b(NOMATCH);
1163 
1164   BIND(HAS_ZERO);
1165     rev(tmp1, tmp1);
1166     clz(tmp1, tmp1);
1167     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
1168     b(MATCH);
1169 
1170   BIND(DO1_SHORT);
1171     mov(result_tmp, cnt1);
1172     lea(str1, Address(str1, cnt1));
1173     sub(cnt1_neg, zr, cnt1);
1174   BIND(DO1_LOOP);
1175     ldrb(ch1, Address(str1, cnt1_neg));
1176     cmp(ch, ch1);
1177     br(EQ, MATCH);
1178     adds(cnt1_neg, cnt1_neg, 1);
1179     br(LT, DO1_LOOP);
1180   BIND(NOMATCH);
1181     mov(result, -1);
1182     b(DONE);
1183   BIND(MATCH);
1184     add(result, result_tmp, cnt1_neg);
1185   BIND(DONE);
1186 }
1187 
1188 // Compare strings.
1189 void C2_MacroAssembler::string_compare(Register str1, Register str2,
1190     Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
1191     FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
1192     PRegister pgtmp1, PRegister pgtmp2, int ae) {
1193   Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
1194       DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
1195       SHORT_LOOP_START, TAIL_CHECK;
1196 
1197   bool isLL = ae == StrIntrinsicNode::LL;
1198   bool isLU = ae == StrIntrinsicNode::LU;
1199   bool isUL = ae == StrIntrinsicNode::UL;
1200 
1201   // The stub threshold for LL strings is: 72 (64 + 8) chars
1202   // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch)
1203   // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least)
1204   const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36);
1205 
1206   bool str1_isL = isLL || isLU;
1207   bool str2_isL = isLL || isUL;
1208 
1209   int str1_chr_shift = str1_isL ? 0 : 1;
1210   int str2_chr_shift = str2_isL ? 0 : 1;
1211   int str1_chr_size = str1_isL ? 1 : 2;
1212   int str2_chr_size = str2_isL ? 1 : 2;
1213   int minCharsInWord = isLL ? wordSize : wordSize/2;
1214 
1215   FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
1216   chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
1217                                       (chr_insn)&MacroAssembler::ldrh;
1218   chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
1219                                       (chr_insn)&MacroAssembler::ldrh;
1220   uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
1221                             (uxt_insn)&MacroAssembler::uxthw;
1222 
1223   BLOCK_COMMENT("string_compare {");
1224 
1225   // Bizzarely, the counts are passed in bytes, regardless of whether they
1226   // are L or U strings, however the result is always in characters.
1227   if (!str1_isL) asrw(cnt1, cnt1, 1);
1228   if (!str2_isL) asrw(cnt2, cnt2, 1);
1229 
1230   // Compute the minimum of the string lengths and save the difference.
1231   subsw(result, cnt1, cnt2);
1232   cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
1233 
1234   // A very short string
1235   cmpw(cnt2, minCharsInWord);
1236   br(Assembler::LE, SHORT_STRING);
1237 
1238   // Compare longwords
1239   // load first parts of strings and finish initialization while loading
1240   {
1241     if (str1_isL == str2_isL) { // LL or UU
1242       ldr(tmp1, Address(str1));
1243       cmp(str1, str2);
1244       br(Assembler::EQ, DONE);
1245       ldr(tmp2, Address(str2));
1246       cmp(cnt2, stub_threshold);
1247       br(GE, STUB);
1248       subsw(cnt2, cnt2, minCharsInWord);
1249       br(EQ, TAIL_CHECK);
1250       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1251       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1252       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1253     } else if (isLU) {
1254       ldrs(vtmp, Address(str1));
1255       ldr(tmp2, Address(str2));
1256       cmp(cnt2, stub_threshold);
1257       br(GE, STUB);
1258       subw(cnt2, cnt2, 4);
1259       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1260       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1261       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1262       zip1(vtmp, T8B, vtmp, vtmpZ);
1263       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1264       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1265       add(cnt1, cnt1, 4);
1266       fmovd(tmp1, vtmp);
1267     } else { // UL case
1268       ldr(tmp1, Address(str1));
1269       ldrs(vtmp, Address(str2));
1270       cmp(cnt2, stub_threshold);
1271       br(GE, STUB);
1272       subw(cnt2, cnt2, 4);
1273       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1274       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1275       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1276       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1277       zip1(vtmp, T8B, vtmp, vtmpZ);
1278       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1279       add(cnt1, cnt1, 8);
1280       fmovd(tmp2, vtmp);
1281     }
1282     adds(cnt2, cnt2, isUL ? 4 : 8);
1283     br(GE, TAIL);
1284     eor(rscratch2, tmp1, tmp2);
1285     cbnz(rscratch2, DIFF);
1286     // main loop
1287     bind(NEXT_WORD);
1288     if (str1_isL == str2_isL) {
1289       ldr(tmp1, Address(str1, cnt2));
1290       ldr(tmp2, Address(str2, cnt2));
1291       adds(cnt2, cnt2, 8);
1292     } else if (isLU) {
1293       ldrs(vtmp, Address(str1, cnt1));
1294       ldr(tmp2, Address(str2, cnt2));
1295       add(cnt1, cnt1, 4);
1296       zip1(vtmp, T8B, vtmp, vtmpZ);
1297       fmovd(tmp1, vtmp);
1298       adds(cnt2, cnt2, 8);
1299     } else { // UL
1300       ldrs(vtmp, Address(str2, cnt2));
1301       ldr(tmp1, Address(str1, cnt1));
1302       zip1(vtmp, T8B, vtmp, vtmpZ);
1303       add(cnt1, cnt1, 8);
1304       fmovd(tmp2, vtmp);
1305       adds(cnt2, cnt2, 4);
1306     }
1307     br(GE, TAIL);
1308 
1309     eor(rscratch2, tmp1, tmp2);
1310     cbz(rscratch2, NEXT_WORD);
1311     b(DIFF);
1312     bind(TAIL);
1313     eor(rscratch2, tmp1, tmp2);
1314     cbnz(rscratch2, DIFF);
1315     // Last longword.  In the case where length == 4 we compare the
1316     // same longword twice, but that's still faster than another
1317     // conditional branch.
1318     if (str1_isL == str2_isL) {
1319       ldr(tmp1, Address(str1));
1320       ldr(tmp2, Address(str2));
1321     } else if (isLU) {
1322       ldrs(vtmp, Address(str1));
1323       ldr(tmp2, Address(str2));
1324       zip1(vtmp, T8B, vtmp, vtmpZ);
1325       fmovd(tmp1, vtmp);
1326     } else { // UL
1327       ldrs(vtmp, Address(str2));
1328       ldr(tmp1, Address(str1));
1329       zip1(vtmp, T8B, vtmp, vtmpZ);
1330       fmovd(tmp2, vtmp);
1331     }
1332     bind(TAIL_CHECK);
1333     eor(rscratch2, tmp1, tmp2);
1334     cbz(rscratch2, DONE);
1335 
1336     // Find the first different characters in the longwords and
1337     // compute their difference.
1338     bind(DIFF);
1339     rev(rscratch2, rscratch2);
1340     clz(rscratch2, rscratch2);
1341     andr(rscratch2, rscratch2, isLL ? -8 : -16);
1342     lsrv(tmp1, tmp1, rscratch2);
1343     (this->*ext_chr)(tmp1, tmp1);
1344     lsrv(tmp2, tmp2, rscratch2);
1345     (this->*ext_chr)(tmp2, tmp2);
1346     subw(result, tmp1, tmp2);
1347     b(DONE);
1348   }
1349 
1350   bind(STUB);
1351     RuntimeAddress stub = nullptr;
1352     switch(ae) {
1353       case StrIntrinsicNode::LL:
1354         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
1355         break;
1356       case StrIntrinsicNode::UU:
1357         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
1358         break;
1359       case StrIntrinsicNode::LU:
1360         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
1361         break;
1362       case StrIntrinsicNode::UL:
1363         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
1364         break;
1365       default:
1366         ShouldNotReachHere();
1367      }
1368     assert(stub.target() != nullptr, "compare_long_string stub has not been generated");
1369     address call = trampoline_call(stub);
1370     if (call == nullptr) {
1371       DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START));
1372       ciEnv::current()->record_failure("CodeCache is full");
1373       return;
1374     }
1375     b(DONE);
1376 
1377   bind(SHORT_STRING);
1378   // Is the minimum length zero?
1379   cbz(cnt2, DONE);
1380   // arrange code to do most branches while loading and loading next characters
1381   // while comparing previous
1382   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1383   subs(cnt2, cnt2, 1);
1384   br(EQ, SHORT_LAST_INIT);
1385   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1386   b(SHORT_LOOP_START);
1387   bind(SHORT_LOOP);
1388   subs(cnt2, cnt2, 1);
1389   br(EQ, SHORT_LAST);
1390   bind(SHORT_LOOP_START);
1391   (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
1392   (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
1393   cmp(tmp1, cnt1);
1394   br(NE, SHORT_LOOP_TAIL);
1395   subs(cnt2, cnt2, 1);
1396   br(EQ, SHORT_LAST2);
1397   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1398   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1399   cmp(tmp2, rscratch1);
1400   br(EQ, SHORT_LOOP);
1401   sub(result, tmp2, rscratch1);
1402   b(DONE);
1403   bind(SHORT_LOOP_TAIL);
1404   sub(result, tmp1, cnt1);
1405   b(DONE);
1406   bind(SHORT_LAST2);
1407   cmp(tmp2, rscratch1);
1408   br(EQ, DONE);
1409   sub(result, tmp2, rscratch1);
1410 
1411   b(DONE);
1412   bind(SHORT_LAST_INIT);
1413   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1414   bind(SHORT_LAST);
1415   cmp(tmp1, cnt1);
1416   br(EQ, DONE);
1417   sub(result, tmp1, cnt1);
1418 
1419   bind(DONE);
1420 
1421   BLOCK_COMMENT("} string_compare");
1422 }
1423 
1424 void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1,
1425                                      FloatRegister src2, Condition cond, bool isQ) {
1426   SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1427   FloatRegister zn = src1, zm = src2;
1428   bool needs_negation = false;
1429   switch (cond) {
1430     case LT: cond = GT; zn = src2; zm = src1; break;
1431     case LE: cond = GE; zn = src2; zm = src1; break;
1432     case LO: cond = HI; zn = src2; zm = src1; break;
1433     case LS: cond = HS; zn = src2; zm = src1; break;
1434     case NE: cond = EQ; needs_negation = true; break;
1435     default:
1436       break;
1437   }
1438 
1439   if (is_floating_point_type(bt)) {
1440     fcm(cond, dst, size, zn, zm);
1441   } else {
1442     cm(cond, dst, size, zn, zm);
1443   }
1444 
1445   if (needs_negation) {
1446     notr(dst, isQ ? T16B : T8B, dst);
1447   }
1448 }
1449 
1450 void C2_MacroAssembler::neon_compare_zero(FloatRegister dst, BasicType bt, FloatRegister src,
1451                                           Condition cond, bool isQ) {
1452   SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1453   if (bt == T_FLOAT || bt == T_DOUBLE) {
1454     if (cond == Assembler::NE) {
1455       fcm(Assembler::EQ, dst, size, src);
1456       notr(dst, isQ ? T16B : T8B, dst);
1457     } else {
1458       fcm(cond, dst, size, src);
1459     }
1460   } else {
1461     if (cond == Assembler::NE) {
1462       cm(Assembler::EQ, dst, size, src);
1463       notr(dst, isQ ? T16B : T8B, dst);
1464     } else {
1465       cm(cond, dst, size, src);
1466     }
1467   }
1468 }
1469 
1470 // Compress the least significant bit of each byte to the rightmost and clear
1471 // the higher garbage bits.
1472 void C2_MacroAssembler::bytemask_compress(Register dst) {
1473   // Example input, dst = 0x01 00 00 00 01 01 00 01
1474   // The "??" bytes are garbage.
1475   orr(dst, dst, dst, Assembler::LSR, 7);  // dst = 0x?? 02 ?? 00 ?? 03 ?? 01
1476   orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D
1477   orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D
1478   andr(dst, dst, 0xff);                   // dst = 0x8D
1479 }
1480 
1481 // Pack the lowest-numbered bit of each mask element in src into a long value
1482 // in dst, at most the first 64 lane elements.
1483 // Clobbers: rscratch1, if UseSVE=1 or the hardware doesn't support FEAT_BITPERM.
1484 void C2_MacroAssembler::sve_vmask_tolong(Register dst, PRegister src, BasicType bt, int lane_cnt,
1485                                          FloatRegister vtmp1, FloatRegister vtmp2) {
1486   assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
1487   assert_different_registers(dst, rscratch1);
1488   assert_different_registers(vtmp1, vtmp2);
1489 
1490   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
1491   // Example:   src = 0b01100101 10001101, bt = T_BYTE, lane_cnt = 16
1492   // Expected:  dst = 0x658D
1493 
1494   // Convert the mask into vector with sequential bytes.
1495   // vtmp1 = 0x00010100 0x00010001 0x01000000 0x01010001
1496   sve_cpy(vtmp1, size, src, 1, false);
1497   if (bt != T_BYTE) {
1498     sve_vector_narrow(vtmp1, B, vtmp1, size, vtmp2);
1499   }
1500 
1501   if (UseSVE > 1 && VM_Version::supports_svebitperm()) {
1502     // Given a vector with the value 0x00 or 0x01 in each byte, the basic idea
1503     // is to compress each significant bit of the byte in a cross-lane way. Due
1504     // to the lack of a cross-lane bit-compress instruction, we use BEXT
1505     // (bit-compress in each lane) with the biggest lane size (T = D) then
1506     // concatenate the results.
1507 
1508     // The second source input of BEXT, initialized with 0x01 in each byte.
1509     // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1510     sve_dup(vtmp2, B, 1);
1511 
1512     // BEXT vtmp1.D, vtmp1.D, vtmp2.D
1513     // vtmp1 = 0x0001010000010001 | 0x0100000001010001
1514     // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1515     //         ---------------------------------------
1516     // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1517     sve_bext(vtmp1, D, vtmp1, vtmp2);
1518 
1519     // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the
1520     // result to dst.
1521     // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1522     // dst   = 0x658D
1523     if (lane_cnt <= 8) {
1524       // No need to concatenate.
1525       umov(dst, vtmp1, B, 0);
1526     } else if (lane_cnt <= 16) {
1527       ins(vtmp1, B, vtmp1, 1, 8);
1528       umov(dst, vtmp1, H, 0);
1529     } else {
1530       // As the lane count is 64 at most, the final expected value must be in
1531       // the lowest 64 bits after narrowing vtmp1 from D to B.
1532       sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2);
1533       umov(dst, vtmp1, D, 0);
1534     }
1535   } else if (UseSVE > 0) {
1536     // Compress the lowest 8 bytes.
1537     fmovd(dst, vtmp1);
1538     bytemask_compress(dst);
1539     if (lane_cnt <= 8) return;
1540 
1541     // Repeat on higher bytes and join the results.
1542     // Compress 8 bytes in each iteration.
1543     for (int idx = 1; idx < (lane_cnt / 8); idx++) {
1544       sve_extract_integral(rscratch1, T_LONG, vtmp1, idx, vtmp2);
1545       bytemask_compress(rscratch1);
1546       orr(dst, dst, rscratch1, Assembler::LSL, idx << 3);
1547     }
1548   } else {
1549     assert(false, "unsupported");
1550     ShouldNotReachHere();
1551   }
1552 }
1553 
1554 // Unpack the mask, a long value in src, into predicate register dst based on the
1555 // corresponding data type. Note that dst can support at most 64 lanes.
1556 // Below example gives the expected dst predicate register in different types, with
1557 // a valid src(0x658D) on a 1024-bit vector size machine.
1558 // BYTE:  dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 00 00 65 8D
1559 // SHORT: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 14 11 40 51
1560 // INT:   dst = 0x00 00 00 00 00 00 00 00 01 10 01 01 10 00 11 01
1561 // LONG:  dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01
1562 //
1563 // The number of significant bits of src must be equal to lane_cnt. E.g., 0xFF658D which
1564 // has 24 significant bits would be an invalid input if dst predicate register refers to
1565 // a LONG type 1024-bit vector, which has at most 16 lanes.
1566 void C2_MacroAssembler::sve_vmask_fromlong(PRegister dst, Register src, BasicType bt, int lane_cnt,
1567                                            FloatRegister vtmp1, FloatRegister vtmp2) {
1568   assert(UseSVE == 2 && VM_Version::supports_svebitperm() &&
1569          lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported");
1570   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
1571   // Example:   src = 0x658D, bt = T_BYTE, size = B, lane_cnt = 16
1572   // Expected:  dst = 0b01101001 10001101
1573 
1574   // Put long value from general purpose register into the first lane of vector.
1575   // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1576   sve_dup(vtmp1, B, 0);
1577   mov(vtmp1, D, 0, src);
1578 
1579   // As sve_cmp generates mask value with the minimum unit in byte, we should
1580   // transform the value in the first lane which is mask in bit now to the
1581   // mask in byte, which can be done by SVE2's BDEP instruction.
1582 
1583   // The first source input of BDEP instruction. Deposite each byte in every 8 bytes.
1584   // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1585   if (lane_cnt <= 8) {
1586     // Nothing. As only one byte exsits.
1587   } else if (lane_cnt <= 16) {
1588     ins(vtmp1, B, vtmp1, 8, 1);
1589     mov(vtmp1, B, 1, zr);
1590   } else {
1591     sve_vector_extend(vtmp1, D, vtmp1, B);
1592   }
1593 
1594   // The second source input of BDEP instruction, initialized with 0x01 for each byte.
1595   // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1596   sve_dup(vtmp2, B, 1);
1597 
1598   // BDEP vtmp1.D, vtmp1.D, vtmp2.D
1599   // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1600   // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1601   //         ---------------------------------------
1602   // vtmp1 = 0x0001010000010001 | 0x0100000001010001
1603   sve_bdep(vtmp1, D, vtmp1, vtmp2);
1604 
1605   if (bt != T_BYTE) {
1606     sve_vector_extend(vtmp1, size, vtmp1, B);
1607   }
1608   // Generate mask according to the given vector, in which the elements have been
1609   // extended to expected type.
1610   // dst = 0b01101001 10001101
1611   sve_cmp(Assembler::NE, dst, size, ptrue, vtmp1, 0);
1612 }
1613 
1614 // Clobbers: rflags
1615 void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg,
1616                                     FloatRegister zn, FloatRegister zm, Condition cond) {
1617   assert(pg->is_governing(), "This register has to be a governing predicate register");
1618   FloatRegister z1 = zn, z2 = zm;
1619   switch (cond) {
1620     case LE: z1 = zm; z2 = zn; cond = GE; break;
1621     case LT: z1 = zm; z2 = zn; cond = GT; break;
1622     case LO: z1 = zm; z2 = zn; cond = HI; break;
1623     case LS: z1 = zm; z2 = zn; cond = HS; break;
1624     default:
1625       break;
1626   }
1627 
1628   SIMD_RegVariant size = elemType_to_regVariant(bt);
1629   if (is_floating_point_type(bt)) {
1630     sve_fcm(cond, pd, size, pg, z1, z2);
1631   } else {
1632     assert(is_integral_type(bt), "unsupported element type");
1633     sve_cmp(cond, pd, size, pg, z1, z2);
1634   }
1635 }
1636 
1637 // Get index of the last mask lane that is set
1638 void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) {
1639   SIMD_RegVariant size = elemType_to_regVariant(bt);
1640   sve_rev(ptmp, size, src);
1641   sve_brkb(ptmp, ptrue, ptmp, false);
1642   sve_cntp(dst, size, ptrue, ptmp);
1643   movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1);
1644   subw(dst, rscratch1, dst);
1645 }
1646 
1647 // Extend integer vector src to dst with the same lane count
1648 // but larger element size, e.g. 4B -> 4I
1649 void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes,
1650                                            FloatRegister src, BasicType src_bt, bool is_unsigned) {
1651   if (src_bt == T_BYTE) {
1652     if (dst_bt == T_SHORT) {
1653       // 4B/8B to 4S/8S
1654       _xshll(is_unsigned, dst, T8H, src, T8B, 0);
1655     } else {
1656       // 4B to 4I
1657       assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported");
1658       _xshll(is_unsigned, dst, T8H, src, T8B, 0);
1659       _xshll(is_unsigned, dst, T4S, dst, T4H, 0);
1660     }
1661   } else if (src_bt == T_SHORT) {
1662     // 4S to 4I
1663     assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported");
1664     _xshll(is_unsigned, dst, T4S, src, T4H, 0);
1665   } else if (src_bt == T_INT) {
1666     // 2I to 2L
1667     assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported");
1668     _xshll(is_unsigned, dst, T2D, src, T2S, 0);
1669   } else {
1670     ShouldNotReachHere();
1671   }
1672 }
1673 
1674 // Narrow integer vector src down to dst with the same lane count
1675 // but smaller element size, e.g. 4I -> 4B
1676 void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt,
1677                                            FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) {
1678   if (src_bt == T_SHORT) {
1679     // 4S/8S to 4B/8B
1680     assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
1681     assert(dst_bt == T_BYTE, "unsupported");
1682     xtn(dst, T8B, src, T8H);
1683   } else if (src_bt == T_INT) {
1684     // 4I to 4B/4S
1685     assert(src_vlen_in_bytes == 16, "unsupported");
1686     assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported");
1687     xtn(dst, T4H, src, T4S);
1688     if (dst_bt == T_BYTE) {
1689       xtn(dst, T8B, dst, T8H);
1690     }
1691   } else if (src_bt == T_LONG) {
1692     // 2L to 2I
1693     assert(src_vlen_in_bytes == 16, "unsupported");
1694     assert(dst_bt == T_INT, "unsupported");
1695     xtn(dst, T2S, src, T2D);
1696   } else {
1697     ShouldNotReachHere();
1698   }
1699 }
1700 
1701 void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size,
1702                                           FloatRegister src, SIMD_RegVariant src_size,
1703                                           bool is_unsigned) {
1704   assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size");
1705 
1706   if (src_size == B) {
1707     switch (dst_size) {
1708     case H:
1709       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1710       break;
1711     case S:
1712       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1713       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1714       break;
1715     case D:
1716       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1717       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1718       _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1719       break;
1720     default:
1721       ShouldNotReachHere();
1722     }
1723   } else if (src_size == H) {
1724     if (dst_size == S) {
1725       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1726     } else { // D
1727       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1728       _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1729     }
1730   } else if (src_size == S) {
1731     _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, src);
1732   }
1733 }
1734 
1735 // Vector narrow from src to dst with specified element sizes.
1736 // High part of dst vector will be filled with zero.
1737 void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size,
1738                                           FloatRegister src, SIMD_RegVariant src_size,
1739                                           FloatRegister tmp) {
1740   assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size");
1741   assert_different_registers(src, tmp);
1742   sve_dup(tmp, src_size, 0);
1743   if (src_size == D) {
1744     switch (dst_size) {
1745     case S:
1746       sve_uzp1(dst, S, src, tmp);
1747       break;
1748     case H:
1749       assert_different_registers(dst, tmp);
1750       sve_uzp1(dst, S, src, tmp);
1751       sve_uzp1(dst, H, dst, tmp);
1752       break;
1753     case B:
1754       assert_different_registers(dst, tmp);
1755       sve_uzp1(dst, S, src, tmp);
1756       sve_uzp1(dst, H, dst, tmp);
1757       sve_uzp1(dst, B, dst, tmp);
1758       break;
1759     default:
1760       ShouldNotReachHere();
1761     }
1762   } else if (src_size == S) {
1763     if (dst_size == H) {
1764       sve_uzp1(dst, H, src, tmp);
1765     } else { // B
1766       assert_different_registers(dst, tmp);
1767       sve_uzp1(dst, H, src, tmp);
1768       sve_uzp1(dst, B, dst, tmp);
1769     }
1770   } else if (src_size == H) {
1771     sve_uzp1(dst, B, src, tmp);
1772   }
1773 }
1774 
1775 // Extend src predicate to dst predicate with the same lane count but larger
1776 // element size, e.g. 64Byte -> 512Long
1777 void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src,
1778                                              uint dst_element_length_in_bytes,
1779                                              uint src_element_length_in_bytes) {
1780   if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) {
1781     sve_punpklo(dst, src);
1782   } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) {
1783     sve_punpklo(dst, src);
1784     sve_punpklo(dst, dst);
1785   } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) {
1786     sve_punpklo(dst, src);
1787     sve_punpklo(dst, dst);
1788     sve_punpklo(dst, dst);
1789   } else {
1790     assert(false, "unsupported");
1791     ShouldNotReachHere();
1792   }
1793 }
1794 
1795 // Narrow src predicate to dst predicate with the same lane count but
1796 // smaller element size, e.g. 512Long -> 64Byte
1797 void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src, PRegister ptmp,
1798                                              uint dst_element_length_in_bytes, uint src_element_length_in_bytes) {
1799   // The insignificant bits in src predicate are expected to be zero.
1800   // To ensure the higher order bits of the resultant narrowed vector are 0, an all-zero predicate is
1801   // passed as the second argument. An example narrowing operation with a given mask would be -
1802   // 128Long -> 64Int on a 128-bit machine i.e 2L -> 2I
1803   // Mask (for 2 Longs) : TF
1804   // Predicate register for the above mask (16 bits) : 00000001 00000000
1805   // After narrowing (uzp1 dst.b, src.b, ptmp.b) : 0000 0000 0001 0000
1806   // Which translates to mask for 2 integers as : TF (lower half is considered while upper half is 0)
1807   assert_different_registers(src, ptmp);
1808   assert_different_registers(dst, ptmp);
1809   sve_pfalse(ptmp);
1810   if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) {
1811     sve_uzp1(dst, B, src, ptmp);
1812   } else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) {
1813     sve_uzp1(dst, H, src, ptmp);
1814     sve_uzp1(dst, B, dst, ptmp);
1815   } else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) {
1816     sve_uzp1(dst, S, src, ptmp);
1817     sve_uzp1(dst, H, dst, ptmp);
1818     sve_uzp1(dst, B, dst, ptmp);
1819   } else {
1820     assert(false, "unsupported");
1821     ShouldNotReachHere();
1822   }
1823 }
1824 
1825 // Vector reduction add for integral type with ASIMD instructions.
1826 void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt,
1827                                                  Register isrc, FloatRegister vsrc,
1828                                                  unsigned vector_length_in_bytes,
1829                                                  FloatRegister vtmp) {
1830   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1831   assert_different_registers(dst, isrc);
1832   bool isQ = vector_length_in_bytes == 16;
1833 
1834   BLOCK_COMMENT("neon_reduce_add_integral {");
1835     switch(bt) {
1836       case T_BYTE:
1837         addv(vtmp, isQ ? T16B : T8B, vsrc);
1838         smov(dst, vtmp, B, 0);
1839         addw(dst, dst, isrc, ext::sxtb);
1840         break;
1841       case T_SHORT:
1842         addv(vtmp, isQ ? T8H : T4H, vsrc);
1843         smov(dst, vtmp, H, 0);
1844         addw(dst, dst, isrc, ext::sxth);
1845         break;
1846       case T_INT:
1847         isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc);
1848         umov(dst, vtmp, S, 0);
1849         addw(dst, dst, isrc);
1850         break;
1851       case T_LONG:
1852         assert(isQ, "unsupported");
1853         addpd(vtmp, vsrc);
1854         umov(dst, vtmp, D, 0);
1855         add(dst, dst, isrc);
1856         break;
1857       default:
1858         assert(false, "unsupported");
1859         ShouldNotReachHere();
1860     }
1861   BLOCK_COMMENT("} neon_reduce_add_integral");
1862 }
1863 
1864 // Vector reduction multiply for integral type with ASIMD instructions.
1865 // Note: temporary registers vtmp1 and vtmp2 are not used in some cases.
1866 // Clobbers: rscratch1
1867 void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt,
1868                                                  Register isrc, FloatRegister vsrc,
1869                                                  unsigned vector_length_in_bytes,
1870                                                  FloatRegister vtmp1, FloatRegister vtmp2) {
1871   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1872   bool isQ = vector_length_in_bytes == 16;
1873 
1874   BLOCK_COMMENT("neon_reduce_mul_integral {");
1875     switch(bt) {
1876       case T_BYTE:
1877         if (isQ) {
1878           // Multiply the lower half and higher half of vector iteratively.
1879           // vtmp1 = vsrc[8:15]
1880           ins(vtmp1, D, vsrc, 0, 1);
1881           // vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7]
1882           mulv(vtmp1, T8B, vtmp1, vsrc);
1883           // vtmp2 = vtmp1[4:7]
1884           ins(vtmp2, S, vtmp1, 0, 1);
1885           // vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3]
1886           mulv(vtmp1, T8B, vtmp2, vtmp1);
1887         } else {
1888           ins(vtmp1, S, vsrc, 0, 1);
1889           mulv(vtmp1, T8B, vtmp1, vsrc);
1890         }
1891         // vtmp2 = vtmp1[2:3]
1892         ins(vtmp2, H, vtmp1, 0, 1);
1893         // vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1]
1894         mulv(vtmp2, T8B, vtmp2, vtmp1);
1895         // dst = vtmp2[0] * isrc * vtmp2[1]
1896         umov(rscratch1, vtmp2, B, 0);
1897         mulw(dst, rscratch1, isrc);
1898         sxtb(dst, dst);
1899         umov(rscratch1, vtmp2, B, 1);
1900         mulw(dst, rscratch1, dst);
1901         sxtb(dst, dst);
1902         break;
1903       case T_SHORT:
1904         if (isQ) {
1905           ins(vtmp2, D, vsrc, 0, 1);
1906           mulv(vtmp2, T4H, vtmp2, vsrc);
1907           ins(vtmp1, S, vtmp2, 0, 1);
1908           mulv(vtmp1, T4H, vtmp1, vtmp2);
1909         } else {
1910           ins(vtmp1, S, vsrc, 0, 1);
1911           mulv(vtmp1, T4H, vtmp1, vsrc);
1912         }
1913         umov(rscratch1, vtmp1, H, 0);
1914         mulw(dst, rscratch1, isrc);
1915         sxth(dst, dst);
1916         umov(rscratch1, vtmp1, H, 1);
1917         mulw(dst, rscratch1, dst);
1918         sxth(dst, dst);
1919         break;
1920       case T_INT:
1921         if (isQ) {
1922           ins(vtmp1, D, vsrc, 0, 1);
1923           mulv(vtmp1, T2S, vtmp1, vsrc);
1924         } else {
1925           vtmp1 = vsrc;
1926         }
1927         umov(rscratch1, vtmp1, S, 0);
1928         mul(dst, rscratch1, isrc);
1929         umov(rscratch1, vtmp1, S, 1);
1930         mul(dst, rscratch1, dst);
1931         break;
1932       case T_LONG:
1933         umov(rscratch1, vsrc, D, 0);
1934         mul(dst, isrc, rscratch1);
1935         umov(rscratch1, vsrc, D, 1);
1936         mul(dst, dst, rscratch1);
1937         break;
1938       default:
1939         assert(false, "unsupported");
1940         ShouldNotReachHere();
1941     }
1942   BLOCK_COMMENT("} neon_reduce_mul_integral");
1943 }
1944 
1945 // Vector reduction multiply for floating-point type with ASIMD instructions.
1946 void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt,
1947                                            FloatRegister fsrc, FloatRegister vsrc,
1948                                            unsigned vector_length_in_bytes,
1949                                            FloatRegister vtmp) {
1950   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1951   bool isQ = vector_length_in_bytes == 16;
1952 
1953   BLOCK_COMMENT("neon_reduce_mul_fp {");
1954     switch(bt) {
1955       case T_FLOAT:
1956         fmuls(dst, fsrc, vsrc);
1957         ins(vtmp, S, vsrc, 0, 1);
1958         fmuls(dst, dst, vtmp);
1959         if (isQ) {
1960           ins(vtmp, S, vsrc, 0, 2);
1961           fmuls(dst, dst, vtmp);
1962           ins(vtmp, S, vsrc, 0, 3);
1963           fmuls(dst, dst, vtmp);
1964          }
1965         break;
1966       case T_DOUBLE:
1967         assert(isQ, "unsupported");
1968         fmuld(dst, fsrc, vsrc);
1969         ins(vtmp, D, vsrc, 0, 1);
1970         fmuld(dst, dst, vtmp);
1971         break;
1972       default:
1973         assert(false, "unsupported");
1974         ShouldNotReachHere();
1975     }
1976   BLOCK_COMMENT("} neon_reduce_mul_fp");
1977 }
1978 
1979 // Helper to select logical instruction
1980 void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd,
1981                                                    Register Rn, Register Rm,
1982                                                    enum shift_kind kind, unsigned shift) {
1983   switch(opc) {
1984     case Op_AndReductionV:
1985       is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift);
1986       break;
1987     case Op_OrReductionV:
1988       is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift);
1989       break;
1990     case Op_XorReductionV:
1991       is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift);
1992       break;
1993     default:
1994       assert(false, "unsupported");
1995       ShouldNotReachHere();
1996   }
1997 }
1998 
1999 // Vector reduction logical operations And, Or, Xor
2000 // Clobbers: rscratch1
2001 void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt,
2002                                             Register isrc, FloatRegister vsrc,
2003                                             unsigned vector_length_in_bytes) {
2004   assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV,
2005          "unsupported");
2006   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
2007   assert_different_registers(dst, isrc);
2008   bool isQ = vector_length_in_bytes == 16;
2009 
2010   BLOCK_COMMENT("neon_reduce_logical {");
2011     umov(rscratch1, vsrc, isQ ? D : S, 0);
2012     umov(dst, vsrc, isQ ? D : S, 1);
2013     neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1);
2014     switch(bt) {
2015       case T_BYTE:
2016         if (isQ) {
2017           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
2018         }
2019         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
2020         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8);
2021         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
2022         sxtb(dst, dst);
2023         break;
2024       case T_SHORT:
2025         if (isQ) {
2026           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
2027         }
2028         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
2029         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
2030         sxth(dst, dst);
2031         break;
2032       case T_INT:
2033         if (isQ) {
2034           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
2035         }
2036         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
2037         break;
2038       case T_LONG:
2039         assert(isQ, "unsupported");
2040         neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst);
2041         break;
2042       default:
2043         assert(false, "unsupported");
2044         ShouldNotReachHere();
2045     }
2046   BLOCK_COMMENT("} neon_reduce_logical");
2047 }
2048 
2049 // Vector reduction min/max for integral type with ASIMD instructions.
2050 // Note: vtmp is not used and expected to be fnoreg for T_LONG case.
2051 // Clobbers: rscratch1, rflags
2052 void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt,
2053                                                     Register isrc, FloatRegister vsrc,
2054                                                     unsigned vector_length_in_bytes,
2055                                                     FloatRegister vtmp) {
2056   assert(opc == Op_MinReductionV || opc == Op_MaxReductionV, "unsupported");
2057   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
2058   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported");
2059   assert_different_registers(dst, isrc);
2060   bool isQ = vector_length_in_bytes == 16;
2061   bool is_min = opc == Op_MinReductionV;
2062 
2063   BLOCK_COMMENT("neon_reduce_minmax_integral {");
2064     if (bt == T_LONG) {
2065       assert(vtmp == fnoreg, "should be");
2066       assert(isQ, "should be");
2067       umov(rscratch1, vsrc, D, 0);
2068       cmp(isrc, rscratch1);
2069       csel(dst, isrc, rscratch1, is_min ? LT : GT);
2070       umov(rscratch1, vsrc, D, 1);
2071       cmp(dst, rscratch1);
2072       csel(dst, dst, rscratch1, is_min ? LT : GT);
2073     } else {
2074       SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
2075       if (size == T2S) {
2076         is_min ? sminp(vtmp, size, vsrc, vsrc) : smaxp(vtmp, size, vsrc, vsrc);
2077       } else {
2078         is_min ? sminv(vtmp, size, vsrc) : smaxv(vtmp, size, vsrc);
2079       }
2080       if (bt == T_INT) {
2081         umov(dst, vtmp, S, 0);
2082       } else {
2083         smov(dst, vtmp, elemType_to_regVariant(bt), 0);
2084       }
2085       cmpw(dst, isrc);
2086       cselw(dst, dst, isrc, is_min ? LT : GT);
2087     }
2088   BLOCK_COMMENT("} neon_reduce_minmax_integral");
2089 }
2090 
2091 // Vector reduction for integral type with SVE instruction.
2092 // Supported operations are Add, And, Or, Xor, Max, Min.
2093 // rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV.
2094 void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1,
2095                                             FloatRegister src2, PRegister pg, FloatRegister tmp) {
2096   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2097   assert(pg->is_governing(), "This register has to be a governing predicate register");
2098   assert_different_registers(src1, dst);
2099   // Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved.
2100   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2101   switch (opc) {
2102     case Op_AddReductionVI: {
2103       sve_uaddv(tmp, size, pg, src2);
2104       if (bt == T_BYTE) {
2105         smov(dst, tmp, size, 0);
2106         addw(dst, src1, dst, ext::sxtb);
2107       } else if (bt == T_SHORT) {
2108         smov(dst, tmp, size, 0);
2109         addw(dst, src1, dst, ext::sxth);
2110       } else {
2111         umov(dst, tmp, size, 0);
2112         addw(dst, dst, src1);
2113       }
2114       break;
2115     }
2116     case Op_AddReductionVL: {
2117       sve_uaddv(tmp, size, pg, src2);
2118       umov(dst, tmp, size, 0);
2119       add(dst, dst, src1);
2120       break;
2121     }
2122     case Op_AndReductionV: {
2123       sve_andv(tmp, size, pg, src2);
2124       if (bt == T_INT || bt == T_LONG) {
2125         umov(dst, tmp, size, 0);
2126       } else {
2127         smov(dst, tmp, size, 0);
2128       }
2129       if (bt == T_LONG) {
2130         andr(dst, dst, src1);
2131       } else {
2132         andw(dst, dst, src1);
2133       }
2134       break;
2135     }
2136     case Op_OrReductionV: {
2137       sve_orv(tmp, size, pg, src2);
2138       if (bt == T_INT || bt == T_LONG) {
2139         umov(dst, tmp, size, 0);
2140       } else {
2141         smov(dst, tmp, size, 0);
2142       }
2143       if (bt == T_LONG) {
2144         orr(dst, dst, src1);
2145       } else {
2146         orrw(dst, dst, src1);
2147       }
2148       break;
2149     }
2150     case Op_XorReductionV: {
2151       sve_eorv(tmp, size, pg, src2);
2152       if (bt == T_INT || bt == T_LONG) {
2153         umov(dst, tmp, size, 0);
2154       } else {
2155         smov(dst, tmp, size, 0);
2156       }
2157       if (bt == T_LONG) {
2158         eor(dst, dst, src1);
2159       } else {
2160         eorw(dst, dst, src1);
2161       }
2162       break;
2163     }
2164     case Op_MaxReductionV: {
2165       sve_smaxv(tmp, size, pg, src2);
2166       if (bt == T_INT || bt == T_LONG) {
2167         umov(dst, tmp, size, 0);
2168       } else {
2169         smov(dst, tmp, size, 0);
2170       }
2171       if (bt == T_LONG) {
2172         cmp(dst, src1);
2173         csel(dst, dst, src1, Assembler::GT);
2174       } else {
2175         cmpw(dst, src1);
2176         cselw(dst, dst, src1, Assembler::GT);
2177       }
2178       break;
2179     }
2180     case Op_MinReductionV: {
2181       sve_sminv(tmp, size, pg, src2);
2182       if (bt == T_INT || bt == T_LONG) {
2183         umov(dst, tmp, size, 0);
2184       } else {
2185         smov(dst, tmp, size, 0);
2186       }
2187       if (bt == T_LONG) {
2188         cmp(dst, src1);
2189         csel(dst, dst, src1, Assembler::LT);
2190       } else {
2191         cmpw(dst, src1);
2192         cselw(dst, dst, src1, Assembler::LT);
2193       }
2194       break;
2195     }
2196     default:
2197       assert(false, "unsupported");
2198       ShouldNotReachHere();
2199   }
2200 
2201   if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) {
2202     if (bt == T_BYTE) {
2203       sxtb(dst, dst);
2204     } else if (bt == T_SHORT) {
2205       sxth(dst, dst);
2206     }
2207   }
2208 }
2209 
2210 // Set elements of the dst predicate to true for lanes in the range of [0, lane_cnt), or
2211 // to false otherwise. The input "lane_cnt" should be smaller than or equal to the supported
2212 // max vector length of the basic type. Clobbers: rscratch1 and the rFlagsReg.
2213 void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt) {
2214   uint32_t max_vector_length = Matcher::max_vector_size(bt);
2215   assert(lane_cnt <= max_vector_length, "unsupported input lane_cnt");
2216 
2217   // Set all elements to false if the input "lane_cnt" is zero.
2218   if (lane_cnt == 0) {
2219     sve_pfalse(dst);
2220     return;
2221   }
2222 
2223   SIMD_RegVariant size = elemType_to_regVariant(bt);
2224   assert(size != Q, "invalid size");
2225 
2226   // Set all true if "lane_cnt" equals to the max lane count.
2227   if (lane_cnt == max_vector_length) {
2228     sve_ptrue(dst, size, /* ALL */ 0b11111);
2229     return;
2230   }
2231 
2232   // Fixed numbers for "ptrue".
2233   switch(lane_cnt) {
2234   case 1: /* VL1 */
2235   case 2: /* VL2 */
2236   case 3: /* VL3 */
2237   case 4: /* VL4 */
2238   case 5: /* VL5 */
2239   case 6: /* VL6 */
2240   case 7: /* VL7 */
2241   case 8: /* VL8 */
2242     sve_ptrue(dst, size, lane_cnt);
2243     return;
2244   case 16:
2245     sve_ptrue(dst, size, /* VL16 */ 0b01001);
2246     return;
2247   case 32:
2248     sve_ptrue(dst, size, /* VL32 */ 0b01010);
2249     return;
2250   case 64:
2251     sve_ptrue(dst, size, /* VL64 */ 0b01011);
2252     return;
2253   case 128:
2254     sve_ptrue(dst, size, /* VL128 */ 0b01100);
2255     return;
2256   case 256:
2257     sve_ptrue(dst, size, /* VL256 */ 0b01101);
2258     return;
2259   default:
2260     break;
2261   }
2262 
2263   // Special patterns for "ptrue".
2264   if (lane_cnt == round_down_power_of_2(max_vector_length)) {
2265     sve_ptrue(dst, size, /* POW2 */ 0b00000);
2266   } else if (lane_cnt == max_vector_length - (max_vector_length % 4)) {
2267     sve_ptrue(dst, size, /* MUL4 */ 0b11101);
2268   } else if (lane_cnt == max_vector_length - (max_vector_length % 3)) {
2269     sve_ptrue(dst, size, /* MUL3 */ 0b11110);
2270   } else {
2271     // Encode to "whileltw" for the remaining cases.
2272     mov(rscratch1, lane_cnt);
2273     sve_whileltw(dst, size, zr, rscratch1);
2274   }
2275 }
2276 
2277 // Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst.
2278 // Any remaining elements of dst will be filled with zero.
2279 // Clobbers: rscratch1
2280 // Preserves: src, mask
2281 void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask,
2282                                            FloatRegister vtmp1, FloatRegister vtmp2,
2283                                            PRegister pgtmp) {
2284   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2285   assert_different_registers(dst, src, vtmp1, vtmp2);
2286   assert_different_registers(mask, pgtmp);
2287 
2288   // Example input:   src   = 8888 7777 6666 5555 4444 3333 2222 1111
2289   //                  mask  = 0001 0000 0000 0001 0001 0000 0001 0001
2290   // Expected result: dst   = 0000 0000 0000 8888 5555 4444 2222 1111
2291   sve_dup(vtmp2, H, 0);
2292 
2293   // Extend lowest half to type INT.
2294   // dst = 00004444 00003333 00002222 00001111
2295   sve_uunpklo(dst, S, src);
2296   // pgtmp = 00000001 00000000 00000001 00000001
2297   sve_punpklo(pgtmp, mask);
2298   // Pack the active elements in size of type INT to the right,
2299   // and fill the remainings with zero.
2300   // dst = 00000000 00004444 00002222 00001111
2301   sve_compact(dst, S, dst, pgtmp);
2302   // Narrow the result back to type SHORT.
2303   // dst = 0000 0000 0000 0000 0000 4444 2222 1111
2304   sve_uzp1(dst, H, dst, vtmp2);
2305   // Count the active elements of lowest half.
2306   // rscratch1 = 3
2307   sve_cntp(rscratch1, S, ptrue, pgtmp);
2308 
2309   // Repeat to the highest half.
2310   // pgtmp = 00000001 00000000 00000000 00000001
2311   sve_punpkhi(pgtmp, mask);
2312   // vtmp1 = 00008888 00007777 00006666 00005555
2313   sve_uunpkhi(vtmp1, S, src);
2314   // vtmp1 = 00000000 00000000 00008888 00005555
2315   sve_compact(vtmp1, S, vtmp1, pgtmp);
2316   // vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555
2317   sve_uzp1(vtmp1, H, vtmp1, vtmp2);
2318 
2319   // Compressed low:   dst   = 0000 0000 0000 0000 0000 4444 2222 1111
2320   // Compressed high:  vtmp1 = 0000 0000 0000 0000 0000 0000 8888  5555
2321   // Left shift(cross lane) compressed high with TRUE_CNT lanes,
2322   // TRUE_CNT is the number of active elements in the compressed low.
2323   neg(rscratch1, rscratch1);
2324   // vtmp2 = {4 3 2 1 0 -1 -2 -3}
2325   sve_index(vtmp2, H, rscratch1, 1);
2326   // vtmp1 = 0000 0000 0000 8888 5555 0000 0000 0000
2327   sve_tbl(vtmp1, H, vtmp1, vtmp2);
2328 
2329   // Combine the compressed high(after shifted) with the compressed low.
2330   // dst = 0000 0000 0000 8888 5555 4444 2222 1111
2331   sve_orr(dst, dst, vtmp1);
2332 }
2333 
2334 // Clobbers: rscratch1, rscratch2
2335 // Preserves: src, mask
2336 void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask,
2337                                           FloatRegister vtmp1, FloatRegister vtmp2,
2338                                           FloatRegister vtmp3, FloatRegister vtmp4,
2339                                           PRegister ptmp, PRegister pgtmp) {
2340   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2341   assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3, vtmp4);
2342   assert_different_registers(mask, ptmp, pgtmp);
2343   // Example input:   src   = 88 77 66 55 44 33 22 11
2344   //                  mask  = 01 00 00 01 01 00 01 01
2345   // Expected result: dst   = 00 00 00 88 55 44 22 11
2346 
2347   sve_dup(vtmp4, B, 0);
2348   // Extend lowest half to type SHORT.
2349   // vtmp1 = 0044 0033 0022 0011
2350   sve_uunpklo(vtmp1, H, src);
2351   // ptmp = 0001 0000 0001 0001
2352   sve_punpklo(ptmp, mask);
2353   // Count the active elements of lowest half.
2354   // rscratch2 = 3
2355   sve_cntp(rscratch2, H, ptrue, ptmp);
2356   // Pack the active elements in size of type SHORT to the right,
2357   // and fill the remainings with zero.
2358   // dst = 0000 0044 0022 0011
2359   sve_compress_short(dst, vtmp1, ptmp, vtmp2, vtmp3, pgtmp);
2360   // Narrow the result back to type BYTE.
2361   // dst = 00 00 00 00 00 44 22 11
2362   sve_uzp1(dst, B, dst, vtmp4);
2363 
2364   // Repeat to the highest half.
2365   // ptmp = 0001 0000 0000 0001
2366   sve_punpkhi(ptmp, mask);
2367   // vtmp1 = 0088 0077 0066 0055
2368   sve_uunpkhi(vtmp2, H, src);
2369   // vtmp1 = 0000 0000 0088 0055
2370   sve_compress_short(vtmp1, vtmp2, ptmp, vtmp3, vtmp4, pgtmp);
2371 
2372   sve_dup(vtmp4, B, 0);
2373   // vtmp1 = 00 00 00 00 00 00 88 55
2374   sve_uzp1(vtmp1, B, vtmp1, vtmp4);
2375 
2376   // Compressed low:   dst   = 00 00 00 00 00 44 22 11
2377   // Compressed high:  vtmp1 = 00 00 00 00 00 00 88 55
2378   // Left shift(cross lane) compressed high with TRUE_CNT lanes,
2379   // TRUE_CNT is the number of active elements in the compressed low.
2380   neg(rscratch2, rscratch2);
2381   // vtmp2 = {4 3 2 1 0 -1 -2 -3}
2382   sve_index(vtmp2, B, rscratch2, 1);
2383   // vtmp1 = 00 00 00 88 55 00 00 00
2384   sve_tbl(vtmp1, B, vtmp1, vtmp2);
2385   // Combine the compressed high(after shifted) with the compressed low.
2386   // dst = 00 00 00 88 55 44 22 11
2387   sve_orr(dst, dst, vtmp1);
2388 }
2389 
2390 void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2391   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2392   SIMD_Arrangement size = isQ ? T16B : T8B;
2393   if (bt == T_BYTE) {
2394     rbit(dst, size, src);
2395   } else {
2396     neon_reverse_bytes(dst, src, bt, isQ);
2397     rbit(dst, size, dst);
2398   }
2399 }
2400 
2401 void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2402   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2403   SIMD_Arrangement size = isQ ? T16B : T8B;
2404   switch (bt) {
2405     case T_BYTE:
2406       if (dst != src) {
2407         orr(dst, size, src, src);
2408       }
2409       break;
2410     case T_SHORT:
2411       rev16(dst, size, src);
2412       break;
2413     case T_INT:
2414       rev32(dst, size, src);
2415       break;
2416     case T_LONG:
2417       rev64(dst, size, src);
2418       break;
2419     default:
2420       assert(false, "unsupported");
2421       ShouldNotReachHere();
2422   }
2423 }
2424 
2425 // Extract a scalar element from an sve vector at position 'idx'.
2426 // The input elements in src are expected to be of integral type.
2427 void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src,
2428                                              int idx, FloatRegister vtmp) {
2429   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2430   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2431   if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction
2432     if (bt == T_INT || bt == T_LONG) {
2433       umov(dst, src, size, idx);
2434     } else {
2435       smov(dst, src, size, idx);
2436     }
2437   } else {
2438     sve_orr(vtmp, src, src);
2439     sve_ext(vtmp, vtmp, idx << size);
2440     if (bt == T_INT || bt == T_LONG) {
2441       umov(dst, vtmp, size, 0);
2442     } else {
2443       smov(dst, vtmp, size, 0);
2444     }
2445   }
2446 }
2447 
2448 // java.lang.Math::round intrinsics
2449 
2450 // Clobbers: rscratch1, rflags
2451 void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2452                                           FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) {
2453   assert_different_registers(tmp1, tmp2, tmp3, src, dst);
2454   switch (T) {
2455     case T2S:
2456     case T4S:
2457       fmovs(tmp1, T, 0.5f);
2458       mov(rscratch1, jint_cast(0x1.0p23f));
2459       break;
2460     case T2D:
2461       fmovd(tmp1, T, 0.5);
2462       mov(rscratch1, julong_cast(0x1.0p52));
2463       break;
2464     default:
2465       assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2466   }
2467   fadd(tmp1, T, tmp1, src);
2468   fcvtms(tmp1, T, tmp1);
2469   // tmp1 = floor(src + 0.5, ties to even)
2470 
2471   fcvtas(dst, T, src);
2472   // dst = round(src), ties to away
2473 
2474   fneg(tmp3, T, src);
2475   dup(tmp2, T, rscratch1);
2476   cm(HS, tmp3, T, tmp3, tmp2);
2477   // tmp3 is now a set of flags
2478 
2479   bif(dst, T16B, tmp1, tmp3);
2480   // result in dst
2481 }
2482 
2483 // Clobbers: rscratch1, rflags
2484 void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2485                                          FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) {
2486   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2487   assert_different_registers(tmp1, tmp2, src, dst);
2488 
2489   switch (T) {
2490     case S:
2491       mov(rscratch1, jint_cast(0x1.0p23f));
2492       break;
2493     case D:
2494       mov(rscratch1, julong_cast(0x1.0p52));
2495       break;
2496     default:
2497       assert(T == S || T == D, "invalid register variant");
2498   }
2499 
2500   sve_frinta(dst, T, ptrue, src);
2501   // dst = round(src), ties to away
2502 
2503   Label none;
2504 
2505   sve_fneg(tmp1, T, ptrue, src);
2506   sve_dup(tmp2, T, rscratch1);
2507   sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1);
2508   br(EQ, none);
2509   {
2510     sve_cpy(tmp1, T, pgtmp, 0.5);
2511     sve_fadd(tmp1, T, pgtmp, src);
2512     sve_frintm(dst, T, pgtmp, tmp1);
2513     // dst = floor(src + 0.5, ties to even)
2514   }
2515   bind(none);
2516 
2517   sve_fcvtzs(dst, T, ptrue, dst, T);
2518   // result in dst
2519 }
2520 
2521 void C2_MacroAssembler::vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero,
2522                                            FloatRegister one, SIMD_Arrangement T) {
2523   assert_different_registers(dst, src, zero, one);
2524   assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2525 
2526   facgt(dst, T, src, zero);
2527   ushr(dst, T, dst, 1); // dst=0 for +-0.0 and NaN. 0x7FF..F otherwise
2528   bsl(dst, T == T2S ? T8B : T16B, one, src); // Result in dst
2529 }
2530 
2531 void C2_MacroAssembler::vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero,
2532                                           FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T) {
2533     assert_different_registers(dst, src, zero, one, vtmp);
2534     assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2535 
2536     sve_orr(vtmp, src, src);
2537     sve_fac(Assembler::GT, pgtmp, T, ptrue, src, zero); // pmtp=0 for +-0.0 and NaN. 0x1 otherwise
2538     switch (T) {
2539     case S:
2540       sve_and(vtmp, T, min_jint); // Extract the sign bit of float value in every lane of src
2541       sve_orr(vtmp, T, jint_cast(1.0)); // OR it with +1 to make the final result +1 or -1 depending
2542                                         // on the sign of the float value
2543       break;
2544     case D:
2545       sve_and(vtmp, T, min_jlong);
2546       sve_orr(vtmp, T, jlong_cast(1.0));
2547       break;
2548     default:
2549       assert(false, "unsupported");
2550       ShouldNotReachHere();
2551     }
2552     sve_sel(dst, T, pgtmp, vtmp, src); // Select either from src or vtmp based on the predicate register pgtmp
2553                                        // Result in dst
2554 }
2555 
2556 bool C2_MacroAssembler::in_scratch_emit_size() {
2557   if (ciEnv::current()->task() != nullptr) {
2558     PhaseOutput* phase_output = Compile::current()->output();
2559     if (phase_output != nullptr && phase_output->in_scratch_emit_size()) {
2560       return true;
2561     }
2562   }
2563   return MacroAssembler::in_scratch_emit_size();
2564 }
2565 
2566 void C2_MacroAssembler::load_nklass_compact(Register dst, Register obj, Register index, int scale, int disp) {
2567   // Note: Don't clobber obj anywhere in that method!
2568 
2569   // The incoming address is pointing into obj-start + klass_offset_in_bytes. We need to extract
2570   // obj-start, so that we can load from the object's mark-word instead. Usually the address
2571   // comes as obj-start in obj and klass_offset_in_bytes in disp. However, sometimes C2
2572   // emits code that pre-computes obj-start + klass_offset_in_bytes into a register, and
2573   // then passes that register as obj and 0 in disp. The following code extracts the base
2574   // and offset to load the mark-word.
2575   int offset = oopDesc::mark_offset_in_bytes() + disp - oopDesc::klass_offset_in_bytes();
2576   if (index == noreg) {
2577     ldr(dst, Address(obj, offset));
2578   } else {
2579     lea(dst, Address(obj, index, Address::lsl(scale)));
2580     ldr(dst, Address(dst, offset));
2581   }
2582   lsr(dst, dst, markWord::klass_shift);
2583 }