New src/hotspot/cpu/aarch64/c2_MacroAssembler

   1 /*
   2  * Copyright (c) 2020, 2023, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "opto/c2_MacroAssembler.hpp"
  29 #include "opto/compile.hpp"
  30 #include "opto/intrinsicnode.hpp"
  31 #include "opto/matcher.hpp"
  32 #include "opto/output.hpp"
  33 #include "opto/subnode.hpp"
  34 #include "runtime/stubRoutines.hpp"
  35 #include "utilities/globalDefinitions.hpp"
  36 
  37 #ifdef PRODUCT
  38 #define BLOCK_COMMENT(str) /* nothing */
  39 #define STOP(error) stop(error)
  40 #else
  41 #define BLOCK_COMMENT(str) block_comment(str)
  42 #define STOP(error) block_comment(error); stop(error)
  43 #endif
  44 
  45 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  46 
  47 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
  48 
  49 void C2_MacroAssembler::fast_lock(Register objectReg, Register boxReg, Register tmpReg,
  50                                   Register tmp2Reg, Register tmp3Reg) {
  51   Register oop = objectReg;
  52   Register box = boxReg;
  53   Register disp_hdr = tmpReg;
  54   Register tmp = tmp2Reg;
  55   Label cont;
  56   Label object_has_monitor;
  57   Label count, no_count;
  58 
  59   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight");
  60   assert_different_registers(oop, box, tmp, disp_hdr);
  61 
  62   // Load markWord from object into displaced_header.
  63   ldr(disp_hdr, Address(oop, oopDesc::mark_offset_in_bytes()));
  64 
  65   if (DiagnoseSyncOnValueBasedClasses != 0) {
  66     load_klass(tmp, oop);
  67     ldrw(tmp, Address(tmp, Klass::access_flags_offset()));
  68     tstw(tmp, JVM_ACC_IS_VALUE_BASED_CLASS);
  69     br(Assembler::NE, cont);
  70   }
  71 
  72   // Check for existing monitor
  73   tbnz(disp_hdr, exact_log2(markWord::monitor_value), object_has_monitor);
  74 
  75   if (LockingMode == LM_MONITOR) {
  76     tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0.
  77     b(cont);
  78   } else {
  79     assert(LockingMode == LM_LEGACY, "must be");
  80     // Set tmp to be (markWord of object | UNLOCK_VALUE).
  81     orr(tmp, disp_hdr, markWord::unlocked_value);
  82 
  83     // Initialize the box. (Must happen before we update the object mark!)
  84     str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes()));
  85 
  86     // Compare object markWord with an unlocked value (tmp) and if
  87     // equal exchange the stack address of our box with object markWord.
  88     // On failure disp_hdr contains the possibly locked markWord.
  89     cmpxchg(oop, tmp, box, Assembler::xword, /*acquire*/ true,
  90             /*release*/ true, /*weak*/ false, disp_hdr);
  91     br(Assembler::EQ, cont);
  92 
  93     assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
  94 
  95     // If the compare-and-exchange succeeded, then we found an unlocked
  96     // object, will have now locked it will continue at label cont
  97 
  98     // Check if the owner is self by comparing the value in the
  99     // markWord of object (disp_hdr) with the stack pointer.
 100     mov(rscratch1, sp);
 101     sub(disp_hdr, disp_hdr, rscratch1);
 102     mov(tmp, (address) (~(os::vm_page_size()-1) | markWord::lock_mask_in_place));
 103     // If condition is true we are cont and hence we can store 0 as the
 104     // displaced header in the box, which indicates that it is a recursive lock.
 105     ands(tmp/*==0?*/, disp_hdr, tmp);   // Sets flags for result
 106     str(tmp/*==0, perhaps*/, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 107     b(cont);
 108   }
 109 
 110   // Handle existing monitor.
 111   bind(object_has_monitor);
 112 
 113   // The object's monitor m is unlocked iff m->owner == NULL,
 114   // otherwise m->owner may contain a thread or a stack address.
 115   //
 116   // Try to CAS m->owner from NULL to current thread.
 117   add(tmp, disp_hdr, (in_bytes(ObjectMonitor::owner_offset())-markWord::monitor_value));
 118   cmpxchg(tmp, zr, rthread, Assembler::xword, /*acquire*/ true,
 119           /*release*/ true, /*weak*/ false, rscratch1); // Sets flags for result
 120 
 121   // Store a non-null value into the box to avoid looking like a re-entrant
 122   // lock. The fast-path monitor unlock code checks for
 123   // markWord::monitor_value so use markWord::unused_mark which has the
 124   // relevant bit set, and also matches ObjectSynchronizer::enter.
 125   mov(tmp, (address)markWord::unused_mark().value());
 126   str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 127 
 128   br(Assembler::EQ, cont); // CAS success means locking succeeded
 129 
 130   cmp(rscratch1, rthread);
 131   br(Assembler::NE, cont); // Check for recursive locking
 132 
 133   // Recursive lock case
 134   increment(Address(disp_hdr, in_bytes(ObjectMonitor::recursions_offset()) - markWord::monitor_value), 1);
 135   // flag == EQ still from the cmp above, checking if this is a reentrant lock
 136 
 137   bind(cont);
 138   // flag == EQ indicates success
 139   // flag == NE indicates failure
 140   br(Assembler::NE, no_count);
 141 
 142   bind(count);
 143   increment(Address(rthread, JavaThread::held_monitor_count_offset()));
 144 
 145   bind(no_count);
 146 }
 147 
 148 void C2_MacroAssembler::fast_unlock(Register objectReg, Register boxReg, Register tmpReg,
 149                                     Register tmp2Reg) {
 150   Register oop = objectReg;
 151   Register box = boxReg;
 152   Register disp_hdr = tmpReg;
 153   Register tmp = tmp2Reg;
 154   Label cont;
 155   Label object_has_monitor;
 156   Label count, no_count;
 157 
 158   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight");
 159   assert_different_registers(oop, box, tmp, disp_hdr);
 160 
 161   if (LockingMode == LM_LEGACY) {
 162     // Find the lock address and load the displaced header from the stack.
 163     ldr(disp_hdr, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 164 
 165     // If the displaced header is 0, we have a recursive unlock.
 166     cmp(disp_hdr, zr);
 167     br(Assembler::EQ, cont);
 168   }
 169 
 170   // Handle existing monitor.
 171   ldr(tmp, Address(oop, oopDesc::mark_offset_in_bytes()));
 172   tbnz(tmp, exact_log2(markWord::monitor_value), object_has_monitor);
 173 
 174   if (LockingMode == LM_MONITOR) {
 175     tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0.
 176     b(cont);
 177   } else {
 178     assert(LockingMode == LM_LEGACY, "must be");
 179     // Check if it is still a light weight lock, this is is true if we
 180     // see the stack address of the basicLock in the markWord of the
 181     // object.
 182 
 183     cmpxchg(oop, box, disp_hdr, Assembler::xword, /*acquire*/ false,
 184             /*release*/ true, /*weak*/ false, tmp);
 185     b(cont);
 186   }
 187 
 188   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
 189 
 190   // Handle existing monitor.
 191   bind(object_has_monitor);
 192   STATIC_ASSERT(markWord::monitor_value <= INT_MAX);
 193   add(tmp, tmp, -(int)markWord::monitor_value); // monitor
 194 
 195   ldr(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset()));
 196 
 197   Label notRecursive;
 198   cbz(disp_hdr, notRecursive);
 199 
 200   // Recursive lock
 201   sub(disp_hdr, disp_hdr, 1u);
 202   str(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset()));
 203   cmp(disp_hdr, disp_hdr); // Sets flags for result
 204   b(cont);
 205 
 206   bind(notRecursive);
 207   ldr(rscratch1, Address(tmp, ObjectMonitor::EntryList_offset()));
 208   ldr(disp_hdr, Address(tmp, ObjectMonitor::cxq_offset()));
 209   orr(rscratch1, rscratch1, disp_hdr); // Will be 0 if both are 0.
 210   cmp(rscratch1, zr); // Sets flags for result
 211   cbnz(rscratch1, cont);
 212   // need a release store here
 213   lea(tmp, Address(tmp, ObjectMonitor::owner_offset()));
 214   stlr(zr, tmp); // set unowned
 215 
 216   bind(cont);
 217   // flag == EQ indicates success
 218   // flag == NE indicates failure
 219   br(Assembler::NE, no_count);
 220 
 221   bind(count);
 222   decrement(Address(rthread, JavaThread::held_monitor_count_offset()));
 223 
 224   bind(no_count);
 225 }
 226 
 227 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register t1,
 228                                               Register t2, Register t3) {
 229   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 230   assert_different_registers(obj, t1, t2, t3);
 231 
 232   // Handle inflated monitor.
 233   Label inflated;
 234   // Finish fast lock successfully. MUST branch to with flag == EQ
 235   Label locked;
 236   // Finish fast lock unsuccessfully. MUST branch to with flag == NE
 237   Label slow_path;
 238 
 239   if (DiagnoseSyncOnValueBasedClasses != 0) {
 240     load_klass(t1, obj);
 241     ldrw(t1, Address(t1, Klass::access_flags_offset()));
 242     tstw(t1, JVM_ACC_IS_VALUE_BASED_CLASS);
 243     br(Assembler::NE, slow_path);
 244   }
 245 
 246   const Register t1_mark = t1;
 247 
 248   { // Lightweight locking
 249 
 250     // Push lock to the lock stack and finish successfully. MUST branch to with flag == EQ
 251     Label push;
 252 
 253     const Register t2_top = t2;
 254     const Register t3_t = t3;
 255 
 256     // Check if lock-stack is full.
 257     ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 258     cmpw(t2_top, (unsigned)LockStack::end_offset() - 1);
 259     br(Assembler::GT, slow_path);
 260 
 261     // Check if recursive.
 262     subw(t3_t, t2_top, oopSize);
 263     ldr(t3_t, Address(rthread, t3_t));
 264     cmp(obj, t3_t);
 265     br(Assembler::EQ, push);
 266 
 267     // Relaxed normal load to check for monitor. Optimization for monitor case.
 268     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 269     tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
 270 
 271     // Not inflated
 272     assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea");
 273 
 274     // Try to lock. Transition lock-bits 0b01 => 0b00
 275     orr(t1_mark, t1_mark, markWord::unlocked_value);
 276     eor(t3_t, t1_mark, markWord::unlocked_value);
 277     cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
 278             /*acquire*/ true, /*release*/ false, /*weak*/ false, noreg);
 279     br(Assembler::NE, slow_path);
 280 
 281     bind(push);
 282     // After successful lock, push object on lock-stack.
 283     str(obj, Address(rthread, t2_top));
 284     addw(t2_top, t2_top, oopSize);
 285     strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 286     b(locked);
 287   }
 288 
 289   { // Handle inflated monitor.
 290     bind(inflated);
 291 
 292     // mark contains the tagged ObjectMonitor*.
 293     const Register t1_tagged_monitor = t1_mark;
 294     const uintptr_t monitor_tag = markWord::monitor_value;
 295     const Register t2_owner_addr = t2;
 296     const Register t3_owner = t3;
 297 
 298     // Compute owner address.
 299     lea(t2_owner_addr, Address(t1_tagged_monitor, (in_bytes(ObjectMonitor::owner_offset()) - monitor_tag)));
 300 
 301     // CAS owner (null => current thread).
 302     cmpxchg(t2_owner_addr, zr, rthread, Assembler::xword, /*acquire*/ true,
 303             /*release*/ false, /*weak*/ false, t3_owner);
 304     br(Assembler::EQ, locked);
 305 
 306     // Check if recursive.
 307     cmp(t3_owner, rthread);
 308     br(Assembler::NE, slow_path);
 309 
 310     // Recursive.
 311     increment(Address(t1_tagged_monitor, in_bytes(ObjectMonitor::recursions_offset()) - monitor_tag), 1);
 312   }
 313 
 314   bind(locked);
 315   increment(Address(rthread, JavaThread::held_monitor_count_offset()));
 316 
 317 #ifdef ASSERT
 318   // Check that locked label is reached with Flags == EQ.
 319   Label flag_correct;
 320   br(Assembler::EQ, flag_correct);
 321   stop("Fast Lock Flag != EQ");
 322 #endif
 323 
 324   bind(slow_path);
 325 #ifdef ASSERT
 326   // Check that slow_path label is reached with Flags == NE.
 327   br(Assembler::NE, flag_correct);
 328   stop("Fast Lock Flag != NE");
 329   bind(flag_correct);
 330 #endif
 331   // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
 332 }
 333 
 334 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register t1, Register t2,
 335                                                 Register t3) {
 336   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 337   assert_different_registers(obj, t1, t2, t3);
 338 
 339   // Handle inflated monitor.
 340   Label inflated, inflated_load_monitor;
 341   // Finish fast unlock successfully. MUST branch to with flag == EQ
 342   Label unlocked;
 343   // Finish fast unlock unsuccessfully. MUST branch to with flag == NE
 344   Label slow_path;
 345 
 346   const Register t1_mark = t1;
 347   const Register t2_top = t2;
 348   const Register t3_t = t3;
 349 
 350   { // Lightweight unlock
 351 
 352     // Check if obj is top of lock-stack.
 353     ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 354     subw(t2_top, t2_top, oopSize);
 355     ldr(t3_t, Address(rthread, t2_top));
 356     cmp(obj, t3_t);
 357     // Top of lock stack was not obj. Must be monitor.
 358     br(Assembler::NE, inflated_load_monitor);
 359 
 360     // Pop lock-stack.
 361     DEBUG_ONLY(str(zr, Address(rthread, t2_top));)
 362     strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 363 
 364     // Check if recursive.
 365     subw(t3_t, t2_top, oopSize);
 366     ldr(t3_t, Address(rthread, t3_t));
 367     cmp(obj, t3_t);
 368     br(Assembler::EQ, unlocked);
 369 
 370     // Not recursive.
 371     // Load Mark.
 372     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 373 
 374     // Check header for monitor (0b10).
 375     tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
 376 
 377     // Try to unlock. Transition lock bits 0b00 => 0b01
 378     assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea");
 379     orr(t3_t, t1_mark, markWord::unlocked_value);
 380     cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
 381             /*acquire*/ false, /*release*/ true, /*weak*/ false, noreg);
 382     br(Assembler::EQ, unlocked);
 383 
 384     // Compare and exchange failed.
 385     // Restore lock-stack and handle the unlock in runtime.
 386     DEBUG_ONLY(str(obj, Address(rthread, t2_top));)
 387     addw(t2_top, t2_top, oopSize);
 388     str(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 389     b(slow_path);
 390   }
 391 
 392 
 393   { // Handle inflated monitor.
 394     bind(inflated_load_monitor);
 395     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 396 #ifdef ASSERT
 397     tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
 398     stop("Fast Unlock not monitor");
 399 #endif
 400 
 401     bind(inflated);
 402 
 403 #ifdef ASSERT
 404     Label check_done;
 405     subw(t2_top, t2_top, oopSize);
 406     cmpw(t2_top, in_bytes(JavaThread::lock_stack_base_offset()));
 407     br(Assembler::LT, check_done);
 408     ldr(t3_t, Address(rthread, t2_top));
 409     cmp(obj, t3_t);
 410     br(Assembler::NE, inflated);
 411     stop("Fast Unlock lock on stack");
 412     bind(check_done);
 413 #endif
 414 
 415     // mark contains the tagged ObjectMonitor*.
 416     const Register t1_monitor = t1_mark;
 417     const uintptr_t monitor_tag = markWord::monitor_value;
 418 
 419     // Untag the monitor.
 420     sub(t1_monitor, t1_mark, monitor_tag);
 421 
 422     const Register t2_recursions = t2;
 423     Label not_recursive;
 424 
 425     // Check if recursive.
 426     ldr(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
 427     cbz(t2_recursions, not_recursive);
 428 
 429     // Recursive unlock.
 430     sub(t2_recursions, t2_recursions, 1u);
 431     str(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
 432     // Set flag == EQ
 433     cmp(t2_recursions, t2_recursions);
 434     b(unlocked);
 435 
 436     bind(not_recursive);
 437 
 438     Label release;
 439     const Register t2_owner_addr = t2;
 440 
 441     // Compute owner address.
 442     lea(t2_owner_addr, Address(t1_monitor, ObjectMonitor::owner_offset()));
 443 
 444     // Check if the entry lists are empty.
 445     ldr(rscratch1, Address(t1_monitor, ObjectMonitor::EntryList_offset()));
 446     ldr(t3_t, Address(t1_monitor, ObjectMonitor::cxq_offset()));
 447     orr(rscratch1, rscratch1, t3_t);
 448     cmp(rscratch1, zr);
 449     br(Assembler::EQ, release);
 450 
 451     // The owner may be anonymous and we removed the last obj entry in
 452     // the lock-stack. This loses the information about the owner.
 453     // Write the thread to the owner field so the runtime knows the owner.
 454     str(rthread, Address(t2_owner_addr));
 455     b(slow_path);
 456 
 457     bind(release);
 458     // Set owner to null.
 459     // Release to satisfy the JMM
 460     stlr(zr, t2_owner_addr);
 461   }
 462 
 463   bind(unlocked);
 464   decrement(Address(rthread, JavaThread::held_monitor_count_offset()));
 465 
 466 #ifdef ASSERT
 467   // Check that unlocked label is reached with Flags == EQ.
 468   Label flag_correct;
 469   br(Assembler::EQ, flag_correct);
 470   stop("Fast Unlock Flag != EQ");
 471 #endif
 472 
 473   bind(slow_path);
 474 #ifdef ASSERT
 475   // Check that slow_path label is reached with Flags == NE.
 476   br(Assembler::NE, flag_correct);
 477   stop("Fast Unlock Flag != NE");
 478   bind(flag_correct);
 479 #endif
 480   // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
 481 }
 482 
 483 // Search for str1 in str2 and return index or -1
 484 // Clobbers: rscratch1, rscratch2, rflags. May also clobber v0-v1, when icnt1==-1.
 485 void C2_MacroAssembler::string_indexof(Register str2, Register str1,
 486                                        Register cnt2, Register cnt1,
 487                                        Register tmp1, Register tmp2,
 488                                        Register tmp3, Register tmp4,
 489                                        Register tmp5, Register tmp6,
 490                                        int icnt1, Register result, int ae) {
 491   // NOTE: tmp5, tmp6 can be zr depending on specific method version
 492   Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
 493 
 494   Register ch1 = rscratch1;
 495   Register ch2 = rscratch2;
 496   Register cnt1tmp = tmp1;
 497   Register cnt2tmp = tmp2;
 498   Register cnt1_neg = cnt1;
 499   Register cnt2_neg = cnt2;
 500   Register result_tmp = tmp4;
 501 
 502   bool isL = ae == StrIntrinsicNode::LL;
 503 
 504   bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
 505   bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
 506   int str1_chr_shift = str1_isL ? 0:1;
 507   int str2_chr_shift = str2_isL ? 0:1;
 508   int str1_chr_size = str1_isL ? 1:2;
 509   int str2_chr_size = str2_isL ? 1:2;
 510   chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
 511                                       (chr_insn)&MacroAssembler::ldrh;
 512   chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
 513                                       (chr_insn)&MacroAssembler::ldrh;
 514   chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
 515   chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
 516 
 517   // Note, inline_string_indexOf() generates checks:
 518   // if (substr.count > string.count) return -1;
 519   // if (substr.count == 0) return 0;
 520 
 521   // We have two strings, a source string in str2, cnt2 and a pattern string
 522   // in str1, cnt1. Find the 1st occurrence of pattern in source or return -1.
 523 
 524   // For larger pattern and source we use a simplified Boyer Moore algorithm.
 525   // With a small pattern and source we use linear scan.
 526 
 527   if (icnt1 == -1) {
 528     sub(result_tmp, cnt2, cnt1);
 529     cmp(cnt1, (u1)8);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
 530     br(LT, LINEARSEARCH);
 531     dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
 532     subs(zr, cnt1, 256);
 533     lsr(tmp1, cnt2, 2);
 534     ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
 535     br(GE, LINEARSTUB);
 536   }
 537 
 538 // The Boyer Moore alogorithm is based on the description here:-
 539 //
 540 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
 541 //
 542 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
 543 // and the 'Good Suffix' rule.
 544 //
 545 // These rules are essentially heuristics for how far we can shift the
 546 // pattern along the search string.
 547 //
 548 // The implementation here uses the 'Bad Character' rule only because of the
 549 // complexity of initialisation for the 'Good Suffix' rule.
 550 //
 551 // This is also known as the Boyer-Moore-Horspool algorithm:-
 552 //
 553 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
 554 //
 555 // This particular implementation has few java-specific optimizations.
 556 //
 557 // #define ASIZE 256
 558 //
 559 //    int bm(unsigned char *x, int m, unsigned char *y, int n) {
 560 //       int i, j;
 561 //       unsigned c;
 562 //       unsigned char bc[ASIZE];
 563 //
 564 //       /* Preprocessing */
 565 //       for (i = 0; i < ASIZE; ++i)
 566 //          bc[i] = m;
 567 //       for (i = 0; i < m - 1; ) {
 568 //          c = x[i];
 569 //          ++i;
 570 //          // c < 256 for Latin1 string, so, no need for branch
 571 //          #ifdef PATTERN_STRING_IS_LATIN1
 572 //          bc[c] = m - i;
 573 //          #else
 574 //          if (c < ASIZE) bc[c] = m - i;
 575 //          #endif
 576 //       }
 577 //
 578 //       /* Searching */
 579 //       j = 0;
 580 //       while (j <= n - m) {
 581 //          c = y[i+j];
 582 //          if (x[m-1] == c)
 583 //            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
 584 //          if (i < 0) return j;
 585 //          // c < 256 for Latin1 string, so, no need for branch
 586 //          #ifdef SOURCE_STRING_IS_LATIN1
 587 //          // LL case: (c< 256) always true. Remove branch
 588 //          j += bc[y[j+m-1]];
 589 //          #endif
 590 //          #ifndef PATTERN_STRING_IS_UTF
 591 //          // UU case: need if (c<ASIZE) check. Skip 1 character if not.
 592 //          if (c < ASIZE)
 593 //            j += bc[y[j+m-1]];
 594 //          else
 595 //            j += 1
 596 //          #endif
 597 //          #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
 598 //          // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
 599 //          if (c < ASIZE)
 600 //            j += bc[y[j+m-1]];
 601 //          else
 602 //            j += m
 603 //          #endif
 604 //       }
 605 //    }
 606 
 607   if (icnt1 == -1) {
 608     Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
 609         BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
 610     Register cnt1end = tmp2;
 611     Register str2end = cnt2;
 612     Register skipch = tmp2;
 613 
 614     // str1 length is >=8, so, we can read at least 1 register for cases when
 615     // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
 616     // UL case. We'll re-read last character in inner pre-loop code to have
 617     // single outer pre-loop load
 618     const int firstStep = isL ? 7 : 3;
 619 
 620     const int ASIZE = 256;
 621     const int STORED_BYTES = 32; // amount of bytes stored per instruction
 622     sub(sp, sp, ASIZE);
 623     mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
 624     mov(ch1, sp);
 625     BIND(BM_INIT_LOOP);
 626       stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
 627       subs(tmp5, tmp5, 1);
 628       br(GT, BM_INIT_LOOP);
 629 
 630       sub(cnt1tmp, cnt1, 1);
 631       mov(tmp5, str2);
 632       add(str2end, str2, result_tmp, LSL, str2_chr_shift);
 633       sub(ch2, cnt1, 1);
 634       mov(tmp3, str1);
 635     BIND(BCLOOP);
 636       (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
 637       if (!str1_isL) {
 638         subs(zr, ch1, ASIZE);
 639         br(HS, BCSKIP);
 640       }
 641       strb(ch2, Address(sp, ch1));
 642     BIND(BCSKIP);
 643       subs(ch2, ch2, 1);
 644       br(GT, BCLOOP);
 645 
 646       add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
 647       if (str1_isL == str2_isL) {
 648         // load last 8 bytes (8LL/4UU symbols)
 649         ldr(tmp6, Address(tmp6, -wordSize));
 650       } else {
 651         ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
 652         // convert Latin1 to UTF. We'll have to wait until load completed, but
 653         // it's still faster than per-character loads+checks
 654         lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
 655         ubfx(ch1, tmp6, 8, 8); // str1[N-2]
 656         ubfx(ch2, tmp6, 16, 8); // str1[N-3]
 657         andr(tmp6, tmp6, 0xFF); // str1[N-4]
 658         orr(ch2, ch1, ch2, LSL, 16);
 659         orr(tmp6, tmp6, tmp3, LSL, 48);
 660         orr(tmp6, tmp6, ch2, LSL, 16);
 661       }
 662     BIND(BMLOOPSTR2);
 663       (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 664       sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
 665       if (str1_isL == str2_isL) {
 666         // re-init tmp3. It's for free because it's executed in parallel with
 667         // load above. Alternative is to initialize it before loop, but it'll
 668         // affect performance on in-order systems with 2 or more ld/st pipelines
 669         lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
 670       }
 671       if (!isL) { // UU/UL case
 672         lsl(ch2, cnt1tmp, 1); // offset in bytes
 673       }
 674       cmp(tmp3, skipch);
 675       br(NE, BMSKIP);
 676       ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
 677       mov(ch1, tmp6);
 678       if (isL) {
 679         b(BMLOOPSTR1_AFTER_LOAD);
 680       } else {
 681         sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
 682         b(BMLOOPSTR1_CMP);
 683       }
 684     BIND(BMLOOPSTR1);
 685       (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
 686       (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 687     BIND(BMLOOPSTR1_AFTER_LOAD);
 688       subs(cnt1tmp, cnt1tmp, 1);
 689       br(LT, BMLOOPSTR1_LASTCMP);
 690     BIND(BMLOOPSTR1_CMP);
 691       cmp(ch1, ch2);
 692       br(EQ, BMLOOPSTR1);
 693     BIND(BMSKIP);
 694       if (!isL) {
 695         // if we've met UTF symbol while searching Latin1 pattern, then we can
 696         // skip cnt1 symbols
 697         if (str1_isL != str2_isL) {
 698           mov(result_tmp, cnt1);
 699         } else {
 700           mov(result_tmp, 1);
 701         }
 702         subs(zr, skipch, ASIZE);
 703         br(HS, BMADV);
 704       }
 705       ldrb(result_tmp, Address(sp, skipch)); // load skip distance
 706     BIND(BMADV);
 707       sub(cnt1tmp, cnt1, 1);
 708       add(str2, str2, result_tmp, LSL, str2_chr_shift);
 709       cmp(str2, str2end);
 710       br(LE, BMLOOPSTR2);
 711       add(sp, sp, ASIZE);
 712       b(NOMATCH);
 713     BIND(BMLOOPSTR1_LASTCMP);
 714       cmp(ch1, ch2);
 715       br(NE, BMSKIP);
 716     BIND(BMMATCH);
 717       sub(result, str2, tmp5);
 718       if (!str2_isL) lsr(result, result, 1);
 719       add(sp, sp, ASIZE);
 720       b(DONE);
 721 
 722     BIND(LINEARSTUB);
 723     cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm
 724     br(LT, LINEAR_MEDIUM);
 725     mov(result, zr);
 726     RuntimeAddress stub = nullptr;
 727     if (isL) {
 728       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
 729       assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated");
 730     } else if (str1_isL) {
 731       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
 732        assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated");
 733     } else {
 734       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
 735       assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated");
 736     }
 737     address call = trampoline_call(stub);
 738     if (call == nullptr) {
 739       DEBUG_ONLY(reset_labels(LINEARSEARCH, LINEAR_MEDIUM, DONE, NOMATCH, MATCH));
 740       ciEnv::current()->record_failure("CodeCache is full");
 741       return;
 742     }
 743     b(DONE);
 744   }
 745 
 746   BIND(LINEARSEARCH);
 747   {
 748     Label DO1, DO2, DO3;
 749 
 750     Register str2tmp = tmp2;
 751     Register first = tmp3;
 752 
 753     if (icnt1 == -1)
 754     {
 755         Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
 756 
 757         cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2));
 758         br(LT, DOSHORT);
 759       BIND(LINEAR_MEDIUM);
 760         (this->*str1_load_1chr)(first, Address(str1));
 761         lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
 762         sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
 763         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 764         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 765 
 766       BIND(FIRST_LOOP);
 767         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
 768         cmp(first, ch2);
 769         br(EQ, STR1_LOOP);
 770       BIND(STR2_NEXT);
 771         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 772         br(LE, FIRST_LOOP);
 773         b(NOMATCH);
 774 
 775       BIND(STR1_LOOP);
 776         adds(cnt1tmp, cnt1_neg, str1_chr_size);
 777         add(cnt2tmp, cnt2_neg, str2_chr_size);
 778         br(GE, MATCH);
 779 
 780       BIND(STR1_NEXT);
 781         (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
 782         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
 783         cmp(ch1, ch2);
 784         br(NE, STR2_NEXT);
 785         adds(cnt1tmp, cnt1tmp, str1_chr_size);
 786         add(cnt2tmp, cnt2tmp, str2_chr_size);
 787         br(LT, STR1_NEXT);
 788         b(MATCH);
 789 
 790       BIND(DOSHORT);
 791       if (str1_isL == str2_isL) {
 792         cmp(cnt1, (u1)2);
 793         br(LT, DO1);
 794         br(GT, DO3);
 795       }
 796     }
 797 
 798     if (icnt1 == 4) {
 799       Label CH1_LOOP;
 800 
 801         (this->*load_4chr)(ch1, str1);
 802         sub(result_tmp, cnt2, 4);
 803         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 804         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 805 
 806       BIND(CH1_LOOP);
 807         (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
 808         cmp(ch1, ch2);
 809         br(EQ, MATCH);
 810         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 811         br(LE, CH1_LOOP);
 812         b(NOMATCH);
 813       }
 814 
 815     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
 816       Label CH1_LOOP;
 817 
 818       BIND(DO2);
 819         (this->*load_2chr)(ch1, str1);
 820         if (icnt1 == 2) {
 821           sub(result_tmp, cnt2, 2);
 822         }
 823         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 824         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 825       BIND(CH1_LOOP);
 826         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
 827         cmp(ch1, ch2);
 828         br(EQ, MATCH);
 829         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 830         br(LE, CH1_LOOP);
 831         b(NOMATCH);
 832     }
 833 
 834     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
 835       Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
 836 
 837       BIND(DO3);
 838         (this->*load_2chr)(first, str1);
 839         (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
 840         if (icnt1 == 3) {
 841           sub(result_tmp, cnt2, 3);
 842         }
 843         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 844         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 845       BIND(FIRST_LOOP);
 846         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
 847         cmpw(first, ch2);
 848         br(EQ, STR1_LOOP);
 849       BIND(STR2_NEXT);
 850         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 851         br(LE, FIRST_LOOP);
 852         b(NOMATCH);
 853 
 854       BIND(STR1_LOOP);
 855         add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
 856         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
 857         cmp(ch1, ch2);
 858         br(NE, STR2_NEXT);
 859         b(MATCH);
 860     }
 861 
 862     if (icnt1 == -1 || icnt1 == 1) {
 863       Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
 864 
 865       BIND(DO1);
 866         (this->*str1_load_1chr)(ch1, str1);
 867         cmp(cnt2, (u1)8);
 868         br(LT, DO1_SHORT);
 869 
 870         sub(result_tmp, cnt2, 8/str2_chr_size);
 871         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 872         mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
 873         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 874 
 875         if (str2_isL) {
 876           orr(ch1, ch1, ch1, LSL, 8);
 877         }
 878         orr(ch1, ch1, ch1, LSL, 16);
 879         orr(ch1, ch1, ch1, LSL, 32);
 880       BIND(CH1_LOOP);
 881         ldr(ch2, Address(str2, cnt2_neg));
 882         eor(ch2, ch1, ch2);
 883         sub(tmp1, ch2, tmp3);
 884         orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 885         bics(tmp1, tmp1, tmp2);
 886         br(NE, HAS_ZERO);
 887         adds(cnt2_neg, cnt2_neg, 8);
 888         br(LT, CH1_LOOP);
 889 
 890         cmp(cnt2_neg, (u1)8);
 891         mov(cnt2_neg, 0);
 892         br(LT, CH1_LOOP);
 893         b(NOMATCH);
 894 
 895       BIND(HAS_ZERO);
 896         rev(tmp1, tmp1);
 897         clz(tmp1, tmp1);
 898         add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
 899         b(MATCH);
 900 
 901       BIND(DO1_SHORT);
 902         mov(result_tmp, cnt2);
 903         lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
 904         sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
 905       BIND(DO1_LOOP);
 906         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
 907         cmpw(ch1, ch2);
 908         br(EQ, MATCH);
 909         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 910         br(LT, DO1_LOOP);
 911     }
 912   }
 913   BIND(NOMATCH);
 914     mov(result, -1);
 915     b(DONE);
 916   BIND(MATCH);
 917     add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
 918   BIND(DONE);
 919 }
 920 
 921 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
 922 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
 923 
 924 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1,
 925                                             Register ch, Register result,
 926                                             Register tmp1, Register tmp2, Register tmp3)
 927 {
 928   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
 929   Register cnt1_neg = cnt1;
 930   Register ch1 = rscratch1;
 931   Register result_tmp = rscratch2;
 932 
 933   cbz(cnt1, NOMATCH);
 934 
 935   cmp(cnt1, (u1)4);
 936   br(LT, DO1_SHORT);
 937 
 938   orr(ch, ch, ch, LSL, 16);
 939   orr(ch, ch, ch, LSL, 32);
 940 
 941   sub(cnt1, cnt1, 4);
 942   mov(result_tmp, cnt1);
 943   lea(str1, Address(str1, cnt1, Address::uxtw(1)));
 944   sub(cnt1_neg, zr, cnt1, LSL, 1);
 945 
 946   mov(tmp3, 0x0001000100010001);
 947 
 948   BIND(CH1_LOOP);
 949     ldr(ch1, Address(str1, cnt1_neg));
 950     eor(ch1, ch, ch1);
 951     sub(tmp1, ch1, tmp3);
 952     orr(tmp2, ch1, 0x7fff7fff7fff7fff);
 953     bics(tmp1, tmp1, tmp2);
 954     br(NE, HAS_ZERO);
 955     adds(cnt1_neg, cnt1_neg, 8);
 956     br(LT, CH1_LOOP);
 957 
 958     cmp(cnt1_neg, (u1)8);
 959     mov(cnt1_neg, 0);
 960     br(LT, CH1_LOOP);
 961     b(NOMATCH);
 962 
 963   BIND(HAS_ZERO);
 964     rev(tmp1, tmp1);
 965     clz(tmp1, tmp1);
 966     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
 967     b(MATCH);
 968 
 969   BIND(DO1_SHORT);
 970     mov(result_tmp, cnt1);
 971     lea(str1, Address(str1, cnt1, Address::uxtw(1)));
 972     sub(cnt1_neg, zr, cnt1, LSL, 1);
 973   BIND(DO1_LOOP);
 974     ldrh(ch1, Address(str1, cnt1_neg));
 975     cmpw(ch, ch1);
 976     br(EQ, MATCH);
 977     adds(cnt1_neg, cnt1_neg, 2);
 978     br(LT, DO1_LOOP);
 979   BIND(NOMATCH);
 980     mov(result, -1);
 981     b(DONE);
 982   BIND(MATCH);
 983     add(result, result_tmp, cnt1_neg, ASR, 1);
 984   BIND(DONE);
 985 }
 986 
 987 void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1,
 988                                                 Register ch, Register result,
 989                                                 FloatRegister ztmp1,
 990                                                 FloatRegister ztmp2,
 991                                                 PRegister tmp_pg,
 992                                                 PRegister tmp_pdn, bool isL)
 993 {
 994   // Note that `tmp_pdn` should *NOT* be used as governing predicate register.
 995   assert(tmp_pg->is_governing(),
 996          "this register has to be a governing predicate register");
 997 
 998   Label LOOP, MATCH, DONE, NOMATCH;
 999   Register vec_len = rscratch1;
1000   Register idx = rscratch2;
1001 
1002   SIMD_RegVariant T = (isL == true) ? B : H;
1003 
1004   cbz(cnt1, NOMATCH);
1005 
1006   // Assign the particular char throughout the vector.
1007   sve_dup(ztmp2, T, ch);
1008   if (isL) {
1009     sve_cntb(vec_len);
1010   } else {
1011     sve_cnth(vec_len);
1012   }
1013   mov(idx, 0);
1014 
1015   // Generate a predicate to control the reading of input string.
1016   sve_whilelt(tmp_pg, T, idx, cnt1);
1017 
1018   BIND(LOOP);
1019     // Read a vector of 8- or 16-bit data depending on the string type. Note
1020     // that inactive elements indicated by the predicate register won't cause
1021     // a data read from memory to the destination vector.
1022     if (isL) {
1023       sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx));
1024     } else {
1025       sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1)));
1026     }
1027     add(idx, idx, vec_len);
1028 
1029     // Perform the comparison. An element of the destination predicate is set
1030     // to active if the particular char is matched.
1031     sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2);
1032 
1033     // Branch if the particular char is found.
1034     br(NE, MATCH);
1035 
1036     sve_whilelt(tmp_pg, T, idx, cnt1);
1037 
1038     // Loop back if the particular char not found.
1039     br(MI, LOOP);
1040 
1041   BIND(NOMATCH);
1042     mov(result, -1);
1043     b(DONE);
1044 
1045   BIND(MATCH);
1046     // Undo the index increment.
1047     sub(idx, idx, vec_len);
1048 
1049     // Crop the vector to find its location.
1050     sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */);
1051     add(result, idx, -1);
1052     sve_incp(result, T, tmp_pdn);
1053   BIND(DONE);
1054 }
1055 
1056 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1,
1057                                             Register ch, Register result,
1058                                             Register tmp1, Register tmp2, Register tmp3)
1059 {
1060   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
1061   Register cnt1_neg = cnt1;
1062   Register ch1 = rscratch1;
1063   Register result_tmp = rscratch2;
1064 
1065   cbz(cnt1, NOMATCH);
1066 
1067   cmp(cnt1, (u1)8);
1068   br(LT, DO1_SHORT);
1069 
1070   orr(ch, ch, ch, LSL, 8);
1071   orr(ch, ch, ch, LSL, 16);
1072   orr(ch, ch, ch, LSL, 32);
1073 
1074   sub(cnt1, cnt1, 8);
1075   mov(result_tmp, cnt1);
1076   lea(str1, Address(str1, cnt1));
1077   sub(cnt1_neg, zr, cnt1);
1078 
1079   mov(tmp3, 0x0101010101010101);
1080 
1081   BIND(CH1_LOOP);
1082     ldr(ch1, Address(str1, cnt1_neg));
1083     eor(ch1, ch, ch1);
1084     sub(tmp1, ch1, tmp3);
1085     orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f);
1086     bics(tmp1, tmp1, tmp2);
1087     br(NE, HAS_ZERO);
1088     adds(cnt1_neg, cnt1_neg, 8);
1089     br(LT, CH1_LOOP);
1090 
1091     cmp(cnt1_neg, (u1)8);
1092     mov(cnt1_neg, 0);
1093     br(LT, CH1_LOOP);
1094     b(NOMATCH);
1095 
1096   BIND(HAS_ZERO);
1097     rev(tmp1, tmp1);
1098     clz(tmp1, tmp1);
1099     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
1100     b(MATCH);
1101 
1102   BIND(DO1_SHORT);
1103     mov(result_tmp, cnt1);
1104     lea(str1, Address(str1, cnt1));
1105     sub(cnt1_neg, zr, cnt1);
1106   BIND(DO1_LOOP);
1107     ldrb(ch1, Address(str1, cnt1_neg));
1108     cmp(ch, ch1);
1109     br(EQ, MATCH);
1110     adds(cnt1_neg, cnt1_neg, 1);
1111     br(LT, DO1_LOOP);
1112   BIND(NOMATCH);
1113     mov(result, -1);
1114     b(DONE);
1115   BIND(MATCH);
1116     add(result, result_tmp, cnt1_neg);
1117   BIND(DONE);
1118 }
1119 
1120 // Compare strings.
1121 void C2_MacroAssembler::string_compare(Register str1, Register str2,
1122     Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
1123     FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
1124     PRegister pgtmp1, PRegister pgtmp2, int ae) {
1125   Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
1126       DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
1127       SHORT_LOOP_START, TAIL_CHECK;
1128 
1129   bool isLL = ae == StrIntrinsicNode::LL;
1130   bool isLU = ae == StrIntrinsicNode::LU;
1131   bool isUL = ae == StrIntrinsicNode::UL;
1132 
1133   // The stub threshold for LL strings is: 72 (64 + 8) chars
1134   // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch)
1135   // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least)
1136   const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36);
1137 
1138   bool str1_isL = isLL || isLU;
1139   bool str2_isL = isLL || isUL;
1140 
1141   int str1_chr_shift = str1_isL ? 0 : 1;
1142   int str2_chr_shift = str2_isL ? 0 : 1;
1143   int str1_chr_size = str1_isL ? 1 : 2;
1144   int str2_chr_size = str2_isL ? 1 : 2;
1145   int minCharsInWord = isLL ? wordSize : wordSize/2;
1146 
1147   FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
1148   chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
1149                                       (chr_insn)&MacroAssembler::ldrh;
1150   chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
1151                                       (chr_insn)&MacroAssembler::ldrh;
1152   uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
1153                             (uxt_insn)&MacroAssembler::uxthw;
1154 
1155   BLOCK_COMMENT("string_compare {");
1156 
1157   // Bizzarely, the counts are passed in bytes, regardless of whether they
1158   // are L or U strings, however the result is always in characters.
1159   if (!str1_isL) asrw(cnt1, cnt1, 1);
1160   if (!str2_isL) asrw(cnt2, cnt2, 1);
1161 
1162   // Compute the minimum of the string lengths and save the difference.
1163   subsw(result, cnt1, cnt2);
1164   cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
1165 
1166   // A very short string
1167   cmpw(cnt2, minCharsInWord);
1168   br(Assembler::LE, SHORT_STRING);
1169 
1170   // Compare longwords
1171   // load first parts of strings and finish initialization while loading
1172   {
1173     if (str1_isL == str2_isL) { // LL or UU
1174       ldr(tmp1, Address(str1));
1175       cmp(str1, str2);
1176       br(Assembler::EQ, DONE);
1177       ldr(tmp2, Address(str2));
1178       cmp(cnt2, stub_threshold);
1179       br(GE, STUB);
1180       subsw(cnt2, cnt2, minCharsInWord);
1181       br(EQ, TAIL_CHECK);
1182       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1183       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1184       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1185     } else if (isLU) {
1186       ldrs(vtmp, Address(str1));
1187       ldr(tmp2, Address(str2));
1188       cmp(cnt2, stub_threshold);
1189       br(GE, STUB);
1190       subw(cnt2, cnt2, 4);
1191       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1192       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1193       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1194       zip1(vtmp, T8B, vtmp, vtmpZ);
1195       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1196       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1197       add(cnt1, cnt1, 4);
1198       fmovd(tmp1, vtmp);
1199     } else { // UL case
1200       ldr(tmp1, Address(str1));
1201       ldrs(vtmp, Address(str2));
1202       cmp(cnt2, stub_threshold);
1203       br(GE, STUB);
1204       subw(cnt2, cnt2, 4);
1205       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1206       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1207       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1208       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1209       zip1(vtmp, T8B, vtmp, vtmpZ);
1210       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1211       add(cnt1, cnt1, 8);
1212       fmovd(tmp2, vtmp);
1213     }
1214     adds(cnt2, cnt2, isUL ? 4 : 8);
1215     br(GE, TAIL);
1216     eor(rscratch2, tmp1, tmp2);
1217     cbnz(rscratch2, DIFF);
1218     // main loop
1219     bind(NEXT_WORD);
1220     if (str1_isL == str2_isL) {
1221       ldr(tmp1, Address(str1, cnt2));
1222       ldr(tmp2, Address(str2, cnt2));
1223       adds(cnt2, cnt2, 8);
1224     } else if (isLU) {
1225       ldrs(vtmp, Address(str1, cnt1));
1226       ldr(tmp2, Address(str2, cnt2));
1227       add(cnt1, cnt1, 4);
1228       zip1(vtmp, T8B, vtmp, vtmpZ);
1229       fmovd(tmp1, vtmp);
1230       adds(cnt2, cnt2, 8);
1231     } else { // UL
1232       ldrs(vtmp, Address(str2, cnt2));
1233       ldr(tmp1, Address(str1, cnt1));
1234       zip1(vtmp, T8B, vtmp, vtmpZ);
1235       add(cnt1, cnt1, 8);
1236       fmovd(tmp2, vtmp);
1237       adds(cnt2, cnt2, 4);
1238     }
1239     br(GE, TAIL);
1240 
1241     eor(rscratch2, tmp1, tmp2);
1242     cbz(rscratch2, NEXT_WORD);
1243     b(DIFF);
1244     bind(TAIL);
1245     eor(rscratch2, tmp1, tmp2);
1246     cbnz(rscratch2, DIFF);
1247     // Last longword.  In the case where length == 4 we compare the
1248     // same longword twice, but that's still faster than another
1249     // conditional branch.
1250     if (str1_isL == str2_isL) {
1251       ldr(tmp1, Address(str1));
1252       ldr(tmp2, Address(str2));
1253     } else if (isLU) {
1254       ldrs(vtmp, Address(str1));
1255       ldr(tmp2, Address(str2));
1256       zip1(vtmp, T8B, vtmp, vtmpZ);
1257       fmovd(tmp1, vtmp);
1258     } else { // UL
1259       ldrs(vtmp, Address(str2));
1260       ldr(tmp1, Address(str1));
1261       zip1(vtmp, T8B, vtmp, vtmpZ);
1262       fmovd(tmp2, vtmp);
1263     }
1264     bind(TAIL_CHECK);
1265     eor(rscratch2, tmp1, tmp2);
1266     cbz(rscratch2, DONE);
1267 
1268     // Find the first different characters in the longwords and
1269     // compute their difference.
1270     bind(DIFF);
1271     rev(rscratch2, rscratch2);
1272     clz(rscratch2, rscratch2);
1273     andr(rscratch2, rscratch2, isLL ? -8 : -16);
1274     lsrv(tmp1, tmp1, rscratch2);
1275     (this->*ext_chr)(tmp1, tmp1);
1276     lsrv(tmp2, tmp2, rscratch2);
1277     (this->*ext_chr)(tmp2, tmp2);
1278     subw(result, tmp1, tmp2);
1279     b(DONE);
1280   }
1281 
1282   bind(STUB);
1283     RuntimeAddress stub = nullptr;
1284     switch(ae) {
1285       case StrIntrinsicNode::LL:
1286         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
1287         break;
1288       case StrIntrinsicNode::UU:
1289         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
1290         break;
1291       case StrIntrinsicNode::LU:
1292         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
1293         break;
1294       case StrIntrinsicNode::UL:
1295         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
1296         break;
1297       default:
1298         ShouldNotReachHere();
1299      }
1300     assert(stub.target() != nullptr, "compare_long_string stub has not been generated");
1301     address call = trampoline_call(stub);
1302     if (call == nullptr) {
1303       DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START));
1304       ciEnv::current()->record_failure("CodeCache is full");
1305       return;
1306     }
1307     b(DONE);
1308 
1309   bind(SHORT_STRING);
1310   // Is the minimum length zero?
1311   cbz(cnt2, DONE);
1312   // arrange code to do most branches while loading and loading next characters
1313   // while comparing previous
1314   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1315   subs(cnt2, cnt2, 1);
1316   br(EQ, SHORT_LAST_INIT);
1317   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1318   b(SHORT_LOOP_START);
1319   bind(SHORT_LOOP);
1320   subs(cnt2, cnt2, 1);
1321   br(EQ, SHORT_LAST);
1322   bind(SHORT_LOOP_START);
1323   (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
1324   (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
1325   cmp(tmp1, cnt1);
1326   br(NE, SHORT_LOOP_TAIL);
1327   subs(cnt2, cnt2, 1);
1328   br(EQ, SHORT_LAST2);
1329   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1330   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1331   cmp(tmp2, rscratch1);
1332   br(EQ, SHORT_LOOP);
1333   sub(result, tmp2, rscratch1);
1334   b(DONE);
1335   bind(SHORT_LOOP_TAIL);
1336   sub(result, tmp1, cnt1);
1337   b(DONE);
1338   bind(SHORT_LAST2);
1339   cmp(tmp2, rscratch1);
1340   br(EQ, DONE);
1341   sub(result, tmp2, rscratch1);
1342 
1343   b(DONE);
1344   bind(SHORT_LAST_INIT);
1345   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1346   bind(SHORT_LAST);
1347   cmp(tmp1, cnt1);
1348   br(EQ, DONE);
1349   sub(result, tmp1, cnt1);
1350 
1351   bind(DONE);
1352 
1353   BLOCK_COMMENT("} string_compare");
1354 }
1355 
1356 void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1,
1357                                      FloatRegister src2, Condition cond, bool isQ) {
1358   SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1359   FloatRegister zn = src1, zm = src2;
1360   bool needs_negation = false;
1361   switch (cond) {
1362     case LT: cond = GT; zn = src2; zm = src1; break;
1363     case LE: cond = GE; zn = src2; zm = src1; break;
1364     case LO: cond = HI; zn = src2; zm = src1; break;
1365     case LS: cond = HS; zn = src2; zm = src1; break;
1366     case NE: cond = EQ; needs_negation = true; break;
1367     default:
1368       break;
1369   }
1370 
1371   if (is_floating_point_type(bt)) {
1372     fcm(cond, dst, size, zn, zm);
1373   } else {
1374     cm(cond, dst, size, zn, zm);
1375   }
1376 
1377   if (needs_negation) {
1378     notr(dst, isQ ? T16B : T8B, dst);
1379   }
1380 }
1381 
1382 void C2_MacroAssembler::neon_compare_zero(FloatRegister dst, BasicType bt, FloatRegister src,
1383                                           Condition cond, bool isQ) {
1384   SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1385   if (bt == T_FLOAT || bt == T_DOUBLE) {
1386     if (cond == Assembler::NE) {
1387       fcm(Assembler::EQ, dst, size, src);
1388       notr(dst, isQ ? T16B : T8B, dst);
1389     } else {
1390       fcm(cond, dst, size, src);
1391     }
1392   } else {
1393     if (cond == Assembler::NE) {
1394       cm(Assembler::EQ, dst, size, src);
1395       notr(dst, isQ ? T16B : T8B, dst);
1396     } else {
1397       cm(cond, dst, size, src);
1398     }
1399   }
1400 }
1401 
1402 // Compress the least significant bit of each byte to the rightmost and clear
1403 // the higher garbage bits.
1404 void C2_MacroAssembler::bytemask_compress(Register dst) {
1405   // Example input, dst = 0x01 00 00 00 01 01 00 01
1406   // The "??" bytes are garbage.
1407   orr(dst, dst, dst, Assembler::LSR, 7);  // dst = 0x?? 02 ?? 00 ?? 03 ?? 01
1408   orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D
1409   orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D
1410   andr(dst, dst, 0xff);                   // dst = 0x8D
1411 }
1412 
1413 // Pack the lowest-numbered bit of each mask element in src into a long value
1414 // in dst, at most the first 64 lane elements.
1415 // Clobbers: rscratch1, if UseSVE=1 or the hardware doesn't support FEAT_BITPERM.
1416 void C2_MacroAssembler::sve_vmask_tolong(Register dst, PRegister src, BasicType bt, int lane_cnt,
1417                                          FloatRegister vtmp1, FloatRegister vtmp2) {
1418   assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
1419   assert_different_registers(dst, rscratch1);
1420   assert_different_registers(vtmp1, vtmp2);
1421 
1422   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
1423   // Example:   src = 0b01100101 10001101, bt = T_BYTE, lane_cnt = 16
1424   // Expected:  dst = 0x658D
1425 
1426   // Convert the mask into vector with sequential bytes.
1427   // vtmp1 = 0x00010100 0x00010001 0x01000000 0x01010001
1428   sve_cpy(vtmp1, size, src, 1, false);
1429   if (bt != T_BYTE) {
1430     sve_vector_narrow(vtmp1, B, vtmp1, size, vtmp2);
1431   }
1432 
1433   if (UseSVE > 1 && VM_Version::supports_svebitperm()) {
1434     // Given a vector with the value 0x00 or 0x01 in each byte, the basic idea
1435     // is to compress each significant bit of the byte in a cross-lane way. Due
1436     // to the lack of a cross-lane bit-compress instruction, we use BEXT
1437     // (bit-compress in each lane) with the biggest lane size (T = D) then
1438     // concatenate the results.
1439 
1440     // The second source input of BEXT, initialized with 0x01 in each byte.
1441     // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1442     sve_dup(vtmp2, B, 1);
1443 
1444     // BEXT vtmp1.D, vtmp1.D, vtmp2.D
1445     // vtmp1 = 0x0001010000010001 | 0x0100000001010001
1446     // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1447     //         ---------------------------------------
1448     // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1449     sve_bext(vtmp1, D, vtmp1, vtmp2);
1450 
1451     // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the
1452     // result to dst.
1453     // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1454     // dst   = 0x658D
1455     if (lane_cnt <= 8) {
1456       // No need to concatenate.
1457       umov(dst, vtmp1, B, 0);
1458     } else if (lane_cnt <= 16) {
1459       ins(vtmp1, B, vtmp1, 1, 8);
1460       umov(dst, vtmp1, H, 0);
1461     } else {
1462       // As the lane count is 64 at most, the final expected value must be in
1463       // the lowest 64 bits after narrowing vtmp1 from D to B.
1464       sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2);
1465       umov(dst, vtmp1, D, 0);
1466     }
1467   } else if (UseSVE > 0) {
1468     // Compress the lowest 8 bytes.
1469     fmovd(dst, vtmp1);
1470     bytemask_compress(dst);
1471     if (lane_cnt <= 8) return;
1472 
1473     // Repeat on higher bytes and join the results.
1474     // Compress 8 bytes in each iteration.
1475     for (int idx = 1; idx < (lane_cnt / 8); idx++) {
1476       sve_extract_integral(rscratch1, T_LONG, vtmp1, idx, vtmp2);
1477       bytemask_compress(rscratch1);
1478       orr(dst, dst, rscratch1, Assembler::LSL, idx << 3);
1479     }
1480   } else {
1481     assert(false, "unsupported");
1482     ShouldNotReachHere();
1483   }
1484 }
1485 
1486 // Unpack the mask, a long value in src, into predicate register dst based on the
1487 // corresponding data type. Note that dst can support at most 64 lanes.
1488 // Below example gives the expected dst predicate register in different types, with
1489 // a valid src(0x658D) on a 1024-bit vector size machine.
1490 // BYTE:  dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 00 00 65 8D
1491 // SHORT: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 14 11 40 51
1492 // INT:   dst = 0x00 00 00 00 00 00 00 00 01 10 01 01 10 00 11 01
1493 // LONG:  dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01
1494 //
1495 // The number of significant bits of src must be equal to lane_cnt. E.g., 0xFF658D which
1496 // has 24 significant bits would be an invalid input if dst predicate register refers to
1497 // a LONG type 1024-bit vector, which has at most 16 lanes.
1498 void C2_MacroAssembler::sve_vmask_fromlong(PRegister dst, Register src, BasicType bt, int lane_cnt,
1499                                            FloatRegister vtmp1, FloatRegister vtmp2) {
1500   assert(UseSVE == 2 && VM_Version::supports_svebitperm() &&
1501          lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported");
1502   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
1503   // Example:   src = 0x658D, bt = T_BYTE, size = B, lane_cnt = 16
1504   // Expected:  dst = 0b01101001 10001101
1505 
1506   // Put long value from general purpose register into the first lane of vector.
1507   // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1508   sve_dup(vtmp1, B, 0);
1509   mov(vtmp1, D, 0, src);
1510 
1511   // As sve_cmp generates mask value with the minimum unit in byte, we should
1512   // transform the value in the first lane which is mask in bit now to the
1513   // mask in byte, which can be done by SVE2's BDEP instruction.
1514 
1515   // The first source input of BDEP instruction. Deposite each byte in every 8 bytes.
1516   // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1517   if (lane_cnt <= 8) {
1518     // Nothing. As only one byte exsits.
1519   } else if (lane_cnt <= 16) {
1520     ins(vtmp1, B, vtmp1, 8, 1);
1521     mov(vtmp1, B, 1, zr);
1522   } else {
1523     sve_vector_extend(vtmp1, D, vtmp1, B);
1524   }
1525 
1526   // The second source input of BDEP instruction, initialized with 0x01 for each byte.
1527   // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1528   sve_dup(vtmp2, B, 1);
1529 
1530   // BDEP vtmp1.D, vtmp1.D, vtmp2.D
1531   // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1532   // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1533   //         ---------------------------------------
1534   // vtmp1 = 0x0001010000010001 | 0x0100000001010001
1535   sve_bdep(vtmp1, D, vtmp1, vtmp2);
1536 
1537   if (bt != T_BYTE) {
1538     sve_vector_extend(vtmp1, size, vtmp1, B);
1539   }
1540   // Generate mask according to the given vector, in which the elements have been
1541   // extended to expected type.
1542   // dst = 0b01101001 10001101
1543   sve_cmp(Assembler::NE, dst, size, ptrue, vtmp1, 0);
1544 }
1545 
1546 // Clobbers: rflags
1547 void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg,
1548                                     FloatRegister zn, FloatRegister zm, Condition cond) {
1549   assert(pg->is_governing(), "This register has to be a governing predicate register");
1550   FloatRegister z1 = zn, z2 = zm;
1551   switch (cond) {
1552     case LE: z1 = zm; z2 = zn; cond = GE; break;
1553     case LT: z1 = zm; z2 = zn; cond = GT; break;
1554     case LO: z1 = zm; z2 = zn; cond = HI; break;
1555     case LS: z1 = zm; z2 = zn; cond = HS; break;
1556     default:
1557       break;
1558   }
1559 
1560   SIMD_RegVariant size = elemType_to_regVariant(bt);
1561   if (is_floating_point_type(bt)) {
1562     sve_fcm(cond, pd, size, pg, z1, z2);
1563   } else {
1564     assert(is_integral_type(bt), "unsupported element type");
1565     sve_cmp(cond, pd, size, pg, z1, z2);
1566   }
1567 }
1568 
1569 // Get index of the last mask lane that is set
1570 void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) {
1571   SIMD_RegVariant size = elemType_to_regVariant(bt);
1572   sve_rev(ptmp, size, src);
1573   sve_brkb(ptmp, ptrue, ptmp, false);
1574   sve_cntp(dst, size, ptrue, ptmp);
1575   movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1);
1576   subw(dst, rscratch1, dst);
1577 }
1578 
1579 // Extend integer vector src to dst with the same lane count
1580 // but larger element size, e.g. 4B -> 4I
1581 void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes,
1582                                            FloatRegister src, BasicType src_bt) {
1583   if (src_bt == T_BYTE) {
1584     if (dst_bt == T_SHORT) {
1585       // 4B/8B to 4S/8S
1586       assert(dst_vlen_in_bytes == 8 || dst_vlen_in_bytes == 16, "unsupported");
1587       sxtl(dst, T8H, src, T8B);
1588     } else {
1589       // 4B to 4I
1590       assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported");
1591       sxtl(dst, T8H, src, T8B);
1592       sxtl(dst, T4S, dst, T4H);
1593     }
1594   } else if (src_bt == T_SHORT) {
1595     // 4S to 4I
1596     assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported");
1597     sxtl(dst, T4S, src, T4H);
1598   } else if (src_bt == T_INT) {
1599     // 2I to 2L
1600     assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported");
1601     sxtl(dst, T2D, src, T2S);
1602   } else {
1603     ShouldNotReachHere();
1604   }
1605 }
1606 
1607 // Narrow integer vector src down to dst with the same lane count
1608 // but smaller element size, e.g. 4I -> 4B
1609 void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt,
1610                                            FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) {
1611   if (src_bt == T_SHORT) {
1612     // 4S/8S to 4B/8B
1613     assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
1614     assert(dst_bt == T_BYTE, "unsupported");
1615     xtn(dst, T8B, src, T8H);
1616   } else if (src_bt == T_INT) {
1617     // 4I to 4B/4S
1618     assert(src_vlen_in_bytes == 16, "unsupported");
1619     assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported");
1620     xtn(dst, T4H, src, T4S);
1621     if (dst_bt == T_BYTE) {
1622       xtn(dst, T8B, dst, T8H);
1623     }
1624   } else if (src_bt == T_LONG) {
1625     // 2L to 2I
1626     assert(src_vlen_in_bytes == 16, "unsupported");
1627     assert(dst_bt == T_INT, "unsupported");
1628     xtn(dst, T2S, src, T2D);
1629   } else {
1630     ShouldNotReachHere();
1631   }
1632 }
1633 
1634 void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size,
1635                                           FloatRegister src, SIMD_RegVariant src_size) {
1636   assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size");
1637   if (src_size == B) {
1638     switch (dst_size) {
1639     case H:
1640       sve_sunpklo(dst, H, src);
1641       break;
1642     case S:
1643       sve_sunpklo(dst, H, src);
1644       sve_sunpklo(dst, S, dst);
1645       break;
1646     case D:
1647       sve_sunpklo(dst, H, src);
1648       sve_sunpklo(dst, S, dst);
1649       sve_sunpklo(dst, D, dst);
1650       break;
1651     default:
1652       ShouldNotReachHere();
1653     }
1654   } else if (src_size == H) {
1655     if (dst_size == S) {
1656       sve_sunpklo(dst, S, src);
1657     } else { // D
1658       sve_sunpklo(dst, S, src);
1659       sve_sunpklo(dst, D, dst);
1660     }
1661   } else if (src_size == S) {
1662     sve_sunpklo(dst, D, src);
1663   }
1664 }
1665 
1666 // Vector narrow from src to dst with specified element sizes.
1667 // High part of dst vector will be filled with zero.
1668 void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size,
1669                                           FloatRegister src, SIMD_RegVariant src_size,
1670                                           FloatRegister tmp) {
1671   assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size");
1672   assert_different_registers(src, tmp);
1673   sve_dup(tmp, src_size, 0);
1674   if (src_size == D) {
1675     switch (dst_size) {
1676     case S:
1677       sve_uzp1(dst, S, src, tmp);
1678       break;
1679     case H:
1680       assert_different_registers(dst, tmp);
1681       sve_uzp1(dst, S, src, tmp);
1682       sve_uzp1(dst, H, dst, tmp);
1683       break;
1684     case B:
1685       assert_different_registers(dst, tmp);
1686       sve_uzp1(dst, S, src, tmp);
1687       sve_uzp1(dst, H, dst, tmp);
1688       sve_uzp1(dst, B, dst, tmp);
1689       break;
1690     default:
1691       ShouldNotReachHere();
1692     }
1693   } else if (src_size == S) {
1694     if (dst_size == H) {
1695       sve_uzp1(dst, H, src, tmp);
1696     } else { // B
1697       assert_different_registers(dst, tmp);
1698       sve_uzp1(dst, H, src, tmp);
1699       sve_uzp1(dst, B, dst, tmp);
1700     }
1701   } else if (src_size == H) {
1702     sve_uzp1(dst, B, src, tmp);
1703   }
1704 }
1705 
1706 // Extend src predicate to dst predicate with the same lane count but larger
1707 // element size, e.g. 64Byte -> 512Long
1708 void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src,
1709                                              uint dst_element_length_in_bytes,
1710                                              uint src_element_length_in_bytes) {
1711   if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) {
1712     sve_punpklo(dst, src);
1713   } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) {
1714     sve_punpklo(dst, src);
1715     sve_punpklo(dst, dst);
1716   } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) {
1717     sve_punpklo(dst, src);
1718     sve_punpklo(dst, dst);
1719     sve_punpklo(dst, dst);
1720   } else {
1721     assert(false, "unsupported");
1722     ShouldNotReachHere();
1723   }
1724 }
1725 
1726 // Narrow src predicate to dst predicate with the same lane count but
1727 // smaller element size, e.g. 512Long -> 64Byte
1728 void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src, PRegister ptmp,
1729                                              uint dst_element_length_in_bytes, uint src_element_length_in_bytes) {
1730   // The insignificant bits in src predicate are expected to be zero.
1731   // To ensure the higher order bits of the resultant narrowed vector are 0, an all-zero predicate is
1732   // passed as the second argument. An example narrowing operation with a given mask would be -
1733   // 128Long -> 64Int on a 128-bit machine i.e 2L -> 2I
1734   // Mask (for 2 Longs) : TF
1735   // Predicate register for the above mask (16 bits) : 00000001 00000000
1736   // After narrowing (uzp1 dst.b, src.b, ptmp.b) : 0000 0000 0001 0000
1737   // Which translates to mask for 2 integers as : TF (lower half is considered while upper half is 0)
1738   assert_different_registers(src, ptmp);
1739   assert_different_registers(dst, ptmp);
1740   sve_pfalse(ptmp);
1741   if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) {
1742     sve_uzp1(dst, B, src, ptmp);
1743   } else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) {
1744     sve_uzp1(dst, H, src, ptmp);
1745     sve_uzp1(dst, B, dst, ptmp);
1746   } else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) {
1747     sve_uzp1(dst, S, src, ptmp);
1748     sve_uzp1(dst, H, dst, ptmp);
1749     sve_uzp1(dst, B, dst, ptmp);
1750   } else {
1751     assert(false, "unsupported");
1752     ShouldNotReachHere();
1753   }
1754 }
1755 
1756 // Vector reduction add for integral type with ASIMD instructions.
1757 void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt,
1758                                                  Register isrc, FloatRegister vsrc,
1759                                                  unsigned vector_length_in_bytes,
1760                                                  FloatRegister vtmp) {
1761   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1762   assert_different_registers(dst, isrc);
1763   bool isQ = vector_length_in_bytes == 16;
1764 
1765   BLOCK_COMMENT("neon_reduce_add_integral {");
1766     switch(bt) {
1767       case T_BYTE:
1768         addv(vtmp, isQ ? T16B : T8B, vsrc);
1769         smov(dst, vtmp, B, 0);
1770         addw(dst, dst, isrc, ext::sxtb);
1771         break;
1772       case T_SHORT:
1773         addv(vtmp, isQ ? T8H : T4H, vsrc);
1774         smov(dst, vtmp, H, 0);
1775         addw(dst, dst, isrc, ext::sxth);
1776         break;
1777       case T_INT:
1778         isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc);
1779         umov(dst, vtmp, S, 0);
1780         addw(dst, dst, isrc);
1781         break;
1782       case T_LONG:
1783         assert(isQ, "unsupported");
1784         addpd(vtmp, vsrc);
1785         umov(dst, vtmp, D, 0);
1786         add(dst, dst, isrc);
1787         break;
1788       default:
1789         assert(false, "unsupported");
1790         ShouldNotReachHere();
1791     }
1792   BLOCK_COMMENT("} neon_reduce_add_integral");
1793 }
1794 
1795 // Vector reduction multiply for integral type with ASIMD instructions.
1796 // Note: temporary registers vtmp1 and vtmp2 are not used in some cases.
1797 // Clobbers: rscratch1
1798 void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt,
1799                                                  Register isrc, FloatRegister vsrc,
1800                                                  unsigned vector_length_in_bytes,
1801                                                  FloatRegister vtmp1, FloatRegister vtmp2) {
1802   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1803   bool isQ = vector_length_in_bytes == 16;
1804 
1805   BLOCK_COMMENT("neon_reduce_mul_integral {");
1806     switch(bt) {
1807       case T_BYTE:
1808         if (isQ) {
1809           // Multiply the lower half and higher half of vector iteratively.
1810           // vtmp1 = vsrc[8:15]
1811           ins(vtmp1, D, vsrc, 0, 1);
1812           // vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7]
1813           mulv(vtmp1, T8B, vtmp1, vsrc);
1814           // vtmp2 = vtmp1[4:7]
1815           ins(vtmp2, S, vtmp1, 0, 1);
1816           // vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3]
1817           mulv(vtmp1, T8B, vtmp2, vtmp1);
1818         } else {
1819           ins(vtmp1, S, vsrc, 0, 1);
1820           mulv(vtmp1, T8B, vtmp1, vsrc);
1821         }
1822         // vtmp2 = vtmp1[2:3]
1823         ins(vtmp2, H, vtmp1, 0, 1);
1824         // vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1]
1825         mulv(vtmp2, T8B, vtmp2, vtmp1);
1826         // dst = vtmp2[0] * isrc * vtmp2[1]
1827         umov(rscratch1, vtmp2, B, 0);
1828         mulw(dst, rscratch1, isrc);
1829         sxtb(dst, dst);
1830         umov(rscratch1, vtmp2, B, 1);
1831         mulw(dst, rscratch1, dst);
1832         sxtb(dst, dst);
1833         break;
1834       case T_SHORT:
1835         if (isQ) {
1836           ins(vtmp2, D, vsrc, 0, 1);
1837           mulv(vtmp2, T4H, vtmp2, vsrc);
1838           ins(vtmp1, S, vtmp2, 0, 1);
1839           mulv(vtmp1, T4H, vtmp1, vtmp2);
1840         } else {
1841           ins(vtmp1, S, vsrc, 0, 1);
1842           mulv(vtmp1, T4H, vtmp1, vsrc);
1843         }
1844         umov(rscratch1, vtmp1, H, 0);
1845         mulw(dst, rscratch1, isrc);
1846         sxth(dst, dst);
1847         umov(rscratch1, vtmp1, H, 1);
1848         mulw(dst, rscratch1, dst);
1849         sxth(dst, dst);
1850         break;
1851       case T_INT:
1852         if (isQ) {
1853           ins(vtmp1, D, vsrc, 0, 1);
1854           mulv(vtmp1, T2S, vtmp1, vsrc);
1855         } else {
1856           vtmp1 = vsrc;
1857         }
1858         umov(rscratch1, vtmp1, S, 0);
1859         mul(dst, rscratch1, isrc);
1860         umov(rscratch1, vtmp1, S, 1);
1861         mul(dst, rscratch1, dst);
1862         break;
1863       case T_LONG:
1864         umov(rscratch1, vsrc, D, 0);
1865         mul(dst, isrc, rscratch1);
1866         umov(rscratch1, vsrc, D, 1);
1867         mul(dst, dst, rscratch1);
1868         break;
1869       default:
1870         assert(false, "unsupported");
1871         ShouldNotReachHere();
1872     }
1873   BLOCK_COMMENT("} neon_reduce_mul_integral");
1874 }
1875 
1876 // Vector reduction multiply for floating-point type with ASIMD instructions.
1877 void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt,
1878                                            FloatRegister fsrc, FloatRegister vsrc,
1879                                            unsigned vector_length_in_bytes,
1880                                            FloatRegister vtmp) {
1881   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1882   bool isQ = vector_length_in_bytes == 16;
1883 
1884   BLOCK_COMMENT("neon_reduce_mul_fp {");
1885     switch(bt) {
1886       case T_FLOAT:
1887         fmuls(dst, fsrc, vsrc);
1888         ins(vtmp, S, vsrc, 0, 1);
1889         fmuls(dst, dst, vtmp);
1890         if (isQ) {
1891           ins(vtmp, S, vsrc, 0, 2);
1892           fmuls(dst, dst, vtmp);
1893           ins(vtmp, S, vsrc, 0, 3);
1894           fmuls(dst, dst, vtmp);
1895          }
1896         break;
1897       case T_DOUBLE:
1898         assert(isQ, "unsupported");
1899         fmuld(dst, fsrc, vsrc);
1900         ins(vtmp, D, vsrc, 0, 1);
1901         fmuld(dst, dst, vtmp);
1902         break;
1903       default:
1904         assert(false, "unsupported");
1905         ShouldNotReachHere();
1906     }
1907   BLOCK_COMMENT("} neon_reduce_mul_fp");
1908 }
1909 
1910 // Helper to select logical instruction
1911 void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd,
1912                                                    Register Rn, Register Rm,
1913                                                    enum shift_kind kind, unsigned shift) {
1914   switch(opc) {
1915     case Op_AndReductionV:
1916       is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift);
1917       break;
1918     case Op_OrReductionV:
1919       is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift);
1920       break;
1921     case Op_XorReductionV:
1922       is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift);
1923       break;
1924     default:
1925       assert(false, "unsupported");
1926       ShouldNotReachHere();
1927   }
1928 }
1929 
1930 // Vector reduction logical operations And, Or, Xor
1931 // Clobbers: rscratch1
1932 void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt,
1933                                             Register isrc, FloatRegister vsrc,
1934                                             unsigned vector_length_in_bytes) {
1935   assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV,
1936          "unsupported");
1937   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1938   assert_different_registers(dst, isrc);
1939   bool isQ = vector_length_in_bytes == 16;
1940 
1941   BLOCK_COMMENT("neon_reduce_logical {");
1942     umov(rscratch1, vsrc, isQ ? D : S, 0);
1943     umov(dst, vsrc, isQ ? D : S, 1);
1944     neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1);
1945     switch(bt) {
1946       case T_BYTE:
1947         if (isQ) {
1948           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1949         }
1950         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
1951         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8);
1952         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1953         sxtb(dst, dst);
1954         break;
1955       case T_SHORT:
1956         if (isQ) {
1957           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1958         }
1959         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
1960         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1961         sxth(dst, dst);
1962         break;
1963       case T_INT:
1964         if (isQ) {
1965           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1966         }
1967         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1968         break;
1969       case T_LONG:
1970         assert(isQ, "unsupported");
1971         neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst);
1972         break;
1973       default:
1974         assert(false, "unsupported");
1975         ShouldNotReachHere();
1976     }
1977   BLOCK_COMMENT("} neon_reduce_logical");
1978 }
1979 
1980 // Vector reduction min/max for integral type with ASIMD instructions.
1981 // Note: vtmp is not used and expected to be fnoreg for T_LONG case.
1982 // Clobbers: rscratch1, rflags
1983 void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt,
1984                                                     Register isrc, FloatRegister vsrc,
1985                                                     unsigned vector_length_in_bytes,
1986                                                     FloatRegister vtmp) {
1987   assert(opc == Op_MinReductionV || opc == Op_MaxReductionV, "unsupported");
1988   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1989   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported");
1990   assert_different_registers(dst, isrc);
1991   bool isQ = vector_length_in_bytes == 16;
1992   bool is_min = opc == Op_MinReductionV;
1993 
1994   BLOCK_COMMENT("neon_reduce_minmax_integral {");
1995     if (bt == T_LONG) {
1996       assert(vtmp == fnoreg, "should be");
1997       assert(isQ, "should be");
1998       umov(rscratch1, vsrc, D, 0);
1999       cmp(isrc, rscratch1);
2000       csel(dst, isrc, rscratch1, is_min ? LT : GT);
2001       umov(rscratch1, vsrc, D, 1);
2002       cmp(dst, rscratch1);
2003       csel(dst, dst, rscratch1, is_min ? LT : GT);
2004     } else {
2005       SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
2006       if (size == T2S) {
2007         is_min ? sminp(vtmp, size, vsrc, vsrc) : smaxp(vtmp, size, vsrc, vsrc);
2008       } else {
2009         is_min ? sminv(vtmp, size, vsrc) : smaxv(vtmp, size, vsrc);
2010       }
2011       if (bt == T_INT) {
2012         umov(dst, vtmp, S, 0);
2013       } else {
2014         smov(dst, vtmp, elemType_to_regVariant(bt), 0);
2015       }
2016       cmpw(dst, isrc);
2017       cselw(dst, dst, isrc, is_min ? LT : GT);
2018     }
2019   BLOCK_COMMENT("} neon_reduce_minmax_integral");
2020 }
2021 
2022 // Vector reduction for integral type with SVE instruction.
2023 // Supported operations are Add, And, Or, Xor, Max, Min.
2024 // rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV.
2025 void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1,
2026                                             FloatRegister src2, PRegister pg, FloatRegister tmp) {
2027   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2028   assert(pg->is_governing(), "This register has to be a governing predicate register");
2029   assert_different_registers(src1, dst);
2030   // Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved.
2031   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2032   switch (opc) {
2033     case Op_AddReductionVI: {
2034       sve_uaddv(tmp, size, pg, src2);
2035       if (bt == T_BYTE) {
2036         smov(dst, tmp, size, 0);
2037         addw(dst, src1, dst, ext::sxtb);
2038       } else if (bt == T_SHORT) {
2039         smov(dst, tmp, size, 0);
2040         addw(dst, src1, dst, ext::sxth);
2041       } else {
2042         umov(dst, tmp, size, 0);
2043         addw(dst, dst, src1);
2044       }
2045       break;
2046     }
2047     case Op_AddReductionVL: {
2048       sve_uaddv(tmp, size, pg, src2);
2049       umov(dst, tmp, size, 0);
2050       add(dst, dst, src1);
2051       break;
2052     }
2053     case Op_AndReductionV: {
2054       sve_andv(tmp, size, pg, src2);
2055       if (bt == T_INT || bt == T_LONG) {
2056         umov(dst, tmp, size, 0);
2057       } else {
2058         smov(dst, tmp, size, 0);
2059       }
2060       if (bt == T_LONG) {
2061         andr(dst, dst, src1);
2062       } else {
2063         andw(dst, dst, src1);
2064       }
2065       break;
2066     }
2067     case Op_OrReductionV: {
2068       sve_orv(tmp, size, pg, src2);
2069       if (bt == T_INT || bt == T_LONG) {
2070         umov(dst, tmp, size, 0);
2071       } else {
2072         smov(dst, tmp, size, 0);
2073       }
2074       if (bt == T_LONG) {
2075         orr(dst, dst, src1);
2076       } else {
2077         orrw(dst, dst, src1);
2078       }
2079       break;
2080     }
2081     case Op_XorReductionV: {
2082       sve_eorv(tmp, size, pg, src2);
2083       if (bt == T_INT || bt == T_LONG) {
2084         umov(dst, tmp, size, 0);
2085       } else {
2086         smov(dst, tmp, size, 0);
2087       }
2088       if (bt == T_LONG) {
2089         eor(dst, dst, src1);
2090       } else {
2091         eorw(dst, dst, src1);
2092       }
2093       break;
2094     }
2095     case Op_MaxReductionV: {
2096       sve_smaxv(tmp, size, pg, src2);
2097       if (bt == T_INT || bt == T_LONG) {
2098         umov(dst, tmp, size, 0);
2099       } else {
2100         smov(dst, tmp, size, 0);
2101       }
2102       if (bt == T_LONG) {
2103         cmp(dst, src1);
2104         csel(dst, dst, src1, Assembler::GT);
2105       } else {
2106         cmpw(dst, src1);
2107         cselw(dst, dst, src1, Assembler::GT);
2108       }
2109       break;
2110     }
2111     case Op_MinReductionV: {
2112       sve_sminv(tmp, size, pg, src2);
2113       if (bt == T_INT || bt == T_LONG) {
2114         umov(dst, tmp, size, 0);
2115       } else {
2116         smov(dst, tmp, size, 0);
2117       }
2118       if (bt == T_LONG) {
2119         cmp(dst, src1);
2120         csel(dst, dst, src1, Assembler::LT);
2121       } else {
2122         cmpw(dst, src1);
2123         cselw(dst, dst, src1, Assembler::LT);
2124       }
2125       break;
2126     }
2127     default:
2128       assert(false, "unsupported");
2129       ShouldNotReachHere();
2130   }
2131 
2132   if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) {
2133     if (bt == T_BYTE) {
2134       sxtb(dst, dst);
2135     } else if (bt == T_SHORT) {
2136       sxth(dst, dst);
2137     }
2138   }
2139 }
2140 
2141 // Set elements of the dst predicate to true for lanes in the range of [0, lane_cnt), or
2142 // to false otherwise. The input "lane_cnt" should be smaller than or equal to the supported
2143 // max vector length of the basic type. Clobbers: rscratch1 and the rFlagsReg.
2144 void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt) {
2145   uint32_t max_vector_length = Matcher::max_vector_size(bt);
2146   assert(lane_cnt <= max_vector_length, "unsupported input lane_cnt");
2147 
2148   // Set all elements to false if the input "lane_cnt" is zero.
2149   if (lane_cnt == 0) {
2150     sve_pfalse(dst);
2151     return;
2152   }
2153 
2154   SIMD_RegVariant size = elemType_to_regVariant(bt);
2155   assert(size != Q, "invalid size");
2156 
2157   // Set all true if "lane_cnt" equals to the max lane count.
2158   if (lane_cnt == max_vector_length) {
2159     sve_ptrue(dst, size, /* ALL */ 0b11111);
2160     return;
2161   }
2162 
2163   // Fixed numbers for "ptrue".
2164   switch(lane_cnt) {
2165   case 1: /* VL1 */
2166   case 2: /* VL2 */
2167   case 3: /* VL3 */
2168   case 4: /* VL4 */
2169   case 5: /* VL5 */
2170   case 6: /* VL6 */
2171   case 7: /* VL7 */
2172   case 8: /* VL8 */
2173     sve_ptrue(dst, size, lane_cnt);
2174     return;
2175   case 16:
2176     sve_ptrue(dst, size, /* VL16 */ 0b01001);
2177     return;
2178   case 32:
2179     sve_ptrue(dst, size, /* VL32 */ 0b01010);
2180     return;
2181   case 64:
2182     sve_ptrue(dst, size, /* VL64 */ 0b01011);
2183     return;
2184   case 128:
2185     sve_ptrue(dst, size, /* VL128 */ 0b01100);
2186     return;
2187   case 256:
2188     sve_ptrue(dst, size, /* VL256 */ 0b01101);
2189     return;
2190   default:
2191     break;
2192   }
2193 
2194   // Special patterns for "ptrue".
2195   if (lane_cnt == round_down_power_of_2(max_vector_length)) {
2196     sve_ptrue(dst, size, /* POW2 */ 0b00000);
2197   } else if (lane_cnt == max_vector_length - (max_vector_length % 4)) {
2198     sve_ptrue(dst, size, /* MUL4 */ 0b11101);
2199   } else if (lane_cnt == max_vector_length - (max_vector_length % 3)) {
2200     sve_ptrue(dst, size, /* MUL3 */ 0b11110);
2201   } else {
2202     // Encode to "whileltw" for the remaining cases.
2203     mov(rscratch1, lane_cnt);
2204     sve_whileltw(dst, size, zr, rscratch1);
2205   }
2206 }
2207 
2208 // Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst.
2209 // Any remaining elements of dst will be filled with zero.
2210 // Clobbers: rscratch1
2211 // Preserves: src, mask
2212 void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask,
2213                                            FloatRegister vtmp1, FloatRegister vtmp2,
2214                                            PRegister pgtmp) {
2215   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2216   assert_different_registers(dst, src, vtmp1, vtmp2);
2217   assert_different_registers(mask, pgtmp);
2218 
2219   // Example input:   src   = 8888 7777 6666 5555 4444 3333 2222 1111
2220   //                  mask  = 0001 0000 0000 0001 0001 0000 0001 0001
2221   // Expected result: dst   = 0000 0000 0000 8888 5555 4444 2222 1111
2222   sve_dup(vtmp2, H, 0);
2223 
2224   // Extend lowest half to type INT.
2225   // dst = 00004444 00003333 00002222 00001111
2226   sve_uunpklo(dst, S, src);
2227   // pgtmp = 00000001 00000000 00000001 00000001
2228   sve_punpklo(pgtmp, mask);
2229   // Pack the active elements in size of type INT to the right,
2230   // and fill the remainings with zero.
2231   // dst = 00000000 00004444 00002222 00001111
2232   sve_compact(dst, S, dst, pgtmp);
2233   // Narrow the result back to type SHORT.
2234   // dst = 0000 0000 0000 0000 0000 4444 2222 1111
2235   sve_uzp1(dst, H, dst, vtmp2);
2236   // Count the active elements of lowest half.
2237   // rscratch1 = 3
2238   sve_cntp(rscratch1, S, ptrue, pgtmp);
2239 
2240   // Repeat to the highest half.
2241   // pgtmp = 00000001 00000000 00000000 00000001
2242   sve_punpkhi(pgtmp, mask);
2243   // vtmp1 = 00008888 00007777 00006666 00005555
2244   sve_uunpkhi(vtmp1, S, src);
2245   // vtmp1 = 00000000 00000000 00008888 00005555
2246   sve_compact(vtmp1, S, vtmp1, pgtmp);
2247   // vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555
2248   sve_uzp1(vtmp1, H, vtmp1, vtmp2);
2249 
2250   // Compressed low:   dst   = 0000 0000 0000 0000 0000 4444 2222 1111
2251   // Compressed high:  vtmp1 = 0000 0000 0000 0000 0000 0000 8888  5555
2252   // Left shift(cross lane) compressed high with TRUE_CNT lanes,
2253   // TRUE_CNT is the number of active elements in the compressed low.
2254   neg(rscratch1, rscratch1);
2255   // vtmp2 = {4 3 2 1 0 -1 -2 -3}
2256   sve_index(vtmp2, H, rscratch1, 1);
2257   // vtmp1 = 0000 0000 0000 8888 5555 0000 0000 0000
2258   sve_tbl(vtmp1, H, vtmp1, vtmp2);
2259 
2260   // Combine the compressed high(after shifted) with the compressed low.
2261   // dst = 0000 0000 0000 8888 5555 4444 2222 1111
2262   sve_orr(dst, dst, vtmp1);
2263 }
2264 
2265 // Clobbers: rscratch1, rscratch2
2266 // Preserves: src, mask
2267 void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask,
2268                                           FloatRegister vtmp1, FloatRegister vtmp2,
2269                                           FloatRegister vtmp3, FloatRegister vtmp4,
2270                                           PRegister ptmp, PRegister pgtmp) {
2271   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2272   assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3, vtmp4);
2273   assert_different_registers(mask, ptmp, pgtmp);
2274   // Example input:   src   = 88 77 66 55 44 33 22 11
2275   //                  mask  = 01 00 00 01 01 00 01 01
2276   // Expected result: dst   = 00 00 00 88 55 44 22 11
2277 
2278   sve_dup(vtmp4, B, 0);
2279   // Extend lowest half to type SHORT.
2280   // vtmp1 = 0044 0033 0022 0011
2281   sve_uunpklo(vtmp1, H, src);
2282   // ptmp = 0001 0000 0001 0001
2283   sve_punpklo(ptmp, mask);
2284   // Count the active elements of lowest half.
2285   // rscratch2 = 3
2286   sve_cntp(rscratch2, H, ptrue, ptmp);
2287   // Pack the active elements in size of type SHORT to the right,
2288   // and fill the remainings with zero.
2289   // dst = 0000 0044 0022 0011
2290   sve_compress_short(dst, vtmp1, ptmp, vtmp2, vtmp3, pgtmp);
2291   // Narrow the result back to type BYTE.
2292   // dst = 00 00 00 00 00 44 22 11
2293   sve_uzp1(dst, B, dst, vtmp4);
2294 
2295   // Repeat to the highest half.
2296   // ptmp = 0001 0000 0000 0001
2297   sve_punpkhi(ptmp, mask);
2298   // vtmp1 = 0088 0077 0066 0055
2299   sve_uunpkhi(vtmp2, H, src);
2300   // vtmp1 = 0000 0000 0088 0055
2301   sve_compress_short(vtmp1, vtmp2, ptmp, vtmp3, vtmp4, pgtmp);
2302 
2303   sve_dup(vtmp4, B, 0);
2304   // vtmp1 = 00 00 00 00 00 00 88 55
2305   sve_uzp1(vtmp1, B, vtmp1, vtmp4);
2306 
2307   // Compressed low:   dst   = 00 00 00 00 00 44 22 11
2308   // Compressed high:  vtmp1 = 00 00 00 00 00 00 88 55
2309   // Left shift(cross lane) compressed high with TRUE_CNT lanes,
2310   // TRUE_CNT is the number of active elements in the compressed low.
2311   neg(rscratch2, rscratch2);
2312   // vtmp2 = {4 3 2 1 0 -1 -2 -3}
2313   sve_index(vtmp2, B, rscratch2, 1);
2314   // vtmp1 = 00 00 00 88 55 00 00 00
2315   sve_tbl(vtmp1, B, vtmp1, vtmp2);
2316   // Combine the compressed high(after shifted) with the compressed low.
2317   // dst = 00 00 00 88 55 44 22 11
2318   sve_orr(dst, dst, vtmp1);
2319 }
2320 
2321 void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2322   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2323   SIMD_Arrangement size = isQ ? T16B : T8B;
2324   if (bt == T_BYTE) {
2325     rbit(dst, size, src);
2326   } else {
2327     neon_reverse_bytes(dst, src, bt, isQ);
2328     rbit(dst, size, dst);
2329   }
2330 }
2331 
2332 void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2333   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2334   SIMD_Arrangement size = isQ ? T16B : T8B;
2335   switch (bt) {
2336     case T_BYTE:
2337       if (dst != src) {
2338         orr(dst, size, src, src);
2339       }
2340       break;
2341     case T_SHORT:
2342       rev16(dst, size, src);
2343       break;
2344     case T_INT:
2345       rev32(dst, size, src);
2346       break;
2347     case T_LONG:
2348       rev64(dst, size, src);
2349       break;
2350     default:
2351       assert(false, "unsupported");
2352       ShouldNotReachHere();
2353   }
2354 }
2355 
2356 // Extract a scalar element from an sve vector at position 'idx'.
2357 // The input elements in src are expected to be of integral type.
2358 void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src,
2359                                              int idx, FloatRegister vtmp) {
2360   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2361   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2362   if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction
2363     if (bt == T_INT || bt == T_LONG) {
2364       umov(dst, src, size, idx);
2365     } else {
2366       smov(dst, src, size, idx);
2367     }
2368   } else {
2369     sve_orr(vtmp, src, src);
2370     sve_ext(vtmp, vtmp, idx << size);
2371     if (bt == T_INT || bt == T_LONG) {
2372       umov(dst, vtmp, size, 0);
2373     } else {
2374       smov(dst, vtmp, size, 0);
2375     }
2376   }
2377 }
2378 
2379 // java.lang.Math::round intrinsics
2380 
2381 // Clobbers: rscratch1, rflags
2382 void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2383                                           FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) {
2384   assert_different_registers(tmp1, tmp2, tmp3, src, dst);
2385   switch (T) {
2386     case T2S:
2387     case T4S:
2388       fmovs(tmp1, T, 0.5f);
2389       mov(rscratch1, jint_cast(0x1.0p23f));
2390       break;
2391     case T2D:
2392       fmovd(tmp1, T, 0.5);
2393       mov(rscratch1, julong_cast(0x1.0p52));
2394       break;
2395     default:
2396       assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2397   }
2398   fadd(tmp1, T, tmp1, src);
2399   fcvtms(tmp1, T, tmp1);
2400   // tmp1 = floor(src + 0.5, ties to even)
2401 
2402   fcvtas(dst, T, src);
2403   // dst = round(src), ties to away
2404 
2405   fneg(tmp3, T, src);
2406   dup(tmp2, T, rscratch1);
2407   cm(HS, tmp3, T, tmp3, tmp2);
2408   // tmp3 is now a set of flags
2409 
2410   bif(dst, T16B, tmp1, tmp3);
2411   // result in dst
2412 }
2413 
2414 // Clobbers: rscratch1, rflags
2415 void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2416                                          FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) {
2417   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2418   assert_different_registers(tmp1, tmp2, src, dst);
2419 
2420   switch (T) {
2421     case S:
2422       mov(rscratch1, jint_cast(0x1.0p23f));
2423       break;
2424     case D:
2425       mov(rscratch1, julong_cast(0x1.0p52));
2426       break;
2427     default:
2428       assert(T == S || T == D, "invalid register variant");
2429   }
2430 
2431   sve_frinta(dst, T, ptrue, src);
2432   // dst = round(src), ties to away
2433 
2434   Label none;
2435 
2436   sve_fneg(tmp1, T, ptrue, src);
2437   sve_dup(tmp2, T, rscratch1);
2438   sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1);
2439   br(EQ, none);
2440   {
2441     sve_cpy(tmp1, T, pgtmp, 0.5);
2442     sve_fadd(tmp1, T, pgtmp, src);
2443     sve_frintm(dst, T, pgtmp, tmp1);
2444     // dst = floor(src + 0.5, ties to even)
2445   }
2446   bind(none);
2447 
2448   sve_fcvtzs(dst, T, ptrue, dst, T);
2449   // result in dst
2450 }
2451 
2452 void C2_MacroAssembler::vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero,
2453                                            FloatRegister one, SIMD_Arrangement T) {
2454   assert_different_registers(dst, src, zero, one);
2455   assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2456 
2457   facgt(dst, T, src, zero);
2458   ushr(dst, T, dst, 1); // dst=0 for +-0.0 and NaN. 0x7FF..F otherwise
2459   bsl(dst, T == T2S ? T8B : T16B, one, src); // Result in dst
2460 }
2461 
2462 void C2_MacroAssembler::vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero,
2463                                           FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T) {
2464     assert_different_registers(dst, src, zero, one, vtmp);
2465     assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2466 
2467     sve_orr(vtmp, src, src);
2468     sve_fac(Assembler::GT, pgtmp, T, ptrue, src, zero); // pmtp=0 for +-0.0 and NaN. 0x1 otherwise
2469     switch (T) {
2470     case S:
2471       sve_and(vtmp, T, min_jint); // Extract the sign bit of float value in every lane of src
2472       sve_orr(vtmp, T, jint_cast(1.0)); // OR it with +1 to make the final result +1 or -1 depending
2473                                         // on the sign of the float value
2474       break;
2475     case D:
2476       sve_and(vtmp, T, min_jlong);
2477       sve_orr(vtmp, T, jlong_cast(1.0));
2478       break;
2479     default:
2480       assert(false, "unsupported");
2481       ShouldNotReachHere();
2482     }
2483     sve_sel(dst, T, pgtmp, vtmp, src); // Select either from src or vtmp based on the predicate register pgtmp
2484                                        // Result in dst
2485 }
2486 
2487 bool C2_MacroAssembler::in_scratch_emit_size() {
2488   if (ciEnv::current()->task() != nullptr) {
2489     PhaseOutput* phase_output = Compile::current()->output();
2490     if (phase_output != nullptr && phase_output->in_scratch_emit_size()) {
2491       return true;
2492     }
2493   }
2494   return MacroAssembler::in_scratch_emit_size();
2495 }
2496 
2497 void C2_MacroAssembler::load_nklass_compact(Register dst, Register obj, Register index, int scale, int disp) {
2498   C2LoadNKlassStub* stub = new (Compile::current()->comp_arena()) C2LoadNKlassStub(dst);
2499   Compile::current()->output()->add_stub(stub);
2500 
2501   // Note: Don't clobber obj anywhere in that method!
2502 
2503   // The incoming address is pointing into obj-start + klass_offset_in_bytes. We need to extract
2504   // obj-start, so that we can load from the object's mark-word instead. Usually the address
2505   // comes as obj-start in obj and klass_offset_in_bytes in disp. However, sometimes C2
2506   // emits code that pre-computes obj-start + klass_offset_in_bytes into a register, and
2507   // then passes that register as obj and 0 in disp. The following code extracts the base
2508   // and offset to load the mark-word.
2509   int offset = oopDesc::mark_offset_in_bytes() + disp - oopDesc::klass_offset_in_bytes();
2510   if (index == noreg) {
2511     ldr(dst, Address(obj, offset));
2512   } else {
2513     lea(dst, Address(obj, index, Address::lsl(scale)));
2514     ldr(dst, Address(dst, offset));
2515   }
2516   // NOTE: We can't use tbnz here, because the target is sometimes too far away
2517   // and cannot be encoded.
2518   tst(dst, markWord::monitor_value);
2519   br(Assembler::NE, stub->entry());
2520   bind(stub->continuation());
2521   lsr(dst, dst, markWord::klass_shift);
2522 }