Old src/hotspot/cpu/aarch64/c2_MacroAssembler

   1 /*
   2  * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "opto/c2_MacroAssembler.hpp"
  29 #include "opto/compile.hpp"
  30 #include "opto/intrinsicnode.hpp"
  31 #include "opto/matcher.hpp"
  32 #include "opto/output.hpp"
  33 #include "opto/subnode.hpp"
  34 #include "runtime/stubRoutines.hpp"
  35 #include "utilities/globalDefinitions.hpp"
  36 
  37 #ifdef PRODUCT
  38 #define BLOCK_COMMENT(str) /* nothing */
  39 #define STOP(error) stop(error)
  40 #else
  41 #define BLOCK_COMMENT(str) block_comment(str)
  42 #define STOP(error) block_comment(error); stop(error)
  43 #endif
  44 
  45 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  46 
  47 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
  48 
  49 void C2_MacroAssembler::fast_lock(Register objectReg, Register boxReg, Register tmpReg,
  50                                   Register tmp2Reg, Register tmp3Reg) {
  51   Register oop = objectReg;
  52   Register box = boxReg;
  53   Register disp_hdr = tmpReg;
  54   Register tmp = tmp2Reg;
  55   Label cont;
  56   Label object_has_monitor;
  57   Label count, no_count;
  58 
  59   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight");
  60   assert_different_registers(oop, box, tmp, disp_hdr);
  61 
  62   // Load markWord from object into displaced_header.
  63   ldr(disp_hdr, Address(oop, oopDesc::mark_offset_in_bytes()));
  64 
  65   if (DiagnoseSyncOnValueBasedClasses != 0) {
  66     load_klass(tmp, oop);
  67     ldrw(tmp, Address(tmp, Klass::access_flags_offset()));
  68     tstw(tmp, JVM_ACC_IS_VALUE_BASED_CLASS);
  69     br(Assembler::NE, cont);
  70   }
  71 
  72   // Check for existing monitor
  73   tbnz(disp_hdr, exact_log2(markWord::monitor_value), object_has_monitor);
  74 
  75   if (LockingMode == LM_MONITOR) {
  76     tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0.
  77     b(cont);
  78   } else {
  79     assert(LockingMode == LM_LEGACY, "must be");
  80     // Set tmp to be (markWord of object | UNLOCK_VALUE).
  81     orr(tmp, disp_hdr, markWord::unlocked_value);
  82 
  83     // Initialize the box. (Must happen before we update the object mark!)
  84     str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes()));
  85 
  86     // Compare object markWord with an unlocked value (tmp) and if
  87     // equal exchange the stack address of our box with object markWord.
  88     // On failure disp_hdr contains the possibly locked markWord.
  89     cmpxchg(oop, tmp, box, Assembler::xword, /*acquire*/ true,
  90             /*release*/ true, /*weak*/ false, disp_hdr);
  91     br(Assembler::EQ, cont);
  92 
  93     assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
  94 
  95     // If the compare-and-exchange succeeded, then we found an unlocked
  96     // object, will have now locked it will continue at label cont
  97 
  98     // Check if the owner is self by comparing the value in the
  99     // markWord of object (disp_hdr) with the stack pointer.
 100     mov(rscratch1, sp);
 101     sub(disp_hdr, disp_hdr, rscratch1);
 102     mov(tmp, (address) (~(os::vm_page_size()-1) | markWord::lock_mask_in_place));
 103     // If condition is true we are cont and hence we can store 0 as the
 104     // displaced header in the box, which indicates that it is a recursive lock.
 105     ands(tmp/*==0?*/, disp_hdr, tmp);   // Sets flags for result
 106     str(tmp/*==0, perhaps*/, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 107     b(cont);
 108   }
 109 
 110   // Handle existing monitor.
 111   bind(object_has_monitor);
 112 
 113   // The object's monitor m is unlocked iff m->owner == nullptr,
 114   // otherwise m->owner may contain a thread or a stack address.
 115   //
 116   // Try to CAS m->owner from null to current thread.
 117   add(tmp, disp_hdr, (in_bytes(ObjectMonitor::owner_offset())-markWord::monitor_value));
 118   cmpxchg(tmp, zr, rthread, Assembler::xword, /*acquire*/ true,
 119           /*release*/ true, /*weak*/ false, tmp3Reg); // Sets flags for result
 120 
 121   // Store a non-null value into the box to avoid looking like a re-entrant
 122   // lock. The fast-path monitor unlock code checks for
 123   // markWord::monitor_value so use markWord::unused_mark which has the
 124   // relevant bit set, and also matches ObjectSynchronizer::enter.
 125   mov(tmp, (address)markWord::unused_mark().value());
 126   str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 127 
 128   br(Assembler::EQ, cont); // CAS success means locking succeeded
 129 
 130   cmp(tmp3Reg, rthread);
 131   br(Assembler::NE, cont); // Check for recursive locking
 132 
 133   // Recursive lock case
 134   increment(Address(disp_hdr, in_bytes(ObjectMonitor::recursions_offset()) - markWord::monitor_value), 1);
 135   // flag == EQ still from the cmp above, checking if this is a reentrant lock
 136 
 137   bind(cont);
 138   // flag == EQ indicates success
 139   // flag == NE indicates failure
 140   br(Assembler::NE, no_count);
 141 
 142   bind(count);
 143   increment(Address(rthread, JavaThread::held_monitor_count_offset()));
 144 
 145   bind(no_count);
 146 }
 147 
 148 void C2_MacroAssembler::fast_unlock(Register objectReg, Register boxReg, Register tmpReg,
 149                                     Register tmp2Reg) {
 150   Register oop = objectReg;
 151   Register box = boxReg;
 152   Register disp_hdr = tmpReg;
 153   Register tmp = tmp2Reg;
 154   Label cont;
 155   Label object_has_monitor;
 156   Label count, no_count;
 157 
 158   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight");
 159   assert_different_registers(oop, box, tmp, disp_hdr);
 160 
 161   if (LockingMode == LM_LEGACY) {
 162     // Find the lock address and load the displaced header from the stack.
 163     ldr(disp_hdr, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 164 
 165     // If the displaced header is 0, we have a recursive unlock.
 166     cmp(disp_hdr, zr);
 167     br(Assembler::EQ, cont);
 168   }
 169 
 170   // Handle existing monitor.
 171   ldr(tmp, Address(oop, oopDesc::mark_offset_in_bytes()));
 172   tbnz(tmp, exact_log2(markWord::monitor_value), object_has_monitor);
 173 
 174   if (LockingMode == LM_MONITOR) {
 175     tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0.
 176     b(cont);
 177   } else {
 178     assert(LockingMode == LM_LEGACY, "must be");
 179     // Check if it is still a light weight lock, this is is true if we
 180     // see the stack address of the basicLock in the markWord of the
 181     // object.
 182 
 183     cmpxchg(oop, box, disp_hdr, Assembler::xword, /*acquire*/ false,
 184             /*release*/ true, /*weak*/ false, tmp);
 185     b(cont);
 186   }
 187 
 188   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
 189 
 190   // Handle existing monitor.
 191   bind(object_has_monitor);
 192   STATIC_ASSERT(markWord::monitor_value <= INT_MAX);
 193   add(tmp, tmp, -(int)markWord::monitor_value); // monitor
 194 
 195   ldr(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset()));
 196 
 197   Label notRecursive;
 198   cbz(disp_hdr, notRecursive);
 199 
 200   // Recursive lock
 201   sub(disp_hdr, disp_hdr, 1u);
 202   str(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset()));
 203   cmp(disp_hdr, disp_hdr); // Sets flags for result
 204   b(cont);
 205 
 206   bind(notRecursive);
 207   ldr(rscratch1, Address(tmp, ObjectMonitor::EntryList_offset()));
 208   ldr(disp_hdr, Address(tmp, ObjectMonitor::cxq_offset()));
 209   orr(rscratch1, rscratch1, disp_hdr); // Will be 0 if both are 0.
 210   cmp(rscratch1, zr); // Sets flags for result
 211   cbnz(rscratch1, cont);
 212   // need a release store here
 213   lea(tmp, Address(tmp, ObjectMonitor::owner_offset()));
 214   stlr(zr, tmp); // set unowned
 215 
 216   bind(cont);
 217   // flag == EQ indicates success
 218   // flag == NE indicates failure
 219   br(Assembler::NE, no_count);
 220 
 221   bind(count);
 222   decrement(Address(rthread, JavaThread::held_monitor_count_offset()));
 223 
 224   bind(no_count);
 225 }
 226 
 227 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register t1,
 228                                               Register t2, Register t3) {
 229   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 230   assert_different_registers(obj, t1, t2, t3);
 231 
 232   // Handle inflated monitor.
 233   Label inflated;
 234   // Finish fast lock successfully. MUST branch to with flag == EQ
 235   Label locked;
 236   // Finish fast lock unsuccessfully. MUST branch to with flag == NE
 237   Label slow_path;
 238 
 239   if (DiagnoseSyncOnValueBasedClasses != 0) {
 240     load_klass(t1, obj);
 241     ldrw(t1, Address(t1, Klass::access_flags_offset()));
 242     tstw(t1, JVM_ACC_IS_VALUE_BASED_CLASS);
 243     br(Assembler::NE, slow_path);
 244   }
 245 
 246   const Register t1_mark = t1;
 247 
 248   { // Lightweight locking
 249 
 250     // Push lock to the lock stack and finish successfully. MUST branch to with flag == EQ
 251     Label push;
 252 
 253     const Register t2_top = t2;
 254     const Register t3_t = t3;
 255 
 256     // Check if lock-stack is full.
 257     ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 258     cmpw(t2_top, (unsigned)LockStack::end_offset() - 1);
 259     br(Assembler::GT, slow_path);
 260 
 261     // Check if recursive.
 262     subw(t3_t, t2_top, oopSize);
 263     ldr(t3_t, Address(rthread, t3_t));
 264     cmp(obj, t3_t);
 265     br(Assembler::EQ, push);
 266 
 267     // Relaxed normal load to check for monitor. Optimization for monitor case.
 268     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 269     tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
 270 
 271     // Not inflated
 272     assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea");
 273 
 274     // Try to lock. Transition lock-bits 0b01 => 0b00
 275     orr(t1_mark, t1_mark, markWord::unlocked_value);
 276     eor(t3_t, t1_mark, markWord::unlocked_value);
 277     cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
 278             /*acquire*/ true, /*release*/ false, /*weak*/ false, noreg);
 279     br(Assembler::NE, slow_path);
 280 
 281     bind(push);
 282     // After successful lock, push object on lock-stack.
 283     str(obj, Address(rthread, t2_top));
 284     addw(t2_top, t2_top, oopSize);
 285     strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 286     b(locked);
 287   }
 288 
 289   { // Handle inflated monitor.
 290     bind(inflated);
 291 
 292     // mark contains the tagged ObjectMonitor*.
 293     const Register t1_tagged_monitor = t1_mark;
 294     const uintptr_t monitor_tag = markWord::monitor_value;
 295     const Register t2_owner_addr = t2;
 296     const Register t3_owner = t3;
 297 
 298     // Compute owner address.
 299     lea(t2_owner_addr, Address(t1_tagged_monitor, (in_bytes(ObjectMonitor::owner_offset()) - monitor_tag)));
 300 
 301     // CAS owner (null => current thread).
 302     cmpxchg(t2_owner_addr, zr, rthread, Assembler::xword, /*acquire*/ true,
 303             /*release*/ false, /*weak*/ false, t3_owner);
 304     br(Assembler::EQ, locked);
 305 
 306     // Check if recursive.
 307     cmp(t3_owner, rthread);
 308     br(Assembler::NE, slow_path);
 309 
 310     // Recursive.
 311     increment(Address(t1_tagged_monitor, in_bytes(ObjectMonitor::recursions_offset()) - monitor_tag), 1);
 312   }
 313 
 314   bind(locked);
 315   increment(Address(rthread, JavaThread::held_monitor_count_offset()));
 316 
 317 #ifdef ASSERT
 318   // Check that locked label is reached with Flags == EQ.
 319   Label flag_correct;
 320   br(Assembler::EQ, flag_correct);
 321   stop("Fast Lock Flag != EQ");
 322 #endif
 323 
 324   bind(slow_path);
 325 #ifdef ASSERT
 326   // Check that slow_path label is reached with Flags == NE.
 327   br(Assembler::NE, flag_correct);
 328   stop("Fast Lock Flag != NE");
 329   bind(flag_correct);
 330 #endif
 331   // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
 332 }
 333 
 334 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register t1, Register t2,
 335                                                 Register t3) {
 336   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 337   assert_different_registers(obj, t1, t2, t3);
 338 
 339   // Handle inflated monitor.
 340   Label inflated, inflated_load_monitor;
 341   // Finish fast unlock successfully. MUST branch to with flag == EQ
 342   Label unlocked;
 343   // Finish fast unlock unsuccessfully. MUST branch to with flag == NE
 344   Label slow_path;
 345 
 346   const Register t1_mark = t1;
 347   const Register t2_top = t2;
 348   const Register t3_t = t3;
 349 
 350   { // Lightweight unlock
 351 
 352     // Check if obj is top of lock-stack.
 353     ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 354     subw(t2_top, t2_top, oopSize);
 355     ldr(t3_t, Address(rthread, t2_top));
 356     cmp(obj, t3_t);
 357     // Top of lock stack was not obj. Must be monitor.
 358     br(Assembler::NE, inflated_load_monitor);
 359 
 360     // Pop lock-stack.
 361     DEBUG_ONLY(str(zr, Address(rthread, t2_top));)
 362     strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 363 
 364     // Check if recursive.
 365     subw(t3_t, t2_top, oopSize);
 366     ldr(t3_t, Address(rthread, t3_t));
 367     cmp(obj, t3_t);
 368     br(Assembler::EQ, unlocked);
 369 
 370     // Not recursive.
 371     // Load Mark.
 372     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 373 
 374     // Check header for monitor (0b10).
 375     tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
 376 
 377     // Try to unlock. Transition lock bits 0b00 => 0b01
 378     assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea");
 379     orr(t3_t, t1_mark, markWord::unlocked_value);
 380     cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
 381             /*acquire*/ false, /*release*/ true, /*weak*/ false, noreg);
 382     br(Assembler::EQ, unlocked);
 383 
 384     // Compare and exchange failed.
 385     // Restore lock-stack and handle the unlock in runtime.
 386     DEBUG_ONLY(str(obj, Address(rthread, t2_top));)
 387     addw(t2_top, t2_top, oopSize);
 388     str(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 389     b(slow_path);
 390   }
 391 
 392 
 393   { // Handle inflated monitor.
 394     bind(inflated_load_monitor);
 395     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 396 #ifdef ASSERT
 397     tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
 398     stop("Fast Unlock not monitor");
 399 #endif
 400 
 401     bind(inflated);
 402 
 403 #ifdef ASSERT
 404     Label check_done;
 405     subw(t2_top, t2_top, oopSize);
 406     cmpw(t2_top, in_bytes(JavaThread::lock_stack_base_offset()));
 407     br(Assembler::LT, check_done);
 408     ldr(t3_t, Address(rthread, t2_top));
 409     cmp(obj, t3_t);
 410     br(Assembler::NE, inflated);
 411     stop("Fast Unlock lock on stack");
 412     bind(check_done);
 413 #endif
 414 
 415     // mark contains the tagged ObjectMonitor*.
 416     const Register t1_monitor = t1_mark;
 417     const uintptr_t monitor_tag = markWord::monitor_value;
 418 
 419     // Untag the monitor.
 420     sub(t1_monitor, t1_mark, monitor_tag);
 421 
 422     const Register t2_recursions = t2;
 423     Label not_recursive;
 424 
 425     // Check if recursive.
 426     ldr(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
 427     cbz(t2_recursions, not_recursive);
 428 
 429     // Recursive unlock.
 430     sub(t2_recursions, t2_recursions, 1u);
 431     str(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
 432     // Set flag == EQ
 433     cmp(t2_recursions, t2_recursions);
 434     b(unlocked);
 435 
 436     bind(not_recursive);
 437 
 438     Label release;
 439     const Register t2_owner_addr = t2;
 440 
 441     // Compute owner address.
 442     lea(t2_owner_addr, Address(t1_monitor, ObjectMonitor::owner_offset()));
 443 
 444     // Check if the entry lists are empty.
 445     ldr(rscratch1, Address(t1_monitor, ObjectMonitor::EntryList_offset()));
 446     ldr(t3_t, Address(t1_monitor, ObjectMonitor::cxq_offset()));
 447     orr(rscratch1, rscratch1, t3_t);
 448     cmp(rscratch1, zr);
 449     br(Assembler::EQ, release);
 450 
 451     // The owner may be anonymous and we removed the last obj entry in
 452     // the lock-stack. This loses the information about the owner.
 453     // Write the thread to the owner field so the runtime knows the owner.
 454     str(rthread, Address(t2_owner_addr));
 455     b(slow_path);
 456 
 457     bind(release);
 458     // Set owner to null.
 459     // Release to satisfy the JMM
 460     stlr(zr, t2_owner_addr);
 461   }
 462 
 463   bind(unlocked);
 464   decrement(Address(rthread, JavaThread::held_monitor_count_offset()));
 465 
 466 #ifdef ASSERT
 467   // Check that unlocked label is reached with Flags == EQ.
 468   Label flag_correct;
 469   br(Assembler::EQ, flag_correct);
 470   stop("Fast Unlock Flag != EQ");
 471 #endif
 472 
 473   bind(slow_path);
 474 #ifdef ASSERT
 475   // Check that slow_path label is reached with Flags == NE.
 476   br(Assembler::NE, flag_correct);
 477   stop("Fast Unlock Flag != NE");
 478   bind(flag_correct);
 479 #endif
 480   // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
 481 }
 482 
 483 // Search for str1 in str2 and return index or -1
 484 // Clobbers: rscratch1, rscratch2, rflags. May also clobber v0-v1, when icnt1==-1.
 485 void C2_MacroAssembler::string_indexof(Register str2, Register str1,
 486                                        Register cnt2, Register cnt1,
 487                                        Register tmp1, Register tmp2,
 488                                        Register tmp3, Register tmp4,
 489                                        Register tmp5, Register tmp6,
 490                                        int icnt1, Register result, int ae) {
 491   // NOTE: tmp5, tmp6 can be zr depending on specific method version
 492   Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
 493 
 494   Register ch1 = rscratch1;
 495   Register ch2 = rscratch2;
 496   Register cnt1tmp = tmp1;
 497   Register cnt2tmp = tmp2;
 498   Register cnt1_neg = cnt1;
 499   Register cnt2_neg = cnt2;
 500   Register result_tmp = tmp4;
 501 
 502   bool isL = ae == StrIntrinsicNode::LL;
 503 
 504   bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
 505   bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
 506   int str1_chr_shift = str1_isL ? 0:1;
 507   int str2_chr_shift = str2_isL ? 0:1;
 508   int str1_chr_size = str1_isL ? 1:2;
 509   int str2_chr_size = str2_isL ? 1:2;
 510   chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
 511                                       (chr_insn)&MacroAssembler::ldrh;
 512   chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
 513                                       (chr_insn)&MacroAssembler::ldrh;
 514   chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
 515   chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
 516 
 517   // Note, inline_string_indexOf() generates checks:
 518   // if (substr.count > string.count) return -1;
 519   // if (substr.count == 0) return 0;
 520 
 521   // We have two strings, a source string in str2, cnt2 and a pattern string
 522   // in str1, cnt1. Find the 1st occurrence of pattern in source or return -1.
 523 
 524   // For larger pattern and source we use a simplified Boyer Moore algorithm.
 525   // With a small pattern and source we use linear scan.
 526 
 527   if (icnt1 == -1) {
 528     sub(result_tmp, cnt2, cnt1);
 529     cmp(cnt1, (u1)8);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
 530     br(LT, LINEARSEARCH);
 531     dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
 532     subs(zr, cnt1, 256);
 533     lsr(tmp1, cnt2, 2);
 534     ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
 535     br(GE, LINEARSTUB);
 536   }
 537 
 538 // The Boyer Moore alogorithm is based on the description here:-
 539 //
 540 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
 541 //
 542 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
 543 // and the 'Good Suffix' rule.
 544 //
 545 // These rules are essentially heuristics for how far we can shift the
 546 // pattern along the search string.
 547 //
 548 // The implementation here uses the 'Bad Character' rule only because of the
 549 // complexity of initialisation for the 'Good Suffix' rule.
 550 //
 551 // This is also known as the Boyer-Moore-Horspool algorithm:-
 552 //
 553 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
 554 //
 555 // This particular implementation has few java-specific optimizations.
 556 //
 557 // #define ASIZE 256
 558 //
 559 //    int bm(unsigned char *x, int m, unsigned char *y, int n) {
 560 //       int i, j;
 561 //       unsigned c;
 562 //       unsigned char bc[ASIZE];
 563 //
 564 //       /* Preprocessing */
 565 //       for (i = 0; i < ASIZE; ++i)
 566 //          bc[i] = m;
 567 //       for (i = 0; i < m - 1; ) {
 568 //          c = x[i];
 569 //          ++i;
 570 //          // c < 256 for Latin1 string, so, no need for branch
 571 //          #ifdef PATTERN_STRING_IS_LATIN1
 572 //          bc[c] = m - i;
 573 //          #else
 574 //          if (c < ASIZE) bc[c] = m - i;
 575 //          #endif
 576 //       }
 577 //
 578 //       /* Searching */
 579 //       j = 0;
 580 //       while (j <= n - m) {
 581 //          c = y[i+j];
 582 //          if (x[m-1] == c)
 583 //            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
 584 //          if (i < 0) return j;
 585 //          // c < 256 for Latin1 string, so, no need for branch
 586 //          #ifdef SOURCE_STRING_IS_LATIN1
 587 //          // LL case: (c< 256) always true. Remove branch
 588 //          j += bc[y[j+m-1]];
 589 //          #endif
 590 //          #ifndef PATTERN_STRING_IS_UTF
 591 //          // UU case: need if (c<ASIZE) check. Skip 1 character if not.
 592 //          if (c < ASIZE)
 593 //            j += bc[y[j+m-1]];
 594 //          else
 595 //            j += 1
 596 //          #endif
 597 //          #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
 598 //          // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
 599 //          if (c < ASIZE)
 600 //            j += bc[y[j+m-1]];
 601 //          else
 602 //            j += m
 603 //          #endif
 604 //       }
 605 //    }
 606 
 607   if (icnt1 == -1) {
 608     Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
 609         BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
 610     Register cnt1end = tmp2;
 611     Register str2end = cnt2;
 612     Register skipch = tmp2;
 613 
 614     // str1 length is >=8, so, we can read at least 1 register for cases when
 615     // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
 616     // UL case. We'll re-read last character in inner pre-loop code to have
 617     // single outer pre-loop load
 618     const int firstStep = isL ? 7 : 3;
 619 
 620     const int ASIZE = 256;
 621     const int STORED_BYTES = 32; // amount of bytes stored per instruction
 622     sub(sp, sp, ASIZE);
 623     mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
 624     mov(ch1, sp);
 625     BIND(BM_INIT_LOOP);
 626       stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
 627       subs(tmp5, tmp5, 1);
 628       br(GT, BM_INIT_LOOP);
 629 
 630       sub(cnt1tmp, cnt1, 1);
 631       mov(tmp5, str2);
 632       add(str2end, str2, result_tmp, LSL, str2_chr_shift);
 633       sub(ch2, cnt1, 1);
 634       mov(tmp3, str1);
 635     BIND(BCLOOP);
 636       (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
 637       if (!str1_isL) {
 638         subs(zr, ch1, ASIZE);
 639         br(HS, BCSKIP);
 640       }
 641       strb(ch2, Address(sp, ch1));
 642     BIND(BCSKIP);
 643       subs(ch2, ch2, 1);
 644       br(GT, BCLOOP);
 645 
 646       add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
 647       if (str1_isL == str2_isL) {
 648         // load last 8 bytes (8LL/4UU symbols)
 649         ldr(tmp6, Address(tmp6, -wordSize));
 650       } else {
 651         ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
 652         // convert Latin1 to UTF. We'll have to wait until load completed, but
 653         // it's still faster than per-character loads+checks
 654         lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
 655         ubfx(ch1, tmp6, 8, 8); // str1[N-2]
 656         ubfx(ch2, tmp6, 16, 8); // str1[N-3]
 657         andr(tmp6, tmp6, 0xFF); // str1[N-4]
 658         orr(ch2, ch1, ch2, LSL, 16);
 659         orr(tmp6, tmp6, tmp3, LSL, 48);
 660         orr(tmp6, tmp6, ch2, LSL, 16);
 661       }
 662     BIND(BMLOOPSTR2);
 663       (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 664       sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
 665       if (str1_isL == str2_isL) {
 666         // re-init tmp3. It's for free because it's executed in parallel with
 667         // load above. Alternative is to initialize it before loop, but it'll
 668         // affect performance on in-order systems with 2 or more ld/st pipelines
 669         lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
 670       }
 671       if (!isL) { // UU/UL case
 672         lsl(ch2, cnt1tmp, 1); // offset in bytes
 673       }
 674       cmp(tmp3, skipch);
 675       br(NE, BMSKIP);
 676       ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
 677       mov(ch1, tmp6);
 678       if (isL) {
 679         b(BMLOOPSTR1_AFTER_LOAD);
 680       } else {
 681         sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
 682         b(BMLOOPSTR1_CMP);
 683       }
 684     BIND(BMLOOPSTR1);
 685       (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
 686       (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 687     BIND(BMLOOPSTR1_AFTER_LOAD);
 688       subs(cnt1tmp, cnt1tmp, 1);
 689       br(LT, BMLOOPSTR1_LASTCMP);
 690     BIND(BMLOOPSTR1_CMP);
 691       cmp(ch1, ch2);
 692       br(EQ, BMLOOPSTR1);
 693     BIND(BMSKIP);
 694       if (!isL) {
 695         // if we've met UTF symbol while searching Latin1 pattern, then we can
 696         // skip cnt1 symbols
 697         if (str1_isL != str2_isL) {
 698           mov(result_tmp, cnt1);
 699         } else {
 700           mov(result_tmp, 1);
 701         }
 702         subs(zr, skipch, ASIZE);
 703         br(HS, BMADV);
 704       }
 705       ldrb(result_tmp, Address(sp, skipch)); // load skip distance
 706     BIND(BMADV);
 707       sub(cnt1tmp, cnt1, 1);
 708       add(str2, str2, result_tmp, LSL, str2_chr_shift);
 709       cmp(str2, str2end);
 710       br(LE, BMLOOPSTR2);
 711       add(sp, sp, ASIZE);
 712       b(NOMATCH);
 713     BIND(BMLOOPSTR1_LASTCMP);
 714       cmp(ch1, ch2);
 715       br(NE, BMSKIP);
 716     BIND(BMMATCH);
 717       sub(result, str2, tmp5);
 718       if (!str2_isL) lsr(result, result, 1);
 719       add(sp, sp, ASIZE);
 720       b(DONE);
 721 
 722     BIND(LINEARSTUB);
 723     cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm
 724     br(LT, LINEAR_MEDIUM);
 725     mov(result, zr);
 726     RuntimeAddress stub = nullptr;
 727     if (isL) {
 728       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
 729       assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated");
 730     } else if (str1_isL) {
 731       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
 732        assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated");
 733     } else {
 734       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
 735       assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated");
 736     }
 737     address call = trampoline_call(stub);
 738     if (call == nullptr) {
 739       DEBUG_ONLY(reset_labels(LINEARSEARCH, LINEAR_MEDIUM, DONE, NOMATCH, MATCH));
 740       ciEnv::current()->record_failure("CodeCache is full");
 741       return;
 742     }
 743     b(DONE);
 744   }
 745 
 746   BIND(LINEARSEARCH);
 747   {
 748     Label DO1, DO2, DO3;
 749 
 750     Register str2tmp = tmp2;
 751     Register first = tmp3;
 752 
 753     if (icnt1 == -1)
 754     {
 755         Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
 756 
 757         cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2));
 758         br(LT, DOSHORT);
 759       BIND(LINEAR_MEDIUM);
 760         (this->*str1_load_1chr)(first, Address(str1));
 761         lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
 762         sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
 763         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 764         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 765 
 766       BIND(FIRST_LOOP);
 767         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
 768         cmp(first, ch2);
 769         br(EQ, STR1_LOOP);
 770       BIND(STR2_NEXT);
 771         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 772         br(LE, FIRST_LOOP);
 773         b(NOMATCH);
 774 
 775       BIND(STR1_LOOP);
 776         adds(cnt1tmp, cnt1_neg, str1_chr_size);
 777         add(cnt2tmp, cnt2_neg, str2_chr_size);
 778         br(GE, MATCH);
 779 
 780       BIND(STR1_NEXT);
 781         (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
 782         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
 783         cmp(ch1, ch2);
 784         br(NE, STR2_NEXT);
 785         adds(cnt1tmp, cnt1tmp, str1_chr_size);
 786         add(cnt2tmp, cnt2tmp, str2_chr_size);
 787         br(LT, STR1_NEXT);
 788         b(MATCH);
 789 
 790       BIND(DOSHORT);
 791       if (str1_isL == str2_isL) {
 792         cmp(cnt1, (u1)2);
 793         br(LT, DO1);
 794         br(GT, DO3);
 795       }
 796     }
 797 
 798     if (icnt1 == 4) {
 799       Label CH1_LOOP;
 800 
 801         (this->*load_4chr)(ch1, str1);
 802         sub(result_tmp, cnt2, 4);
 803         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 804         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 805 
 806       BIND(CH1_LOOP);
 807         (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
 808         cmp(ch1, ch2);
 809         br(EQ, MATCH);
 810         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 811         br(LE, CH1_LOOP);
 812         b(NOMATCH);
 813       }
 814 
 815     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
 816       Label CH1_LOOP;
 817 
 818       BIND(DO2);
 819         (this->*load_2chr)(ch1, str1);
 820         if (icnt1 == 2) {
 821           sub(result_tmp, cnt2, 2);
 822         }
 823         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 824         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 825       BIND(CH1_LOOP);
 826         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
 827         cmp(ch1, ch2);
 828         br(EQ, MATCH);
 829         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 830         br(LE, CH1_LOOP);
 831         b(NOMATCH);
 832     }
 833 
 834     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
 835       Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
 836 
 837       BIND(DO3);
 838         (this->*load_2chr)(first, str1);
 839         (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
 840         if (icnt1 == 3) {
 841           sub(result_tmp, cnt2, 3);
 842         }
 843         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 844         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 845       BIND(FIRST_LOOP);
 846         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
 847         cmpw(first, ch2);
 848         br(EQ, STR1_LOOP);
 849       BIND(STR2_NEXT);
 850         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 851         br(LE, FIRST_LOOP);
 852         b(NOMATCH);
 853 
 854       BIND(STR1_LOOP);
 855         add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
 856         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
 857         cmp(ch1, ch2);
 858         br(NE, STR2_NEXT);
 859         b(MATCH);
 860     }
 861 
 862     if (icnt1 == -1 || icnt1 == 1) {
 863       Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
 864 
 865       BIND(DO1);
 866         (this->*str1_load_1chr)(ch1, str1);
 867         cmp(cnt2, (u1)8);
 868         br(LT, DO1_SHORT);
 869 
 870         sub(result_tmp, cnt2, 8/str2_chr_size);
 871         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 872         mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
 873         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 874 
 875         if (str2_isL) {
 876           orr(ch1, ch1, ch1, LSL, 8);
 877         }
 878         orr(ch1, ch1, ch1, LSL, 16);
 879         orr(ch1, ch1, ch1, LSL, 32);
 880       BIND(CH1_LOOP);
 881         ldr(ch2, Address(str2, cnt2_neg));
 882         eor(ch2, ch1, ch2);
 883         sub(tmp1, ch2, tmp3);
 884         orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 885         bics(tmp1, tmp1, tmp2);
 886         br(NE, HAS_ZERO);
 887         adds(cnt2_neg, cnt2_neg, 8);
 888         br(LT, CH1_LOOP);
 889 
 890         cmp(cnt2_neg, (u1)8);
 891         mov(cnt2_neg, 0);
 892         br(LT, CH1_LOOP);
 893         b(NOMATCH);
 894 
 895       BIND(HAS_ZERO);
 896         rev(tmp1, tmp1);
 897         clz(tmp1, tmp1);
 898         add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
 899         b(MATCH);
 900 
 901       BIND(DO1_SHORT);
 902         mov(result_tmp, cnt2);
 903         lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
 904         sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
 905       BIND(DO1_LOOP);
 906         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
 907         cmpw(ch1, ch2);
 908         br(EQ, MATCH);
 909         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 910         br(LT, DO1_LOOP);
 911     }
 912   }
 913   BIND(NOMATCH);
 914     mov(result, -1);
 915     b(DONE);
 916   BIND(MATCH);
 917     add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
 918   BIND(DONE);
 919 }
 920 
 921 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
 922 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
 923 
 924 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1,
 925                                             Register ch, Register result,
 926                                             Register tmp1, Register tmp2, Register tmp3)
 927 {
 928   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
 929   Register cnt1_neg = cnt1;
 930   Register ch1 = rscratch1;
 931   Register result_tmp = rscratch2;
 932 
 933   cbz(cnt1, NOMATCH);
 934 
 935   cmp(cnt1, (u1)4);
 936   br(LT, DO1_SHORT);
 937 
 938   orr(ch, ch, ch, LSL, 16);
 939   orr(ch, ch, ch, LSL, 32);
 940 
 941   sub(cnt1, cnt1, 4);
 942   mov(result_tmp, cnt1);
 943   lea(str1, Address(str1, cnt1, Address::uxtw(1)));
 944   sub(cnt1_neg, zr, cnt1, LSL, 1);
 945 
 946   mov(tmp3, 0x0001000100010001);
 947 
 948   BIND(CH1_LOOP);
 949     ldr(ch1, Address(str1, cnt1_neg));
 950     eor(ch1, ch, ch1);
 951     sub(tmp1, ch1, tmp3);
 952     orr(tmp2, ch1, 0x7fff7fff7fff7fff);
 953     bics(tmp1, tmp1, tmp2);
 954     br(NE, HAS_ZERO);
 955     adds(cnt1_neg, cnt1_neg, 8);
 956     br(LT, CH1_LOOP);
 957 
 958     cmp(cnt1_neg, (u1)8);
 959     mov(cnt1_neg, 0);
 960     br(LT, CH1_LOOP);
 961     b(NOMATCH);
 962 
 963   BIND(HAS_ZERO);
 964     rev(tmp1, tmp1);
 965     clz(tmp1, tmp1);
 966     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
 967     b(MATCH);
 968 
 969   BIND(DO1_SHORT);
 970     mov(result_tmp, cnt1);
 971     lea(str1, Address(str1, cnt1, Address::uxtw(1)));
 972     sub(cnt1_neg, zr, cnt1, LSL, 1);
 973   BIND(DO1_LOOP);
 974     ldrh(ch1, Address(str1, cnt1_neg));
 975     cmpw(ch, ch1);
 976     br(EQ, MATCH);
 977     adds(cnt1_neg, cnt1_neg, 2);
 978     br(LT, DO1_LOOP);
 979   BIND(NOMATCH);
 980     mov(result, -1);
 981     b(DONE);
 982   BIND(MATCH);
 983     add(result, result_tmp, cnt1_neg, ASR, 1);
 984   BIND(DONE);
 985 }
 986 
 987 void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1,
 988                                                 Register ch, Register result,
 989                                                 FloatRegister ztmp1,
 990                                                 FloatRegister ztmp2,
 991                                                 PRegister tmp_pg,
 992                                                 PRegister tmp_pdn, bool isL)
 993 {
 994   // Note that `tmp_pdn` should *NOT* be used as governing predicate register.
 995   assert(tmp_pg->is_governing(),
 996          "this register has to be a governing predicate register");
 997 
 998   Label LOOP, MATCH, DONE, NOMATCH;
 999   Register vec_len = rscratch1;
1000   Register idx = rscratch2;
1001 
1002   SIMD_RegVariant T = (isL == true) ? B : H;
1003 
1004   cbz(cnt1, NOMATCH);
1005 
1006   // Assign the particular char throughout the vector.
1007   sve_dup(ztmp2, T, ch);
1008   if (isL) {
1009     sve_cntb(vec_len);
1010   } else {
1011     sve_cnth(vec_len);
1012   }
1013   mov(idx, 0);
1014 
1015   // Generate a predicate to control the reading of input string.
1016   sve_whilelt(tmp_pg, T, idx, cnt1);
1017 
1018   BIND(LOOP);
1019     // Read a vector of 8- or 16-bit data depending on the string type. Note
1020     // that inactive elements indicated by the predicate register won't cause
1021     // a data read from memory to the destination vector.
1022     if (isL) {
1023       sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx));
1024     } else {
1025       sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1)));
1026     }
1027     add(idx, idx, vec_len);
1028 
1029     // Perform the comparison. An element of the destination predicate is set
1030     // to active if the particular char is matched.
1031     sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2);
1032 
1033     // Branch if the particular char is found.
1034     br(NE, MATCH);
1035 
1036     sve_whilelt(tmp_pg, T, idx, cnt1);
1037 
1038     // Loop back if the particular char not found.
1039     br(MI, LOOP);
1040 
1041   BIND(NOMATCH);
1042     mov(result, -1);
1043     b(DONE);
1044 
1045   BIND(MATCH);
1046     // Undo the index increment.
1047     sub(idx, idx, vec_len);
1048 
1049     // Crop the vector to find its location.
1050     sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */);
1051     add(result, idx, -1);
1052     sve_incp(result, T, tmp_pdn);
1053   BIND(DONE);
1054 }
1055 
1056 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1,
1057                                             Register ch, Register result,
1058                                             Register tmp1, Register tmp2, Register tmp3)
1059 {
1060   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
1061   Register cnt1_neg = cnt1;
1062   Register ch1 = rscratch1;
1063   Register result_tmp = rscratch2;
1064 
1065   cbz(cnt1, NOMATCH);
1066 
1067   cmp(cnt1, (u1)8);
1068   br(LT, DO1_SHORT);
1069 
1070   orr(ch, ch, ch, LSL, 8);
1071   orr(ch, ch, ch, LSL, 16);
1072   orr(ch, ch, ch, LSL, 32);
1073 
1074   sub(cnt1, cnt1, 8);
1075   mov(result_tmp, cnt1);
1076   lea(str1, Address(str1, cnt1));
1077   sub(cnt1_neg, zr, cnt1);
1078 
1079   mov(tmp3, 0x0101010101010101);
1080 
1081   BIND(CH1_LOOP);
1082     ldr(ch1, Address(str1, cnt1_neg));
1083     eor(ch1, ch, ch1);
1084     sub(tmp1, ch1, tmp3);
1085     orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f);
1086     bics(tmp1, tmp1, tmp2);
1087     br(NE, HAS_ZERO);
1088     adds(cnt1_neg, cnt1_neg, 8);
1089     br(LT, CH1_LOOP);
1090 
1091     cmp(cnt1_neg, (u1)8);
1092     mov(cnt1_neg, 0);
1093     br(LT, CH1_LOOP);
1094     b(NOMATCH);
1095 
1096   BIND(HAS_ZERO);
1097     rev(tmp1, tmp1);
1098     clz(tmp1, tmp1);
1099     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
1100     b(MATCH);
1101 
1102   BIND(DO1_SHORT);
1103     mov(result_tmp, cnt1);
1104     lea(str1, Address(str1, cnt1));
1105     sub(cnt1_neg, zr, cnt1);
1106   BIND(DO1_LOOP);
1107     ldrb(ch1, Address(str1, cnt1_neg));
1108     cmp(ch, ch1);
1109     br(EQ, MATCH);
1110     adds(cnt1_neg, cnt1_neg, 1);
1111     br(LT, DO1_LOOP);
1112   BIND(NOMATCH);
1113     mov(result, -1);
1114     b(DONE);
1115   BIND(MATCH);
1116     add(result, result_tmp, cnt1_neg);
1117   BIND(DONE);
1118 }
1119 
1120 // Compare strings.
1121 void C2_MacroAssembler::string_compare(Register str1, Register str2,
1122     Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
1123     FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
1124     PRegister pgtmp1, PRegister pgtmp2, int ae) {
1125   Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
1126       DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
1127       SHORT_LOOP_START, TAIL_CHECK;
1128 
1129   bool isLL = ae == StrIntrinsicNode::LL;
1130   bool isLU = ae == StrIntrinsicNode::LU;
1131   bool isUL = ae == StrIntrinsicNode::UL;
1132 
1133   // The stub threshold for LL strings is: 72 (64 + 8) chars
1134   // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch)
1135   // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least)
1136   const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36);
1137 
1138   bool str1_isL = isLL || isLU;
1139   bool str2_isL = isLL || isUL;
1140 
1141   int str1_chr_shift = str1_isL ? 0 : 1;
1142   int str2_chr_shift = str2_isL ? 0 : 1;
1143   int str1_chr_size = str1_isL ? 1 : 2;
1144   int str2_chr_size = str2_isL ? 1 : 2;
1145   int minCharsInWord = isLL ? wordSize : wordSize/2;
1146 
1147   FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
1148   chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
1149                                       (chr_insn)&MacroAssembler::ldrh;
1150   chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
1151                                       (chr_insn)&MacroAssembler::ldrh;
1152   uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
1153                             (uxt_insn)&MacroAssembler::uxthw;
1154 
1155   BLOCK_COMMENT("string_compare {");
1156 
1157   // Bizzarely, the counts are passed in bytes, regardless of whether they
1158   // are L or U strings, however the result is always in characters.
1159   if (!str1_isL) asrw(cnt1, cnt1, 1);
1160   if (!str2_isL) asrw(cnt2, cnt2, 1);
1161 
1162   // Compute the minimum of the string lengths and save the difference.
1163   subsw(result, cnt1, cnt2);
1164   cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
1165 
1166   // A very short string
1167   cmpw(cnt2, minCharsInWord);
1168   br(Assembler::LE, SHORT_STRING);
1169 
1170   // Compare longwords
1171   // load first parts of strings and finish initialization while loading
1172   {
1173     if (str1_isL == str2_isL) { // LL or UU
1174       ldr(tmp1, Address(str1));
1175       cmp(str1, str2);
1176       br(Assembler::EQ, DONE);
1177       ldr(tmp2, Address(str2));
1178       cmp(cnt2, stub_threshold);
1179       br(GE, STUB);
1180       subsw(cnt2, cnt2, minCharsInWord);
1181       br(EQ, TAIL_CHECK);
1182       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1183       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1184       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1185     } else if (isLU) {
1186       ldrs(vtmp, Address(str1));
1187       ldr(tmp2, Address(str2));
1188       cmp(cnt2, stub_threshold);
1189       br(GE, STUB);
1190       subw(cnt2, cnt2, 4);
1191       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1192       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1193       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1194       zip1(vtmp, T8B, vtmp, vtmpZ);
1195       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1196       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1197       add(cnt1, cnt1, 4);
1198       fmovd(tmp1, vtmp);
1199     } else { // UL case
1200       ldr(tmp1, Address(str1));
1201       ldrs(vtmp, Address(str2));
1202       cmp(cnt2, stub_threshold);
1203       br(GE, STUB);
1204       subw(cnt2, cnt2, 4);
1205       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1206       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1207       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1208       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1209       zip1(vtmp, T8B, vtmp, vtmpZ);
1210       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1211       add(cnt1, cnt1, 8);
1212       fmovd(tmp2, vtmp);
1213     }
1214     adds(cnt2, cnt2, isUL ? 4 : 8);
1215     br(GE, TAIL);
1216     eor(rscratch2, tmp1, tmp2);
1217     cbnz(rscratch2, DIFF);
1218     // main loop
1219     bind(NEXT_WORD);
1220     if (str1_isL == str2_isL) {
1221       ldr(tmp1, Address(str1, cnt2));
1222       ldr(tmp2, Address(str2, cnt2));
1223       adds(cnt2, cnt2, 8);
1224     } else if (isLU) {
1225       ldrs(vtmp, Address(str1, cnt1));
1226       ldr(tmp2, Address(str2, cnt2));
1227       add(cnt1, cnt1, 4);
1228       zip1(vtmp, T8B, vtmp, vtmpZ);
1229       fmovd(tmp1, vtmp);
1230       adds(cnt2, cnt2, 8);
1231     } else { // UL
1232       ldrs(vtmp, Address(str2, cnt2));
1233       ldr(tmp1, Address(str1, cnt1));
1234       zip1(vtmp, T8B, vtmp, vtmpZ);
1235       add(cnt1, cnt1, 8);
1236       fmovd(tmp2, vtmp);
1237       adds(cnt2, cnt2, 4);
1238     }
1239     br(GE, TAIL);
1240 
1241     eor(rscratch2, tmp1, tmp2);
1242     cbz(rscratch2, NEXT_WORD);
1243     b(DIFF);
1244     bind(TAIL);
1245     eor(rscratch2, tmp1, tmp2);
1246     cbnz(rscratch2, DIFF);
1247     // Last longword.  In the case where length == 4 we compare the
1248     // same longword twice, but that's still faster than another
1249     // conditional branch.
1250     if (str1_isL == str2_isL) {
1251       ldr(tmp1, Address(str1));
1252       ldr(tmp2, Address(str2));
1253     } else if (isLU) {
1254       ldrs(vtmp, Address(str1));
1255       ldr(tmp2, Address(str2));
1256       zip1(vtmp, T8B, vtmp, vtmpZ);
1257       fmovd(tmp1, vtmp);
1258     } else { // UL
1259       ldrs(vtmp, Address(str2));
1260       ldr(tmp1, Address(str1));
1261       zip1(vtmp, T8B, vtmp, vtmpZ);
1262       fmovd(tmp2, vtmp);
1263     }
1264     bind(TAIL_CHECK);
1265     eor(rscratch2, tmp1, tmp2);
1266     cbz(rscratch2, DONE);
1267 
1268     // Find the first different characters in the longwords and
1269     // compute their difference.
1270     bind(DIFF);
1271     rev(rscratch2, rscratch2);
1272     clz(rscratch2, rscratch2);
1273     andr(rscratch2, rscratch2, isLL ? -8 : -16);
1274     lsrv(tmp1, tmp1, rscratch2);
1275     (this->*ext_chr)(tmp1, tmp1);
1276     lsrv(tmp2, tmp2, rscratch2);
1277     (this->*ext_chr)(tmp2, tmp2);
1278     subw(result, tmp1, tmp2);
1279     b(DONE);
1280   }
1281 
1282   bind(STUB);
1283     RuntimeAddress stub = nullptr;
1284     switch(ae) {
1285       case StrIntrinsicNode::LL:
1286         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
1287         break;
1288       case StrIntrinsicNode::UU:
1289         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
1290         break;
1291       case StrIntrinsicNode::LU:
1292         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
1293         break;
1294       case StrIntrinsicNode::UL:
1295         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
1296         break;
1297       default:
1298         ShouldNotReachHere();
1299      }
1300     assert(stub.target() != nullptr, "compare_long_string stub has not been generated");
1301     address call = trampoline_call(stub);
1302     if (call == nullptr) {
1303       DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START));
1304       ciEnv::current()->record_failure("CodeCache is full");
1305       return;
1306     }
1307     b(DONE);
1308 
1309   bind(SHORT_STRING);
1310   // Is the minimum length zero?
1311   cbz(cnt2, DONE);
1312   // arrange code to do most branches while loading and loading next characters
1313   // while comparing previous
1314   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1315   subs(cnt2, cnt2, 1);
1316   br(EQ, SHORT_LAST_INIT);
1317   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1318   b(SHORT_LOOP_START);
1319   bind(SHORT_LOOP);
1320   subs(cnt2, cnt2, 1);
1321   br(EQ, SHORT_LAST);
1322   bind(SHORT_LOOP_START);
1323   (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
1324   (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
1325   cmp(tmp1, cnt1);
1326   br(NE, SHORT_LOOP_TAIL);
1327   subs(cnt2, cnt2, 1);
1328   br(EQ, SHORT_LAST2);
1329   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1330   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1331   cmp(tmp2, rscratch1);
1332   br(EQ, SHORT_LOOP);
1333   sub(result, tmp2, rscratch1);
1334   b(DONE);
1335   bind(SHORT_LOOP_TAIL);
1336   sub(result, tmp1, cnt1);
1337   b(DONE);
1338   bind(SHORT_LAST2);
1339   cmp(tmp2, rscratch1);
1340   br(EQ, DONE);
1341   sub(result, tmp2, rscratch1);
1342 
1343   b(DONE);
1344   bind(SHORT_LAST_INIT);
1345   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1346   bind(SHORT_LAST);
1347   cmp(tmp1, cnt1);
1348   br(EQ, DONE);
1349   sub(result, tmp1, cnt1);
1350 
1351   bind(DONE);
1352 
1353   BLOCK_COMMENT("} string_compare");
1354 }
1355 
1356 void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1,
1357                                      FloatRegister src2, Condition cond, bool isQ) {
1358   SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1359   FloatRegister zn = src1, zm = src2;
1360   bool needs_negation = false;
1361   switch (cond) {
1362     case LT: cond = GT; zn = src2; zm = src1; break;
1363     case LE: cond = GE; zn = src2; zm = src1; break;
1364     case LO: cond = HI; zn = src2; zm = src1; break;
1365     case LS: cond = HS; zn = src2; zm = src1; break;
1366     case NE: cond = EQ; needs_negation = true; break;
1367     default:
1368       break;
1369   }
1370 
1371   if (is_floating_point_type(bt)) {
1372     fcm(cond, dst, size, zn, zm);
1373   } else {
1374     cm(cond, dst, size, zn, zm);
1375   }
1376 
1377   if (needs_negation) {
1378     notr(dst, isQ ? T16B : T8B, dst);
1379   }
1380 }
1381 
1382 void C2_MacroAssembler::neon_compare_zero(FloatRegister dst, BasicType bt, FloatRegister src,
1383                                           Condition cond, bool isQ) {
1384   SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1385   if (bt == T_FLOAT || bt == T_DOUBLE) {
1386     if (cond == Assembler::NE) {
1387       fcm(Assembler::EQ, dst, size, src);
1388       notr(dst, isQ ? T16B : T8B, dst);
1389     } else {
1390       fcm(cond, dst, size, src);
1391     }
1392   } else {
1393     if (cond == Assembler::NE) {
1394       cm(Assembler::EQ, dst, size, src);
1395       notr(dst, isQ ? T16B : T8B, dst);
1396     } else {
1397       cm(cond, dst, size, src);
1398     }
1399   }
1400 }
1401 
1402 // Compress the least significant bit of each byte to the rightmost and clear
1403 // the higher garbage bits.
1404 void C2_MacroAssembler::bytemask_compress(Register dst) {
1405   // Example input, dst = 0x01 00 00 00 01 01 00 01
1406   // The "??" bytes are garbage.
1407   orr(dst, dst, dst, Assembler::LSR, 7);  // dst = 0x?? 02 ?? 00 ?? 03 ?? 01
1408   orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D
1409   orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D
1410   andr(dst, dst, 0xff);                   // dst = 0x8D
1411 }
1412 
1413 // Pack the lowest-numbered bit of each mask element in src into a long value
1414 // in dst, at most the first 64 lane elements.
1415 // Clobbers: rscratch1, if UseSVE=1 or the hardware doesn't support FEAT_BITPERM.
1416 void C2_MacroAssembler::sve_vmask_tolong(Register dst, PRegister src, BasicType bt, int lane_cnt,
1417                                          FloatRegister vtmp1, FloatRegister vtmp2) {
1418   assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
1419   assert_different_registers(dst, rscratch1);
1420   assert_different_registers(vtmp1, vtmp2);
1421 
1422   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
1423   // Example:   src = 0b01100101 10001101, bt = T_BYTE, lane_cnt = 16
1424   // Expected:  dst = 0x658D
1425 
1426   // Convert the mask into vector with sequential bytes.
1427   // vtmp1 = 0x00010100 0x00010001 0x01000000 0x01010001
1428   sve_cpy(vtmp1, size, src, 1, false);
1429   if (bt != T_BYTE) {
1430     sve_vector_narrow(vtmp1, B, vtmp1, size, vtmp2);
1431   }
1432 
1433   if (UseSVE > 1 && VM_Version::supports_svebitperm()) {
1434     // Given a vector with the value 0x00 or 0x01 in each byte, the basic idea
1435     // is to compress each significant bit of the byte in a cross-lane way. Due
1436     // to the lack of a cross-lane bit-compress instruction, we use BEXT
1437     // (bit-compress in each lane) with the biggest lane size (T = D) then
1438     // concatenate the results.
1439 
1440     // The second source input of BEXT, initialized with 0x01 in each byte.
1441     // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1442     sve_dup(vtmp2, B, 1);
1443 
1444     // BEXT vtmp1.D, vtmp1.D, vtmp2.D
1445     // vtmp1 = 0x0001010000010001 | 0x0100000001010001
1446     // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1447     //         ---------------------------------------
1448     // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1449     sve_bext(vtmp1, D, vtmp1, vtmp2);
1450 
1451     // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the
1452     // result to dst.
1453     // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1454     // dst   = 0x658D
1455     if (lane_cnt <= 8) {
1456       // No need to concatenate.
1457       umov(dst, vtmp1, B, 0);
1458     } else if (lane_cnt <= 16) {
1459       ins(vtmp1, B, vtmp1, 1, 8);
1460       umov(dst, vtmp1, H, 0);
1461     } else {
1462       // As the lane count is 64 at most, the final expected value must be in
1463       // the lowest 64 bits after narrowing vtmp1 from D to B.
1464       sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2);
1465       umov(dst, vtmp1, D, 0);
1466     }
1467   } else if (UseSVE > 0) {
1468     // Compress the lowest 8 bytes.
1469     fmovd(dst, vtmp1);
1470     bytemask_compress(dst);
1471     if (lane_cnt <= 8) return;
1472 
1473     // Repeat on higher bytes and join the results.
1474     // Compress 8 bytes in each iteration.
1475     for (int idx = 1; idx < (lane_cnt / 8); idx++) {
1476       sve_extract_integral(rscratch1, T_LONG, vtmp1, idx, vtmp2);
1477       bytemask_compress(rscratch1);
1478       orr(dst, dst, rscratch1, Assembler::LSL, idx << 3);
1479     }
1480   } else {
1481     assert(false, "unsupported");
1482     ShouldNotReachHere();
1483   }
1484 }
1485 
1486 // Unpack the mask, a long value in src, into predicate register dst based on the
1487 // corresponding data type. Note that dst can support at most 64 lanes.
1488 // Below example gives the expected dst predicate register in different types, with
1489 // a valid src(0x658D) on a 1024-bit vector size machine.
1490 // BYTE:  dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 00 00 65 8D
1491 // SHORT: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 14 11 40 51
1492 // INT:   dst = 0x00 00 00 00 00 00 00 00 01 10 01 01 10 00 11 01
1493 // LONG:  dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01
1494 //
1495 // The number of significant bits of src must be equal to lane_cnt. E.g., 0xFF658D which
1496 // has 24 significant bits would be an invalid input if dst predicate register refers to
1497 // a LONG type 1024-bit vector, which has at most 16 lanes.
1498 void C2_MacroAssembler::sve_vmask_fromlong(PRegister dst, Register src, BasicType bt, int lane_cnt,
1499                                            FloatRegister vtmp1, FloatRegister vtmp2) {
1500   assert(UseSVE == 2 && VM_Version::supports_svebitperm() &&
1501          lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported");
1502   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
1503   // Example:   src = 0x658D, bt = T_BYTE, size = B, lane_cnt = 16
1504   // Expected:  dst = 0b01101001 10001101
1505 
1506   // Put long value from general purpose register into the first lane of vector.
1507   // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1508   sve_dup(vtmp1, B, 0);
1509   mov(vtmp1, D, 0, src);
1510 
1511   // As sve_cmp generates mask value with the minimum unit in byte, we should
1512   // transform the value in the first lane which is mask in bit now to the
1513   // mask in byte, which can be done by SVE2's BDEP instruction.
1514 
1515   // The first source input of BDEP instruction. Deposite each byte in every 8 bytes.
1516   // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1517   if (lane_cnt <= 8) {
1518     // Nothing. As only one byte exsits.
1519   } else if (lane_cnt <= 16) {
1520     ins(vtmp1, B, vtmp1, 8, 1);
1521     mov(vtmp1, B, 1, zr);
1522   } else {
1523     sve_vector_extend(vtmp1, D, vtmp1, B);
1524   }
1525 
1526   // The second source input of BDEP instruction, initialized with 0x01 for each byte.
1527   // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1528   sve_dup(vtmp2, B, 1);
1529 
1530   // BDEP vtmp1.D, vtmp1.D, vtmp2.D
1531   // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1532   // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1533   //         ---------------------------------------
1534   // vtmp1 = 0x0001010000010001 | 0x0100000001010001
1535   sve_bdep(vtmp1, D, vtmp1, vtmp2);
1536 
1537   if (bt != T_BYTE) {
1538     sve_vector_extend(vtmp1, size, vtmp1, B);
1539   }
1540   // Generate mask according to the given vector, in which the elements have been
1541   // extended to expected type.
1542   // dst = 0b01101001 10001101
1543   sve_cmp(Assembler::NE, dst, size, ptrue, vtmp1, 0);
1544 }
1545 
1546 // Clobbers: rflags
1547 void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg,
1548                                     FloatRegister zn, FloatRegister zm, Condition cond) {
1549   assert(pg->is_governing(), "This register has to be a governing predicate register");
1550   FloatRegister z1 = zn, z2 = zm;
1551   switch (cond) {
1552     case LE: z1 = zm; z2 = zn; cond = GE; break;
1553     case LT: z1 = zm; z2 = zn; cond = GT; break;
1554     case LO: z1 = zm; z2 = zn; cond = HI; break;
1555     case LS: z1 = zm; z2 = zn; cond = HS; break;
1556     default:
1557       break;
1558   }
1559 
1560   SIMD_RegVariant size = elemType_to_regVariant(bt);
1561   if (is_floating_point_type(bt)) {
1562     sve_fcm(cond, pd, size, pg, z1, z2);
1563   } else {
1564     assert(is_integral_type(bt), "unsupported element type");
1565     sve_cmp(cond, pd, size, pg, z1, z2);
1566   }
1567 }
1568 
1569 // Get index of the last mask lane that is set
1570 void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) {
1571   SIMD_RegVariant size = elemType_to_regVariant(bt);
1572   sve_rev(ptmp, size, src);
1573   sve_brkb(ptmp, ptrue, ptmp, false);
1574   sve_cntp(dst, size, ptrue, ptmp);
1575   movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1);
1576   subw(dst, rscratch1, dst);
1577 }
1578 
1579 // Extend integer vector src to dst with the same lane count
1580 // but larger element size, e.g. 4B -> 4I
1581 void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes,
1582                                            FloatRegister src, BasicType src_bt, bool is_unsigned) {
1583   if (src_bt == T_BYTE) {
1584     if (dst_bt == T_SHORT) {
1585       // 4B/8B to 4S/8S
1586       _xshll(is_unsigned, dst, T8H, src, T8B, 0);
1587     } else {
1588       // 4B to 4I
1589       assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported");
1590       _xshll(is_unsigned, dst, T8H, src, T8B, 0);
1591       _xshll(is_unsigned, dst, T4S, dst, T4H, 0);
1592     }
1593   } else if (src_bt == T_SHORT) {
1594     // 4S to 4I
1595     assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported");
1596     _xshll(is_unsigned, dst, T4S, src, T4H, 0);
1597   } else if (src_bt == T_INT) {
1598     // 2I to 2L
1599     assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported");
1600     _xshll(is_unsigned, dst, T2D, src, T2S, 0);
1601   } else {
1602     ShouldNotReachHere();
1603   }
1604 }
1605 
1606 // Narrow integer vector src down to dst with the same lane count
1607 // but smaller element size, e.g. 4I -> 4B
1608 void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt,
1609                                            FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) {
1610   if (src_bt == T_SHORT) {
1611     // 4S/8S to 4B/8B
1612     assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
1613     assert(dst_bt == T_BYTE, "unsupported");
1614     xtn(dst, T8B, src, T8H);
1615   } else if (src_bt == T_INT) {
1616     // 4I to 4B/4S
1617     assert(src_vlen_in_bytes == 16, "unsupported");
1618     assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported");
1619     xtn(dst, T4H, src, T4S);
1620     if (dst_bt == T_BYTE) {
1621       xtn(dst, T8B, dst, T8H);
1622     }
1623   } else if (src_bt == T_LONG) {
1624     // 2L to 2I
1625     assert(src_vlen_in_bytes == 16, "unsupported");
1626     assert(dst_bt == T_INT, "unsupported");
1627     xtn(dst, T2S, src, T2D);
1628   } else {
1629     ShouldNotReachHere();
1630   }
1631 }
1632 
1633 void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size,
1634                                           FloatRegister src, SIMD_RegVariant src_size,
1635                                           bool is_unsigned) {
1636   assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size");
1637 
1638   if (src_size == B) {
1639     switch (dst_size) {
1640     case H:
1641       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1642       break;
1643     case S:
1644       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1645       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1646       break;
1647     case D:
1648       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1649       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1650       _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1651       break;
1652     default:
1653       ShouldNotReachHere();
1654     }
1655   } else if (src_size == H) {
1656     if (dst_size == S) {
1657       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1658     } else { // D
1659       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1660       _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1661     }
1662   } else if (src_size == S) {
1663     _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, src);
1664   }
1665 }
1666 
1667 // Vector narrow from src to dst with specified element sizes.
1668 // High part of dst vector will be filled with zero.
1669 void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size,
1670                                           FloatRegister src, SIMD_RegVariant src_size,
1671                                           FloatRegister tmp) {
1672   assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size");
1673   assert_different_registers(src, tmp);
1674   sve_dup(tmp, src_size, 0);
1675   if (src_size == D) {
1676     switch (dst_size) {
1677     case S:
1678       sve_uzp1(dst, S, src, tmp);
1679       break;
1680     case H:
1681       assert_different_registers(dst, tmp);
1682       sve_uzp1(dst, S, src, tmp);
1683       sve_uzp1(dst, H, dst, tmp);
1684       break;
1685     case B:
1686       assert_different_registers(dst, tmp);
1687       sve_uzp1(dst, S, src, tmp);
1688       sve_uzp1(dst, H, dst, tmp);
1689       sve_uzp1(dst, B, dst, tmp);
1690       break;
1691     default:
1692       ShouldNotReachHere();
1693     }
1694   } else if (src_size == S) {
1695     if (dst_size == H) {
1696       sve_uzp1(dst, H, src, tmp);
1697     } else { // B
1698       assert_different_registers(dst, tmp);
1699       sve_uzp1(dst, H, src, tmp);
1700       sve_uzp1(dst, B, dst, tmp);
1701     }
1702   } else if (src_size == H) {
1703     sve_uzp1(dst, B, src, tmp);
1704   }
1705 }
1706 
1707 // Extend src predicate to dst predicate with the same lane count but larger
1708 // element size, e.g. 64Byte -> 512Long
1709 void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src,
1710                                              uint dst_element_length_in_bytes,
1711                                              uint src_element_length_in_bytes) {
1712   if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) {
1713     sve_punpklo(dst, src);
1714   } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) {
1715     sve_punpklo(dst, src);
1716     sve_punpklo(dst, dst);
1717   } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) {
1718     sve_punpklo(dst, src);
1719     sve_punpklo(dst, dst);
1720     sve_punpklo(dst, dst);
1721   } else {
1722     assert(false, "unsupported");
1723     ShouldNotReachHere();
1724   }
1725 }
1726 
1727 // Narrow src predicate to dst predicate with the same lane count but
1728 // smaller element size, e.g. 512Long -> 64Byte
1729 void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src, PRegister ptmp,
1730                                              uint dst_element_length_in_bytes, uint src_element_length_in_bytes) {
1731   // The insignificant bits in src predicate are expected to be zero.
1732   // To ensure the higher order bits of the resultant narrowed vector are 0, an all-zero predicate is
1733   // passed as the second argument. An example narrowing operation with a given mask would be -
1734   // 128Long -> 64Int on a 128-bit machine i.e 2L -> 2I
1735   // Mask (for 2 Longs) : TF
1736   // Predicate register for the above mask (16 bits) : 00000001 00000000
1737   // After narrowing (uzp1 dst.b, src.b, ptmp.b) : 0000 0000 0001 0000
1738   // Which translates to mask for 2 integers as : TF (lower half is considered while upper half is 0)
1739   assert_different_registers(src, ptmp);
1740   assert_different_registers(dst, ptmp);
1741   sve_pfalse(ptmp);
1742   if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) {
1743     sve_uzp1(dst, B, src, ptmp);
1744   } else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) {
1745     sve_uzp1(dst, H, src, ptmp);
1746     sve_uzp1(dst, B, dst, ptmp);
1747   } else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) {
1748     sve_uzp1(dst, S, src, ptmp);
1749     sve_uzp1(dst, H, dst, ptmp);
1750     sve_uzp1(dst, B, dst, ptmp);
1751   } else {
1752     assert(false, "unsupported");
1753     ShouldNotReachHere();
1754   }
1755 }
1756 
1757 // Vector reduction add for integral type with ASIMD instructions.
1758 void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt,
1759                                                  Register isrc, FloatRegister vsrc,
1760                                                  unsigned vector_length_in_bytes,
1761                                                  FloatRegister vtmp) {
1762   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1763   assert_different_registers(dst, isrc);
1764   bool isQ = vector_length_in_bytes == 16;
1765 
1766   BLOCK_COMMENT("neon_reduce_add_integral {");
1767     switch(bt) {
1768       case T_BYTE:
1769         addv(vtmp, isQ ? T16B : T8B, vsrc);
1770         smov(dst, vtmp, B, 0);
1771         addw(dst, dst, isrc, ext::sxtb);
1772         break;
1773       case T_SHORT:
1774         addv(vtmp, isQ ? T8H : T4H, vsrc);
1775         smov(dst, vtmp, H, 0);
1776         addw(dst, dst, isrc, ext::sxth);
1777         break;
1778       case T_INT:
1779         isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc);
1780         umov(dst, vtmp, S, 0);
1781         addw(dst, dst, isrc);
1782         break;
1783       case T_LONG:
1784         assert(isQ, "unsupported");
1785         addpd(vtmp, vsrc);
1786         umov(dst, vtmp, D, 0);
1787         add(dst, dst, isrc);
1788         break;
1789       default:
1790         assert(false, "unsupported");
1791         ShouldNotReachHere();
1792     }
1793   BLOCK_COMMENT("} neon_reduce_add_integral");
1794 }
1795 
1796 // Vector reduction multiply for integral type with ASIMD instructions.
1797 // Note: temporary registers vtmp1 and vtmp2 are not used in some cases.
1798 // Clobbers: rscratch1
1799 void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt,
1800                                                  Register isrc, FloatRegister vsrc,
1801                                                  unsigned vector_length_in_bytes,
1802                                                  FloatRegister vtmp1, FloatRegister vtmp2) {
1803   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1804   bool isQ = vector_length_in_bytes == 16;
1805 
1806   BLOCK_COMMENT("neon_reduce_mul_integral {");
1807     switch(bt) {
1808       case T_BYTE:
1809         if (isQ) {
1810           // Multiply the lower half and higher half of vector iteratively.
1811           // vtmp1 = vsrc[8:15]
1812           ins(vtmp1, D, vsrc, 0, 1);
1813           // vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7]
1814           mulv(vtmp1, T8B, vtmp1, vsrc);
1815           // vtmp2 = vtmp1[4:7]
1816           ins(vtmp2, S, vtmp1, 0, 1);
1817           // vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3]
1818           mulv(vtmp1, T8B, vtmp2, vtmp1);
1819         } else {
1820           ins(vtmp1, S, vsrc, 0, 1);
1821           mulv(vtmp1, T8B, vtmp1, vsrc);
1822         }
1823         // vtmp2 = vtmp1[2:3]
1824         ins(vtmp2, H, vtmp1, 0, 1);
1825         // vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1]
1826         mulv(vtmp2, T8B, vtmp2, vtmp1);
1827         // dst = vtmp2[0] * isrc * vtmp2[1]
1828         umov(rscratch1, vtmp2, B, 0);
1829         mulw(dst, rscratch1, isrc);
1830         sxtb(dst, dst);
1831         umov(rscratch1, vtmp2, B, 1);
1832         mulw(dst, rscratch1, dst);
1833         sxtb(dst, dst);
1834         break;
1835       case T_SHORT:
1836         if (isQ) {
1837           ins(vtmp2, D, vsrc, 0, 1);
1838           mulv(vtmp2, T4H, vtmp2, vsrc);
1839           ins(vtmp1, S, vtmp2, 0, 1);
1840           mulv(vtmp1, T4H, vtmp1, vtmp2);
1841         } else {
1842           ins(vtmp1, S, vsrc, 0, 1);
1843           mulv(vtmp1, T4H, vtmp1, vsrc);
1844         }
1845         umov(rscratch1, vtmp1, H, 0);
1846         mulw(dst, rscratch1, isrc);
1847         sxth(dst, dst);
1848         umov(rscratch1, vtmp1, H, 1);
1849         mulw(dst, rscratch1, dst);
1850         sxth(dst, dst);
1851         break;
1852       case T_INT:
1853         if (isQ) {
1854           ins(vtmp1, D, vsrc, 0, 1);
1855           mulv(vtmp1, T2S, vtmp1, vsrc);
1856         } else {
1857           vtmp1 = vsrc;
1858         }
1859         umov(rscratch1, vtmp1, S, 0);
1860         mul(dst, rscratch1, isrc);
1861         umov(rscratch1, vtmp1, S, 1);
1862         mul(dst, rscratch1, dst);
1863         break;
1864       case T_LONG:
1865         umov(rscratch1, vsrc, D, 0);
1866         mul(dst, isrc, rscratch1);
1867         umov(rscratch1, vsrc, D, 1);
1868         mul(dst, dst, rscratch1);
1869         break;
1870       default:
1871         assert(false, "unsupported");
1872         ShouldNotReachHere();
1873     }
1874   BLOCK_COMMENT("} neon_reduce_mul_integral");
1875 }
1876 
1877 // Vector reduction multiply for floating-point type with ASIMD instructions.
1878 void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt,
1879                                            FloatRegister fsrc, FloatRegister vsrc,
1880                                            unsigned vector_length_in_bytes,
1881                                            FloatRegister vtmp) {
1882   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1883   bool isQ = vector_length_in_bytes == 16;
1884 
1885   BLOCK_COMMENT("neon_reduce_mul_fp {");
1886     switch(bt) {
1887       case T_FLOAT:
1888         fmuls(dst, fsrc, vsrc);
1889         ins(vtmp, S, vsrc, 0, 1);
1890         fmuls(dst, dst, vtmp);
1891         if (isQ) {
1892           ins(vtmp, S, vsrc, 0, 2);
1893           fmuls(dst, dst, vtmp);
1894           ins(vtmp, S, vsrc, 0, 3);
1895           fmuls(dst, dst, vtmp);
1896          }
1897         break;
1898       case T_DOUBLE:
1899         assert(isQ, "unsupported");
1900         fmuld(dst, fsrc, vsrc);
1901         ins(vtmp, D, vsrc, 0, 1);
1902         fmuld(dst, dst, vtmp);
1903         break;
1904       default:
1905         assert(false, "unsupported");
1906         ShouldNotReachHere();
1907     }
1908   BLOCK_COMMENT("} neon_reduce_mul_fp");
1909 }
1910 
1911 // Helper to select logical instruction
1912 void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd,
1913                                                    Register Rn, Register Rm,
1914                                                    enum shift_kind kind, unsigned shift) {
1915   switch(opc) {
1916     case Op_AndReductionV:
1917       is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift);
1918       break;
1919     case Op_OrReductionV:
1920       is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift);
1921       break;
1922     case Op_XorReductionV:
1923       is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift);
1924       break;
1925     default:
1926       assert(false, "unsupported");
1927       ShouldNotReachHere();
1928   }
1929 }
1930 
1931 // Vector reduction logical operations And, Or, Xor
1932 // Clobbers: rscratch1
1933 void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt,
1934                                             Register isrc, FloatRegister vsrc,
1935                                             unsigned vector_length_in_bytes) {
1936   assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV,
1937          "unsupported");
1938   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1939   assert_different_registers(dst, isrc);
1940   bool isQ = vector_length_in_bytes == 16;
1941 
1942   BLOCK_COMMENT("neon_reduce_logical {");
1943     umov(rscratch1, vsrc, isQ ? D : S, 0);
1944     umov(dst, vsrc, isQ ? D : S, 1);
1945     neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1);
1946     switch(bt) {
1947       case T_BYTE:
1948         if (isQ) {
1949           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1950         }
1951         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
1952         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8);
1953         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1954         sxtb(dst, dst);
1955         break;
1956       case T_SHORT:
1957         if (isQ) {
1958           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1959         }
1960         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
1961         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1962         sxth(dst, dst);
1963         break;
1964       case T_INT:
1965         if (isQ) {
1966           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1967         }
1968         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1969         break;
1970       case T_LONG:
1971         assert(isQ, "unsupported");
1972         neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst);
1973         break;
1974       default:
1975         assert(false, "unsupported");
1976         ShouldNotReachHere();
1977     }
1978   BLOCK_COMMENT("} neon_reduce_logical");
1979 }
1980 
1981 // Vector reduction min/max for integral type with ASIMD instructions.
1982 // Note: vtmp is not used and expected to be fnoreg for T_LONG case.
1983 // Clobbers: rscratch1, rflags
1984 void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt,
1985                                                     Register isrc, FloatRegister vsrc,
1986                                                     unsigned vector_length_in_bytes,
1987                                                     FloatRegister vtmp) {
1988   assert(opc == Op_MinReductionV || opc == Op_MaxReductionV, "unsupported");
1989   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1990   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported");
1991   assert_different_registers(dst, isrc);
1992   bool isQ = vector_length_in_bytes == 16;
1993   bool is_min = opc == Op_MinReductionV;
1994 
1995   BLOCK_COMMENT("neon_reduce_minmax_integral {");
1996     if (bt == T_LONG) {
1997       assert(vtmp == fnoreg, "should be");
1998       assert(isQ, "should be");
1999       umov(rscratch1, vsrc, D, 0);
2000       cmp(isrc, rscratch1);
2001       csel(dst, isrc, rscratch1, is_min ? LT : GT);
2002       umov(rscratch1, vsrc, D, 1);
2003       cmp(dst, rscratch1);
2004       csel(dst, dst, rscratch1, is_min ? LT : GT);
2005     } else {
2006       SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
2007       if (size == T2S) {
2008         is_min ? sminp(vtmp, size, vsrc, vsrc) : smaxp(vtmp, size, vsrc, vsrc);
2009       } else {
2010         is_min ? sminv(vtmp, size, vsrc) : smaxv(vtmp, size, vsrc);
2011       }
2012       if (bt == T_INT) {
2013         umov(dst, vtmp, S, 0);
2014       } else {
2015         smov(dst, vtmp, elemType_to_regVariant(bt), 0);
2016       }
2017       cmpw(dst, isrc);
2018       cselw(dst, dst, isrc, is_min ? LT : GT);
2019     }
2020   BLOCK_COMMENT("} neon_reduce_minmax_integral");
2021 }
2022 
2023 // Vector reduction for integral type with SVE instruction.
2024 // Supported operations are Add, And, Or, Xor, Max, Min.
2025 // rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV.
2026 void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1,
2027                                             FloatRegister src2, PRegister pg, FloatRegister tmp) {
2028   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2029   assert(pg->is_governing(), "This register has to be a governing predicate register");
2030   assert_different_registers(src1, dst);
2031   // Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved.
2032   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2033   switch (opc) {
2034     case Op_AddReductionVI: {
2035       sve_uaddv(tmp, size, pg, src2);
2036       if (bt == T_BYTE) {
2037         smov(dst, tmp, size, 0);
2038         addw(dst, src1, dst, ext::sxtb);
2039       } else if (bt == T_SHORT) {
2040         smov(dst, tmp, size, 0);
2041         addw(dst, src1, dst, ext::sxth);
2042       } else {
2043         umov(dst, tmp, size, 0);
2044         addw(dst, dst, src1);
2045       }
2046       break;
2047     }
2048     case Op_AddReductionVL: {
2049       sve_uaddv(tmp, size, pg, src2);
2050       umov(dst, tmp, size, 0);
2051       add(dst, dst, src1);
2052       break;
2053     }
2054     case Op_AndReductionV: {
2055       sve_andv(tmp, size, pg, src2);
2056       if (bt == T_INT || bt == T_LONG) {
2057         umov(dst, tmp, size, 0);
2058       } else {
2059         smov(dst, tmp, size, 0);
2060       }
2061       if (bt == T_LONG) {
2062         andr(dst, dst, src1);
2063       } else {
2064         andw(dst, dst, src1);
2065       }
2066       break;
2067     }
2068     case Op_OrReductionV: {
2069       sve_orv(tmp, size, pg, src2);
2070       if (bt == T_INT || bt == T_LONG) {
2071         umov(dst, tmp, size, 0);
2072       } else {
2073         smov(dst, tmp, size, 0);
2074       }
2075       if (bt == T_LONG) {
2076         orr(dst, dst, src1);
2077       } else {
2078         orrw(dst, dst, src1);
2079       }
2080       break;
2081     }
2082     case Op_XorReductionV: {
2083       sve_eorv(tmp, size, pg, src2);
2084       if (bt == T_INT || bt == T_LONG) {
2085         umov(dst, tmp, size, 0);
2086       } else {
2087         smov(dst, tmp, size, 0);
2088       }
2089       if (bt == T_LONG) {
2090         eor(dst, dst, src1);
2091       } else {
2092         eorw(dst, dst, src1);
2093       }
2094       break;
2095     }
2096     case Op_MaxReductionV: {
2097       sve_smaxv(tmp, size, pg, src2);
2098       if (bt == T_INT || bt == T_LONG) {
2099         umov(dst, tmp, size, 0);
2100       } else {
2101         smov(dst, tmp, size, 0);
2102       }
2103       if (bt == T_LONG) {
2104         cmp(dst, src1);
2105         csel(dst, dst, src1, Assembler::GT);
2106       } else {
2107         cmpw(dst, src1);
2108         cselw(dst, dst, src1, Assembler::GT);
2109       }
2110       break;
2111     }
2112     case Op_MinReductionV: {
2113       sve_sminv(tmp, size, pg, src2);
2114       if (bt == T_INT || bt == T_LONG) {
2115         umov(dst, tmp, size, 0);
2116       } else {
2117         smov(dst, tmp, size, 0);
2118       }
2119       if (bt == T_LONG) {
2120         cmp(dst, src1);
2121         csel(dst, dst, src1, Assembler::LT);
2122       } else {
2123         cmpw(dst, src1);
2124         cselw(dst, dst, src1, Assembler::LT);
2125       }
2126       break;
2127     }
2128     default:
2129       assert(false, "unsupported");
2130       ShouldNotReachHere();
2131   }
2132 
2133   if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) {
2134     if (bt == T_BYTE) {
2135       sxtb(dst, dst);
2136     } else if (bt == T_SHORT) {
2137       sxth(dst, dst);
2138     }
2139   }
2140 }
2141 
2142 // Set elements of the dst predicate to true for lanes in the range of [0, lane_cnt), or
2143 // to false otherwise. The input "lane_cnt" should be smaller than or equal to the supported
2144 // max vector length of the basic type. Clobbers: rscratch1 and the rFlagsReg.
2145 void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt) {
2146   uint32_t max_vector_length = Matcher::max_vector_size(bt);
2147   assert(lane_cnt <= max_vector_length, "unsupported input lane_cnt");
2148 
2149   // Set all elements to false if the input "lane_cnt" is zero.
2150   if (lane_cnt == 0) {
2151     sve_pfalse(dst);
2152     return;
2153   }
2154 
2155   SIMD_RegVariant size = elemType_to_regVariant(bt);
2156   assert(size != Q, "invalid size");
2157 
2158   // Set all true if "lane_cnt" equals to the max lane count.
2159   if (lane_cnt == max_vector_length) {
2160     sve_ptrue(dst, size, /* ALL */ 0b11111);
2161     return;
2162   }
2163 
2164   // Fixed numbers for "ptrue".
2165   switch(lane_cnt) {
2166   case 1: /* VL1 */
2167   case 2: /* VL2 */
2168   case 3: /* VL3 */
2169   case 4: /* VL4 */
2170   case 5: /* VL5 */
2171   case 6: /* VL6 */
2172   case 7: /* VL7 */
2173   case 8: /* VL8 */
2174     sve_ptrue(dst, size, lane_cnt);
2175     return;
2176   case 16:
2177     sve_ptrue(dst, size, /* VL16 */ 0b01001);
2178     return;
2179   case 32:
2180     sve_ptrue(dst, size, /* VL32 */ 0b01010);
2181     return;
2182   case 64:
2183     sve_ptrue(dst, size, /* VL64 */ 0b01011);
2184     return;
2185   case 128:
2186     sve_ptrue(dst, size, /* VL128 */ 0b01100);
2187     return;
2188   case 256:
2189     sve_ptrue(dst, size, /* VL256 */ 0b01101);
2190     return;
2191   default:
2192     break;
2193   }
2194 
2195   // Special patterns for "ptrue".
2196   if (lane_cnt == round_down_power_of_2(max_vector_length)) {
2197     sve_ptrue(dst, size, /* POW2 */ 0b00000);
2198   } else if (lane_cnt == max_vector_length - (max_vector_length % 4)) {
2199     sve_ptrue(dst, size, /* MUL4 */ 0b11101);
2200   } else if (lane_cnt == max_vector_length - (max_vector_length % 3)) {
2201     sve_ptrue(dst, size, /* MUL3 */ 0b11110);
2202   } else {
2203     // Encode to "whileltw" for the remaining cases.
2204     mov(rscratch1, lane_cnt);
2205     sve_whileltw(dst, size, zr, rscratch1);
2206   }
2207 }
2208 
2209 // Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst.
2210 // Any remaining elements of dst will be filled with zero.
2211 // Clobbers: rscratch1
2212 // Preserves: src, mask
2213 void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask,
2214                                            FloatRegister vtmp1, FloatRegister vtmp2,
2215                                            PRegister pgtmp) {
2216   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2217   assert_different_registers(dst, src, vtmp1, vtmp2);
2218   assert_different_registers(mask, pgtmp);
2219 
2220   // Example input:   src   = 8888 7777 6666 5555 4444 3333 2222 1111
2221   //                  mask  = 0001 0000 0000 0001 0001 0000 0001 0001
2222   // Expected result: dst   = 0000 0000 0000 8888 5555 4444 2222 1111
2223   sve_dup(vtmp2, H, 0);
2224 
2225   // Extend lowest half to type INT.
2226   // dst = 00004444 00003333 00002222 00001111
2227   sve_uunpklo(dst, S, src);
2228   // pgtmp = 00000001 00000000 00000001 00000001
2229   sve_punpklo(pgtmp, mask);
2230   // Pack the active elements in size of type INT to the right,
2231   // and fill the remainings with zero.
2232   // dst = 00000000 00004444 00002222 00001111
2233   sve_compact(dst, S, dst, pgtmp);
2234   // Narrow the result back to type SHORT.
2235   // dst = 0000 0000 0000 0000 0000 4444 2222 1111
2236   sve_uzp1(dst, H, dst, vtmp2);
2237   // Count the active elements of lowest half.
2238   // rscratch1 = 3
2239   sve_cntp(rscratch1, S, ptrue, pgtmp);
2240 
2241   // Repeat to the highest half.
2242   // pgtmp = 00000001 00000000 00000000 00000001
2243   sve_punpkhi(pgtmp, mask);
2244   // vtmp1 = 00008888 00007777 00006666 00005555
2245   sve_uunpkhi(vtmp1, S, src);
2246   // vtmp1 = 00000000 00000000 00008888 00005555
2247   sve_compact(vtmp1, S, vtmp1, pgtmp);
2248   // vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555
2249   sve_uzp1(vtmp1, H, vtmp1, vtmp2);
2250 
2251   // Compressed low:   dst   = 0000 0000 0000 0000 0000 4444 2222 1111
2252   // Compressed high:  vtmp1 = 0000 0000 0000 0000 0000 0000 8888  5555
2253   // Left shift(cross lane) compressed high with TRUE_CNT lanes,
2254   // TRUE_CNT is the number of active elements in the compressed low.
2255   neg(rscratch1, rscratch1);
2256   // vtmp2 = {4 3 2 1 0 -1 -2 -3}
2257   sve_index(vtmp2, H, rscratch1, 1);
2258   // vtmp1 = 0000 0000 0000 8888 5555 0000 0000 0000
2259   sve_tbl(vtmp1, H, vtmp1, vtmp2);
2260 
2261   // Combine the compressed high(after shifted) with the compressed low.
2262   // dst = 0000 0000 0000 8888 5555 4444 2222 1111
2263   sve_orr(dst, dst, vtmp1);
2264 }
2265 
2266 // Clobbers: rscratch1, rscratch2
2267 // Preserves: src, mask
2268 void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask,
2269                                           FloatRegister vtmp1, FloatRegister vtmp2,
2270                                           FloatRegister vtmp3, FloatRegister vtmp4,
2271                                           PRegister ptmp, PRegister pgtmp) {
2272   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2273   assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3, vtmp4);
2274   assert_different_registers(mask, ptmp, pgtmp);
2275   // Example input:   src   = 88 77 66 55 44 33 22 11
2276   //                  mask  = 01 00 00 01 01 00 01 01
2277   // Expected result: dst   = 00 00 00 88 55 44 22 11
2278 
2279   sve_dup(vtmp4, B, 0);
2280   // Extend lowest half to type SHORT.
2281   // vtmp1 = 0044 0033 0022 0011
2282   sve_uunpklo(vtmp1, H, src);
2283   // ptmp = 0001 0000 0001 0001
2284   sve_punpklo(ptmp, mask);
2285   // Count the active elements of lowest half.
2286   // rscratch2 = 3
2287   sve_cntp(rscratch2, H, ptrue, ptmp);
2288   // Pack the active elements in size of type SHORT to the right,
2289   // and fill the remainings with zero.
2290   // dst = 0000 0044 0022 0011
2291   sve_compress_short(dst, vtmp1, ptmp, vtmp2, vtmp3, pgtmp);
2292   // Narrow the result back to type BYTE.
2293   // dst = 00 00 00 00 00 44 22 11
2294   sve_uzp1(dst, B, dst, vtmp4);
2295 
2296   // Repeat to the highest half.
2297   // ptmp = 0001 0000 0000 0001
2298   sve_punpkhi(ptmp, mask);
2299   // vtmp1 = 0088 0077 0066 0055
2300   sve_uunpkhi(vtmp2, H, src);
2301   // vtmp1 = 0000 0000 0088 0055
2302   sve_compress_short(vtmp1, vtmp2, ptmp, vtmp3, vtmp4, pgtmp);
2303 
2304   sve_dup(vtmp4, B, 0);
2305   // vtmp1 = 00 00 00 00 00 00 88 55
2306   sve_uzp1(vtmp1, B, vtmp1, vtmp4);
2307 
2308   // Compressed low:   dst   = 00 00 00 00 00 44 22 11
2309   // Compressed high:  vtmp1 = 00 00 00 00 00 00 88 55
2310   // Left shift(cross lane) compressed high with TRUE_CNT lanes,
2311   // TRUE_CNT is the number of active elements in the compressed low.
2312   neg(rscratch2, rscratch2);
2313   // vtmp2 = {4 3 2 1 0 -1 -2 -3}
2314   sve_index(vtmp2, B, rscratch2, 1);
2315   // vtmp1 = 00 00 00 88 55 00 00 00
2316   sve_tbl(vtmp1, B, vtmp1, vtmp2);
2317   // Combine the compressed high(after shifted) with the compressed low.
2318   // dst = 00 00 00 88 55 44 22 11
2319   sve_orr(dst, dst, vtmp1);
2320 }
2321 
2322 void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2323   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2324   SIMD_Arrangement size = isQ ? T16B : T8B;
2325   if (bt == T_BYTE) {
2326     rbit(dst, size, src);
2327   } else {
2328     neon_reverse_bytes(dst, src, bt, isQ);
2329     rbit(dst, size, dst);
2330   }
2331 }
2332 
2333 void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2334   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2335   SIMD_Arrangement size = isQ ? T16B : T8B;
2336   switch (bt) {
2337     case T_BYTE:
2338       if (dst != src) {
2339         orr(dst, size, src, src);
2340       }
2341       break;
2342     case T_SHORT:
2343       rev16(dst, size, src);
2344       break;
2345     case T_INT:
2346       rev32(dst, size, src);
2347       break;
2348     case T_LONG:
2349       rev64(dst, size, src);
2350       break;
2351     default:
2352       assert(false, "unsupported");
2353       ShouldNotReachHere();
2354   }
2355 }
2356 
2357 // Extract a scalar element from an sve vector at position 'idx'.
2358 // The input elements in src are expected to be of integral type.
2359 void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src,
2360                                              int idx, FloatRegister vtmp) {
2361   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2362   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2363   if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction
2364     if (bt == T_INT || bt == T_LONG) {
2365       umov(dst, src, size, idx);
2366     } else {
2367       smov(dst, src, size, idx);
2368     }
2369   } else {
2370     sve_orr(vtmp, src, src);
2371     sve_ext(vtmp, vtmp, idx << size);
2372     if (bt == T_INT || bt == T_LONG) {
2373       umov(dst, vtmp, size, 0);
2374     } else {
2375       smov(dst, vtmp, size, 0);
2376     }
2377   }
2378 }
2379 
2380 // java.lang.Math::round intrinsics
2381 
2382 // Clobbers: rscratch1, rflags
2383 void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2384                                           FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) {
2385   assert_different_registers(tmp1, tmp2, tmp3, src, dst);
2386   switch (T) {
2387     case T2S:
2388     case T4S:
2389       fmovs(tmp1, T, 0.5f);
2390       mov(rscratch1, jint_cast(0x1.0p23f));
2391       break;
2392     case T2D:
2393       fmovd(tmp1, T, 0.5);
2394       mov(rscratch1, julong_cast(0x1.0p52));
2395       break;
2396     default:
2397       assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2398   }
2399   fadd(tmp1, T, tmp1, src);
2400   fcvtms(tmp1, T, tmp1);
2401   // tmp1 = floor(src + 0.5, ties to even)
2402 
2403   fcvtas(dst, T, src);
2404   // dst = round(src), ties to away
2405 
2406   fneg(tmp3, T, src);
2407   dup(tmp2, T, rscratch1);
2408   cm(HS, tmp3, T, tmp3, tmp2);
2409   // tmp3 is now a set of flags
2410 
2411   bif(dst, T16B, tmp1, tmp3);
2412   // result in dst
2413 }
2414 
2415 // Clobbers: rscratch1, rflags
2416 void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2417                                          FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) {
2418   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2419   assert_different_registers(tmp1, tmp2, src, dst);
2420 
2421   switch (T) {
2422     case S:
2423       mov(rscratch1, jint_cast(0x1.0p23f));
2424       break;
2425     case D:
2426       mov(rscratch1, julong_cast(0x1.0p52));
2427       break;
2428     default:
2429       assert(T == S || T == D, "invalid register variant");
2430   }
2431 
2432   sve_frinta(dst, T, ptrue, src);
2433   // dst = round(src), ties to away
2434 
2435   Label none;
2436 
2437   sve_fneg(tmp1, T, ptrue, src);
2438   sve_dup(tmp2, T, rscratch1);
2439   sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1);
2440   br(EQ, none);
2441   {
2442     sve_cpy(tmp1, T, pgtmp, 0.5);
2443     sve_fadd(tmp1, T, pgtmp, src);
2444     sve_frintm(dst, T, pgtmp, tmp1);
2445     // dst = floor(src + 0.5, ties to even)
2446   }
2447   bind(none);
2448 
2449   sve_fcvtzs(dst, T, ptrue, dst, T);
2450   // result in dst
2451 }
2452 
2453 void C2_MacroAssembler::vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero,
2454                                            FloatRegister one, SIMD_Arrangement T) {
2455   assert_different_registers(dst, src, zero, one);
2456   assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2457 
2458   facgt(dst, T, src, zero);
2459   ushr(dst, T, dst, 1); // dst=0 for +-0.0 and NaN. 0x7FF..F otherwise
2460   bsl(dst, T == T2S ? T8B : T16B, one, src); // Result in dst
2461 }
2462 
2463 void C2_MacroAssembler::vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero,
2464                                           FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T) {
2465     assert_different_registers(dst, src, zero, one, vtmp);
2466     assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2467 
2468     sve_orr(vtmp, src, src);
2469     sve_fac(Assembler::GT, pgtmp, T, ptrue, src, zero); // pmtp=0 for +-0.0 and NaN. 0x1 otherwise
2470     switch (T) {
2471     case S:
2472       sve_and(vtmp, T, min_jint); // Extract the sign bit of float value in every lane of src
2473       sve_orr(vtmp, T, jint_cast(1.0)); // OR it with +1 to make the final result +1 or -1 depending
2474                                         // on the sign of the float value
2475       break;
2476     case D:
2477       sve_and(vtmp, T, min_jlong);
2478       sve_orr(vtmp, T, jlong_cast(1.0));
2479       break;
2480     default:
2481       assert(false, "unsupported");
2482       ShouldNotReachHere();
2483     }
2484     sve_sel(dst, T, pgtmp, vtmp, src); // Select either from src or vtmp based on the predicate register pgtmp
2485                                        // Result in dst
2486 }
2487 
2488 bool C2_MacroAssembler::in_scratch_emit_size() {
2489   if (ciEnv::current()->task() != nullptr) {
2490     PhaseOutput* phase_output = Compile::current()->output();
2491     if (phase_output != nullptr && phase_output->in_scratch_emit_size()) {
2492       return true;
2493     }
2494   }
2495   return MacroAssembler::in_scratch_emit_size();
2496 }