1 /*
   2  * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "opto/c2_MacroAssembler.hpp"
  29 #include "opto/compile.hpp"
  30 #include "opto/intrinsicnode.hpp"
  31 #include "opto/matcher.hpp"
  32 #include "opto/output.hpp"
  33 #include "opto/subnode.hpp"
  34 #include "runtime/stubRoutines.hpp"
  35 
  36 #ifdef PRODUCT
  37 #define BLOCK_COMMENT(str) /* nothing */
  38 #define STOP(error) stop(error)
  39 #else
  40 #define BLOCK_COMMENT(str) block_comment(str)
  41 #define STOP(error) block_comment(error); stop(error)
  42 #endif
  43 
  44 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  45 
  46 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
  47 
  48 void C2_MacroAssembler::entry_barrier() {
  49   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
  50   if (BarrierSet::barrier_set()->barrier_set_nmethod() != nullptr) {
  51     // Dummy labels for just measuring the code size
  52     Label dummy_slow_path;
  53     Label dummy_continuation;
  54     Label dummy_guard;
  55     Label* slow_path = &dummy_slow_path;
  56     Label* continuation = &dummy_continuation;
  57     Label* guard = &dummy_guard;
  58     if (!Compile::current()->output()->in_scratch_emit_size()) {
  59       // Use real labels from actual stub when not emitting code for the purpose of measuring its size
  60       C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
  61       Compile::current()->output()->add_stub(stub);
  62       slow_path = &stub->entry();
  63       continuation = &stub->continuation();
  64       guard = &stub->guard();
  65     }
  66     // In the C2 code, we move the non-hot part of nmethod entry barriers out-of-line to a stub.
  67     bs->nmethod_entry_barrier(this, slow_path, continuation, guard);
  68   }
  69 }
  70 
  71 void C2_MacroAssembler::fast_lock(Register objectReg, Register boxReg, Register tmpReg,
  72                                   Register tmp2Reg, Register tmp3Reg) {
  73   Register oop = objectReg;
  74   Register box = boxReg;
  75   Register disp_hdr = tmpReg;
  76   Register tmp = tmp2Reg;
  77   Label cont;
  78   Label object_has_monitor;
  79   Label count, no_count;
  80 
  81   assert_different_registers(oop, box, tmp, disp_hdr);
  82 
  83   // Load markWord from object into displaced_header.
  84   ldr(disp_hdr, Address(oop, oopDesc::mark_offset_in_bytes()));
  85 
  86   if (DiagnoseSyncOnValueBasedClasses != 0) {
  87     load_klass(tmp, oop);
  88     ldrw(tmp, Address(tmp, Klass::access_flags_offset()));
  89     tstw(tmp, JVM_ACC_IS_VALUE_BASED_CLASS);
  90     br(Assembler::NE, cont);
  91   }
  92 
  93   // Check for existing monitor
  94   tbnz(disp_hdr, exact_log2(markWord::monitor_value), object_has_monitor);
  95 
  96   if (LockingMode == LM_MONITOR) {
  97     tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0.
  98     b(cont);
  99   } else if (LockingMode == LM_LEGACY) {
 100     // Set tmp to be (markWord of object | UNLOCK_VALUE).
 101     orr(tmp, disp_hdr, markWord::unlocked_value);
 102 
 103     if (EnableValhalla) {
 104       // Mask inline_type bit such that we go to the slow path if object is an inline type
 105       andr(tmp, tmp, ~((int) markWord::inline_type_bit_in_place));
 106     }
 107 
 108     // Initialize the box. (Must happen before we update the object mark!)
 109     str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 110 
 111     // Compare object markWord with an unlocked value (tmp) and if
 112     // equal exchange the stack address of our box with object markWord.
 113     // On failure disp_hdr contains the possibly locked markWord.
 114     cmpxchg(oop, tmp, box, Assembler::xword, /*acquire*/ true,
 115             /*release*/ true, /*weak*/ false, disp_hdr);
 116     br(Assembler::EQ, cont);
 117 
 118     assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
 119 
 120     // If the compare-and-exchange succeeded, then we found an unlocked
 121     // object, will have now locked it will continue at label cont
 122 
 123     // Check if the owner is self by comparing the value in the
 124     // markWord of object (disp_hdr) with the stack pointer.
 125     mov(rscratch1, sp);
 126     sub(disp_hdr, disp_hdr, rscratch1);
 127     mov(tmp, (address) (~(os::vm_page_size()-1) | markWord::lock_mask_in_place));
 128     // If condition is true we are cont and hence we can store 0 as the
 129     // displaced header in the box, which indicates that it is a recursive lock.
 130     ands(tmp/*==0?*/, disp_hdr, tmp);   // Sets flags for result
 131     str(tmp/*==0, perhaps*/, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 132     b(cont);
 133   } else {
 134     assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 135     lightweight_lock(oop, disp_hdr, tmp, tmp3Reg, no_count);
 136     b(count);
 137   }
 138 
 139   // Handle existing monitor.
 140   bind(object_has_monitor);
 141 
 142   // The object's monitor m is unlocked iff m->owner == nullptr,
 143   // otherwise m->owner may contain a thread or a stack address.
 144   //
 145   // Try to CAS m->owner from null to current thread.
 146   add(tmp, disp_hdr, (in_bytes(ObjectMonitor::owner_offset())-markWord::monitor_value));
 147   cmpxchg(tmp, zr, rthread, Assembler::xword, /*acquire*/ true,
 148           /*release*/ true, /*weak*/ false, tmp3Reg); // Sets flags for result
 149 
 150   if (LockingMode != LM_LIGHTWEIGHT) {
 151     // Store a non-null value into the box to avoid looking like a re-entrant
 152     // lock. The fast-path monitor unlock code checks for
 153     // markWord::monitor_value so use markWord::unused_mark which has the
 154     // relevant bit set, and also matches ObjectSynchronizer::enter.
 155     mov(tmp, (address)markWord::unused_mark().value());
 156     str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 157   }
 158   br(Assembler::EQ, cont); // CAS success means locking succeeded
 159 
 160   cmp(tmp3Reg, rthread);
 161   br(Assembler::NE, cont); // Check for recursive locking
 162 
 163   // Recursive lock case
 164   increment(Address(disp_hdr, in_bytes(ObjectMonitor::recursions_offset()) - markWord::monitor_value), 1);
 165   // flag == EQ still from the cmp above, checking if this is a reentrant lock
 166 
 167   bind(cont);
 168   // flag == EQ indicates success
 169   // flag == NE indicates failure
 170   br(Assembler::NE, no_count);
 171 
 172   bind(count);
 173   increment(Address(rthread, JavaThread::held_monitor_count_offset()));
 174 
 175   bind(no_count);
 176 }
 177 
 178 void C2_MacroAssembler::fast_unlock(Register objectReg, Register boxReg, Register tmpReg,
 179                                     Register tmp2Reg) {
 180   Register oop = objectReg;
 181   Register box = boxReg;
 182   Register disp_hdr = tmpReg;
 183   Register tmp = tmp2Reg;
 184   Label cont;
 185   Label object_has_monitor;
 186   Label count, no_count;
 187 
 188   assert_different_registers(oop, box, tmp, disp_hdr);
 189 
 190   if (LockingMode == LM_LEGACY) {
 191     // Find the lock address and load the displaced header from the stack.
 192     ldr(disp_hdr, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 193 
 194     // If the displaced header is 0, we have a recursive unlock.
 195     cmp(disp_hdr, zr);
 196     br(Assembler::EQ, cont);
 197   }
 198 
 199   // Handle existing monitor.
 200   ldr(tmp, Address(oop, oopDesc::mark_offset_in_bytes()));
 201   tbnz(tmp, exact_log2(markWord::monitor_value), object_has_monitor);
 202 
 203   if (LockingMode == LM_MONITOR) {
 204     tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0.
 205     b(cont);
 206   } else if (LockingMode == LM_LEGACY) {
 207     // Check if it is still a light weight lock, this is is true if we
 208     // see the stack address of the basicLock in the markWord of the
 209     // object.
 210 
 211     cmpxchg(oop, box, disp_hdr, Assembler::xword, /*acquire*/ false,
 212             /*release*/ true, /*weak*/ false, tmp);
 213     b(cont);
 214   } else {
 215     assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 216     lightweight_unlock(oop, tmp, box, disp_hdr, no_count);
 217     b(count);
 218   }
 219 
 220   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
 221 
 222   // Handle existing monitor.
 223   bind(object_has_monitor);
 224   STATIC_ASSERT(markWord::monitor_value <= INT_MAX);
 225   add(tmp, tmp, -(int)markWord::monitor_value); // monitor
 226 
 227   if (LockingMode == LM_LIGHTWEIGHT) {
 228     // If the owner is anonymous, we need to fix it -- in an outline stub.
 229     Register tmp2 = disp_hdr;
 230     ldr(tmp2, Address(tmp, ObjectMonitor::owner_offset()));
 231     // We cannot use tbnz here, the target might be too far away and cannot
 232     // be encoded.
 233     tst(tmp2, (uint64_t)ObjectMonitor::ANONYMOUS_OWNER);
 234     C2HandleAnonOMOwnerStub* stub = new (Compile::current()->comp_arena()) C2HandleAnonOMOwnerStub(tmp, tmp2);
 235     Compile::current()->output()->add_stub(stub);
 236     br(Assembler::NE, stub->entry());
 237     bind(stub->continuation());
 238   }
 239 
 240   ldr(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset()));
 241 
 242   Label notRecursive;
 243   cbz(disp_hdr, notRecursive);
 244 
 245   // Recursive lock
 246   sub(disp_hdr, disp_hdr, 1u);
 247   str(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset()));
 248   cmp(disp_hdr, disp_hdr); // Sets flags for result
 249   b(cont);
 250 
 251   bind(notRecursive);
 252   ldr(rscratch1, Address(tmp, ObjectMonitor::EntryList_offset()));
 253   ldr(disp_hdr, Address(tmp, ObjectMonitor::cxq_offset()));
 254   orr(rscratch1, rscratch1, disp_hdr); // Will be 0 if both are 0.
 255   cmp(rscratch1, zr); // Sets flags for result
 256   cbnz(rscratch1, cont);
 257   // need a release store here
 258   lea(tmp, Address(tmp, ObjectMonitor::owner_offset()));
 259   stlr(zr, tmp); // set unowned
 260 
 261   bind(cont);
 262   // flag == EQ indicates success
 263   // flag == NE indicates failure
 264   br(Assembler::NE, no_count);
 265 
 266   bind(count);
 267   decrement(Address(rthread, JavaThread::held_monitor_count_offset()));
 268 
 269   bind(no_count);
 270 }
 271 
 272 // Search for str1 in str2 and return index or -1
 273 // Clobbers: rscratch1, rscratch2, rflags. May also clobber v0-v1, when icnt1==-1.
 274 void C2_MacroAssembler::string_indexof(Register str2, Register str1,
 275                                        Register cnt2, Register cnt1,
 276                                        Register tmp1, Register tmp2,
 277                                        Register tmp3, Register tmp4,
 278                                        Register tmp5, Register tmp6,
 279                                        int icnt1, Register result, int ae) {
 280   // NOTE: tmp5, tmp6 can be zr depending on specific method version
 281   Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
 282 
 283   Register ch1 = rscratch1;
 284   Register ch2 = rscratch2;
 285   Register cnt1tmp = tmp1;
 286   Register cnt2tmp = tmp2;
 287   Register cnt1_neg = cnt1;
 288   Register cnt2_neg = cnt2;
 289   Register result_tmp = tmp4;
 290 
 291   bool isL = ae == StrIntrinsicNode::LL;
 292 
 293   bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
 294   bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
 295   int str1_chr_shift = str1_isL ? 0:1;
 296   int str2_chr_shift = str2_isL ? 0:1;
 297   int str1_chr_size = str1_isL ? 1:2;
 298   int str2_chr_size = str2_isL ? 1:2;
 299   chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
 300                                       (chr_insn)&MacroAssembler::ldrh;
 301   chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
 302                                       (chr_insn)&MacroAssembler::ldrh;
 303   chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
 304   chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
 305 
 306   // Note, inline_string_indexOf() generates checks:
 307   // if (substr.count > string.count) return -1;
 308   // if (substr.count == 0) return 0;
 309 
 310   // We have two strings, a source string in str2, cnt2 and a pattern string
 311   // in str1, cnt1. Find the 1st occurrence of pattern in source or return -1.
 312 
 313   // For larger pattern and source we use a simplified Boyer Moore algorithm.
 314   // With a small pattern and source we use linear scan.
 315 
 316   if (icnt1 == -1) {
 317     sub(result_tmp, cnt2, cnt1);
 318     cmp(cnt1, (u1)8);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
 319     br(LT, LINEARSEARCH);
 320     dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
 321     subs(zr, cnt1, 256);
 322     lsr(tmp1, cnt2, 2);
 323     ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
 324     br(GE, LINEARSTUB);
 325   }
 326 
 327 // The Boyer Moore alogorithm is based on the description here:-
 328 //
 329 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
 330 //
 331 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
 332 // and the 'Good Suffix' rule.
 333 //
 334 // These rules are essentially heuristics for how far we can shift the
 335 // pattern along the search string.
 336 //
 337 // The implementation here uses the 'Bad Character' rule only because of the
 338 // complexity of initialisation for the 'Good Suffix' rule.
 339 //
 340 // This is also known as the Boyer-Moore-Horspool algorithm:-
 341 //
 342 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
 343 //
 344 // This particular implementation has few java-specific optimizations.
 345 //
 346 // #define ASIZE 256
 347 //
 348 //    int bm(unsigned char *x, int m, unsigned char *y, int n) {
 349 //       int i, j;
 350 //       unsigned c;
 351 //       unsigned char bc[ASIZE];
 352 //
 353 //       /* Preprocessing */
 354 //       for (i = 0; i < ASIZE; ++i)
 355 //          bc[i] = m;
 356 //       for (i = 0; i < m - 1; ) {
 357 //          c = x[i];
 358 //          ++i;
 359 //          // c < 256 for Latin1 string, so, no need for branch
 360 //          #ifdef PATTERN_STRING_IS_LATIN1
 361 //          bc[c] = m - i;
 362 //          #else
 363 //          if (c < ASIZE) bc[c] = m - i;
 364 //          #endif
 365 //       }
 366 //
 367 //       /* Searching */
 368 //       j = 0;
 369 //       while (j <= n - m) {
 370 //          c = y[i+j];
 371 //          if (x[m-1] == c)
 372 //            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
 373 //          if (i < 0) return j;
 374 //          // c < 256 for Latin1 string, so, no need for branch
 375 //          #ifdef SOURCE_STRING_IS_LATIN1
 376 //          // LL case: (c< 256) always true. Remove branch
 377 //          j += bc[y[j+m-1]];
 378 //          #endif
 379 //          #ifndef PATTERN_STRING_IS_UTF
 380 //          // UU case: need if (c<ASIZE) check. Skip 1 character if not.
 381 //          if (c < ASIZE)
 382 //            j += bc[y[j+m-1]];
 383 //          else
 384 //            j += 1
 385 //          #endif
 386 //          #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
 387 //          // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
 388 //          if (c < ASIZE)
 389 //            j += bc[y[j+m-1]];
 390 //          else
 391 //            j += m
 392 //          #endif
 393 //       }
 394 //    }
 395 
 396   if (icnt1 == -1) {
 397     Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
 398         BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
 399     Register cnt1end = tmp2;
 400     Register str2end = cnt2;
 401     Register skipch = tmp2;
 402 
 403     // str1 length is >=8, so, we can read at least 1 register for cases when
 404     // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
 405     // UL case. We'll re-read last character in inner pre-loop code to have
 406     // single outer pre-loop load
 407     const int firstStep = isL ? 7 : 3;
 408 
 409     const int ASIZE = 256;
 410     const int STORED_BYTES = 32; // amount of bytes stored per instruction
 411     sub(sp, sp, ASIZE);
 412     mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
 413     mov(ch1, sp);
 414     BIND(BM_INIT_LOOP);
 415       stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
 416       subs(tmp5, tmp5, 1);
 417       br(GT, BM_INIT_LOOP);
 418 
 419       sub(cnt1tmp, cnt1, 1);
 420       mov(tmp5, str2);
 421       add(str2end, str2, result_tmp, LSL, str2_chr_shift);
 422       sub(ch2, cnt1, 1);
 423       mov(tmp3, str1);
 424     BIND(BCLOOP);
 425       (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
 426       if (!str1_isL) {
 427         subs(zr, ch1, ASIZE);
 428         br(HS, BCSKIP);
 429       }
 430       strb(ch2, Address(sp, ch1));
 431     BIND(BCSKIP);
 432       subs(ch2, ch2, 1);
 433       br(GT, BCLOOP);
 434 
 435       add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
 436       if (str1_isL == str2_isL) {
 437         // load last 8 bytes (8LL/4UU symbols)
 438         ldr(tmp6, Address(tmp6, -wordSize));
 439       } else {
 440         ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
 441         // convert Latin1 to UTF. We'll have to wait until load completed, but
 442         // it's still faster than per-character loads+checks
 443         lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
 444         ubfx(ch1, tmp6, 8, 8); // str1[N-2]
 445         ubfx(ch2, tmp6, 16, 8); // str1[N-3]
 446         andr(tmp6, tmp6, 0xFF); // str1[N-4]
 447         orr(ch2, ch1, ch2, LSL, 16);
 448         orr(tmp6, tmp6, tmp3, LSL, 48);
 449         orr(tmp6, tmp6, ch2, LSL, 16);
 450       }
 451     BIND(BMLOOPSTR2);
 452       (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 453       sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
 454       if (str1_isL == str2_isL) {
 455         // re-init tmp3. It's for free because it's executed in parallel with
 456         // load above. Alternative is to initialize it before loop, but it'll
 457         // affect performance on in-order systems with 2 or more ld/st pipelines
 458         lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
 459       }
 460       if (!isL) { // UU/UL case
 461         lsl(ch2, cnt1tmp, 1); // offset in bytes
 462       }
 463       cmp(tmp3, skipch);
 464       br(NE, BMSKIP);
 465       ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
 466       mov(ch1, tmp6);
 467       if (isL) {
 468         b(BMLOOPSTR1_AFTER_LOAD);
 469       } else {
 470         sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
 471         b(BMLOOPSTR1_CMP);
 472       }
 473     BIND(BMLOOPSTR1);
 474       (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
 475       (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 476     BIND(BMLOOPSTR1_AFTER_LOAD);
 477       subs(cnt1tmp, cnt1tmp, 1);
 478       br(LT, BMLOOPSTR1_LASTCMP);
 479     BIND(BMLOOPSTR1_CMP);
 480       cmp(ch1, ch2);
 481       br(EQ, BMLOOPSTR1);
 482     BIND(BMSKIP);
 483       if (!isL) {
 484         // if we've met UTF symbol while searching Latin1 pattern, then we can
 485         // skip cnt1 symbols
 486         if (str1_isL != str2_isL) {
 487           mov(result_tmp, cnt1);
 488         } else {
 489           mov(result_tmp, 1);
 490         }
 491         subs(zr, skipch, ASIZE);
 492         br(HS, BMADV);
 493       }
 494       ldrb(result_tmp, Address(sp, skipch)); // load skip distance
 495     BIND(BMADV);
 496       sub(cnt1tmp, cnt1, 1);
 497       add(str2, str2, result_tmp, LSL, str2_chr_shift);
 498       cmp(str2, str2end);
 499       br(LE, BMLOOPSTR2);
 500       add(sp, sp, ASIZE);
 501       b(NOMATCH);
 502     BIND(BMLOOPSTR1_LASTCMP);
 503       cmp(ch1, ch2);
 504       br(NE, BMSKIP);
 505     BIND(BMMATCH);
 506       sub(result, str2, tmp5);
 507       if (!str2_isL) lsr(result, result, 1);
 508       add(sp, sp, ASIZE);
 509       b(DONE);
 510 
 511     BIND(LINEARSTUB);
 512     cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm
 513     br(LT, LINEAR_MEDIUM);
 514     mov(result, zr);
 515     RuntimeAddress stub = nullptr;
 516     if (isL) {
 517       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
 518       assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated");
 519     } else if (str1_isL) {
 520       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
 521        assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated");
 522     } else {
 523       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
 524       assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated");
 525     }
 526     address call = trampoline_call(stub);
 527     if (call == nullptr) {
 528       DEBUG_ONLY(reset_labels(LINEARSEARCH, LINEAR_MEDIUM, DONE, NOMATCH, MATCH));
 529       ciEnv::current()->record_failure("CodeCache is full");
 530       return;
 531     }
 532     b(DONE);
 533   }
 534 
 535   BIND(LINEARSEARCH);
 536   {
 537     Label DO1, DO2, DO3;
 538 
 539     Register str2tmp = tmp2;
 540     Register first = tmp3;
 541 
 542     if (icnt1 == -1)
 543     {
 544         Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
 545 
 546         cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2));
 547         br(LT, DOSHORT);
 548       BIND(LINEAR_MEDIUM);
 549         (this->*str1_load_1chr)(first, Address(str1));
 550         lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
 551         sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
 552         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 553         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 554 
 555       BIND(FIRST_LOOP);
 556         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
 557         cmp(first, ch2);
 558         br(EQ, STR1_LOOP);
 559       BIND(STR2_NEXT);
 560         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 561         br(LE, FIRST_LOOP);
 562         b(NOMATCH);
 563 
 564       BIND(STR1_LOOP);
 565         adds(cnt1tmp, cnt1_neg, str1_chr_size);
 566         add(cnt2tmp, cnt2_neg, str2_chr_size);
 567         br(GE, MATCH);
 568 
 569       BIND(STR1_NEXT);
 570         (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
 571         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
 572         cmp(ch1, ch2);
 573         br(NE, STR2_NEXT);
 574         adds(cnt1tmp, cnt1tmp, str1_chr_size);
 575         add(cnt2tmp, cnt2tmp, str2_chr_size);
 576         br(LT, STR1_NEXT);
 577         b(MATCH);
 578 
 579       BIND(DOSHORT);
 580       if (str1_isL == str2_isL) {
 581         cmp(cnt1, (u1)2);
 582         br(LT, DO1);
 583         br(GT, DO3);
 584       }
 585     }
 586 
 587     if (icnt1 == 4) {
 588       Label CH1_LOOP;
 589 
 590         (this->*load_4chr)(ch1, str1);
 591         sub(result_tmp, cnt2, 4);
 592         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 593         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 594 
 595       BIND(CH1_LOOP);
 596         (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
 597         cmp(ch1, ch2);
 598         br(EQ, MATCH);
 599         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 600         br(LE, CH1_LOOP);
 601         b(NOMATCH);
 602       }
 603 
 604     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
 605       Label CH1_LOOP;
 606 
 607       BIND(DO2);
 608         (this->*load_2chr)(ch1, str1);
 609         if (icnt1 == 2) {
 610           sub(result_tmp, cnt2, 2);
 611         }
 612         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 613         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 614       BIND(CH1_LOOP);
 615         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
 616         cmp(ch1, ch2);
 617         br(EQ, MATCH);
 618         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 619         br(LE, CH1_LOOP);
 620         b(NOMATCH);
 621     }
 622 
 623     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
 624       Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
 625 
 626       BIND(DO3);
 627         (this->*load_2chr)(first, str1);
 628         (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
 629         if (icnt1 == 3) {
 630           sub(result_tmp, cnt2, 3);
 631         }
 632         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 633         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 634       BIND(FIRST_LOOP);
 635         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
 636         cmpw(first, ch2);
 637         br(EQ, STR1_LOOP);
 638       BIND(STR2_NEXT);
 639         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 640         br(LE, FIRST_LOOP);
 641         b(NOMATCH);
 642 
 643       BIND(STR1_LOOP);
 644         add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
 645         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
 646         cmp(ch1, ch2);
 647         br(NE, STR2_NEXT);
 648         b(MATCH);
 649     }
 650 
 651     if (icnt1 == -1 || icnt1 == 1) {
 652       Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
 653 
 654       BIND(DO1);
 655         (this->*str1_load_1chr)(ch1, str1);
 656         cmp(cnt2, (u1)8);
 657         br(LT, DO1_SHORT);
 658 
 659         sub(result_tmp, cnt2, 8/str2_chr_size);
 660         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 661         mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
 662         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 663 
 664         if (str2_isL) {
 665           orr(ch1, ch1, ch1, LSL, 8);
 666         }
 667         orr(ch1, ch1, ch1, LSL, 16);
 668         orr(ch1, ch1, ch1, LSL, 32);
 669       BIND(CH1_LOOP);
 670         ldr(ch2, Address(str2, cnt2_neg));
 671         eor(ch2, ch1, ch2);
 672         sub(tmp1, ch2, tmp3);
 673         orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 674         bics(tmp1, tmp1, tmp2);
 675         br(NE, HAS_ZERO);
 676         adds(cnt2_neg, cnt2_neg, 8);
 677         br(LT, CH1_LOOP);
 678 
 679         cmp(cnt2_neg, (u1)8);
 680         mov(cnt2_neg, 0);
 681         br(LT, CH1_LOOP);
 682         b(NOMATCH);
 683 
 684       BIND(HAS_ZERO);
 685         rev(tmp1, tmp1);
 686         clz(tmp1, tmp1);
 687         add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
 688         b(MATCH);
 689 
 690       BIND(DO1_SHORT);
 691         mov(result_tmp, cnt2);
 692         lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
 693         sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
 694       BIND(DO1_LOOP);
 695         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
 696         cmpw(ch1, ch2);
 697         br(EQ, MATCH);
 698         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 699         br(LT, DO1_LOOP);
 700     }
 701   }
 702   BIND(NOMATCH);
 703     mov(result, -1);
 704     b(DONE);
 705   BIND(MATCH);
 706     add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
 707   BIND(DONE);
 708 }
 709 
 710 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
 711 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
 712 
 713 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1,
 714                                             Register ch, Register result,
 715                                             Register tmp1, Register tmp2, Register tmp3)
 716 {
 717   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
 718   Register cnt1_neg = cnt1;
 719   Register ch1 = rscratch1;
 720   Register result_tmp = rscratch2;
 721 
 722   cbz(cnt1, NOMATCH);
 723 
 724   cmp(cnt1, (u1)4);
 725   br(LT, DO1_SHORT);
 726 
 727   orr(ch, ch, ch, LSL, 16);
 728   orr(ch, ch, ch, LSL, 32);
 729 
 730   sub(cnt1, cnt1, 4);
 731   mov(result_tmp, cnt1);
 732   lea(str1, Address(str1, cnt1, Address::uxtw(1)));
 733   sub(cnt1_neg, zr, cnt1, LSL, 1);
 734 
 735   mov(tmp3, 0x0001000100010001);
 736 
 737   BIND(CH1_LOOP);
 738     ldr(ch1, Address(str1, cnt1_neg));
 739     eor(ch1, ch, ch1);
 740     sub(tmp1, ch1, tmp3);
 741     orr(tmp2, ch1, 0x7fff7fff7fff7fff);
 742     bics(tmp1, tmp1, tmp2);
 743     br(NE, HAS_ZERO);
 744     adds(cnt1_neg, cnt1_neg, 8);
 745     br(LT, CH1_LOOP);
 746 
 747     cmp(cnt1_neg, (u1)8);
 748     mov(cnt1_neg, 0);
 749     br(LT, CH1_LOOP);
 750     b(NOMATCH);
 751 
 752   BIND(HAS_ZERO);
 753     rev(tmp1, tmp1);
 754     clz(tmp1, tmp1);
 755     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
 756     b(MATCH);
 757 
 758   BIND(DO1_SHORT);
 759     mov(result_tmp, cnt1);
 760     lea(str1, Address(str1, cnt1, Address::uxtw(1)));
 761     sub(cnt1_neg, zr, cnt1, LSL, 1);
 762   BIND(DO1_LOOP);
 763     ldrh(ch1, Address(str1, cnt1_neg));
 764     cmpw(ch, ch1);
 765     br(EQ, MATCH);
 766     adds(cnt1_neg, cnt1_neg, 2);
 767     br(LT, DO1_LOOP);
 768   BIND(NOMATCH);
 769     mov(result, -1);
 770     b(DONE);
 771   BIND(MATCH);
 772     add(result, result_tmp, cnt1_neg, ASR, 1);
 773   BIND(DONE);
 774 }
 775 
 776 void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1,
 777                                                 Register ch, Register result,
 778                                                 FloatRegister ztmp1,
 779                                                 FloatRegister ztmp2,
 780                                                 PRegister tmp_pg,
 781                                                 PRegister tmp_pdn, bool isL)
 782 {
 783   // Note that `tmp_pdn` should *NOT* be used as governing predicate register.
 784   assert(tmp_pg->is_governing(),
 785          "this register has to be a governing predicate register");
 786 
 787   Label LOOP, MATCH, DONE, NOMATCH;
 788   Register vec_len = rscratch1;
 789   Register idx = rscratch2;
 790 
 791   SIMD_RegVariant T = (isL == true) ? B : H;
 792 
 793   cbz(cnt1, NOMATCH);
 794 
 795   // Assign the particular char throughout the vector.
 796   sve_dup(ztmp2, T, ch);
 797   if (isL) {
 798     sve_cntb(vec_len);
 799   } else {
 800     sve_cnth(vec_len);
 801   }
 802   mov(idx, 0);
 803 
 804   // Generate a predicate to control the reading of input string.
 805   sve_whilelt(tmp_pg, T, idx, cnt1);
 806 
 807   BIND(LOOP);
 808     // Read a vector of 8- or 16-bit data depending on the string type. Note
 809     // that inactive elements indicated by the predicate register won't cause
 810     // a data read from memory to the destination vector.
 811     if (isL) {
 812       sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx));
 813     } else {
 814       sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1)));
 815     }
 816     add(idx, idx, vec_len);
 817 
 818     // Perform the comparison. An element of the destination predicate is set
 819     // to active if the particular char is matched.
 820     sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2);
 821 
 822     // Branch if the particular char is found.
 823     br(NE, MATCH);
 824 
 825     sve_whilelt(tmp_pg, T, idx, cnt1);
 826 
 827     // Loop back if the particular char not found.
 828     br(MI, LOOP);
 829 
 830   BIND(NOMATCH);
 831     mov(result, -1);
 832     b(DONE);
 833 
 834   BIND(MATCH);
 835     // Undo the index increment.
 836     sub(idx, idx, vec_len);
 837 
 838     // Crop the vector to find its location.
 839     sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */);
 840     add(result, idx, -1);
 841     sve_incp(result, T, tmp_pdn);
 842   BIND(DONE);
 843 }
 844 
 845 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1,
 846                                             Register ch, Register result,
 847                                             Register tmp1, Register tmp2, Register tmp3)
 848 {
 849   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
 850   Register cnt1_neg = cnt1;
 851   Register ch1 = rscratch1;
 852   Register result_tmp = rscratch2;
 853 
 854   cbz(cnt1, NOMATCH);
 855 
 856   cmp(cnt1, (u1)8);
 857   br(LT, DO1_SHORT);
 858 
 859   orr(ch, ch, ch, LSL, 8);
 860   orr(ch, ch, ch, LSL, 16);
 861   orr(ch, ch, ch, LSL, 32);
 862 
 863   sub(cnt1, cnt1, 8);
 864   mov(result_tmp, cnt1);
 865   lea(str1, Address(str1, cnt1));
 866   sub(cnt1_neg, zr, cnt1);
 867 
 868   mov(tmp3, 0x0101010101010101);
 869 
 870   BIND(CH1_LOOP);
 871     ldr(ch1, Address(str1, cnt1_neg));
 872     eor(ch1, ch, ch1);
 873     sub(tmp1, ch1, tmp3);
 874     orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f);
 875     bics(tmp1, tmp1, tmp2);
 876     br(NE, HAS_ZERO);
 877     adds(cnt1_neg, cnt1_neg, 8);
 878     br(LT, CH1_LOOP);
 879 
 880     cmp(cnt1_neg, (u1)8);
 881     mov(cnt1_neg, 0);
 882     br(LT, CH1_LOOP);
 883     b(NOMATCH);
 884 
 885   BIND(HAS_ZERO);
 886     rev(tmp1, tmp1);
 887     clz(tmp1, tmp1);
 888     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
 889     b(MATCH);
 890 
 891   BIND(DO1_SHORT);
 892     mov(result_tmp, cnt1);
 893     lea(str1, Address(str1, cnt1));
 894     sub(cnt1_neg, zr, cnt1);
 895   BIND(DO1_LOOP);
 896     ldrb(ch1, Address(str1, cnt1_neg));
 897     cmp(ch, ch1);
 898     br(EQ, MATCH);
 899     adds(cnt1_neg, cnt1_neg, 1);
 900     br(LT, DO1_LOOP);
 901   BIND(NOMATCH);
 902     mov(result, -1);
 903     b(DONE);
 904   BIND(MATCH);
 905     add(result, result_tmp, cnt1_neg);
 906   BIND(DONE);
 907 }
 908 
 909 // Compare strings.
 910 void C2_MacroAssembler::string_compare(Register str1, Register str2,
 911     Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
 912     FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
 913     PRegister pgtmp1, PRegister pgtmp2, int ae) {
 914   Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
 915       DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
 916       SHORT_LOOP_START, TAIL_CHECK;
 917 
 918   bool isLL = ae == StrIntrinsicNode::LL;
 919   bool isLU = ae == StrIntrinsicNode::LU;
 920   bool isUL = ae == StrIntrinsicNode::UL;
 921 
 922   // The stub threshold for LL strings is: 72 (64 + 8) chars
 923   // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch)
 924   // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least)
 925   const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36);
 926 
 927   bool str1_isL = isLL || isLU;
 928   bool str2_isL = isLL || isUL;
 929 
 930   int str1_chr_shift = str1_isL ? 0 : 1;
 931   int str2_chr_shift = str2_isL ? 0 : 1;
 932   int str1_chr_size = str1_isL ? 1 : 2;
 933   int str2_chr_size = str2_isL ? 1 : 2;
 934   int minCharsInWord = isLL ? wordSize : wordSize/2;
 935 
 936   FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
 937   chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
 938                                       (chr_insn)&MacroAssembler::ldrh;
 939   chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
 940                                       (chr_insn)&MacroAssembler::ldrh;
 941   uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
 942                             (uxt_insn)&MacroAssembler::uxthw;
 943 
 944   BLOCK_COMMENT("string_compare {");
 945 
 946   // Bizzarely, the counts are passed in bytes, regardless of whether they
 947   // are L or U strings, however the result is always in characters.
 948   if (!str1_isL) asrw(cnt1, cnt1, 1);
 949   if (!str2_isL) asrw(cnt2, cnt2, 1);
 950 
 951   // Compute the minimum of the string lengths and save the difference.
 952   subsw(result, cnt1, cnt2);
 953   cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
 954 
 955   // A very short string
 956   cmpw(cnt2, minCharsInWord);
 957   br(Assembler::LE, SHORT_STRING);
 958 
 959   // Compare longwords
 960   // load first parts of strings and finish initialization while loading
 961   {
 962     if (str1_isL == str2_isL) { // LL or UU
 963       ldr(tmp1, Address(str1));
 964       cmp(str1, str2);
 965       br(Assembler::EQ, DONE);
 966       ldr(tmp2, Address(str2));
 967       cmp(cnt2, stub_threshold);
 968       br(GE, STUB);
 969       subsw(cnt2, cnt2, minCharsInWord);
 970       br(EQ, TAIL_CHECK);
 971       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
 972       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
 973       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
 974     } else if (isLU) {
 975       ldrs(vtmp, Address(str1));
 976       ldr(tmp2, Address(str2));
 977       cmp(cnt2, stub_threshold);
 978       br(GE, STUB);
 979       subw(cnt2, cnt2, 4);
 980       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
 981       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
 982       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
 983       zip1(vtmp, T8B, vtmp, vtmpZ);
 984       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
 985       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
 986       add(cnt1, cnt1, 4);
 987       fmovd(tmp1, vtmp);
 988     } else { // UL case
 989       ldr(tmp1, Address(str1));
 990       ldrs(vtmp, Address(str2));
 991       cmp(cnt2, stub_threshold);
 992       br(GE, STUB);
 993       subw(cnt2, cnt2, 4);
 994       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
 995       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
 996       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
 997       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
 998       zip1(vtmp, T8B, vtmp, vtmpZ);
 999       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1000       add(cnt1, cnt1, 8);
1001       fmovd(tmp2, vtmp);
1002     }
1003     adds(cnt2, cnt2, isUL ? 4 : 8);
1004     br(GE, TAIL);
1005     eor(rscratch2, tmp1, tmp2);
1006     cbnz(rscratch2, DIFF);
1007     // main loop
1008     bind(NEXT_WORD);
1009     if (str1_isL == str2_isL) {
1010       ldr(tmp1, Address(str1, cnt2));
1011       ldr(tmp2, Address(str2, cnt2));
1012       adds(cnt2, cnt2, 8);
1013     } else if (isLU) {
1014       ldrs(vtmp, Address(str1, cnt1));
1015       ldr(tmp2, Address(str2, cnt2));
1016       add(cnt1, cnt1, 4);
1017       zip1(vtmp, T8B, vtmp, vtmpZ);
1018       fmovd(tmp1, vtmp);
1019       adds(cnt2, cnt2, 8);
1020     } else { // UL
1021       ldrs(vtmp, Address(str2, cnt2));
1022       ldr(tmp1, Address(str1, cnt1));
1023       zip1(vtmp, T8B, vtmp, vtmpZ);
1024       add(cnt1, cnt1, 8);
1025       fmovd(tmp2, vtmp);
1026       adds(cnt2, cnt2, 4);
1027     }
1028     br(GE, TAIL);
1029 
1030     eor(rscratch2, tmp1, tmp2);
1031     cbz(rscratch2, NEXT_WORD);
1032     b(DIFF);
1033     bind(TAIL);
1034     eor(rscratch2, tmp1, tmp2);
1035     cbnz(rscratch2, DIFF);
1036     // Last longword.  In the case where length == 4 we compare the
1037     // same longword twice, but that's still faster than another
1038     // conditional branch.
1039     if (str1_isL == str2_isL) {
1040       ldr(tmp1, Address(str1));
1041       ldr(tmp2, Address(str2));
1042     } else if (isLU) {
1043       ldrs(vtmp, Address(str1));
1044       ldr(tmp2, Address(str2));
1045       zip1(vtmp, T8B, vtmp, vtmpZ);
1046       fmovd(tmp1, vtmp);
1047     } else { // UL
1048       ldrs(vtmp, Address(str2));
1049       ldr(tmp1, Address(str1));
1050       zip1(vtmp, T8B, vtmp, vtmpZ);
1051       fmovd(tmp2, vtmp);
1052     }
1053     bind(TAIL_CHECK);
1054     eor(rscratch2, tmp1, tmp2);
1055     cbz(rscratch2, DONE);
1056 
1057     // Find the first different characters in the longwords and
1058     // compute their difference.
1059     bind(DIFF);
1060     rev(rscratch2, rscratch2);
1061     clz(rscratch2, rscratch2);
1062     andr(rscratch2, rscratch2, isLL ? -8 : -16);
1063     lsrv(tmp1, tmp1, rscratch2);
1064     (this->*ext_chr)(tmp1, tmp1);
1065     lsrv(tmp2, tmp2, rscratch2);
1066     (this->*ext_chr)(tmp2, tmp2);
1067     subw(result, tmp1, tmp2);
1068     b(DONE);
1069   }
1070 
1071   bind(STUB);
1072     RuntimeAddress stub = nullptr;
1073     switch(ae) {
1074       case StrIntrinsicNode::LL:
1075         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
1076         break;
1077       case StrIntrinsicNode::UU:
1078         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
1079         break;
1080       case StrIntrinsicNode::LU:
1081         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
1082         break;
1083       case StrIntrinsicNode::UL:
1084         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
1085         break;
1086       default:
1087         ShouldNotReachHere();
1088      }
1089     assert(stub.target() != nullptr, "compare_long_string stub has not been generated");
1090     address call = trampoline_call(stub);
1091     if (call == nullptr) {
1092       DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START));
1093       ciEnv::current()->record_failure("CodeCache is full");
1094       return;
1095     }
1096     b(DONE);
1097 
1098   bind(SHORT_STRING);
1099   // Is the minimum length zero?
1100   cbz(cnt2, DONE);
1101   // arrange code to do most branches while loading and loading next characters
1102   // while comparing previous
1103   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1104   subs(cnt2, cnt2, 1);
1105   br(EQ, SHORT_LAST_INIT);
1106   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1107   b(SHORT_LOOP_START);
1108   bind(SHORT_LOOP);
1109   subs(cnt2, cnt2, 1);
1110   br(EQ, SHORT_LAST);
1111   bind(SHORT_LOOP_START);
1112   (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
1113   (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
1114   cmp(tmp1, cnt1);
1115   br(NE, SHORT_LOOP_TAIL);
1116   subs(cnt2, cnt2, 1);
1117   br(EQ, SHORT_LAST2);
1118   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1119   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1120   cmp(tmp2, rscratch1);
1121   br(EQ, SHORT_LOOP);
1122   sub(result, tmp2, rscratch1);
1123   b(DONE);
1124   bind(SHORT_LOOP_TAIL);
1125   sub(result, tmp1, cnt1);
1126   b(DONE);
1127   bind(SHORT_LAST2);
1128   cmp(tmp2, rscratch1);
1129   br(EQ, DONE);
1130   sub(result, tmp2, rscratch1);
1131 
1132   b(DONE);
1133   bind(SHORT_LAST_INIT);
1134   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1135   bind(SHORT_LAST);
1136   cmp(tmp1, cnt1);
1137   br(EQ, DONE);
1138   sub(result, tmp1, cnt1);
1139 
1140   bind(DONE);
1141 
1142   BLOCK_COMMENT("} string_compare");
1143 }
1144 
1145 void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1,
1146                                      FloatRegister src2, Condition cond, bool isQ) {
1147   SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1148   FloatRegister zn = src1, zm = src2;
1149   bool needs_negation = false;
1150   switch (cond) {
1151     case LT: cond = GT; zn = src2; zm = src1; break;
1152     case LE: cond = GE; zn = src2; zm = src1; break;
1153     case LO: cond = HI; zn = src2; zm = src1; break;
1154     case LS: cond = HS; zn = src2; zm = src1; break;
1155     case NE: cond = EQ; needs_negation = true; break;
1156     default:
1157       break;
1158   }
1159 
1160   if (is_floating_point_type(bt)) {
1161     fcm(cond, dst, size, zn, zm);
1162   } else {
1163     cm(cond, dst, size, zn, zm);
1164   }
1165 
1166   if (needs_negation) {
1167     notr(dst, isQ ? T16B : T8B, dst);
1168   }
1169 }
1170 
1171 void C2_MacroAssembler::neon_compare_zero(FloatRegister dst, BasicType bt, FloatRegister src,
1172                                           Condition cond, bool isQ) {
1173   SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1174   if (bt == T_FLOAT || bt == T_DOUBLE) {
1175     if (cond == Assembler::NE) {
1176       fcm(Assembler::EQ, dst, size, src);
1177       notr(dst, isQ ? T16B : T8B, dst);
1178     } else {
1179       fcm(cond, dst, size, src);
1180     }
1181   } else {
1182     if (cond == Assembler::NE) {
1183       cm(Assembler::EQ, dst, size, src);
1184       notr(dst, isQ ? T16B : T8B, dst);
1185     } else {
1186       cm(cond, dst, size, src);
1187     }
1188   }
1189 }
1190 
1191 // Compress the least significant bit of each byte to the rightmost and clear
1192 // the higher garbage bits.
1193 void C2_MacroAssembler::bytemask_compress(Register dst) {
1194   // Example input, dst = 0x01 00 00 00 01 01 00 01
1195   // The "??" bytes are garbage.
1196   orr(dst, dst, dst, Assembler::LSR, 7);  // dst = 0x?? 02 ?? 00 ?? 03 ?? 01
1197   orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D
1198   orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D
1199   andr(dst, dst, 0xff);                   // dst = 0x8D
1200 }
1201 
1202 // Pack the lowest-numbered bit of each mask element in src into a long value
1203 // in dst, at most the first 64 lane elements.
1204 // Clobbers: rscratch1, if UseSVE=1 or the hardware doesn't support FEAT_BITPERM.
1205 void C2_MacroAssembler::sve_vmask_tolong(Register dst, PRegister src, BasicType bt, int lane_cnt,
1206                                          FloatRegister vtmp1, FloatRegister vtmp2) {
1207   assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
1208   assert_different_registers(dst, rscratch1);
1209   assert_different_registers(vtmp1, vtmp2);
1210 
1211   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
1212   // Example:   src = 0b01100101 10001101, bt = T_BYTE, lane_cnt = 16
1213   // Expected:  dst = 0x658D
1214 
1215   // Convert the mask into vector with sequential bytes.
1216   // vtmp1 = 0x00010100 0x00010001 0x01000000 0x01010001
1217   sve_cpy(vtmp1, size, src, 1, false);
1218   if (bt != T_BYTE) {
1219     sve_vector_narrow(vtmp1, B, vtmp1, size, vtmp2);
1220   }
1221 
1222   if (UseSVE > 1 && VM_Version::supports_svebitperm()) {
1223     // Given a vector with the value 0x00 or 0x01 in each byte, the basic idea
1224     // is to compress each significant bit of the byte in a cross-lane way. Due
1225     // to the lack of a cross-lane bit-compress instruction, we use BEXT
1226     // (bit-compress in each lane) with the biggest lane size (T = D) then
1227     // concatenate the results.
1228 
1229     // The second source input of BEXT, initialized with 0x01 in each byte.
1230     // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1231     sve_dup(vtmp2, B, 1);
1232 
1233     // BEXT vtmp1.D, vtmp1.D, vtmp2.D
1234     // vtmp1 = 0x0001010000010001 | 0x0100000001010001
1235     // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1236     //         ---------------------------------------
1237     // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1238     sve_bext(vtmp1, D, vtmp1, vtmp2);
1239 
1240     // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the
1241     // result to dst.
1242     // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1243     // dst   = 0x658D
1244     if (lane_cnt <= 8) {
1245       // No need to concatenate.
1246       umov(dst, vtmp1, B, 0);
1247     } else if (lane_cnt <= 16) {
1248       ins(vtmp1, B, vtmp1, 1, 8);
1249       umov(dst, vtmp1, H, 0);
1250     } else {
1251       // As the lane count is 64 at most, the final expected value must be in
1252       // the lowest 64 bits after narrowing vtmp1 from D to B.
1253       sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2);
1254       umov(dst, vtmp1, D, 0);
1255     }
1256   } else if (UseSVE > 0) {
1257     // Compress the lowest 8 bytes.
1258     fmovd(dst, vtmp1);
1259     bytemask_compress(dst);
1260     if (lane_cnt <= 8) return;
1261 
1262     // Repeat on higher bytes and join the results.
1263     // Compress 8 bytes in each iteration.
1264     for (int idx = 1; idx < (lane_cnt / 8); idx++) {
1265       sve_extract_integral(rscratch1, T_LONG, vtmp1, idx, vtmp2);
1266       bytemask_compress(rscratch1);
1267       orr(dst, dst, rscratch1, Assembler::LSL, idx << 3);
1268     }
1269   } else {
1270     assert(false, "unsupported");
1271     ShouldNotReachHere();
1272   }
1273 }
1274 
1275 // Unpack the mask, a long value in src, into predicate register dst based on the
1276 // corresponding data type. Note that dst can support at most 64 lanes.
1277 // Below example gives the expected dst predicate register in different types, with
1278 // a valid src(0x658D) on a 1024-bit vector size machine.
1279 // BYTE:  dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 00 00 65 8D
1280 // SHORT: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 14 11 40 51
1281 // INT:   dst = 0x00 00 00 00 00 00 00 00 01 10 01 01 10 00 11 01
1282 // LONG:  dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01
1283 //
1284 // The number of significant bits of src must be equal to lane_cnt. E.g., 0xFF658D which
1285 // has 24 significant bits would be an invalid input if dst predicate register refers to
1286 // a LONG type 1024-bit vector, which has at most 16 lanes.
1287 void C2_MacroAssembler::sve_vmask_fromlong(PRegister dst, Register src, BasicType bt, int lane_cnt,
1288                                            FloatRegister vtmp1, FloatRegister vtmp2) {
1289   assert(UseSVE == 2 && VM_Version::supports_svebitperm() &&
1290          lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported");
1291   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
1292   // Example:   src = 0x658D, bt = T_BYTE, size = B, lane_cnt = 16
1293   // Expected:  dst = 0b01101001 10001101
1294 
1295   // Put long value from general purpose register into the first lane of vector.
1296   // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1297   sve_dup(vtmp1, B, 0);
1298   mov(vtmp1, D, 0, src);
1299 
1300   // As sve_cmp generates mask value with the minimum unit in byte, we should
1301   // transform the value in the first lane which is mask in bit now to the
1302   // mask in byte, which can be done by SVE2's BDEP instruction.
1303 
1304   // The first source input of BDEP instruction. Deposite each byte in every 8 bytes.
1305   // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1306   if (lane_cnt <= 8) {
1307     // Nothing. As only one byte exsits.
1308   } else if (lane_cnt <= 16) {
1309     ins(vtmp1, B, vtmp1, 8, 1);
1310     mov(vtmp1, B, 1, zr);
1311   } else {
1312     sve_vector_extend(vtmp1, D, vtmp1, B);
1313   }
1314 
1315   // The second source input of BDEP instruction, initialized with 0x01 for each byte.
1316   // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1317   sve_dup(vtmp2, B, 1);
1318 
1319   // BDEP vtmp1.D, vtmp1.D, vtmp2.D
1320   // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1321   // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1322   //         ---------------------------------------
1323   // vtmp1 = 0x0001010000010001 | 0x0100000001010001
1324   sve_bdep(vtmp1, D, vtmp1, vtmp2);
1325 
1326   if (bt != T_BYTE) {
1327     sve_vector_extend(vtmp1, size, vtmp1, B);
1328   }
1329   // Generate mask according to the given vector, in which the elements have been
1330   // extended to expected type.
1331   // dst = 0b01101001 10001101
1332   sve_cmp(Assembler::NE, dst, size, ptrue, vtmp1, 0);
1333 }
1334 
1335 // Clobbers: rflags
1336 void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg,
1337                                     FloatRegister zn, FloatRegister zm, Condition cond) {
1338   assert(pg->is_governing(), "This register has to be a governing predicate register");
1339   FloatRegister z1 = zn, z2 = zm;
1340   switch (cond) {
1341     case LE: z1 = zm; z2 = zn; cond = GE; break;
1342     case LT: z1 = zm; z2 = zn; cond = GT; break;
1343     case LO: z1 = zm; z2 = zn; cond = HI; break;
1344     case LS: z1 = zm; z2 = zn; cond = HS; break;
1345     default:
1346       break;
1347   }
1348 
1349   SIMD_RegVariant size = elemType_to_regVariant(bt);
1350   if (is_floating_point_type(bt)) {
1351     sve_fcm(cond, pd, size, pg, z1, z2);
1352   } else {
1353     assert(is_integral_type(bt), "unsupported element type");
1354     sve_cmp(cond, pd, size, pg, z1, z2);
1355   }
1356 }
1357 
1358 // Get index of the last mask lane that is set
1359 void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) {
1360   SIMD_RegVariant size = elemType_to_regVariant(bt);
1361   sve_rev(ptmp, size, src);
1362   sve_brkb(ptmp, ptrue, ptmp, false);
1363   sve_cntp(dst, size, ptrue, ptmp);
1364   movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1);
1365   subw(dst, rscratch1, dst);
1366 }
1367 
1368 // Extend integer vector src to dst with the same lane count
1369 // but larger element size, e.g. 4B -> 4I
1370 void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes,
1371                                            FloatRegister src, BasicType src_bt, bool is_unsigned) {
1372   if (src_bt == T_BYTE) {
1373     if (dst_bt == T_SHORT) {
1374       // 4B/8B to 4S/8S
1375       _xshll(is_unsigned, dst, T8H, src, T8B, 0);
1376     } else {
1377       // 4B to 4I
1378       assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported");
1379       _xshll(is_unsigned, dst, T8H, src, T8B, 0);
1380       _xshll(is_unsigned, dst, T4S, dst, T4H, 0);
1381     }
1382   } else if (src_bt == T_SHORT) {
1383     // 4S to 4I
1384     assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported");
1385     _xshll(is_unsigned, dst, T4S, src, T4H, 0);
1386   } else if (src_bt == T_INT) {
1387     // 2I to 2L
1388     assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported");
1389     _xshll(is_unsigned, dst, T2D, src, T2S, 0);
1390   } else {
1391     ShouldNotReachHere();
1392   }
1393 }
1394 
1395 // Narrow integer vector src down to dst with the same lane count
1396 // but smaller element size, e.g. 4I -> 4B
1397 void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt,
1398                                            FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) {
1399   if (src_bt == T_SHORT) {
1400     // 4S/8S to 4B/8B
1401     assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
1402     assert(dst_bt == T_BYTE, "unsupported");
1403     xtn(dst, T8B, src, T8H);
1404   } else if (src_bt == T_INT) {
1405     // 4I to 4B/4S
1406     assert(src_vlen_in_bytes == 16, "unsupported");
1407     assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported");
1408     xtn(dst, T4H, src, T4S);
1409     if (dst_bt == T_BYTE) {
1410       xtn(dst, T8B, dst, T8H);
1411     }
1412   } else if (src_bt == T_LONG) {
1413     // 2L to 2I
1414     assert(src_vlen_in_bytes == 16, "unsupported");
1415     assert(dst_bt == T_INT, "unsupported");
1416     xtn(dst, T2S, src, T2D);
1417   } else {
1418     ShouldNotReachHere();
1419   }
1420 }
1421 
1422 void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size,
1423                                           FloatRegister src, SIMD_RegVariant src_size,
1424                                           bool is_unsigned) {
1425   assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size");
1426 
1427   if (src_size == B) {
1428     switch (dst_size) {
1429     case H:
1430       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1431       break;
1432     case S:
1433       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1434       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1435       break;
1436     case D:
1437       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1438       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1439       _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1440       break;
1441     default:
1442       ShouldNotReachHere();
1443     }
1444   } else if (src_size == H) {
1445     if (dst_size == S) {
1446       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1447     } else { // D
1448       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1449       _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1450     }
1451   } else if (src_size == S) {
1452     _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, src);
1453   }
1454 }
1455 
1456 // Vector narrow from src to dst with specified element sizes.
1457 // High part of dst vector will be filled with zero.
1458 void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size,
1459                                           FloatRegister src, SIMD_RegVariant src_size,
1460                                           FloatRegister tmp) {
1461   assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size");
1462   assert_different_registers(src, tmp);
1463   sve_dup(tmp, src_size, 0);
1464   if (src_size == D) {
1465     switch (dst_size) {
1466     case S:
1467       sve_uzp1(dst, S, src, tmp);
1468       break;
1469     case H:
1470       assert_different_registers(dst, tmp);
1471       sve_uzp1(dst, S, src, tmp);
1472       sve_uzp1(dst, H, dst, tmp);
1473       break;
1474     case B:
1475       assert_different_registers(dst, tmp);
1476       sve_uzp1(dst, S, src, tmp);
1477       sve_uzp1(dst, H, dst, tmp);
1478       sve_uzp1(dst, B, dst, tmp);
1479       break;
1480     default:
1481       ShouldNotReachHere();
1482     }
1483   } else if (src_size == S) {
1484     if (dst_size == H) {
1485       sve_uzp1(dst, H, src, tmp);
1486     } else { // B
1487       assert_different_registers(dst, tmp);
1488       sve_uzp1(dst, H, src, tmp);
1489       sve_uzp1(dst, B, dst, tmp);
1490     }
1491   } else if (src_size == H) {
1492     sve_uzp1(dst, B, src, tmp);
1493   }
1494 }
1495 
1496 // Extend src predicate to dst predicate with the same lane count but larger
1497 // element size, e.g. 64Byte -> 512Long
1498 void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src,
1499                                              uint dst_element_length_in_bytes,
1500                                              uint src_element_length_in_bytes) {
1501   if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) {
1502     sve_punpklo(dst, src);
1503   } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) {
1504     sve_punpklo(dst, src);
1505     sve_punpklo(dst, dst);
1506   } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) {
1507     sve_punpklo(dst, src);
1508     sve_punpklo(dst, dst);
1509     sve_punpklo(dst, dst);
1510   } else {
1511     assert(false, "unsupported");
1512     ShouldNotReachHere();
1513   }
1514 }
1515 
1516 // Narrow src predicate to dst predicate with the same lane count but
1517 // smaller element size, e.g. 512Long -> 64Byte
1518 void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src, PRegister ptmp,
1519                                              uint dst_element_length_in_bytes, uint src_element_length_in_bytes) {
1520   // The insignificant bits in src predicate are expected to be zero.
1521   // To ensure the higher order bits of the resultant narrowed vector are 0, an all-zero predicate is
1522   // passed as the second argument. An example narrowing operation with a given mask would be -
1523   // 128Long -> 64Int on a 128-bit machine i.e 2L -> 2I
1524   // Mask (for 2 Longs) : TF
1525   // Predicate register for the above mask (16 bits) : 00000001 00000000
1526   // After narrowing (uzp1 dst.b, src.b, ptmp.b) : 0000 0000 0001 0000
1527   // Which translates to mask for 2 integers as : TF (lower half is considered while upper half is 0)
1528   assert_different_registers(src, ptmp);
1529   assert_different_registers(dst, ptmp);
1530   sve_pfalse(ptmp);
1531   if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) {
1532     sve_uzp1(dst, B, src, ptmp);
1533   } else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) {
1534     sve_uzp1(dst, H, src, ptmp);
1535     sve_uzp1(dst, B, dst, ptmp);
1536   } else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) {
1537     sve_uzp1(dst, S, src, ptmp);
1538     sve_uzp1(dst, H, dst, ptmp);
1539     sve_uzp1(dst, B, dst, ptmp);
1540   } else {
1541     assert(false, "unsupported");
1542     ShouldNotReachHere();
1543   }
1544 }
1545 
1546 // Vector reduction add for integral type with ASIMD instructions.
1547 void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt,
1548                                                  Register isrc, FloatRegister vsrc,
1549                                                  unsigned vector_length_in_bytes,
1550                                                  FloatRegister vtmp) {
1551   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1552   assert_different_registers(dst, isrc);
1553   bool isQ = vector_length_in_bytes == 16;
1554 
1555   BLOCK_COMMENT("neon_reduce_add_integral {");
1556     switch(bt) {
1557       case T_BYTE:
1558         addv(vtmp, isQ ? T16B : T8B, vsrc);
1559         smov(dst, vtmp, B, 0);
1560         addw(dst, dst, isrc, ext::sxtb);
1561         break;
1562       case T_SHORT:
1563         addv(vtmp, isQ ? T8H : T4H, vsrc);
1564         smov(dst, vtmp, H, 0);
1565         addw(dst, dst, isrc, ext::sxth);
1566         break;
1567       case T_INT:
1568         isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc);
1569         umov(dst, vtmp, S, 0);
1570         addw(dst, dst, isrc);
1571         break;
1572       case T_LONG:
1573         assert(isQ, "unsupported");
1574         addpd(vtmp, vsrc);
1575         umov(dst, vtmp, D, 0);
1576         add(dst, dst, isrc);
1577         break;
1578       default:
1579         assert(false, "unsupported");
1580         ShouldNotReachHere();
1581     }
1582   BLOCK_COMMENT("} neon_reduce_add_integral");
1583 }
1584 
1585 // Vector reduction multiply for integral type with ASIMD instructions.
1586 // Note: temporary registers vtmp1 and vtmp2 are not used in some cases.
1587 // Clobbers: rscratch1
1588 void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt,
1589                                                  Register isrc, FloatRegister vsrc,
1590                                                  unsigned vector_length_in_bytes,
1591                                                  FloatRegister vtmp1, FloatRegister vtmp2) {
1592   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1593   bool isQ = vector_length_in_bytes == 16;
1594 
1595   BLOCK_COMMENT("neon_reduce_mul_integral {");
1596     switch(bt) {
1597       case T_BYTE:
1598         if (isQ) {
1599           // Multiply the lower half and higher half of vector iteratively.
1600           // vtmp1 = vsrc[8:15]
1601           ins(vtmp1, D, vsrc, 0, 1);
1602           // vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7]
1603           mulv(vtmp1, T8B, vtmp1, vsrc);
1604           // vtmp2 = vtmp1[4:7]
1605           ins(vtmp2, S, vtmp1, 0, 1);
1606           // vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3]
1607           mulv(vtmp1, T8B, vtmp2, vtmp1);
1608         } else {
1609           ins(vtmp1, S, vsrc, 0, 1);
1610           mulv(vtmp1, T8B, vtmp1, vsrc);
1611         }
1612         // vtmp2 = vtmp1[2:3]
1613         ins(vtmp2, H, vtmp1, 0, 1);
1614         // vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1]
1615         mulv(vtmp2, T8B, vtmp2, vtmp1);
1616         // dst = vtmp2[0] * isrc * vtmp2[1]
1617         umov(rscratch1, vtmp2, B, 0);
1618         mulw(dst, rscratch1, isrc);
1619         sxtb(dst, dst);
1620         umov(rscratch1, vtmp2, B, 1);
1621         mulw(dst, rscratch1, dst);
1622         sxtb(dst, dst);
1623         break;
1624       case T_SHORT:
1625         if (isQ) {
1626           ins(vtmp2, D, vsrc, 0, 1);
1627           mulv(vtmp2, T4H, vtmp2, vsrc);
1628           ins(vtmp1, S, vtmp2, 0, 1);
1629           mulv(vtmp1, T4H, vtmp1, vtmp2);
1630         } else {
1631           ins(vtmp1, S, vsrc, 0, 1);
1632           mulv(vtmp1, T4H, vtmp1, vsrc);
1633         }
1634         umov(rscratch1, vtmp1, H, 0);
1635         mulw(dst, rscratch1, isrc);
1636         sxth(dst, dst);
1637         umov(rscratch1, vtmp1, H, 1);
1638         mulw(dst, rscratch1, dst);
1639         sxth(dst, dst);
1640         break;
1641       case T_INT:
1642         if (isQ) {
1643           ins(vtmp1, D, vsrc, 0, 1);
1644           mulv(vtmp1, T2S, vtmp1, vsrc);
1645         } else {
1646           vtmp1 = vsrc;
1647         }
1648         umov(rscratch1, vtmp1, S, 0);
1649         mul(dst, rscratch1, isrc);
1650         umov(rscratch1, vtmp1, S, 1);
1651         mul(dst, rscratch1, dst);
1652         break;
1653       case T_LONG:
1654         umov(rscratch1, vsrc, D, 0);
1655         mul(dst, isrc, rscratch1);
1656         umov(rscratch1, vsrc, D, 1);
1657         mul(dst, dst, rscratch1);
1658         break;
1659       default:
1660         assert(false, "unsupported");
1661         ShouldNotReachHere();
1662     }
1663   BLOCK_COMMENT("} neon_reduce_mul_integral");
1664 }
1665 
1666 // Vector reduction multiply for floating-point type with ASIMD instructions.
1667 void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt,
1668                                            FloatRegister fsrc, FloatRegister vsrc,
1669                                            unsigned vector_length_in_bytes,
1670                                            FloatRegister vtmp) {
1671   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1672   bool isQ = vector_length_in_bytes == 16;
1673 
1674   BLOCK_COMMENT("neon_reduce_mul_fp {");
1675     switch(bt) {
1676       case T_FLOAT:
1677         fmuls(dst, fsrc, vsrc);
1678         ins(vtmp, S, vsrc, 0, 1);
1679         fmuls(dst, dst, vtmp);
1680         if (isQ) {
1681           ins(vtmp, S, vsrc, 0, 2);
1682           fmuls(dst, dst, vtmp);
1683           ins(vtmp, S, vsrc, 0, 3);
1684           fmuls(dst, dst, vtmp);
1685          }
1686         break;
1687       case T_DOUBLE:
1688         assert(isQ, "unsupported");
1689         fmuld(dst, fsrc, vsrc);
1690         ins(vtmp, D, vsrc, 0, 1);
1691         fmuld(dst, dst, vtmp);
1692         break;
1693       default:
1694         assert(false, "unsupported");
1695         ShouldNotReachHere();
1696     }
1697   BLOCK_COMMENT("} neon_reduce_mul_fp");
1698 }
1699 
1700 // Helper to select logical instruction
1701 void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd,
1702                                                    Register Rn, Register Rm,
1703                                                    enum shift_kind kind, unsigned shift) {
1704   switch(opc) {
1705     case Op_AndReductionV:
1706       is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift);
1707       break;
1708     case Op_OrReductionV:
1709       is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift);
1710       break;
1711     case Op_XorReductionV:
1712       is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift);
1713       break;
1714     default:
1715       assert(false, "unsupported");
1716       ShouldNotReachHere();
1717   }
1718 }
1719 
1720 // Vector reduction logical operations And, Or, Xor
1721 // Clobbers: rscratch1
1722 void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt,
1723                                             Register isrc, FloatRegister vsrc,
1724                                             unsigned vector_length_in_bytes) {
1725   assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV,
1726          "unsupported");
1727   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1728   assert_different_registers(dst, isrc);
1729   bool isQ = vector_length_in_bytes == 16;
1730 
1731   BLOCK_COMMENT("neon_reduce_logical {");
1732     umov(rscratch1, vsrc, isQ ? D : S, 0);
1733     umov(dst, vsrc, isQ ? D : S, 1);
1734     neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1);
1735     switch(bt) {
1736       case T_BYTE:
1737         if (isQ) {
1738           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1739         }
1740         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
1741         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8);
1742         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1743         sxtb(dst, dst);
1744         break;
1745       case T_SHORT:
1746         if (isQ) {
1747           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1748         }
1749         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
1750         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1751         sxth(dst, dst);
1752         break;
1753       case T_INT:
1754         if (isQ) {
1755           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1756         }
1757         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1758         break;
1759       case T_LONG:
1760         assert(isQ, "unsupported");
1761         neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst);
1762         break;
1763       default:
1764         assert(false, "unsupported");
1765         ShouldNotReachHere();
1766     }
1767   BLOCK_COMMENT("} neon_reduce_logical");
1768 }
1769 
1770 // Vector reduction min/max for integral type with ASIMD instructions.
1771 // Note: vtmp is not used and expected to be fnoreg for T_LONG case.
1772 // Clobbers: rscratch1, rflags
1773 void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt,
1774                                                     Register isrc, FloatRegister vsrc,
1775                                                     unsigned vector_length_in_bytes,
1776                                                     FloatRegister vtmp) {
1777   assert(opc == Op_MinReductionV || opc == Op_MaxReductionV, "unsupported");
1778   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1779   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported");
1780   assert_different_registers(dst, isrc);
1781   bool isQ = vector_length_in_bytes == 16;
1782   bool is_min = opc == Op_MinReductionV;
1783 
1784   BLOCK_COMMENT("neon_reduce_minmax_integral {");
1785     if (bt == T_LONG) {
1786       assert(vtmp == fnoreg, "should be");
1787       assert(isQ, "should be");
1788       umov(rscratch1, vsrc, D, 0);
1789       cmp(isrc, rscratch1);
1790       csel(dst, isrc, rscratch1, is_min ? LT : GT);
1791       umov(rscratch1, vsrc, D, 1);
1792       cmp(dst, rscratch1);
1793       csel(dst, dst, rscratch1, is_min ? LT : GT);
1794     } else {
1795       SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1796       if (size == T2S) {
1797         is_min ? sminp(vtmp, size, vsrc, vsrc) : smaxp(vtmp, size, vsrc, vsrc);
1798       } else {
1799         is_min ? sminv(vtmp, size, vsrc) : smaxv(vtmp, size, vsrc);
1800       }
1801       if (bt == T_INT) {
1802         umov(dst, vtmp, S, 0);
1803       } else {
1804         smov(dst, vtmp, elemType_to_regVariant(bt), 0);
1805       }
1806       cmpw(dst, isrc);
1807       cselw(dst, dst, isrc, is_min ? LT : GT);
1808     }
1809   BLOCK_COMMENT("} neon_reduce_minmax_integral");
1810 }
1811 
1812 // Vector reduction for integral type with SVE instruction.
1813 // Supported operations are Add, And, Or, Xor, Max, Min.
1814 // rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV.
1815 void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1,
1816                                             FloatRegister src2, PRegister pg, FloatRegister tmp) {
1817   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
1818   assert(pg->is_governing(), "This register has to be a governing predicate register");
1819   assert_different_registers(src1, dst);
1820   // Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved.
1821   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
1822   switch (opc) {
1823     case Op_AddReductionVI: {
1824       sve_uaddv(tmp, size, pg, src2);
1825       if (bt == T_BYTE) {
1826         smov(dst, tmp, size, 0);
1827         addw(dst, src1, dst, ext::sxtb);
1828       } else if (bt == T_SHORT) {
1829         smov(dst, tmp, size, 0);
1830         addw(dst, src1, dst, ext::sxth);
1831       } else {
1832         umov(dst, tmp, size, 0);
1833         addw(dst, dst, src1);
1834       }
1835       break;
1836     }
1837     case Op_AddReductionVL: {
1838       sve_uaddv(tmp, size, pg, src2);
1839       umov(dst, tmp, size, 0);
1840       add(dst, dst, src1);
1841       break;
1842     }
1843     case Op_AndReductionV: {
1844       sve_andv(tmp, size, pg, src2);
1845       if (bt == T_INT || bt == T_LONG) {
1846         umov(dst, tmp, size, 0);
1847       } else {
1848         smov(dst, tmp, size, 0);
1849       }
1850       if (bt == T_LONG) {
1851         andr(dst, dst, src1);
1852       } else {
1853         andw(dst, dst, src1);
1854       }
1855       break;
1856     }
1857     case Op_OrReductionV: {
1858       sve_orv(tmp, size, pg, src2);
1859       if (bt == T_INT || bt == T_LONG) {
1860         umov(dst, tmp, size, 0);
1861       } else {
1862         smov(dst, tmp, size, 0);
1863       }
1864       if (bt == T_LONG) {
1865         orr(dst, dst, src1);
1866       } else {
1867         orrw(dst, dst, src1);
1868       }
1869       break;
1870     }
1871     case Op_XorReductionV: {
1872       sve_eorv(tmp, size, pg, src2);
1873       if (bt == T_INT || bt == T_LONG) {
1874         umov(dst, tmp, size, 0);
1875       } else {
1876         smov(dst, tmp, size, 0);
1877       }
1878       if (bt == T_LONG) {
1879         eor(dst, dst, src1);
1880       } else {
1881         eorw(dst, dst, src1);
1882       }
1883       break;
1884     }
1885     case Op_MaxReductionV: {
1886       sve_smaxv(tmp, size, pg, src2);
1887       if (bt == T_INT || bt == T_LONG) {
1888         umov(dst, tmp, size, 0);
1889       } else {
1890         smov(dst, tmp, size, 0);
1891       }
1892       if (bt == T_LONG) {
1893         cmp(dst, src1);
1894         csel(dst, dst, src1, Assembler::GT);
1895       } else {
1896         cmpw(dst, src1);
1897         cselw(dst, dst, src1, Assembler::GT);
1898       }
1899       break;
1900     }
1901     case Op_MinReductionV: {
1902       sve_sminv(tmp, size, pg, src2);
1903       if (bt == T_INT || bt == T_LONG) {
1904         umov(dst, tmp, size, 0);
1905       } else {
1906         smov(dst, tmp, size, 0);
1907       }
1908       if (bt == T_LONG) {
1909         cmp(dst, src1);
1910         csel(dst, dst, src1, Assembler::LT);
1911       } else {
1912         cmpw(dst, src1);
1913         cselw(dst, dst, src1, Assembler::LT);
1914       }
1915       break;
1916     }
1917     default:
1918       assert(false, "unsupported");
1919       ShouldNotReachHere();
1920   }
1921 
1922   if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) {
1923     if (bt == T_BYTE) {
1924       sxtb(dst, dst);
1925     } else if (bt == T_SHORT) {
1926       sxth(dst, dst);
1927     }
1928   }
1929 }
1930 
1931 // Set elements of the dst predicate to true for lanes in the range of [0, lane_cnt), or
1932 // to false otherwise. The input "lane_cnt" should be smaller than or equal to the supported
1933 // max vector length of the basic type. Clobbers: rscratch1 and the rFlagsReg.
1934 void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt) {
1935   uint32_t max_vector_length = Matcher::max_vector_size(bt);
1936   assert(lane_cnt <= max_vector_length, "unsupported input lane_cnt");
1937 
1938   // Set all elements to false if the input "lane_cnt" is zero.
1939   if (lane_cnt == 0) {
1940     sve_pfalse(dst);
1941     return;
1942   }
1943 
1944   SIMD_RegVariant size = elemType_to_regVariant(bt);
1945   assert(size != Q, "invalid size");
1946 
1947   // Set all true if "lane_cnt" equals to the max lane count.
1948   if (lane_cnt == max_vector_length) {
1949     sve_ptrue(dst, size, /* ALL */ 0b11111);
1950     return;
1951   }
1952 
1953   // Fixed numbers for "ptrue".
1954   switch(lane_cnt) {
1955   case 1: /* VL1 */
1956   case 2: /* VL2 */
1957   case 3: /* VL3 */
1958   case 4: /* VL4 */
1959   case 5: /* VL5 */
1960   case 6: /* VL6 */
1961   case 7: /* VL7 */
1962   case 8: /* VL8 */
1963     sve_ptrue(dst, size, lane_cnt);
1964     return;
1965   case 16:
1966     sve_ptrue(dst, size, /* VL16 */ 0b01001);
1967     return;
1968   case 32:
1969     sve_ptrue(dst, size, /* VL32 */ 0b01010);
1970     return;
1971   case 64:
1972     sve_ptrue(dst, size, /* VL64 */ 0b01011);
1973     return;
1974   case 128:
1975     sve_ptrue(dst, size, /* VL128 */ 0b01100);
1976     return;
1977   case 256:
1978     sve_ptrue(dst, size, /* VL256 */ 0b01101);
1979     return;
1980   default:
1981     break;
1982   }
1983 
1984   // Special patterns for "ptrue".
1985   if (lane_cnt == round_down_power_of_2(max_vector_length)) {
1986     sve_ptrue(dst, size, /* POW2 */ 0b00000);
1987   } else if (lane_cnt == max_vector_length - (max_vector_length % 4)) {
1988     sve_ptrue(dst, size, /* MUL4 */ 0b11101);
1989   } else if (lane_cnt == max_vector_length - (max_vector_length % 3)) {
1990     sve_ptrue(dst, size, /* MUL3 */ 0b11110);
1991   } else {
1992     // Encode to "whileltw" for the remaining cases.
1993     mov(rscratch1, lane_cnt);
1994     sve_whileltw(dst, size, zr, rscratch1);
1995   }
1996 }
1997 
1998 // Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst.
1999 // Any remaining elements of dst will be filled with zero.
2000 // Clobbers: rscratch1
2001 // Preserves: src, mask
2002 void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask,
2003                                            FloatRegister vtmp1, FloatRegister vtmp2,
2004                                            PRegister pgtmp) {
2005   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2006   assert_different_registers(dst, src, vtmp1, vtmp2);
2007   assert_different_registers(mask, pgtmp);
2008 
2009   // Example input:   src   = 8888 7777 6666 5555 4444 3333 2222 1111
2010   //                  mask  = 0001 0000 0000 0001 0001 0000 0001 0001
2011   // Expected result: dst   = 0000 0000 0000 8888 5555 4444 2222 1111
2012   sve_dup(vtmp2, H, 0);
2013 
2014   // Extend lowest half to type INT.
2015   // dst = 00004444 00003333 00002222 00001111
2016   sve_uunpklo(dst, S, src);
2017   // pgtmp = 00000001 00000000 00000001 00000001
2018   sve_punpklo(pgtmp, mask);
2019   // Pack the active elements in size of type INT to the right,
2020   // and fill the remainings with zero.
2021   // dst = 00000000 00004444 00002222 00001111
2022   sve_compact(dst, S, dst, pgtmp);
2023   // Narrow the result back to type SHORT.
2024   // dst = 0000 0000 0000 0000 0000 4444 2222 1111
2025   sve_uzp1(dst, H, dst, vtmp2);
2026   // Count the active elements of lowest half.
2027   // rscratch1 = 3
2028   sve_cntp(rscratch1, S, ptrue, pgtmp);
2029 
2030   // Repeat to the highest half.
2031   // pgtmp = 00000001 00000000 00000000 00000001
2032   sve_punpkhi(pgtmp, mask);
2033   // vtmp1 = 00008888 00007777 00006666 00005555
2034   sve_uunpkhi(vtmp1, S, src);
2035   // vtmp1 = 00000000 00000000 00008888 00005555
2036   sve_compact(vtmp1, S, vtmp1, pgtmp);
2037   // vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555
2038   sve_uzp1(vtmp1, H, vtmp1, vtmp2);
2039 
2040   // Compressed low:   dst   = 0000 0000 0000 0000 0000 4444 2222 1111
2041   // Compressed high:  vtmp1 = 0000 0000 0000 0000 0000 0000 8888  5555
2042   // Left shift(cross lane) compressed high with TRUE_CNT lanes,
2043   // TRUE_CNT is the number of active elements in the compressed low.
2044   neg(rscratch1, rscratch1);
2045   // vtmp2 = {4 3 2 1 0 -1 -2 -3}
2046   sve_index(vtmp2, H, rscratch1, 1);
2047   // vtmp1 = 0000 0000 0000 8888 5555 0000 0000 0000
2048   sve_tbl(vtmp1, H, vtmp1, vtmp2);
2049 
2050   // Combine the compressed high(after shifted) with the compressed low.
2051   // dst = 0000 0000 0000 8888 5555 4444 2222 1111
2052   sve_orr(dst, dst, vtmp1);
2053 }
2054 
2055 // Clobbers: rscratch1, rscratch2
2056 // Preserves: src, mask
2057 void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask,
2058                                           FloatRegister vtmp1, FloatRegister vtmp2,
2059                                           FloatRegister vtmp3, FloatRegister vtmp4,
2060                                           PRegister ptmp, PRegister pgtmp) {
2061   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2062   assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3, vtmp4);
2063   assert_different_registers(mask, ptmp, pgtmp);
2064   // Example input:   src   = 88 77 66 55 44 33 22 11
2065   //                  mask  = 01 00 00 01 01 00 01 01
2066   // Expected result: dst   = 00 00 00 88 55 44 22 11
2067 
2068   sve_dup(vtmp4, B, 0);
2069   // Extend lowest half to type SHORT.
2070   // vtmp1 = 0044 0033 0022 0011
2071   sve_uunpklo(vtmp1, H, src);
2072   // ptmp = 0001 0000 0001 0001
2073   sve_punpklo(ptmp, mask);
2074   // Count the active elements of lowest half.
2075   // rscratch2 = 3
2076   sve_cntp(rscratch2, H, ptrue, ptmp);
2077   // Pack the active elements in size of type SHORT to the right,
2078   // and fill the remainings with zero.
2079   // dst = 0000 0044 0022 0011
2080   sve_compress_short(dst, vtmp1, ptmp, vtmp2, vtmp3, pgtmp);
2081   // Narrow the result back to type BYTE.
2082   // dst = 00 00 00 00 00 44 22 11
2083   sve_uzp1(dst, B, dst, vtmp4);
2084 
2085   // Repeat to the highest half.
2086   // ptmp = 0001 0000 0000 0001
2087   sve_punpkhi(ptmp, mask);
2088   // vtmp1 = 0088 0077 0066 0055
2089   sve_uunpkhi(vtmp2, H, src);
2090   // vtmp1 = 0000 0000 0088 0055
2091   sve_compress_short(vtmp1, vtmp2, ptmp, vtmp3, vtmp4, pgtmp);
2092 
2093   sve_dup(vtmp4, B, 0);
2094   // vtmp1 = 00 00 00 00 00 00 88 55
2095   sve_uzp1(vtmp1, B, vtmp1, vtmp4);
2096 
2097   // Compressed low:   dst   = 00 00 00 00 00 44 22 11
2098   // Compressed high:  vtmp1 = 00 00 00 00 00 00 88 55
2099   // Left shift(cross lane) compressed high with TRUE_CNT lanes,
2100   // TRUE_CNT is the number of active elements in the compressed low.
2101   neg(rscratch2, rscratch2);
2102   // vtmp2 = {4 3 2 1 0 -1 -2 -3}
2103   sve_index(vtmp2, B, rscratch2, 1);
2104   // vtmp1 = 00 00 00 88 55 00 00 00
2105   sve_tbl(vtmp1, B, vtmp1, vtmp2);
2106   // Combine the compressed high(after shifted) with the compressed low.
2107   // dst = 00 00 00 88 55 44 22 11
2108   sve_orr(dst, dst, vtmp1);
2109 }
2110 
2111 void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2112   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2113   SIMD_Arrangement size = isQ ? T16B : T8B;
2114   if (bt == T_BYTE) {
2115     rbit(dst, size, src);
2116   } else {
2117     neon_reverse_bytes(dst, src, bt, isQ);
2118     rbit(dst, size, dst);
2119   }
2120 }
2121 
2122 void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2123   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2124   SIMD_Arrangement size = isQ ? T16B : T8B;
2125   switch (bt) {
2126     case T_BYTE:
2127       if (dst != src) {
2128         orr(dst, size, src, src);
2129       }
2130       break;
2131     case T_SHORT:
2132       rev16(dst, size, src);
2133       break;
2134     case T_INT:
2135       rev32(dst, size, src);
2136       break;
2137     case T_LONG:
2138       rev64(dst, size, src);
2139       break;
2140     default:
2141       assert(false, "unsupported");
2142       ShouldNotReachHere();
2143   }
2144 }
2145 
2146 // Extract a scalar element from an sve vector at position 'idx'.
2147 // The input elements in src are expected to be of integral type.
2148 void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src,
2149                                              int idx, FloatRegister vtmp) {
2150   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2151   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2152   if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction
2153     if (bt == T_INT || bt == T_LONG) {
2154       umov(dst, src, size, idx);
2155     } else {
2156       smov(dst, src, size, idx);
2157     }
2158   } else {
2159     sve_orr(vtmp, src, src);
2160     sve_ext(vtmp, vtmp, idx << size);
2161     if (bt == T_INT || bt == T_LONG) {
2162       umov(dst, vtmp, size, 0);
2163     } else {
2164       smov(dst, vtmp, size, 0);
2165     }
2166   }
2167 }
2168 
2169 // java.lang.Math::round intrinsics
2170 
2171 // Clobbers: rscratch1, rflags
2172 void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2173                                           FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) {
2174   assert_different_registers(tmp1, tmp2, tmp3, src, dst);
2175   switch (T) {
2176     case T2S:
2177     case T4S:
2178       fmovs(tmp1, T, 0.5f);
2179       mov(rscratch1, jint_cast(0x1.0p23f));
2180       break;
2181     case T2D:
2182       fmovd(tmp1, T, 0.5);
2183       mov(rscratch1, julong_cast(0x1.0p52));
2184       break;
2185     default:
2186       assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2187   }
2188   fadd(tmp1, T, tmp1, src);
2189   fcvtms(tmp1, T, tmp1);
2190   // tmp1 = floor(src + 0.5, ties to even)
2191 
2192   fcvtas(dst, T, src);
2193   // dst = round(src), ties to away
2194 
2195   fneg(tmp3, T, src);
2196   dup(tmp2, T, rscratch1);
2197   cm(HS, tmp3, T, tmp3, tmp2);
2198   // tmp3 is now a set of flags
2199 
2200   bif(dst, T16B, tmp1, tmp3);
2201   // result in dst
2202 }
2203 
2204 // Clobbers: rscratch1, rflags
2205 void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2206                                          FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) {
2207   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2208   assert_different_registers(tmp1, tmp2, src, dst);
2209 
2210   switch (T) {
2211     case S:
2212       mov(rscratch1, jint_cast(0x1.0p23f));
2213       break;
2214     case D:
2215       mov(rscratch1, julong_cast(0x1.0p52));
2216       break;
2217     default:
2218       assert(T == S || T == D, "invalid register variant");
2219   }
2220 
2221   sve_frinta(dst, T, ptrue, src);
2222   // dst = round(src), ties to away
2223 
2224   Label none;
2225 
2226   sve_fneg(tmp1, T, ptrue, src);
2227   sve_dup(tmp2, T, rscratch1);
2228   sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1);
2229   br(EQ, none);
2230   {
2231     sve_cpy(tmp1, T, pgtmp, 0.5);
2232     sve_fadd(tmp1, T, pgtmp, src);
2233     sve_frintm(dst, T, pgtmp, tmp1);
2234     // dst = floor(src + 0.5, ties to even)
2235   }
2236   bind(none);
2237 
2238   sve_fcvtzs(dst, T, ptrue, dst, T);
2239   // result in dst
2240 }
2241 
2242 void C2_MacroAssembler::vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero,
2243                                            FloatRegister one, SIMD_Arrangement T) {
2244   assert_different_registers(dst, src, zero, one);
2245   assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2246 
2247   facgt(dst, T, src, zero);
2248   ushr(dst, T, dst, 1); // dst=0 for +-0.0 and NaN. 0x7FF..F otherwise
2249   bsl(dst, T == T2S ? T8B : T16B, one, src); // Result in dst
2250 }
2251 
2252 void C2_MacroAssembler::vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero,
2253                                           FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T) {
2254     assert_different_registers(dst, src, zero, one, vtmp);
2255     assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2256 
2257     sve_orr(vtmp, src, src);
2258     sve_fac(Assembler::GT, pgtmp, T, ptrue, src, zero); // pmtp=0 for +-0.0 and NaN. 0x1 otherwise
2259     switch (T) {
2260     case S:
2261       sve_and(vtmp, T, min_jint); // Extract the sign bit of float value in every lane of src
2262       sve_orr(vtmp, T, jint_cast(1.0)); // OR it with +1 to make the final result +1 or -1 depending
2263                                         // on the sign of the float value
2264       break;
2265     case D:
2266       sve_and(vtmp, T, min_jlong);
2267       sve_orr(vtmp, T, jlong_cast(1.0));
2268       break;
2269     default:
2270       assert(false, "unsupported");
2271       ShouldNotReachHere();
2272     }
2273     sve_sel(dst, T, pgtmp, vtmp, src); // Select either from src or vtmp based on the predicate register pgtmp
2274                                        // Result in dst
2275 }
2276 
2277 bool C2_MacroAssembler::in_scratch_emit_size() {
2278   if (ciEnv::current()->task() != nullptr) {
2279     PhaseOutput* phase_output = Compile::current()->output();
2280     if (phase_output != nullptr && phase_output->in_scratch_emit_size()) {
2281       return true;
2282     }
2283   }
2284   return MacroAssembler::in_scratch_emit_size();
2285 }