1 /*
   2  * Copyright (c) 2020, 2023, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "opto/c2_MacroAssembler.hpp"
  29 #include "opto/compile.hpp"
  30 #include "opto/intrinsicnode.hpp"
  31 #include "opto/matcher.hpp"
  32 #include "opto/output.hpp"
  33 #include "opto/subnode.hpp"
  34 #include "runtime/stubRoutines.hpp"
  35 
  36 #ifdef PRODUCT
  37 #define BLOCK_COMMENT(str) /* nothing */
  38 #define STOP(error) stop(error)
  39 #else
  40 #define BLOCK_COMMENT(str) block_comment(str)
  41 #define STOP(error) block_comment(error); stop(error)
  42 #endif
  43 
  44 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  45 
  46 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
  47 
  48 void C2_MacroAssembler::entry_barrier() {
  49   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
  50   if (BarrierSet::barrier_set()->barrier_set_nmethod() != nullptr) {
  51     // Dummy labels for just measuring the code size
  52     Label dummy_slow_path;
  53     Label dummy_continuation;
  54     Label dummy_guard;
  55     Label* slow_path = &dummy_slow_path;
  56     Label* continuation = &dummy_continuation;
  57     Label* guard = &dummy_guard;
  58     if (!Compile::current()->output()->in_scratch_emit_size()) {
  59       // Use real labels from actual stub when not emitting code for the purpose of measuring its size
  60       C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
  61       Compile::current()->output()->add_stub(stub);
  62       slow_path = &stub->entry();
  63       continuation = &stub->continuation();
  64       guard = &stub->guard();
  65     }
  66     // In the C2 code, we move the non-hot part of nmethod entry barriers out-of-line to a stub.
  67     bs->nmethod_entry_barrier(this, slow_path, continuation, guard);
  68   }
  69 }
  70 
  71 void C2_MacroAssembler::fast_lock(Register objectReg, Register boxReg, Register tmpReg,
  72                                   Register tmp2Reg, Register tmp3Reg) {
  73   Register oop = objectReg;
  74   Register box = boxReg;
  75   Register disp_hdr = tmpReg;
  76   Register tmp = tmp2Reg;
  77   Label cont;
  78   Label object_has_monitor;
  79   Label count, no_count;
  80 
  81   assert_different_registers(oop, box, tmp, disp_hdr);
  82 
  83   // Load markWord from object into displaced_header.
  84   ldr(disp_hdr, Address(oop, oopDesc::mark_offset_in_bytes()));
  85 
  86   if (DiagnoseSyncOnValueBasedClasses != 0) {
  87     load_klass(tmp, oop);
  88     ldrw(tmp, Address(tmp, Klass::access_flags_offset()));
  89     tstw(tmp, JVM_ACC_IS_VALUE_BASED_CLASS);
  90     br(Assembler::NE, cont);
  91   }
  92 
  93   // Check for existing monitor
  94   tbnz(disp_hdr, exact_log2(markWord::monitor_value), object_has_monitor);
  95 
  96   if (LockingMode == LM_MONITOR) {
  97     tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0.
  98     b(cont);
  99   } else if (LockingMode == LM_LEGACY) {
 100     // Set tmp to be (markWord of object | UNLOCK_VALUE).
 101     orr(tmp, disp_hdr, markWord::unlocked_value);
 102 
 103     if (EnableValhalla) {
 104       // Mask inline_type bit such that we go to the slow path if object is an inline type
 105       andr(tmp, tmp, ~((int) markWord::inline_type_bit_in_place));
 106     }
 107 
 108     // Initialize the box. (Must happen before we update the object mark!)
 109     str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 110 
 111     // Compare object markWord with an unlocked value (tmp) and if
 112     // equal exchange the stack address of our box with object markWord.
 113     // On failure disp_hdr contains the possibly locked markWord.
 114     cmpxchg(oop, tmp, box, Assembler::xword, /*acquire*/ true,
 115             /*release*/ true, /*weak*/ false, disp_hdr);
 116     br(Assembler::EQ, cont);
 117 
 118     assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
 119 
 120     // If the compare-and-exchange succeeded, then we found an unlocked
 121     // object, will have now locked it will continue at label cont
 122 
 123     // Check if the owner is self by comparing the value in the
 124     // markWord of object (disp_hdr) with the stack pointer.
 125     mov(rscratch1, sp);
 126     sub(disp_hdr, disp_hdr, rscratch1);
 127     mov(tmp, (address) (~(os::vm_page_size()-1) | markWord::lock_mask_in_place));
 128     // If condition is true we are cont and hence we can store 0 as the
 129     // displaced header in the box, which indicates that it is a recursive lock.
 130     ands(tmp/*==0?*/, disp_hdr, tmp);   // Sets flags for result
 131     str(tmp/*==0, perhaps*/, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 132     b(cont);
 133   } else {
 134     assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 135     lightweight_lock(oop, disp_hdr, tmp, tmp3Reg, no_count);
 136     b(count);
 137   }
 138 
 139   // Handle existing monitor.
 140   bind(object_has_monitor);
 141 
 142   // The object's monitor m is unlocked iff m->owner == NULL,
 143   // otherwise m->owner may contain a thread or a stack address.
 144   //
 145   // Try to CAS m->owner from NULL to current thread.
 146   add(tmp, disp_hdr, (in_bytes(ObjectMonitor::owner_offset())-markWord::monitor_value));
 147   cmpxchg(tmp, zr, rthread, Assembler::xword, /*acquire*/ true,
 148           /*release*/ true, /*weak*/ false, tmp3Reg); // Sets flags for result
 149 
 150   if (LockingMode != LM_LIGHTWEIGHT) {
 151     // Store a non-null value into the box to avoid looking like a re-entrant
 152     // lock. The fast-path monitor unlock code checks for
 153     // markWord::monitor_value so use markWord::unused_mark which has the
 154     // relevant bit set, and also matches ObjectSynchronizer::enter.
 155     mov(tmp, (address)markWord::unused_mark().value());
 156     str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 157   }
 158   br(Assembler::EQ, cont); // CAS success means locking succeeded
 159 
 160   cmp(tmp3Reg, rthread);
 161   br(Assembler::NE, cont); // Check for recursive locking
 162 
 163   // Recursive lock case
 164   increment(Address(disp_hdr, in_bytes(ObjectMonitor::recursions_offset()) - markWord::monitor_value), 1);
 165   // flag == EQ still from the cmp above, checking if this is a reentrant lock
 166 
 167   bind(cont);
 168   // flag == EQ indicates success
 169   // flag == NE indicates failure
 170   br(Assembler::NE, no_count);
 171 
 172   bind(count);
 173   increment(Address(rthread, JavaThread::held_monitor_count_offset()));
 174 
 175   bind(no_count);
 176 }
 177 
 178 void C2_MacroAssembler::fast_unlock(Register objectReg, Register boxReg, Register tmpReg,
 179                                     Register tmp2Reg) {
 180   Register oop = objectReg;
 181   Register box = boxReg;
 182   Register disp_hdr = tmpReg;
 183   Register tmp = tmp2Reg;
 184   Label cont;
 185   Label object_has_monitor;
 186   Label count, no_count;
 187 
 188   assert_different_registers(oop, box, tmp, disp_hdr);
 189 
 190   if (LockingMode == LM_LEGACY) {
 191     // Find the lock address and load the displaced header from the stack.
 192     ldr(disp_hdr, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 193 
 194     // If the displaced header is 0, we have a recursive unlock.
 195     cmp(disp_hdr, zr);
 196     br(Assembler::EQ, cont);
 197   }
 198 
 199   // Handle existing monitor.
 200   ldr(tmp, Address(oop, oopDesc::mark_offset_in_bytes()));
 201   tbnz(tmp, exact_log2(markWord::monitor_value), object_has_monitor);
 202 
 203   if (LockingMode == LM_MONITOR) {
 204     tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0.
 205     b(cont);
 206   } else if (LockingMode == LM_LEGACY) {
 207     // Check if it is still a light weight lock, this is is true if we
 208     // see the stack address of the basicLock in the markWord of the
 209     // object.
 210 
 211     cmpxchg(oop, box, disp_hdr, Assembler::xword, /*acquire*/ false,
 212             /*release*/ true, /*weak*/ false, tmp);
 213     b(cont);
 214   } else {
 215     assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 216     lightweight_unlock(oop, tmp, box, disp_hdr, no_count);
 217     b(count);
 218   }
 219 
 220   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
 221 
 222   // Handle existing monitor.
 223   bind(object_has_monitor);
 224   STATIC_ASSERT(markWord::monitor_value <= INT_MAX);
 225   add(tmp, tmp, -(int)markWord::monitor_value); // monitor
 226 
 227   if (LockingMode == LM_LIGHTWEIGHT) {
 228     // If the owner is anonymous, we need to fix it -- in an outline stub.
 229     Register tmp2 = disp_hdr;
 230     ldr(tmp2, Address(tmp, ObjectMonitor::owner_offset()));
 231     // We cannot use tbnz here, the target might be too far away and cannot
 232     // be encoded.
 233     tst(tmp2, (uint64_t)ObjectMonitor::ANONYMOUS_OWNER);
 234     C2HandleAnonOMOwnerStub* stub = new (Compile::current()->comp_arena()) C2HandleAnonOMOwnerStub(tmp, tmp2);
 235     Compile::current()->output()->add_stub(stub);
 236     br(Assembler::NE, stub->entry());
 237     bind(stub->continuation());
 238   }
 239 
 240   ldr(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset()));
 241 
 242   Label notRecursive;
 243   cbz(disp_hdr, notRecursive);
 244 
 245   // Recursive lock
 246   sub(disp_hdr, disp_hdr, 1u);
 247   str(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset()));
 248   cmp(disp_hdr, disp_hdr); // Sets flags for result
 249   b(cont);
 250 
 251   bind(notRecursive);
 252   ldr(rscratch1, Address(tmp, ObjectMonitor::EntryList_offset()));
 253   ldr(disp_hdr, Address(tmp, ObjectMonitor::cxq_offset()));
 254   orr(rscratch1, rscratch1, disp_hdr); // Will be 0 if both are 0.
 255   cmp(rscratch1, zr); // Sets flags for result
 256   cbnz(rscratch1, cont);
 257   // need a release store here
 258   lea(tmp, Address(tmp, ObjectMonitor::owner_offset()));
 259   stlr(zr, tmp); // set unowned
 260 
 261   bind(cont);
 262   // flag == EQ indicates success
 263   // flag == NE indicates failure
 264   br(Assembler::NE, no_count);
 265 
 266   bind(count);
 267   decrement(Address(rthread, JavaThread::held_monitor_count_offset()));
 268 
 269   bind(no_count);
 270 }
 271 
 272 // Search for str1 in str2 and return index or -1
 273 // Clobbers: rscratch1, rscratch2, rflags. May also clobber v0-v1, when icnt1==-1.
 274 void C2_MacroAssembler::string_indexof(Register str2, Register str1,
 275                                        Register cnt2, Register cnt1,
 276                                        Register tmp1, Register tmp2,
 277                                        Register tmp3, Register tmp4,
 278                                        Register tmp5, Register tmp6,
 279                                        int icnt1, Register result, int ae) {
 280   // NOTE: tmp5, tmp6 can be zr depending on specific method version
 281   Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
 282 
 283   Register ch1 = rscratch1;
 284   Register ch2 = rscratch2;
 285   Register cnt1tmp = tmp1;
 286   Register cnt2tmp = tmp2;
 287   Register cnt1_neg = cnt1;
 288   Register cnt2_neg = cnt2;
 289   Register result_tmp = tmp4;
 290 
 291   bool isL = ae == StrIntrinsicNode::LL;
 292 
 293   bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
 294   bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
 295   int str1_chr_shift = str1_isL ? 0:1;
 296   int str2_chr_shift = str2_isL ? 0:1;
 297   int str1_chr_size = str1_isL ? 1:2;
 298   int str2_chr_size = str2_isL ? 1:2;
 299   chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
 300                                       (chr_insn)&MacroAssembler::ldrh;
 301   chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
 302                                       (chr_insn)&MacroAssembler::ldrh;
 303   chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
 304   chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
 305 
 306   // Note, inline_string_indexOf() generates checks:
 307   // if (substr.count > string.count) return -1;
 308   // if (substr.count == 0) return 0;
 309 
 310   // We have two strings, a source string in str2, cnt2 and a pattern string
 311   // in str1, cnt1. Find the 1st occurrence of pattern in source or return -1.
 312 
 313   // For larger pattern and source we use a simplified Boyer Moore algorithm.
 314   // With a small pattern and source we use linear scan.
 315 
 316   if (icnt1 == -1) {
 317     sub(result_tmp, cnt2, cnt1);
 318     cmp(cnt1, (u1)8);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
 319     br(LT, LINEARSEARCH);
 320     dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
 321     subs(zr, cnt1, 256);
 322     lsr(tmp1, cnt2, 2);
 323     ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
 324     br(GE, LINEARSTUB);
 325   }
 326 
 327 // The Boyer Moore alogorithm is based on the description here:-
 328 //
 329 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
 330 //
 331 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
 332 // and the 'Good Suffix' rule.
 333 //
 334 // These rules are essentially heuristics for how far we can shift the
 335 // pattern along the search string.
 336 //
 337 // The implementation here uses the 'Bad Character' rule only because of the
 338 // complexity of initialisation for the 'Good Suffix' rule.
 339 //
 340 // This is also known as the Boyer-Moore-Horspool algorithm:-
 341 //
 342 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
 343 //
 344 // This particular implementation has few java-specific optimizations.
 345 //
 346 // #define ASIZE 256
 347 //
 348 //    int bm(unsigned char *x, int m, unsigned char *y, int n) {
 349 //       int i, j;
 350 //       unsigned c;
 351 //       unsigned char bc[ASIZE];
 352 //
 353 //       /* Preprocessing */
 354 //       for (i = 0; i < ASIZE; ++i)
 355 //          bc[i] = m;
 356 //       for (i = 0; i < m - 1; ) {
 357 //          c = x[i];
 358 //          ++i;
 359 //          // c < 256 for Latin1 string, so, no need for branch
 360 //          #ifdef PATTERN_STRING_IS_LATIN1
 361 //          bc[c] = m - i;
 362 //          #else
 363 //          if (c < ASIZE) bc[c] = m - i;
 364 //          #endif
 365 //       }
 366 //
 367 //       /* Searching */
 368 //       j = 0;
 369 //       while (j <= n - m) {
 370 //          c = y[i+j];
 371 //          if (x[m-1] == c)
 372 //            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
 373 //          if (i < 0) return j;
 374 //          // c < 256 for Latin1 string, so, no need for branch
 375 //          #ifdef SOURCE_STRING_IS_LATIN1
 376 //          // LL case: (c< 256) always true. Remove branch
 377 //          j += bc[y[j+m-1]];
 378 //          #endif
 379 //          #ifndef PATTERN_STRING_IS_UTF
 380 //          // UU case: need if (c<ASIZE) check. Skip 1 character if not.
 381 //          if (c < ASIZE)
 382 //            j += bc[y[j+m-1]];
 383 //          else
 384 //            j += 1
 385 //          #endif
 386 //          #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
 387 //          // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
 388 //          if (c < ASIZE)
 389 //            j += bc[y[j+m-1]];
 390 //          else
 391 //            j += m
 392 //          #endif
 393 //       }
 394 //    }
 395 
 396   if (icnt1 == -1) {
 397     Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
 398         BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
 399     Register cnt1end = tmp2;
 400     Register str2end = cnt2;
 401     Register skipch = tmp2;
 402 
 403     // str1 length is >=8, so, we can read at least 1 register for cases when
 404     // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
 405     // UL case. We'll re-read last character in inner pre-loop code to have
 406     // single outer pre-loop load
 407     const int firstStep = isL ? 7 : 3;
 408 
 409     const int ASIZE = 256;
 410     const int STORED_BYTES = 32; // amount of bytes stored per instruction
 411     sub(sp, sp, ASIZE);
 412     mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
 413     mov(ch1, sp);
 414     BIND(BM_INIT_LOOP);
 415       stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
 416       subs(tmp5, tmp5, 1);
 417       br(GT, BM_INIT_LOOP);
 418 
 419       sub(cnt1tmp, cnt1, 1);
 420       mov(tmp5, str2);
 421       add(str2end, str2, result_tmp, LSL, str2_chr_shift);
 422       sub(ch2, cnt1, 1);
 423       mov(tmp3, str1);
 424     BIND(BCLOOP);
 425       (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
 426       if (!str1_isL) {
 427         subs(zr, ch1, ASIZE);
 428         br(HS, BCSKIP);
 429       }
 430       strb(ch2, Address(sp, ch1));
 431     BIND(BCSKIP);
 432       subs(ch2, ch2, 1);
 433       br(GT, BCLOOP);
 434 
 435       add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
 436       if (str1_isL == str2_isL) {
 437         // load last 8 bytes (8LL/4UU symbols)
 438         ldr(tmp6, Address(tmp6, -wordSize));
 439       } else {
 440         ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
 441         // convert Latin1 to UTF. We'll have to wait until load completed, but
 442         // it's still faster than per-character loads+checks
 443         lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
 444         ubfx(ch1, tmp6, 8, 8); // str1[N-2]
 445         ubfx(ch2, tmp6, 16, 8); // str1[N-3]
 446         andr(tmp6, tmp6, 0xFF); // str1[N-4]
 447         orr(ch2, ch1, ch2, LSL, 16);
 448         orr(tmp6, tmp6, tmp3, LSL, 48);
 449         orr(tmp6, tmp6, ch2, LSL, 16);
 450       }
 451     BIND(BMLOOPSTR2);
 452       (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 453       sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
 454       if (str1_isL == str2_isL) {
 455         // re-init tmp3. It's for free because it's executed in parallel with
 456         // load above. Alternative is to initialize it before loop, but it'll
 457         // affect performance on in-order systems with 2 or more ld/st pipelines
 458         lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
 459       }
 460       if (!isL) { // UU/UL case
 461         lsl(ch2, cnt1tmp, 1); // offset in bytes
 462       }
 463       cmp(tmp3, skipch);
 464       br(NE, BMSKIP);
 465       ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
 466       mov(ch1, tmp6);
 467       if (isL) {
 468         b(BMLOOPSTR1_AFTER_LOAD);
 469       } else {
 470         sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
 471         b(BMLOOPSTR1_CMP);
 472       }
 473     BIND(BMLOOPSTR1);
 474       (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
 475       (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 476     BIND(BMLOOPSTR1_AFTER_LOAD);
 477       subs(cnt1tmp, cnt1tmp, 1);
 478       br(LT, BMLOOPSTR1_LASTCMP);
 479     BIND(BMLOOPSTR1_CMP);
 480       cmp(ch1, ch2);
 481       br(EQ, BMLOOPSTR1);
 482     BIND(BMSKIP);
 483       if (!isL) {
 484         // if we've met UTF symbol while searching Latin1 pattern, then we can
 485         // skip cnt1 symbols
 486         if (str1_isL != str2_isL) {
 487           mov(result_tmp, cnt1);
 488         } else {
 489           mov(result_tmp, 1);
 490         }
 491         subs(zr, skipch, ASIZE);
 492         br(HS, BMADV);
 493       }
 494       ldrb(result_tmp, Address(sp, skipch)); // load skip distance
 495     BIND(BMADV);
 496       sub(cnt1tmp, cnt1, 1);
 497       add(str2, str2, result_tmp, LSL, str2_chr_shift);
 498       cmp(str2, str2end);
 499       br(LE, BMLOOPSTR2);
 500       add(sp, sp, ASIZE);
 501       b(NOMATCH);
 502     BIND(BMLOOPSTR1_LASTCMP);
 503       cmp(ch1, ch2);
 504       br(NE, BMSKIP);
 505     BIND(BMMATCH);
 506       sub(result, str2, tmp5);
 507       if (!str2_isL) lsr(result, result, 1);
 508       add(sp, sp, ASIZE);
 509       b(DONE);
 510 
 511     BIND(LINEARSTUB);
 512     cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm
 513     br(LT, LINEAR_MEDIUM);
 514     mov(result, zr);
 515     RuntimeAddress stub = nullptr;
 516     if (isL) {
 517       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
 518       assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated");
 519     } else if (str1_isL) {
 520       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
 521        assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated");
 522     } else {
 523       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
 524       assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated");
 525     }
 526     address call = trampoline_call(stub);
 527     if (call == nullptr) {
 528       DEBUG_ONLY(reset_labels(LINEARSEARCH, LINEAR_MEDIUM, DONE, NOMATCH, MATCH));
 529       ciEnv::current()->record_failure("CodeCache is full");
 530       return;
 531     }
 532     b(DONE);
 533   }
 534 
 535   BIND(LINEARSEARCH);
 536   {
 537     Label DO1, DO2, DO3;
 538 
 539     Register str2tmp = tmp2;
 540     Register first = tmp3;
 541 
 542     if (icnt1 == -1)
 543     {
 544         Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
 545 
 546         cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2));
 547         br(LT, DOSHORT);
 548       BIND(LINEAR_MEDIUM);
 549         (this->*str1_load_1chr)(first, Address(str1));
 550         lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
 551         sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
 552         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 553         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 554 
 555       BIND(FIRST_LOOP);
 556         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
 557         cmp(first, ch2);
 558         br(EQ, STR1_LOOP);
 559       BIND(STR2_NEXT);
 560         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 561         br(LE, FIRST_LOOP);
 562         b(NOMATCH);
 563 
 564       BIND(STR1_LOOP);
 565         adds(cnt1tmp, cnt1_neg, str1_chr_size);
 566         add(cnt2tmp, cnt2_neg, str2_chr_size);
 567         br(GE, MATCH);
 568 
 569       BIND(STR1_NEXT);
 570         (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
 571         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
 572         cmp(ch1, ch2);
 573         br(NE, STR2_NEXT);
 574         adds(cnt1tmp, cnt1tmp, str1_chr_size);
 575         add(cnt2tmp, cnt2tmp, str2_chr_size);
 576         br(LT, STR1_NEXT);
 577         b(MATCH);
 578 
 579       BIND(DOSHORT);
 580       if (str1_isL == str2_isL) {
 581         cmp(cnt1, (u1)2);
 582         br(LT, DO1);
 583         br(GT, DO3);
 584       }
 585     }
 586 
 587     if (icnt1 == 4) {
 588       Label CH1_LOOP;
 589 
 590         (this->*load_4chr)(ch1, str1);
 591         sub(result_tmp, cnt2, 4);
 592         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 593         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 594 
 595       BIND(CH1_LOOP);
 596         (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
 597         cmp(ch1, ch2);
 598         br(EQ, MATCH);
 599         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 600         br(LE, CH1_LOOP);
 601         b(NOMATCH);
 602       }
 603 
 604     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
 605       Label CH1_LOOP;
 606 
 607       BIND(DO2);
 608         (this->*load_2chr)(ch1, str1);
 609         if (icnt1 == 2) {
 610           sub(result_tmp, cnt2, 2);
 611         }
 612         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 613         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 614       BIND(CH1_LOOP);
 615         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
 616         cmp(ch1, ch2);
 617         br(EQ, MATCH);
 618         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 619         br(LE, CH1_LOOP);
 620         b(NOMATCH);
 621     }
 622 
 623     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
 624       Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
 625 
 626       BIND(DO3);
 627         (this->*load_2chr)(first, str1);
 628         (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
 629         if (icnt1 == 3) {
 630           sub(result_tmp, cnt2, 3);
 631         }
 632         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 633         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 634       BIND(FIRST_LOOP);
 635         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
 636         cmpw(first, ch2);
 637         br(EQ, STR1_LOOP);
 638       BIND(STR2_NEXT);
 639         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 640         br(LE, FIRST_LOOP);
 641         b(NOMATCH);
 642 
 643       BIND(STR1_LOOP);
 644         add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
 645         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
 646         cmp(ch1, ch2);
 647         br(NE, STR2_NEXT);
 648         b(MATCH);
 649     }
 650 
 651     if (icnt1 == -1 || icnt1 == 1) {
 652       Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
 653 
 654       BIND(DO1);
 655         (this->*str1_load_1chr)(ch1, str1);
 656         cmp(cnt2, (u1)8);
 657         br(LT, DO1_SHORT);
 658 
 659         sub(result_tmp, cnt2, 8/str2_chr_size);
 660         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 661         mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
 662         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 663 
 664         if (str2_isL) {
 665           orr(ch1, ch1, ch1, LSL, 8);
 666         }
 667         orr(ch1, ch1, ch1, LSL, 16);
 668         orr(ch1, ch1, ch1, LSL, 32);
 669       BIND(CH1_LOOP);
 670         ldr(ch2, Address(str2, cnt2_neg));
 671         eor(ch2, ch1, ch2);
 672         sub(tmp1, ch2, tmp3);
 673         orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 674         bics(tmp1, tmp1, tmp2);
 675         br(NE, HAS_ZERO);
 676         adds(cnt2_neg, cnt2_neg, 8);
 677         br(LT, CH1_LOOP);
 678 
 679         cmp(cnt2_neg, (u1)8);
 680         mov(cnt2_neg, 0);
 681         br(LT, CH1_LOOP);
 682         b(NOMATCH);
 683 
 684       BIND(HAS_ZERO);
 685         rev(tmp1, tmp1);
 686         clz(tmp1, tmp1);
 687         add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
 688         b(MATCH);
 689 
 690       BIND(DO1_SHORT);
 691         mov(result_tmp, cnt2);
 692         lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
 693         sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
 694       BIND(DO1_LOOP);
 695         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
 696         cmpw(ch1, ch2);
 697         br(EQ, MATCH);
 698         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 699         br(LT, DO1_LOOP);
 700     }
 701   }
 702   BIND(NOMATCH);
 703     mov(result, -1);
 704     b(DONE);
 705   BIND(MATCH);
 706     add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
 707   BIND(DONE);
 708 }
 709 
 710 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
 711 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
 712 
 713 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1,
 714                                             Register ch, Register result,
 715                                             Register tmp1, Register tmp2, Register tmp3)
 716 {
 717   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
 718   Register cnt1_neg = cnt1;
 719   Register ch1 = rscratch1;
 720   Register result_tmp = rscratch2;
 721 
 722   cbz(cnt1, NOMATCH);
 723 
 724   cmp(cnt1, (u1)4);
 725   br(LT, DO1_SHORT);
 726 
 727   orr(ch, ch, ch, LSL, 16);
 728   orr(ch, ch, ch, LSL, 32);
 729 
 730   sub(cnt1, cnt1, 4);
 731   mov(result_tmp, cnt1);
 732   lea(str1, Address(str1, cnt1, Address::uxtw(1)));
 733   sub(cnt1_neg, zr, cnt1, LSL, 1);
 734 
 735   mov(tmp3, 0x0001000100010001);
 736 
 737   BIND(CH1_LOOP);
 738     ldr(ch1, Address(str1, cnt1_neg));
 739     eor(ch1, ch, ch1);
 740     sub(tmp1, ch1, tmp3);
 741     orr(tmp2, ch1, 0x7fff7fff7fff7fff);
 742     bics(tmp1, tmp1, tmp2);
 743     br(NE, HAS_ZERO);
 744     adds(cnt1_neg, cnt1_neg, 8);
 745     br(LT, CH1_LOOP);
 746 
 747     cmp(cnt1_neg, (u1)8);
 748     mov(cnt1_neg, 0);
 749     br(LT, CH1_LOOP);
 750     b(NOMATCH);
 751 
 752   BIND(HAS_ZERO);
 753     rev(tmp1, tmp1);
 754     clz(tmp1, tmp1);
 755     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
 756     b(MATCH);
 757 
 758   BIND(DO1_SHORT);
 759     mov(result_tmp, cnt1);
 760     lea(str1, Address(str1, cnt1, Address::uxtw(1)));
 761     sub(cnt1_neg, zr, cnt1, LSL, 1);
 762   BIND(DO1_LOOP);
 763     ldrh(ch1, Address(str1, cnt1_neg));
 764     cmpw(ch, ch1);
 765     br(EQ, MATCH);
 766     adds(cnt1_neg, cnt1_neg, 2);
 767     br(LT, DO1_LOOP);
 768   BIND(NOMATCH);
 769     mov(result, -1);
 770     b(DONE);
 771   BIND(MATCH);
 772     add(result, result_tmp, cnt1_neg, ASR, 1);
 773   BIND(DONE);
 774 }
 775 
 776 void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1,
 777                                                 Register ch, Register result,
 778                                                 FloatRegister ztmp1,
 779                                                 FloatRegister ztmp2,
 780                                                 PRegister tmp_pg,
 781                                                 PRegister tmp_pdn, bool isL)
 782 {
 783   // Note that `tmp_pdn` should *NOT* be used as governing predicate register.
 784   assert(tmp_pg->is_governing(),
 785          "this register has to be a governing predicate register");
 786 
 787   Label LOOP, MATCH, DONE, NOMATCH;
 788   Register vec_len = rscratch1;
 789   Register idx = rscratch2;
 790 
 791   SIMD_RegVariant T = (isL == true) ? B : H;
 792 
 793   cbz(cnt1, NOMATCH);
 794 
 795   // Assign the particular char throughout the vector.
 796   sve_dup(ztmp2, T, ch);
 797   if (isL) {
 798     sve_cntb(vec_len);
 799   } else {
 800     sve_cnth(vec_len);
 801   }
 802   mov(idx, 0);
 803 
 804   // Generate a predicate to control the reading of input string.
 805   sve_whilelt(tmp_pg, T, idx, cnt1);
 806 
 807   BIND(LOOP);
 808     // Read a vector of 8- or 16-bit data depending on the string type. Note
 809     // that inactive elements indicated by the predicate register won't cause
 810     // a data read from memory to the destination vector.
 811     if (isL) {
 812       sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx));
 813     } else {
 814       sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1)));
 815     }
 816     add(idx, idx, vec_len);
 817 
 818     // Perform the comparison. An element of the destination predicate is set
 819     // to active if the particular char is matched.
 820     sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2);
 821 
 822     // Branch if the particular char is found.
 823     br(NE, MATCH);
 824 
 825     sve_whilelt(tmp_pg, T, idx, cnt1);
 826 
 827     // Loop back if the particular char not found.
 828     br(MI, LOOP);
 829 
 830   BIND(NOMATCH);
 831     mov(result, -1);
 832     b(DONE);
 833 
 834   BIND(MATCH);
 835     // Undo the index increment.
 836     sub(idx, idx, vec_len);
 837 
 838     // Crop the vector to find its location.
 839     sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */);
 840     add(result, idx, -1);
 841     sve_incp(result, T, tmp_pdn);
 842   BIND(DONE);
 843 }
 844 
 845 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1,
 846                                             Register ch, Register result,
 847                                             Register tmp1, Register tmp2, Register tmp3)
 848 {
 849   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
 850   Register cnt1_neg = cnt1;
 851   Register ch1 = rscratch1;
 852   Register result_tmp = rscratch2;
 853 
 854   cbz(cnt1, NOMATCH);
 855 
 856   cmp(cnt1, (u1)8);
 857   br(LT, DO1_SHORT);
 858 
 859   orr(ch, ch, ch, LSL, 8);
 860   orr(ch, ch, ch, LSL, 16);
 861   orr(ch, ch, ch, LSL, 32);
 862 
 863   sub(cnt1, cnt1, 8);
 864   mov(result_tmp, cnt1);
 865   lea(str1, Address(str1, cnt1));
 866   sub(cnt1_neg, zr, cnt1);
 867 
 868   mov(tmp3, 0x0101010101010101);
 869 
 870   BIND(CH1_LOOP);
 871     ldr(ch1, Address(str1, cnt1_neg));
 872     eor(ch1, ch, ch1);
 873     sub(tmp1, ch1, tmp3);
 874     orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f);
 875     bics(tmp1, tmp1, tmp2);
 876     br(NE, HAS_ZERO);
 877     adds(cnt1_neg, cnt1_neg, 8);
 878     br(LT, CH1_LOOP);
 879 
 880     cmp(cnt1_neg, (u1)8);
 881     mov(cnt1_neg, 0);
 882     br(LT, CH1_LOOP);
 883     b(NOMATCH);
 884 
 885   BIND(HAS_ZERO);
 886     rev(tmp1, tmp1);
 887     clz(tmp1, tmp1);
 888     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
 889     b(MATCH);
 890 
 891   BIND(DO1_SHORT);
 892     mov(result_tmp, cnt1);
 893     lea(str1, Address(str1, cnt1));
 894     sub(cnt1_neg, zr, cnt1);
 895   BIND(DO1_LOOP);
 896     ldrb(ch1, Address(str1, cnt1_neg));
 897     cmp(ch, ch1);
 898     br(EQ, MATCH);
 899     adds(cnt1_neg, cnt1_neg, 1);
 900     br(LT, DO1_LOOP);
 901   BIND(NOMATCH);
 902     mov(result, -1);
 903     b(DONE);
 904   BIND(MATCH);
 905     add(result, result_tmp, cnt1_neg);
 906   BIND(DONE);
 907 }
 908 
 909 // Compare strings.
 910 void C2_MacroAssembler::string_compare(Register str1, Register str2,
 911     Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
 912     FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
 913     PRegister pgtmp1, PRegister pgtmp2, int ae) {
 914   Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
 915       DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
 916       SHORT_LOOP_START, TAIL_CHECK;
 917 
 918   bool isLL = ae == StrIntrinsicNode::LL;
 919   bool isLU = ae == StrIntrinsicNode::LU;
 920   bool isUL = ae == StrIntrinsicNode::UL;
 921 
 922   // The stub threshold for LL strings is: 72 (64 + 8) chars
 923   // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch)
 924   // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least)
 925   const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36);
 926 
 927   bool str1_isL = isLL || isLU;
 928   bool str2_isL = isLL || isUL;
 929 
 930   int str1_chr_shift = str1_isL ? 0 : 1;
 931   int str2_chr_shift = str2_isL ? 0 : 1;
 932   int str1_chr_size = str1_isL ? 1 : 2;
 933   int str2_chr_size = str2_isL ? 1 : 2;
 934   int minCharsInWord = isLL ? wordSize : wordSize/2;
 935 
 936   FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
 937   chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
 938                                       (chr_insn)&MacroAssembler::ldrh;
 939   chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
 940                                       (chr_insn)&MacroAssembler::ldrh;
 941   uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
 942                             (uxt_insn)&MacroAssembler::uxthw;
 943 
 944   BLOCK_COMMENT("string_compare {");
 945 
 946   // Bizzarely, the counts are passed in bytes, regardless of whether they
 947   // are L or U strings, however the result is always in characters.
 948   if (!str1_isL) asrw(cnt1, cnt1, 1);
 949   if (!str2_isL) asrw(cnt2, cnt2, 1);
 950 
 951   // Compute the minimum of the string lengths and save the difference.
 952   subsw(result, cnt1, cnt2);
 953   cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
 954 
 955   // A very short string
 956   cmpw(cnt2, minCharsInWord);
 957   br(Assembler::LE, SHORT_STRING);
 958 
 959   // Compare longwords
 960   // load first parts of strings and finish initialization while loading
 961   {
 962     if (str1_isL == str2_isL) { // LL or UU
 963       ldr(tmp1, Address(str1));
 964       cmp(str1, str2);
 965       br(Assembler::EQ, DONE);
 966       ldr(tmp2, Address(str2));
 967       cmp(cnt2, stub_threshold);
 968       br(GE, STUB);
 969       subsw(cnt2, cnt2, minCharsInWord);
 970       br(EQ, TAIL_CHECK);
 971       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
 972       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
 973       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
 974     } else if (isLU) {
 975       ldrs(vtmp, Address(str1));
 976       ldr(tmp2, Address(str2));
 977       cmp(cnt2, stub_threshold);
 978       br(GE, STUB);
 979       subw(cnt2, cnt2, 4);
 980       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
 981       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
 982       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
 983       zip1(vtmp, T8B, vtmp, vtmpZ);
 984       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
 985       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
 986       add(cnt1, cnt1, 4);
 987       fmovd(tmp1, vtmp);
 988     } else { // UL case
 989       ldr(tmp1, Address(str1));
 990       ldrs(vtmp, Address(str2));
 991       cmp(cnt2, stub_threshold);
 992       br(GE, STUB);
 993       subw(cnt2, cnt2, 4);
 994       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
 995       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
 996       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
 997       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
 998       zip1(vtmp, T8B, vtmp, vtmpZ);
 999       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1000       add(cnt1, cnt1, 8);
1001       fmovd(tmp2, vtmp);
1002     }
1003     adds(cnt2, cnt2, isUL ? 4 : 8);
1004     br(GE, TAIL);
1005     eor(rscratch2, tmp1, tmp2);
1006     cbnz(rscratch2, DIFF);
1007     // main loop
1008     bind(NEXT_WORD);
1009     if (str1_isL == str2_isL) {
1010       ldr(tmp1, Address(str1, cnt2));
1011       ldr(tmp2, Address(str2, cnt2));
1012       adds(cnt2, cnt2, 8);
1013     } else if (isLU) {
1014       ldrs(vtmp, Address(str1, cnt1));
1015       ldr(tmp2, Address(str2, cnt2));
1016       add(cnt1, cnt1, 4);
1017       zip1(vtmp, T8B, vtmp, vtmpZ);
1018       fmovd(tmp1, vtmp);
1019       adds(cnt2, cnt2, 8);
1020     } else { // UL
1021       ldrs(vtmp, Address(str2, cnt2));
1022       ldr(tmp1, Address(str1, cnt1));
1023       zip1(vtmp, T8B, vtmp, vtmpZ);
1024       add(cnt1, cnt1, 8);
1025       fmovd(tmp2, vtmp);
1026       adds(cnt2, cnt2, 4);
1027     }
1028     br(GE, TAIL);
1029 
1030     eor(rscratch2, tmp1, tmp2);
1031     cbz(rscratch2, NEXT_WORD);
1032     b(DIFF);
1033     bind(TAIL);
1034     eor(rscratch2, tmp1, tmp2);
1035     cbnz(rscratch2, DIFF);
1036     // Last longword.  In the case where length == 4 we compare the
1037     // same longword twice, but that's still faster than another
1038     // conditional branch.
1039     if (str1_isL == str2_isL) {
1040       ldr(tmp1, Address(str1));
1041       ldr(tmp2, Address(str2));
1042     } else if (isLU) {
1043       ldrs(vtmp, Address(str1));
1044       ldr(tmp2, Address(str2));
1045       zip1(vtmp, T8B, vtmp, vtmpZ);
1046       fmovd(tmp1, vtmp);
1047     } else { // UL
1048       ldrs(vtmp, Address(str2));
1049       ldr(tmp1, Address(str1));
1050       zip1(vtmp, T8B, vtmp, vtmpZ);
1051       fmovd(tmp2, vtmp);
1052     }
1053     bind(TAIL_CHECK);
1054     eor(rscratch2, tmp1, tmp2);
1055     cbz(rscratch2, DONE);
1056 
1057     // Find the first different characters in the longwords and
1058     // compute their difference.
1059     bind(DIFF);
1060     rev(rscratch2, rscratch2);
1061     clz(rscratch2, rscratch2);
1062     andr(rscratch2, rscratch2, isLL ? -8 : -16);
1063     lsrv(tmp1, tmp1, rscratch2);
1064     (this->*ext_chr)(tmp1, tmp1);
1065     lsrv(tmp2, tmp2, rscratch2);
1066     (this->*ext_chr)(tmp2, tmp2);
1067     subw(result, tmp1, tmp2);
1068     b(DONE);
1069   }
1070 
1071   bind(STUB);
1072     RuntimeAddress stub = nullptr;
1073     switch(ae) {
1074       case StrIntrinsicNode::LL:
1075         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
1076         break;
1077       case StrIntrinsicNode::UU:
1078         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
1079         break;
1080       case StrIntrinsicNode::LU:
1081         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
1082         break;
1083       case StrIntrinsicNode::UL:
1084         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
1085         break;
1086       default:
1087         ShouldNotReachHere();
1088      }
1089     assert(stub.target() != nullptr, "compare_long_string stub has not been generated");
1090     address call = trampoline_call(stub);
1091     if (call == nullptr) {
1092       DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START));
1093       ciEnv::current()->record_failure("CodeCache is full");
1094       return;
1095     }
1096     b(DONE);
1097 
1098   bind(SHORT_STRING);
1099   // Is the minimum length zero?
1100   cbz(cnt2, DONE);
1101   // arrange code to do most branches while loading and loading next characters
1102   // while comparing previous
1103   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1104   subs(cnt2, cnt2, 1);
1105   br(EQ, SHORT_LAST_INIT);
1106   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1107   b(SHORT_LOOP_START);
1108   bind(SHORT_LOOP);
1109   subs(cnt2, cnt2, 1);
1110   br(EQ, SHORT_LAST);
1111   bind(SHORT_LOOP_START);
1112   (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
1113   (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
1114   cmp(tmp1, cnt1);
1115   br(NE, SHORT_LOOP_TAIL);
1116   subs(cnt2, cnt2, 1);
1117   br(EQ, SHORT_LAST2);
1118   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1119   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1120   cmp(tmp2, rscratch1);
1121   br(EQ, SHORT_LOOP);
1122   sub(result, tmp2, rscratch1);
1123   b(DONE);
1124   bind(SHORT_LOOP_TAIL);
1125   sub(result, tmp1, cnt1);
1126   b(DONE);
1127   bind(SHORT_LAST2);
1128   cmp(tmp2, rscratch1);
1129   br(EQ, DONE);
1130   sub(result, tmp2, rscratch1);
1131 
1132   b(DONE);
1133   bind(SHORT_LAST_INIT);
1134   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1135   bind(SHORT_LAST);
1136   cmp(tmp1, cnt1);
1137   br(EQ, DONE);
1138   sub(result, tmp1, cnt1);
1139 
1140   bind(DONE);
1141 
1142   BLOCK_COMMENT("} string_compare");
1143 }
1144 
1145 void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1,
1146                                      FloatRegister src2, Condition cond, bool isQ) {
1147   SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1148   FloatRegister zn = src1, zm = src2;
1149   bool needs_negation = false;
1150   switch (cond) {
1151     case LT: cond = GT; zn = src2; zm = src1; break;
1152     case LE: cond = GE; zn = src2; zm = src1; break;
1153     case LO: cond = HI; zn = src2; zm = src1; break;
1154     case LS: cond = HS; zn = src2; zm = src1; break;
1155     case NE: cond = EQ; needs_negation = true; break;
1156     default:
1157       break;
1158   }
1159 
1160   if (is_floating_point_type(bt)) {
1161     fcm(cond, dst, size, zn, zm);
1162   } else {
1163     cm(cond, dst, size, zn, zm);
1164   }
1165 
1166   if (needs_negation) {
1167     notr(dst, isQ ? T16B : T8B, dst);
1168   }
1169 }
1170 
1171 void C2_MacroAssembler::neon_compare_zero(FloatRegister dst, BasicType bt, FloatRegister src,
1172                                           Condition cond, bool isQ) {
1173   SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1174   if (bt == T_FLOAT || bt == T_DOUBLE) {
1175     if (cond == Assembler::NE) {
1176       fcm(Assembler::EQ, dst, size, src);
1177       notr(dst, isQ ? T16B : T8B, dst);
1178     } else {
1179       fcm(cond, dst, size, src);
1180     }
1181   } else {
1182     if (cond == Assembler::NE) {
1183       cm(Assembler::EQ, dst, size, src);
1184       notr(dst, isQ ? T16B : T8B, dst);
1185     } else {
1186       cm(cond, dst, size, src);
1187     }
1188   }
1189 }
1190 
1191 // Compress the least significant bit of each byte to the rightmost and clear
1192 // the higher garbage bits.
1193 void C2_MacroAssembler::bytemask_compress(Register dst) {
1194   // Example input, dst = 0x01 00 00 00 01 01 00 01
1195   // The "??" bytes are garbage.
1196   orr(dst, dst, dst, Assembler::LSR, 7);  // dst = 0x?? 02 ?? 00 ?? 03 ?? 01
1197   orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D
1198   orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D
1199   andr(dst, dst, 0xff);                   // dst = 0x8D
1200 }
1201 
1202 // Pack the lowest-numbered bit of each mask element in src into a long value
1203 // in dst, at most the first 64 lane elements.
1204 // Clobbers: rscratch1, if UseSVE=1 or the hardware doesn't support FEAT_BITPERM.
1205 void C2_MacroAssembler::sve_vmask_tolong(Register dst, PRegister src, BasicType bt, int lane_cnt,
1206                                          FloatRegister vtmp1, FloatRegister vtmp2) {
1207   assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
1208   assert_different_registers(dst, rscratch1);
1209   assert_different_registers(vtmp1, vtmp2);
1210 
1211   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
1212   // Example:   src = 0b01100101 10001101, bt = T_BYTE, lane_cnt = 16
1213   // Expected:  dst = 0x658D
1214 
1215   // Convert the mask into vector with sequential bytes.
1216   // vtmp1 = 0x00010100 0x00010001 0x01000000 0x01010001
1217   sve_cpy(vtmp1, size, src, 1, false);
1218   if (bt != T_BYTE) {
1219     sve_vector_narrow(vtmp1, B, vtmp1, size, vtmp2);
1220   }
1221 
1222   if (UseSVE > 1 && VM_Version::supports_svebitperm()) {
1223     // Given a vector with the value 0x00 or 0x01 in each byte, the basic idea
1224     // is to compress each significant bit of the byte in a cross-lane way. Due
1225     // to the lack of a cross-lane bit-compress instruction, we use BEXT
1226     // (bit-compress in each lane) with the biggest lane size (T = D) then
1227     // concatenate the results.
1228 
1229     // The second source input of BEXT, initialized with 0x01 in each byte.
1230     // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1231     sve_dup(vtmp2, B, 1);
1232 
1233     // BEXT vtmp1.D, vtmp1.D, vtmp2.D
1234     // vtmp1 = 0x0001010000010001 | 0x0100000001010001
1235     // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1236     //         ---------------------------------------
1237     // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1238     sve_bext(vtmp1, D, vtmp1, vtmp2);
1239 
1240     // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the
1241     // result to dst.
1242     // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1243     // dst   = 0x658D
1244     if (lane_cnt <= 8) {
1245       // No need to concatenate.
1246       umov(dst, vtmp1, B, 0);
1247     } else if (lane_cnt <= 16) {
1248       ins(vtmp1, B, vtmp1, 1, 8);
1249       umov(dst, vtmp1, H, 0);
1250     } else {
1251       // As the lane count is 64 at most, the final expected value must be in
1252       // the lowest 64 bits after narrowing vtmp1 from D to B.
1253       sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2);
1254       umov(dst, vtmp1, D, 0);
1255     }
1256   } else if (UseSVE > 0) {
1257     // Compress the lowest 8 bytes.
1258     fmovd(dst, vtmp1);
1259     bytemask_compress(dst);
1260     if (lane_cnt <= 8) return;
1261 
1262     // Repeat on higher bytes and join the results.
1263     // Compress 8 bytes in each iteration.
1264     for (int idx = 1; idx < (lane_cnt / 8); idx++) {
1265       sve_extract_integral(rscratch1, T_LONG, vtmp1, idx, vtmp2);
1266       bytemask_compress(rscratch1);
1267       orr(dst, dst, rscratch1, Assembler::LSL, idx << 3);
1268     }
1269   } else {
1270     assert(false, "unsupported");
1271     ShouldNotReachHere();
1272   }
1273 }
1274 
1275 // Unpack the mask, a long value in src, into predicate register dst based on the
1276 // corresponding data type. Note that dst can support at most 64 lanes.
1277 // Below example gives the expected dst predicate register in different types, with
1278 // a valid src(0x658D) on a 1024-bit vector size machine.
1279 // BYTE:  dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 00 00 65 8D
1280 // SHORT: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 14 11 40 51
1281 // INT:   dst = 0x00 00 00 00 00 00 00 00 01 10 01 01 10 00 11 01
1282 // LONG:  dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01
1283 //
1284 // The number of significant bits of src must be equal to lane_cnt. E.g., 0xFF658D which
1285 // has 24 significant bits would be an invalid input if dst predicate register refers to
1286 // a LONG type 1024-bit vector, which has at most 16 lanes.
1287 void C2_MacroAssembler::sve_vmask_fromlong(PRegister dst, Register src, BasicType bt, int lane_cnt,
1288                                            FloatRegister vtmp1, FloatRegister vtmp2) {
1289   assert(UseSVE == 2 && VM_Version::supports_svebitperm() &&
1290          lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported");
1291   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
1292   // Example:   src = 0x658D, bt = T_BYTE, size = B, lane_cnt = 16
1293   // Expected:  dst = 0b01101001 10001101
1294 
1295   // Put long value from general purpose register into the first lane of vector.
1296   // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1297   sve_dup(vtmp1, B, 0);
1298   mov(vtmp1, D, 0, src);
1299 
1300   // As sve_cmp generates mask value with the minimum unit in byte, we should
1301   // transform the value in the first lane which is mask in bit now to the
1302   // mask in byte, which can be done by SVE2's BDEP instruction.
1303 
1304   // The first source input of BDEP instruction. Deposite each byte in every 8 bytes.
1305   // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1306   if (lane_cnt <= 8) {
1307     // Nothing. As only one byte exsits.
1308   } else if (lane_cnt <= 16) {
1309     ins(vtmp1, B, vtmp1, 8, 1);
1310     mov(vtmp1, B, 1, zr);
1311   } else {
1312     sve_vector_extend(vtmp1, D, vtmp1, B);
1313   }
1314 
1315   // The second source input of BDEP instruction, initialized with 0x01 for each byte.
1316   // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1317   sve_dup(vtmp2, B, 1);
1318 
1319   // BDEP vtmp1.D, vtmp1.D, vtmp2.D
1320   // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1321   // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1322   //         ---------------------------------------
1323   // vtmp1 = 0x0001010000010001 | 0x0100000001010001
1324   sve_bdep(vtmp1, D, vtmp1, vtmp2);
1325 
1326   if (bt != T_BYTE) {
1327     sve_vector_extend(vtmp1, size, vtmp1, B);
1328   }
1329   // Generate mask according to the given vector, in which the elements have been
1330   // extended to expected type.
1331   // dst = 0b01101001 10001101
1332   sve_cmp(Assembler::NE, dst, size, ptrue, vtmp1, 0);
1333 }
1334 
1335 // Clobbers: rflags
1336 void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg,
1337                                     FloatRegister zn, FloatRegister zm, Condition cond) {
1338   assert(pg->is_governing(), "This register has to be a governing predicate register");
1339   FloatRegister z1 = zn, z2 = zm;
1340   switch (cond) {
1341     case LE: z1 = zm; z2 = zn; cond = GE; break;
1342     case LT: z1 = zm; z2 = zn; cond = GT; break;
1343     case LO: z1 = zm; z2 = zn; cond = HI; break;
1344     case LS: z1 = zm; z2 = zn; cond = HS; break;
1345     default:
1346       break;
1347   }
1348 
1349   SIMD_RegVariant size = elemType_to_regVariant(bt);
1350   if (is_floating_point_type(bt)) {
1351     sve_fcm(cond, pd, size, pg, z1, z2);
1352   } else {
1353     assert(is_integral_type(bt), "unsupported element type");
1354     sve_cmp(cond, pd, size, pg, z1, z2);
1355   }
1356 }
1357 
1358 // Get index of the last mask lane that is set
1359 void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) {
1360   SIMD_RegVariant size = elemType_to_regVariant(bt);
1361   sve_rev(ptmp, size, src);
1362   sve_brkb(ptmp, ptrue, ptmp, false);
1363   sve_cntp(dst, size, ptrue, ptmp);
1364   movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1);
1365   subw(dst, rscratch1, dst);
1366 }
1367 
1368 // Extend integer vector src to dst with the same lane count
1369 // but larger element size, e.g. 4B -> 4I
1370 void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes,
1371                                            FloatRegister src, BasicType src_bt) {
1372   if (src_bt == T_BYTE) {
1373     if (dst_bt == T_SHORT) {
1374       // 4B/8B to 4S/8S
1375       assert(dst_vlen_in_bytes == 8 || dst_vlen_in_bytes == 16, "unsupported");
1376       sxtl(dst, T8H, src, T8B);
1377     } else {
1378       // 4B to 4I
1379       assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported");
1380       sxtl(dst, T8H, src, T8B);
1381       sxtl(dst, T4S, dst, T4H);
1382     }
1383   } else if (src_bt == T_SHORT) {
1384     // 4S to 4I
1385     assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported");
1386     sxtl(dst, T4S, src, T4H);
1387   } else if (src_bt == T_INT) {
1388     // 2I to 2L
1389     assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported");
1390     sxtl(dst, T2D, src, T2S);
1391   } else {
1392     ShouldNotReachHere();
1393   }
1394 }
1395 
1396 // Narrow integer vector src down to dst with the same lane count
1397 // but smaller element size, e.g. 4I -> 4B
1398 void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt,
1399                                            FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) {
1400   if (src_bt == T_SHORT) {
1401     // 4S/8S to 4B/8B
1402     assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
1403     assert(dst_bt == T_BYTE, "unsupported");
1404     xtn(dst, T8B, src, T8H);
1405   } else if (src_bt == T_INT) {
1406     // 4I to 4B/4S
1407     assert(src_vlen_in_bytes == 16, "unsupported");
1408     assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported");
1409     xtn(dst, T4H, src, T4S);
1410     if (dst_bt == T_BYTE) {
1411       xtn(dst, T8B, dst, T8H);
1412     }
1413   } else if (src_bt == T_LONG) {
1414     // 2L to 2I
1415     assert(src_vlen_in_bytes == 16, "unsupported");
1416     assert(dst_bt == T_INT, "unsupported");
1417     xtn(dst, T2S, src, T2D);
1418   } else {
1419     ShouldNotReachHere();
1420   }
1421 }
1422 
1423 void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size,
1424                                           FloatRegister src, SIMD_RegVariant src_size) {
1425   assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size");
1426   if (src_size == B) {
1427     switch (dst_size) {
1428     case H:
1429       sve_sunpklo(dst, H, src);
1430       break;
1431     case S:
1432       sve_sunpklo(dst, H, src);
1433       sve_sunpklo(dst, S, dst);
1434       break;
1435     case D:
1436       sve_sunpklo(dst, H, src);
1437       sve_sunpklo(dst, S, dst);
1438       sve_sunpklo(dst, D, dst);
1439       break;
1440     default:
1441       ShouldNotReachHere();
1442     }
1443   } else if (src_size == H) {
1444     if (dst_size == S) {
1445       sve_sunpklo(dst, S, src);
1446     } else { // D
1447       sve_sunpklo(dst, S, src);
1448       sve_sunpklo(dst, D, dst);
1449     }
1450   } else if (src_size == S) {
1451     sve_sunpklo(dst, D, src);
1452   }
1453 }
1454 
1455 // Vector narrow from src to dst with specified element sizes.
1456 // High part of dst vector will be filled with zero.
1457 void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size,
1458                                           FloatRegister src, SIMD_RegVariant src_size,
1459                                           FloatRegister tmp) {
1460   assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size");
1461   assert_different_registers(src, tmp);
1462   sve_dup(tmp, src_size, 0);
1463   if (src_size == D) {
1464     switch (dst_size) {
1465     case S:
1466       sve_uzp1(dst, S, src, tmp);
1467       break;
1468     case H:
1469       assert_different_registers(dst, tmp);
1470       sve_uzp1(dst, S, src, tmp);
1471       sve_uzp1(dst, H, dst, tmp);
1472       break;
1473     case B:
1474       assert_different_registers(dst, tmp);
1475       sve_uzp1(dst, S, src, tmp);
1476       sve_uzp1(dst, H, dst, tmp);
1477       sve_uzp1(dst, B, dst, tmp);
1478       break;
1479     default:
1480       ShouldNotReachHere();
1481     }
1482   } else if (src_size == S) {
1483     if (dst_size == H) {
1484       sve_uzp1(dst, H, src, tmp);
1485     } else { // B
1486       assert_different_registers(dst, tmp);
1487       sve_uzp1(dst, H, src, tmp);
1488       sve_uzp1(dst, B, dst, tmp);
1489     }
1490   } else if (src_size == H) {
1491     sve_uzp1(dst, B, src, tmp);
1492   }
1493 }
1494 
1495 // Extend src predicate to dst predicate with the same lane count but larger
1496 // element size, e.g. 64Byte -> 512Long
1497 void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src,
1498                                              uint dst_element_length_in_bytes,
1499                                              uint src_element_length_in_bytes) {
1500   if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) {
1501     sve_punpklo(dst, src);
1502   } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) {
1503     sve_punpklo(dst, src);
1504     sve_punpklo(dst, dst);
1505   } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) {
1506     sve_punpklo(dst, src);
1507     sve_punpklo(dst, dst);
1508     sve_punpklo(dst, dst);
1509   } else {
1510     assert(false, "unsupported");
1511     ShouldNotReachHere();
1512   }
1513 }
1514 
1515 // Narrow src predicate to dst predicate with the same lane count but
1516 // smaller element size, e.g. 512Long -> 64Byte
1517 void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src, PRegister ptmp,
1518                                              uint dst_element_length_in_bytes, uint src_element_length_in_bytes) {
1519   // The insignificant bits in src predicate are expected to be zero.
1520   // To ensure the higher order bits of the resultant narrowed vector are 0, an all-zero predicate is
1521   // passed as the second argument. An example narrowing operation with a given mask would be -
1522   // 128Long -> 64Int on a 128-bit machine i.e 2L -> 2I
1523   // Mask (for 2 Longs) : TF
1524   // Predicate register for the above mask (16 bits) : 00000001 00000000
1525   // After narrowing (uzp1 dst.b, src.b, ptmp.b) : 0000 0000 0001 0000
1526   // Which translates to mask for 2 integers as : TF (lower half is considered while upper half is 0)
1527   assert_different_registers(src, ptmp);
1528   assert_different_registers(dst, ptmp);
1529   sve_pfalse(ptmp);
1530   if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) {
1531     sve_uzp1(dst, B, src, ptmp);
1532   } else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) {
1533     sve_uzp1(dst, H, src, ptmp);
1534     sve_uzp1(dst, B, dst, ptmp);
1535   } else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) {
1536     sve_uzp1(dst, S, src, ptmp);
1537     sve_uzp1(dst, H, dst, ptmp);
1538     sve_uzp1(dst, B, dst, ptmp);
1539   } else {
1540     assert(false, "unsupported");
1541     ShouldNotReachHere();
1542   }
1543 }
1544 
1545 // Vector reduction add for integral type with ASIMD instructions.
1546 void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt,
1547                                                  Register isrc, FloatRegister vsrc,
1548                                                  unsigned vector_length_in_bytes,
1549                                                  FloatRegister vtmp) {
1550   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1551   assert_different_registers(dst, isrc);
1552   bool isQ = vector_length_in_bytes == 16;
1553 
1554   BLOCK_COMMENT("neon_reduce_add_integral {");
1555     switch(bt) {
1556       case T_BYTE:
1557         addv(vtmp, isQ ? T16B : T8B, vsrc);
1558         smov(dst, vtmp, B, 0);
1559         addw(dst, dst, isrc, ext::sxtb);
1560         break;
1561       case T_SHORT:
1562         addv(vtmp, isQ ? T8H : T4H, vsrc);
1563         smov(dst, vtmp, H, 0);
1564         addw(dst, dst, isrc, ext::sxth);
1565         break;
1566       case T_INT:
1567         isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc);
1568         umov(dst, vtmp, S, 0);
1569         addw(dst, dst, isrc);
1570         break;
1571       case T_LONG:
1572         assert(isQ, "unsupported");
1573         addpd(vtmp, vsrc);
1574         umov(dst, vtmp, D, 0);
1575         add(dst, dst, isrc);
1576         break;
1577       default:
1578         assert(false, "unsupported");
1579         ShouldNotReachHere();
1580     }
1581   BLOCK_COMMENT("} neon_reduce_add_integral");
1582 }
1583 
1584 // Vector reduction multiply for integral type with ASIMD instructions.
1585 // Note: temporary registers vtmp1 and vtmp2 are not used in some cases.
1586 // Clobbers: rscratch1
1587 void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt,
1588                                                  Register isrc, FloatRegister vsrc,
1589                                                  unsigned vector_length_in_bytes,
1590                                                  FloatRegister vtmp1, FloatRegister vtmp2) {
1591   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1592   bool isQ = vector_length_in_bytes == 16;
1593 
1594   BLOCK_COMMENT("neon_reduce_mul_integral {");
1595     switch(bt) {
1596       case T_BYTE:
1597         if (isQ) {
1598           // Multiply the lower half and higher half of vector iteratively.
1599           // vtmp1 = vsrc[8:15]
1600           ins(vtmp1, D, vsrc, 0, 1);
1601           // vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7]
1602           mulv(vtmp1, T8B, vtmp1, vsrc);
1603           // vtmp2 = vtmp1[4:7]
1604           ins(vtmp2, S, vtmp1, 0, 1);
1605           // vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3]
1606           mulv(vtmp1, T8B, vtmp2, vtmp1);
1607         } else {
1608           ins(vtmp1, S, vsrc, 0, 1);
1609           mulv(vtmp1, T8B, vtmp1, vsrc);
1610         }
1611         // vtmp2 = vtmp1[2:3]
1612         ins(vtmp2, H, vtmp1, 0, 1);
1613         // vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1]
1614         mulv(vtmp2, T8B, vtmp2, vtmp1);
1615         // dst = vtmp2[0] * isrc * vtmp2[1]
1616         umov(rscratch1, vtmp2, B, 0);
1617         mulw(dst, rscratch1, isrc);
1618         sxtb(dst, dst);
1619         umov(rscratch1, vtmp2, B, 1);
1620         mulw(dst, rscratch1, dst);
1621         sxtb(dst, dst);
1622         break;
1623       case T_SHORT:
1624         if (isQ) {
1625           ins(vtmp2, D, vsrc, 0, 1);
1626           mulv(vtmp2, T4H, vtmp2, vsrc);
1627           ins(vtmp1, S, vtmp2, 0, 1);
1628           mulv(vtmp1, T4H, vtmp1, vtmp2);
1629         } else {
1630           ins(vtmp1, S, vsrc, 0, 1);
1631           mulv(vtmp1, T4H, vtmp1, vsrc);
1632         }
1633         umov(rscratch1, vtmp1, H, 0);
1634         mulw(dst, rscratch1, isrc);
1635         sxth(dst, dst);
1636         umov(rscratch1, vtmp1, H, 1);
1637         mulw(dst, rscratch1, dst);
1638         sxth(dst, dst);
1639         break;
1640       case T_INT:
1641         if (isQ) {
1642           ins(vtmp1, D, vsrc, 0, 1);
1643           mulv(vtmp1, T2S, vtmp1, vsrc);
1644         } else {
1645           vtmp1 = vsrc;
1646         }
1647         umov(rscratch1, vtmp1, S, 0);
1648         mul(dst, rscratch1, isrc);
1649         umov(rscratch1, vtmp1, S, 1);
1650         mul(dst, rscratch1, dst);
1651         break;
1652       case T_LONG:
1653         umov(rscratch1, vsrc, D, 0);
1654         mul(dst, isrc, rscratch1);
1655         umov(rscratch1, vsrc, D, 1);
1656         mul(dst, dst, rscratch1);
1657         break;
1658       default:
1659         assert(false, "unsupported");
1660         ShouldNotReachHere();
1661     }
1662   BLOCK_COMMENT("} neon_reduce_mul_integral");
1663 }
1664 
1665 // Vector reduction multiply for floating-point type with ASIMD instructions.
1666 void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt,
1667                                            FloatRegister fsrc, FloatRegister vsrc,
1668                                            unsigned vector_length_in_bytes,
1669                                            FloatRegister vtmp) {
1670   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1671   bool isQ = vector_length_in_bytes == 16;
1672 
1673   BLOCK_COMMENT("neon_reduce_mul_fp {");
1674     switch(bt) {
1675       case T_FLOAT:
1676         fmuls(dst, fsrc, vsrc);
1677         ins(vtmp, S, vsrc, 0, 1);
1678         fmuls(dst, dst, vtmp);
1679         if (isQ) {
1680           ins(vtmp, S, vsrc, 0, 2);
1681           fmuls(dst, dst, vtmp);
1682           ins(vtmp, S, vsrc, 0, 3);
1683           fmuls(dst, dst, vtmp);
1684          }
1685         break;
1686       case T_DOUBLE:
1687         assert(isQ, "unsupported");
1688         fmuld(dst, fsrc, vsrc);
1689         ins(vtmp, D, vsrc, 0, 1);
1690         fmuld(dst, dst, vtmp);
1691         break;
1692       default:
1693         assert(false, "unsupported");
1694         ShouldNotReachHere();
1695     }
1696   BLOCK_COMMENT("} neon_reduce_mul_fp");
1697 }
1698 
1699 // Helper to select logical instruction
1700 void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd,
1701                                                    Register Rn, Register Rm,
1702                                                    enum shift_kind kind, unsigned shift) {
1703   switch(opc) {
1704     case Op_AndReductionV:
1705       is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift);
1706       break;
1707     case Op_OrReductionV:
1708       is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift);
1709       break;
1710     case Op_XorReductionV:
1711       is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift);
1712       break;
1713     default:
1714       assert(false, "unsupported");
1715       ShouldNotReachHere();
1716   }
1717 }
1718 
1719 // Vector reduction logical operations And, Or, Xor
1720 // Clobbers: rscratch1
1721 void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt,
1722                                             Register isrc, FloatRegister vsrc,
1723                                             unsigned vector_length_in_bytes) {
1724   assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV,
1725          "unsupported");
1726   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1727   assert_different_registers(dst, isrc);
1728   bool isQ = vector_length_in_bytes == 16;
1729 
1730   BLOCK_COMMENT("neon_reduce_logical {");
1731     umov(rscratch1, vsrc, isQ ? D : S, 0);
1732     umov(dst, vsrc, isQ ? D : S, 1);
1733     neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1);
1734     switch(bt) {
1735       case T_BYTE:
1736         if (isQ) {
1737           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1738         }
1739         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
1740         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8);
1741         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1742         sxtb(dst, dst);
1743         break;
1744       case T_SHORT:
1745         if (isQ) {
1746           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1747         }
1748         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
1749         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1750         sxth(dst, dst);
1751         break;
1752       case T_INT:
1753         if (isQ) {
1754           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1755         }
1756         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1757         break;
1758       case T_LONG:
1759         assert(isQ, "unsupported");
1760         neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst);
1761         break;
1762       default:
1763         assert(false, "unsupported");
1764         ShouldNotReachHere();
1765     }
1766   BLOCK_COMMENT("} neon_reduce_logical");
1767 }
1768 
1769 // Vector reduction min/max for integral type with ASIMD instructions.
1770 // Note: vtmp is not used and expected to be fnoreg for T_LONG case.
1771 // Clobbers: rscratch1, rflags
1772 void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt,
1773                                                     Register isrc, FloatRegister vsrc,
1774                                                     unsigned vector_length_in_bytes,
1775                                                     FloatRegister vtmp) {
1776   assert(opc == Op_MinReductionV || opc == Op_MaxReductionV, "unsupported");
1777   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1778   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported");
1779   assert_different_registers(dst, isrc);
1780   bool isQ = vector_length_in_bytes == 16;
1781   bool is_min = opc == Op_MinReductionV;
1782 
1783   BLOCK_COMMENT("neon_reduce_minmax_integral {");
1784     if (bt == T_LONG) {
1785       assert(vtmp == fnoreg, "should be");
1786       assert(isQ, "should be");
1787       umov(rscratch1, vsrc, D, 0);
1788       cmp(isrc, rscratch1);
1789       csel(dst, isrc, rscratch1, is_min ? LT : GT);
1790       umov(rscratch1, vsrc, D, 1);
1791       cmp(dst, rscratch1);
1792       csel(dst, dst, rscratch1, is_min ? LT : GT);
1793     } else {
1794       SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1795       if (size == T2S) {
1796         is_min ? sminp(vtmp, size, vsrc, vsrc) : smaxp(vtmp, size, vsrc, vsrc);
1797       } else {
1798         is_min ? sminv(vtmp, size, vsrc) : smaxv(vtmp, size, vsrc);
1799       }
1800       if (bt == T_INT) {
1801         umov(dst, vtmp, S, 0);
1802       } else {
1803         smov(dst, vtmp, elemType_to_regVariant(bt), 0);
1804       }
1805       cmpw(dst, isrc);
1806       cselw(dst, dst, isrc, is_min ? LT : GT);
1807     }
1808   BLOCK_COMMENT("} neon_reduce_minmax_integral");
1809 }
1810 
1811 // Vector reduction for integral type with SVE instruction.
1812 // Supported operations are Add, And, Or, Xor, Max, Min.
1813 // rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV.
1814 void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1,
1815                                             FloatRegister src2, PRegister pg, FloatRegister tmp) {
1816   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
1817   assert(pg->is_governing(), "This register has to be a governing predicate register");
1818   assert_different_registers(src1, dst);
1819   // Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved.
1820   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
1821   switch (opc) {
1822     case Op_AddReductionVI: {
1823       sve_uaddv(tmp, size, pg, src2);
1824       if (bt == T_BYTE) {
1825         smov(dst, tmp, size, 0);
1826         addw(dst, src1, dst, ext::sxtb);
1827       } else if (bt == T_SHORT) {
1828         smov(dst, tmp, size, 0);
1829         addw(dst, src1, dst, ext::sxth);
1830       } else {
1831         umov(dst, tmp, size, 0);
1832         addw(dst, dst, src1);
1833       }
1834       break;
1835     }
1836     case Op_AddReductionVL: {
1837       sve_uaddv(tmp, size, pg, src2);
1838       umov(dst, tmp, size, 0);
1839       add(dst, dst, src1);
1840       break;
1841     }
1842     case Op_AndReductionV: {
1843       sve_andv(tmp, size, pg, src2);
1844       if (bt == T_INT || bt == T_LONG) {
1845         umov(dst, tmp, size, 0);
1846       } else {
1847         smov(dst, tmp, size, 0);
1848       }
1849       if (bt == T_LONG) {
1850         andr(dst, dst, src1);
1851       } else {
1852         andw(dst, dst, src1);
1853       }
1854       break;
1855     }
1856     case Op_OrReductionV: {
1857       sve_orv(tmp, size, pg, src2);
1858       if (bt == T_INT || bt == T_LONG) {
1859         umov(dst, tmp, size, 0);
1860       } else {
1861         smov(dst, tmp, size, 0);
1862       }
1863       if (bt == T_LONG) {
1864         orr(dst, dst, src1);
1865       } else {
1866         orrw(dst, dst, src1);
1867       }
1868       break;
1869     }
1870     case Op_XorReductionV: {
1871       sve_eorv(tmp, size, pg, src2);
1872       if (bt == T_INT || bt == T_LONG) {
1873         umov(dst, tmp, size, 0);
1874       } else {
1875         smov(dst, tmp, size, 0);
1876       }
1877       if (bt == T_LONG) {
1878         eor(dst, dst, src1);
1879       } else {
1880         eorw(dst, dst, src1);
1881       }
1882       break;
1883     }
1884     case Op_MaxReductionV: {
1885       sve_smaxv(tmp, size, pg, src2);
1886       if (bt == T_INT || bt == T_LONG) {
1887         umov(dst, tmp, size, 0);
1888       } else {
1889         smov(dst, tmp, size, 0);
1890       }
1891       if (bt == T_LONG) {
1892         cmp(dst, src1);
1893         csel(dst, dst, src1, Assembler::GT);
1894       } else {
1895         cmpw(dst, src1);
1896         cselw(dst, dst, src1, Assembler::GT);
1897       }
1898       break;
1899     }
1900     case Op_MinReductionV: {
1901       sve_sminv(tmp, size, pg, src2);
1902       if (bt == T_INT || bt == T_LONG) {
1903         umov(dst, tmp, size, 0);
1904       } else {
1905         smov(dst, tmp, size, 0);
1906       }
1907       if (bt == T_LONG) {
1908         cmp(dst, src1);
1909         csel(dst, dst, src1, Assembler::LT);
1910       } else {
1911         cmpw(dst, src1);
1912         cselw(dst, dst, src1, Assembler::LT);
1913       }
1914       break;
1915     }
1916     default:
1917       assert(false, "unsupported");
1918       ShouldNotReachHere();
1919   }
1920 
1921   if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) {
1922     if (bt == T_BYTE) {
1923       sxtb(dst, dst);
1924     } else if (bt == T_SHORT) {
1925       sxth(dst, dst);
1926     }
1927   }
1928 }
1929 
1930 // Set elements of the dst predicate to true for lanes in the range of [0, lane_cnt), or
1931 // to false otherwise. The input "lane_cnt" should be smaller than or equal to the supported
1932 // max vector length of the basic type. Clobbers: rscratch1 and the rFlagsReg.
1933 void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt) {
1934   uint32_t max_vector_length = Matcher::max_vector_size(bt);
1935   assert(lane_cnt <= max_vector_length, "unsupported input lane_cnt");
1936 
1937   // Set all elements to false if the input "lane_cnt" is zero.
1938   if (lane_cnt == 0) {
1939     sve_pfalse(dst);
1940     return;
1941   }
1942 
1943   SIMD_RegVariant size = elemType_to_regVariant(bt);
1944   assert(size != Q, "invalid size");
1945 
1946   // Set all true if "lane_cnt" equals to the max lane count.
1947   if (lane_cnt == max_vector_length) {
1948     sve_ptrue(dst, size, /* ALL */ 0b11111);
1949     return;
1950   }
1951 
1952   // Fixed numbers for "ptrue".
1953   switch(lane_cnt) {
1954   case 1: /* VL1 */
1955   case 2: /* VL2 */
1956   case 3: /* VL3 */
1957   case 4: /* VL4 */
1958   case 5: /* VL5 */
1959   case 6: /* VL6 */
1960   case 7: /* VL7 */
1961   case 8: /* VL8 */
1962     sve_ptrue(dst, size, lane_cnt);
1963     return;
1964   case 16:
1965     sve_ptrue(dst, size, /* VL16 */ 0b01001);
1966     return;
1967   case 32:
1968     sve_ptrue(dst, size, /* VL32 */ 0b01010);
1969     return;
1970   case 64:
1971     sve_ptrue(dst, size, /* VL64 */ 0b01011);
1972     return;
1973   case 128:
1974     sve_ptrue(dst, size, /* VL128 */ 0b01100);
1975     return;
1976   case 256:
1977     sve_ptrue(dst, size, /* VL256 */ 0b01101);
1978     return;
1979   default:
1980     break;
1981   }
1982 
1983   // Special patterns for "ptrue".
1984   if (lane_cnt == round_down_power_of_2(max_vector_length)) {
1985     sve_ptrue(dst, size, /* POW2 */ 0b00000);
1986   } else if (lane_cnt == max_vector_length - (max_vector_length % 4)) {
1987     sve_ptrue(dst, size, /* MUL4 */ 0b11101);
1988   } else if (lane_cnt == max_vector_length - (max_vector_length % 3)) {
1989     sve_ptrue(dst, size, /* MUL3 */ 0b11110);
1990   } else {
1991     // Encode to "whileltw" for the remaining cases.
1992     mov(rscratch1, lane_cnt);
1993     sve_whileltw(dst, size, zr, rscratch1);
1994   }
1995 }
1996 
1997 // Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst.
1998 // Any remaining elements of dst will be filled with zero.
1999 // Clobbers: rscratch1
2000 // Preserves: src, mask
2001 void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask,
2002                                            FloatRegister vtmp1, FloatRegister vtmp2,
2003                                            PRegister pgtmp) {
2004   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2005   assert_different_registers(dst, src, vtmp1, vtmp2);
2006   assert_different_registers(mask, pgtmp);
2007 
2008   // Example input:   src   = 8888 7777 6666 5555 4444 3333 2222 1111
2009   //                  mask  = 0001 0000 0000 0001 0001 0000 0001 0001
2010   // Expected result: dst   = 0000 0000 0000 8888 5555 4444 2222 1111
2011   sve_dup(vtmp2, H, 0);
2012 
2013   // Extend lowest half to type INT.
2014   // dst = 00004444 00003333 00002222 00001111
2015   sve_uunpklo(dst, S, src);
2016   // pgtmp = 00000001 00000000 00000001 00000001
2017   sve_punpklo(pgtmp, mask);
2018   // Pack the active elements in size of type INT to the right,
2019   // and fill the remainings with zero.
2020   // dst = 00000000 00004444 00002222 00001111
2021   sve_compact(dst, S, dst, pgtmp);
2022   // Narrow the result back to type SHORT.
2023   // dst = 0000 0000 0000 0000 0000 4444 2222 1111
2024   sve_uzp1(dst, H, dst, vtmp2);
2025   // Count the active elements of lowest half.
2026   // rscratch1 = 3
2027   sve_cntp(rscratch1, S, ptrue, pgtmp);
2028 
2029   // Repeat to the highest half.
2030   // pgtmp = 00000001 00000000 00000000 00000001
2031   sve_punpkhi(pgtmp, mask);
2032   // vtmp1 = 00008888 00007777 00006666 00005555
2033   sve_uunpkhi(vtmp1, S, src);
2034   // vtmp1 = 00000000 00000000 00008888 00005555
2035   sve_compact(vtmp1, S, vtmp1, pgtmp);
2036   // vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555
2037   sve_uzp1(vtmp1, H, vtmp1, vtmp2);
2038 
2039   // Compressed low:   dst   = 0000 0000 0000 0000 0000 4444 2222 1111
2040   // Compressed high:  vtmp1 = 0000 0000 0000 0000 0000 0000 8888  5555
2041   // Left shift(cross lane) compressed high with TRUE_CNT lanes,
2042   // TRUE_CNT is the number of active elements in the compressed low.
2043   neg(rscratch1, rscratch1);
2044   // vtmp2 = {4 3 2 1 0 -1 -2 -3}
2045   sve_index(vtmp2, H, rscratch1, 1);
2046   // vtmp1 = 0000 0000 0000 8888 5555 0000 0000 0000
2047   sve_tbl(vtmp1, H, vtmp1, vtmp2);
2048 
2049   // Combine the compressed high(after shifted) with the compressed low.
2050   // dst = 0000 0000 0000 8888 5555 4444 2222 1111
2051   sve_orr(dst, dst, vtmp1);
2052 }
2053 
2054 // Clobbers: rscratch1, rscratch2
2055 // Preserves: src, mask
2056 void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask,
2057                                           FloatRegister vtmp1, FloatRegister vtmp2,
2058                                           FloatRegister vtmp3, FloatRegister vtmp4,
2059                                           PRegister ptmp, PRegister pgtmp) {
2060   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2061   assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3, vtmp4);
2062   assert_different_registers(mask, ptmp, pgtmp);
2063   // Example input:   src   = 88 77 66 55 44 33 22 11
2064   //                  mask  = 01 00 00 01 01 00 01 01
2065   // Expected result: dst   = 00 00 00 88 55 44 22 11
2066 
2067   sve_dup(vtmp4, B, 0);
2068   // Extend lowest half to type SHORT.
2069   // vtmp1 = 0044 0033 0022 0011
2070   sve_uunpklo(vtmp1, H, src);
2071   // ptmp = 0001 0000 0001 0001
2072   sve_punpklo(ptmp, mask);
2073   // Count the active elements of lowest half.
2074   // rscratch2 = 3
2075   sve_cntp(rscratch2, H, ptrue, ptmp);
2076   // Pack the active elements in size of type SHORT to the right,
2077   // and fill the remainings with zero.
2078   // dst = 0000 0044 0022 0011
2079   sve_compress_short(dst, vtmp1, ptmp, vtmp2, vtmp3, pgtmp);
2080   // Narrow the result back to type BYTE.
2081   // dst = 00 00 00 00 00 44 22 11
2082   sve_uzp1(dst, B, dst, vtmp4);
2083 
2084   // Repeat to the highest half.
2085   // ptmp = 0001 0000 0000 0001
2086   sve_punpkhi(ptmp, mask);
2087   // vtmp1 = 0088 0077 0066 0055
2088   sve_uunpkhi(vtmp2, H, src);
2089   // vtmp1 = 0000 0000 0088 0055
2090   sve_compress_short(vtmp1, vtmp2, ptmp, vtmp3, vtmp4, pgtmp);
2091 
2092   sve_dup(vtmp4, B, 0);
2093   // vtmp1 = 00 00 00 00 00 00 88 55
2094   sve_uzp1(vtmp1, B, vtmp1, vtmp4);
2095 
2096   // Compressed low:   dst   = 00 00 00 00 00 44 22 11
2097   // Compressed high:  vtmp1 = 00 00 00 00 00 00 88 55
2098   // Left shift(cross lane) compressed high with TRUE_CNT lanes,
2099   // TRUE_CNT is the number of active elements in the compressed low.
2100   neg(rscratch2, rscratch2);
2101   // vtmp2 = {4 3 2 1 0 -1 -2 -3}
2102   sve_index(vtmp2, B, rscratch2, 1);
2103   // vtmp1 = 00 00 00 88 55 00 00 00
2104   sve_tbl(vtmp1, B, vtmp1, vtmp2);
2105   // Combine the compressed high(after shifted) with the compressed low.
2106   // dst = 00 00 00 88 55 44 22 11
2107   sve_orr(dst, dst, vtmp1);
2108 }
2109 
2110 void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2111   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2112   SIMD_Arrangement size = isQ ? T16B : T8B;
2113   if (bt == T_BYTE) {
2114     rbit(dst, size, src);
2115   } else {
2116     neon_reverse_bytes(dst, src, bt, isQ);
2117     rbit(dst, size, dst);
2118   }
2119 }
2120 
2121 void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2122   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2123   SIMD_Arrangement size = isQ ? T16B : T8B;
2124   switch (bt) {
2125     case T_BYTE:
2126       if (dst != src) {
2127         orr(dst, size, src, src);
2128       }
2129       break;
2130     case T_SHORT:
2131       rev16(dst, size, src);
2132       break;
2133     case T_INT:
2134       rev32(dst, size, src);
2135       break;
2136     case T_LONG:
2137       rev64(dst, size, src);
2138       break;
2139     default:
2140       assert(false, "unsupported");
2141       ShouldNotReachHere();
2142   }
2143 }
2144 
2145 // Extract a scalar element from an sve vector at position 'idx'.
2146 // The input elements in src are expected to be of integral type.
2147 void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src,
2148                                              int idx, FloatRegister vtmp) {
2149   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2150   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2151   if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction
2152     if (bt == T_INT || bt == T_LONG) {
2153       umov(dst, src, size, idx);
2154     } else {
2155       smov(dst, src, size, idx);
2156     }
2157   } else {
2158     sve_orr(vtmp, src, src);
2159     sve_ext(vtmp, vtmp, idx << size);
2160     if (bt == T_INT || bt == T_LONG) {
2161       umov(dst, vtmp, size, 0);
2162     } else {
2163       smov(dst, vtmp, size, 0);
2164     }
2165   }
2166 }
2167 
2168 // java.lang.Math::round intrinsics
2169 
2170 // Clobbers: rscratch1, rflags
2171 void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2172                                           FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) {
2173   assert_different_registers(tmp1, tmp2, tmp3, src, dst);
2174   switch (T) {
2175     case T2S:
2176     case T4S:
2177       fmovs(tmp1, T, 0.5f);
2178       mov(rscratch1, jint_cast(0x1.0p23f));
2179       break;
2180     case T2D:
2181       fmovd(tmp1, T, 0.5);
2182       mov(rscratch1, julong_cast(0x1.0p52));
2183       break;
2184     default:
2185       assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2186   }
2187   fadd(tmp1, T, tmp1, src);
2188   fcvtms(tmp1, T, tmp1);
2189   // tmp1 = floor(src + 0.5, ties to even)
2190 
2191   fcvtas(dst, T, src);
2192   // dst = round(src), ties to away
2193 
2194   fneg(tmp3, T, src);
2195   dup(tmp2, T, rscratch1);
2196   cm(HS, tmp3, T, tmp3, tmp2);
2197   // tmp3 is now a set of flags
2198 
2199   bif(dst, T16B, tmp1, tmp3);
2200   // result in dst
2201 }
2202 
2203 // Clobbers: rscratch1, rflags
2204 void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2205                                          FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) {
2206   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2207   assert_different_registers(tmp1, tmp2, src, dst);
2208 
2209   switch (T) {
2210     case S:
2211       mov(rscratch1, jint_cast(0x1.0p23f));
2212       break;
2213     case D:
2214       mov(rscratch1, julong_cast(0x1.0p52));
2215       break;
2216     default:
2217       assert(T == S || T == D, "invalid register variant");
2218   }
2219 
2220   sve_frinta(dst, T, ptrue, src);
2221   // dst = round(src), ties to away
2222 
2223   Label none;
2224 
2225   sve_fneg(tmp1, T, ptrue, src);
2226   sve_dup(tmp2, T, rscratch1);
2227   sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1);
2228   br(EQ, none);
2229   {
2230     sve_cpy(tmp1, T, pgtmp, 0.5);
2231     sve_fadd(tmp1, T, pgtmp, src);
2232     sve_frintm(dst, T, pgtmp, tmp1);
2233     // dst = floor(src + 0.5, ties to even)
2234   }
2235   bind(none);
2236 
2237   sve_fcvtzs(dst, T, ptrue, dst, T);
2238   // result in dst
2239 }
2240 
2241 void C2_MacroAssembler::vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero,
2242                                            FloatRegister one, SIMD_Arrangement T) {
2243   assert_different_registers(dst, src, zero, one);
2244   assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2245 
2246   facgt(dst, T, src, zero);
2247   ushr(dst, T, dst, 1); // dst=0 for +-0.0 and NaN. 0x7FF..F otherwise
2248   bsl(dst, T == T2S ? T8B : T16B, one, src); // Result in dst
2249 }
2250 
2251 void C2_MacroAssembler::vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero,
2252                                           FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T) {
2253     assert_different_registers(dst, src, zero, one, vtmp);
2254     assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2255 
2256     sve_orr(vtmp, src, src);
2257     sve_fac(Assembler::GT, pgtmp, T, ptrue, src, zero); // pmtp=0 for +-0.0 and NaN. 0x1 otherwise
2258     switch (T) {
2259     case S:
2260       sve_and(vtmp, T, min_jint); // Extract the sign bit of float value in every lane of src
2261       sve_orr(vtmp, T, jint_cast(1.0)); // OR it with +1 to make the final result +1 or -1 depending
2262                                         // on the sign of the float value
2263       break;
2264     case D:
2265       sve_and(vtmp, T, min_jlong);
2266       sve_orr(vtmp, T, jlong_cast(1.0));
2267       break;
2268     default:
2269       assert(false, "unsupported");
2270       ShouldNotReachHere();
2271     }
2272     sve_sel(dst, T, pgtmp, vtmp, src); // Select either from src or vtmp based on the predicate register pgtmp
2273                                        // Result in dst
2274 }
2275 
2276 bool C2_MacroAssembler::in_scratch_emit_size() {
2277   if (ciEnv::current()->task() != nullptr) {
2278     PhaseOutput* phase_output = Compile::current()->output();
2279     if (phase_output != nullptr && phase_output->in_scratch_emit_size()) {
2280       return true;
2281     }
2282   }
2283   return MacroAssembler::in_scratch_emit_size();
2284 }