Old src/hotspot/cpu/aarch64/c2_MacroAssembler

   1 /*
   2  * Copyright (c) 2020, 2023, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "opto/c2_MacroAssembler.hpp"
  29 #include "opto/compile.hpp"
  30 #include "opto/intrinsicnode.hpp"
  31 #include "opto/matcher.hpp"
  32 #include "opto/output.hpp"
  33 #include "opto/subnode.hpp"
  34 #include "runtime/stubRoutines.hpp"
  35 
  36 #ifdef PRODUCT
  37 #define BLOCK_COMMENT(str) /* nothing */
  38 #define STOP(error) stop(error)
  39 #else
  40 #define BLOCK_COMMENT(str) block_comment(str)
  41 #define STOP(error) block_comment(error); stop(error)
  42 #endif
  43 
  44 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  45 
  46 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
  47 
  48 void C2_MacroAssembler::fast_lock(Register objectReg, Register boxReg, Register tmpReg,
  49                                   Register tmp2Reg, Register tmp3Reg) {
  50   Register oop = objectReg;
  51   Register box = boxReg;
  52   Register disp_hdr = tmpReg;
  53   Register tmp = tmp2Reg;
  54   Label cont;
  55   Label object_has_monitor;
  56   Label count, no_count;
  57 
  58   assert_different_registers(oop, box, tmp, disp_hdr);
  59 
  60   // Load markWord from object into displaced_header.
  61   ldr(disp_hdr, Address(oop, oopDesc::mark_offset_in_bytes()));
  62 
  63   if (DiagnoseSyncOnValueBasedClasses != 0) {
  64     load_klass(tmp, oop);
  65     ldrw(tmp, Address(tmp, Klass::access_flags_offset()));
  66     tstw(tmp, JVM_ACC_IS_VALUE_BASED_CLASS);
  67     br(Assembler::NE, cont);
  68   }
  69 
  70   // Check for existing monitor
  71   tbnz(disp_hdr, exact_log2(markWord::monitor_value), object_has_monitor);
  72 
  73   if (LockingMode == LM_MONITOR) {
  74     tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0.
  75     b(cont);
  76   } else if (LockingMode == LM_LEGACY) {
  77     // Set tmp to be (markWord of object | UNLOCK_VALUE).
  78     orr(tmp, disp_hdr, markWord::unlocked_value);
  79 
  80     // Initialize the box. (Must happen before we update the object mark!)
  81     str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes()));
  82 
  83     // Compare object markWord with an unlocked value (tmp) and if
  84     // equal exchange the stack address of our box with object markWord.
  85     // On failure disp_hdr contains the possibly locked markWord.
  86     cmpxchg(oop, tmp, box, Assembler::xword, /*acquire*/ true,
  87             /*release*/ true, /*weak*/ false, disp_hdr);
  88     br(Assembler::EQ, cont);
  89 
  90     assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
  91 
  92     // If the compare-and-exchange succeeded, then we found an unlocked
  93     // object, will have now locked it will continue at label cont
  94 
  95     // Check if the owner is self by comparing the value in the
  96     // markWord of object (disp_hdr) with the stack pointer.
  97     mov(rscratch1, sp);
  98     sub(disp_hdr, disp_hdr, rscratch1);
  99     mov(tmp, (address) (~(os::vm_page_size()-1) | markWord::lock_mask_in_place));
 100     // If condition is true we are cont and hence we can store 0 as the
 101     // displaced header in the box, which indicates that it is a recursive lock.
 102     ands(tmp/*==0?*/, disp_hdr, tmp);   // Sets flags for result
 103     str(tmp/*==0, perhaps*/, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 104     b(cont);
 105   } else {
 106     assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 107     lightweight_lock(oop, disp_hdr, tmp, tmp3Reg, no_count);
 108     b(count);
 109   }
 110 
 111   // Handle existing monitor.
 112   bind(object_has_monitor);
 113 
 114   // The object's monitor m is unlocked iff m->owner == NULL,
 115   // otherwise m->owner may contain a thread or a stack address.
 116   //
 117   // Try to CAS m->owner from NULL to current thread.
 118   add(tmp, disp_hdr, (in_bytes(ObjectMonitor::owner_offset())-markWord::monitor_value));
 119   cmpxchg(tmp, zr, rthread, Assembler::xword, /*acquire*/ true,
 120           /*release*/ true, /*weak*/ false, rscratch1); // Sets flags for result
 121 
 122   if (LockingMode != LM_LIGHTWEIGHT) {
 123     // Store a non-null value into the box to avoid looking like a re-entrant
 124     // lock. The fast-path monitor unlock code checks for
 125     // markWord::monitor_value so use markWord::unused_mark which has the
 126     // relevant bit set, and also matches ObjectSynchronizer::enter.
 127     mov(tmp, (address)markWord::unused_mark().value());
 128     str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 129   }
 130   br(Assembler::EQ, cont); // CAS success means locking succeeded
 131 
 132   cmp(rscratch1, rthread);
 133   br(Assembler::NE, cont); // Check for recursive locking
 134 
 135   // Recursive lock case
 136   increment(Address(disp_hdr, in_bytes(ObjectMonitor::recursions_offset()) - markWord::monitor_value), 1);
 137   // flag == EQ still from the cmp above, checking if this is a reentrant lock
 138 
 139   bind(cont);
 140   // flag == EQ indicates success
 141   // flag == NE indicates failure
 142   br(Assembler::NE, no_count);
 143 
 144   bind(count);
 145   increment(Address(rthread, JavaThread::held_monitor_count_offset()));
 146 
 147   bind(no_count);
 148 }
 149 
 150 void C2_MacroAssembler::fast_unlock(Register objectReg, Register boxReg, Register tmpReg,
 151                                     Register tmp2Reg) {
 152   Register oop = objectReg;
 153   Register box = boxReg;
 154   Register disp_hdr = tmpReg;
 155   Register tmp = tmp2Reg;
 156   Label cont;
 157   Label object_has_monitor;
 158   Label count, no_count;
 159 
 160   assert_different_registers(oop, box, tmp, disp_hdr);
 161 
 162   if (LockingMode == LM_LEGACY) {
 163     // Find the lock address and load the displaced header from the stack.
 164     ldr(disp_hdr, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 165 
 166     // If the displaced header is 0, we have a recursive unlock.
 167     cmp(disp_hdr, zr);
 168     br(Assembler::EQ, cont);
 169   }
 170 
 171   // Handle existing monitor.
 172   ldr(tmp, Address(oop, oopDesc::mark_offset_in_bytes()));
 173   tbnz(tmp, exact_log2(markWord::monitor_value), object_has_monitor);
 174 
 175   if (LockingMode == LM_MONITOR) {
 176     tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0.
 177     b(cont);
 178   } else if (LockingMode == LM_LEGACY) {
 179     // Check if it is still a light weight lock, this is is true if we
 180     // see the stack address of the basicLock in the markWord of the
 181     // object.
 182 
 183     cmpxchg(oop, box, disp_hdr, Assembler::xword, /*acquire*/ false,
 184             /*release*/ true, /*weak*/ false, tmp);
 185     b(cont);
 186   } else {
 187     assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 188     lightweight_unlock(oop, tmp, box, disp_hdr, no_count);
 189     b(count);
 190   }
 191 
 192   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
 193 
 194   // Handle existing monitor.
 195   bind(object_has_monitor);
 196   STATIC_ASSERT(markWord::monitor_value <= INT_MAX);
 197   add(tmp, tmp, -(int)markWord::monitor_value); // monitor
 198 
 199   if (LockingMode == LM_LIGHTWEIGHT) {
 200     // If the owner is anonymous, we need to fix it -- in an outline stub.
 201     Register tmp2 = disp_hdr;
 202     ldr(tmp2, Address(tmp, ObjectMonitor::owner_offset()));
 203     // We cannot use tbnz here, the target might be too far away and cannot
 204     // be encoded.
 205     tst(tmp2, (uint64_t)ObjectMonitor::ANONYMOUS_OWNER);
 206     C2HandleAnonOMOwnerStub* stub = new (Compile::current()->comp_arena()) C2HandleAnonOMOwnerStub(tmp, tmp2);
 207     Compile::current()->output()->add_stub(stub);
 208     br(Assembler::NE, stub->entry());
 209     bind(stub->continuation());
 210   }
 211 
 212   ldr(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset()));
 213 
 214   Label notRecursive;
 215   cbz(disp_hdr, notRecursive);
 216 
 217   // Recursive lock
 218   sub(disp_hdr, disp_hdr, 1u);
 219   str(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset()));
 220   cmp(disp_hdr, disp_hdr); // Sets flags for result
 221   b(cont);
 222 
 223   bind(notRecursive);
 224   ldr(rscratch1, Address(tmp, ObjectMonitor::EntryList_offset()));
 225   ldr(disp_hdr, Address(tmp, ObjectMonitor::cxq_offset()));
 226   orr(rscratch1, rscratch1, disp_hdr); // Will be 0 if both are 0.
 227   cmp(rscratch1, zr); // Sets flags for result
 228   cbnz(rscratch1, cont);
 229   // need a release store here
 230   lea(tmp, Address(tmp, ObjectMonitor::owner_offset()));
 231   stlr(zr, tmp); // set unowned
 232 
 233   bind(cont);
 234   // flag == EQ indicates success
 235   // flag == NE indicates failure
 236   br(Assembler::NE, no_count);
 237 
 238   bind(count);
 239   decrement(Address(rthread, JavaThread::held_monitor_count_offset()));
 240 
 241   bind(no_count);
 242 }
 243 
 244 // Search for str1 in str2 and return index or -1
 245 // Clobbers: rscratch1, rscratch2, rflags. May also clobber v0-v1, when icnt1==-1.
 246 void C2_MacroAssembler::string_indexof(Register str2, Register str1,
 247                                        Register cnt2, Register cnt1,
 248                                        Register tmp1, Register tmp2,
 249                                        Register tmp3, Register tmp4,
 250                                        Register tmp5, Register tmp6,
 251                                        int icnt1, Register result, int ae) {
 252   // NOTE: tmp5, tmp6 can be zr depending on specific method version
 253   Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
 254 
 255   Register ch1 = rscratch1;
 256   Register ch2 = rscratch2;
 257   Register cnt1tmp = tmp1;
 258   Register cnt2tmp = tmp2;
 259   Register cnt1_neg = cnt1;
 260   Register cnt2_neg = cnt2;
 261   Register result_tmp = tmp4;
 262 
 263   bool isL = ae == StrIntrinsicNode::LL;
 264 
 265   bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
 266   bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
 267   int str1_chr_shift = str1_isL ? 0:1;
 268   int str2_chr_shift = str2_isL ? 0:1;
 269   int str1_chr_size = str1_isL ? 1:2;
 270   int str2_chr_size = str2_isL ? 1:2;
 271   chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
 272                                       (chr_insn)&MacroAssembler::ldrh;
 273   chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
 274                                       (chr_insn)&MacroAssembler::ldrh;
 275   chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
 276   chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
 277 
 278   // Note, inline_string_indexOf() generates checks:
 279   // if (substr.count > string.count) return -1;
 280   // if (substr.count == 0) return 0;
 281 
 282   // We have two strings, a source string in str2, cnt2 and a pattern string
 283   // in str1, cnt1. Find the 1st occurrence of pattern in source or return -1.
 284 
 285   // For larger pattern and source we use a simplified Boyer Moore algorithm.
 286   // With a small pattern and source we use linear scan.
 287 
 288   if (icnt1 == -1) {
 289     sub(result_tmp, cnt2, cnt1);
 290     cmp(cnt1, (u1)8);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
 291     br(LT, LINEARSEARCH);
 292     dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
 293     subs(zr, cnt1, 256);
 294     lsr(tmp1, cnt2, 2);
 295     ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
 296     br(GE, LINEARSTUB);
 297   }
 298 
 299 // The Boyer Moore alogorithm is based on the description here:-
 300 //
 301 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
 302 //
 303 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
 304 // and the 'Good Suffix' rule.
 305 //
 306 // These rules are essentially heuristics for how far we can shift the
 307 // pattern along the search string.
 308 //
 309 // The implementation here uses the 'Bad Character' rule only because of the
 310 // complexity of initialisation for the 'Good Suffix' rule.
 311 //
 312 // This is also known as the Boyer-Moore-Horspool algorithm:-
 313 //
 314 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
 315 //
 316 // This particular implementation has few java-specific optimizations.
 317 //
 318 // #define ASIZE 256
 319 //
 320 //    int bm(unsigned char *x, int m, unsigned char *y, int n) {
 321 //       int i, j;
 322 //       unsigned c;
 323 //       unsigned char bc[ASIZE];
 324 //
 325 //       /* Preprocessing */
 326 //       for (i = 0; i < ASIZE; ++i)
 327 //          bc[i] = m;
 328 //       for (i = 0; i < m - 1; ) {
 329 //          c = x[i];
 330 //          ++i;
 331 //          // c < 256 for Latin1 string, so, no need for branch
 332 //          #ifdef PATTERN_STRING_IS_LATIN1
 333 //          bc[c] = m - i;
 334 //          #else
 335 //          if (c < ASIZE) bc[c] = m - i;
 336 //          #endif
 337 //       }
 338 //
 339 //       /* Searching */
 340 //       j = 0;
 341 //       while (j <= n - m) {
 342 //          c = y[i+j];
 343 //          if (x[m-1] == c)
 344 //            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
 345 //          if (i < 0) return j;
 346 //          // c < 256 for Latin1 string, so, no need for branch
 347 //          #ifdef SOURCE_STRING_IS_LATIN1
 348 //          // LL case: (c< 256) always true. Remove branch
 349 //          j += bc[y[j+m-1]];
 350 //          #endif
 351 //          #ifndef PATTERN_STRING_IS_UTF
 352 //          // UU case: need if (c<ASIZE) check. Skip 1 character if not.
 353 //          if (c < ASIZE)
 354 //            j += bc[y[j+m-1]];
 355 //          else
 356 //            j += 1
 357 //          #endif
 358 //          #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
 359 //          // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
 360 //          if (c < ASIZE)
 361 //            j += bc[y[j+m-1]];
 362 //          else
 363 //            j += m
 364 //          #endif
 365 //       }
 366 //    }
 367 
 368   if (icnt1 == -1) {
 369     Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
 370         BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
 371     Register cnt1end = tmp2;
 372     Register str2end = cnt2;
 373     Register skipch = tmp2;
 374 
 375     // str1 length is >=8, so, we can read at least 1 register for cases when
 376     // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
 377     // UL case. We'll re-read last character in inner pre-loop code to have
 378     // single outer pre-loop load
 379     const int firstStep = isL ? 7 : 3;
 380 
 381     const int ASIZE = 256;
 382     const int STORED_BYTES = 32; // amount of bytes stored per instruction
 383     sub(sp, sp, ASIZE);
 384     mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
 385     mov(ch1, sp);
 386     BIND(BM_INIT_LOOP);
 387       stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
 388       subs(tmp5, tmp5, 1);
 389       br(GT, BM_INIT_LOOP);
 390 
 391       sub(cnt1tmp, cnt1, 1);
 392       mov(tmp5, str2);
 393       add(str2end, str2, result_tmp, LSL, str2_chr_shift);
 394       sub(ch2, cnt1, 1);
 395       mov(tmp3, str1);
 396     BIND(BCLOOP);
 397       (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
 398       if (!str1_isL) {
 399         subs(zr, ch1, ASIZE);
 400         br(HS, BCSKIP);
 401       }
 402       strb(ch2, Address(sp, ch1));
 403     BIND(BCSKIP);
 404       subs(ch2, ch2, 1);
 405       br(GT, BCLOOP);
 406 
 407       add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
 408       if (str1_isL == str2_isL) {
 409         // load last 8 bytes (8LL/4UU symbols)
 410         ldr(tmp6, Address(tmp6, -wordSize));
 411       } else {
 412         ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
 413         // convert Latin1 to UTF. We'll have to wait until load completed, but
 414         // it's still faster than per-character loads+checks
 415         lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
 416         ubfx(ch1, tmp6, 8, 8); // str1[N-2]
 417         ubfx(ch2, tmp6, 16, 8); // str1[N-3]
 418         andr(tmp6, tmp6, 0xFF); // str1[N-4]
 419         orr(ch2, ch1, ch2, LSL, 16);
 420         orr(tmp6, tmp6, tmp3, LSL, 48);
 421         orr(tmp6, tmp6, ch2, LSL, 16);
 422       }
 423     BIND(BMLOOPSTR2);
 424       (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 425       sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
 426       if (str1_isL == str2_isL) {
 427         // re-init tmp3. It's for free because it's executed in parallel with
 428         // load above. Alternative is to initialize it before loop, but it'll
 429         // affect performance on in-order systems with 2 or more ld/st pipelines
 430         lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
 431       }
 432       if (!isL) { // UU/UL case
 433         lsl(ch2, cnt1tmp, 1); // offset in bytes
 434       }
 435       cmp(tmp3, skipch);
 436       br(NE, BMSKIP);
 437       ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
 438       mov(ch1, tmp6);
 439       if (isL) {
 440         b(BMLOOPSTR1_AFTER_LOAD);
 441       } else {
 442         sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
 443         b(BMLOOPSTR1_CMP);
 444       }
 445     BIND(BMLOOPSTR1);
 446       (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
 447       (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 448     BIND(BMLOOPSTR1_AFTER_LOAD);
 449       subs(cnt1tmp, cnt1tmp, 1);
 450       br(LT, BMLOOPSTR1_LASTCMP);
 451     BIND(BMLOOPSTR1_CMP);
 452       cmp(ch1, ch2);
 453       br(EQ, BMLOOPSTR1);
 454     BIND(BMSKIP);
 455       if (!isL) {
 456         // if we've met UTF symbol while searching Latin1 pattern, then we can
 457         // skip cnt1 symbols
 458         if (str1_isL != str2_isL) {
 459           mov(result_tmp, cnt1);
 460         } else {
 461           mov(result_tmp, 1);
 462         }
 463         subs(zr, skipch, ASIZE);
 464         br(HS, BMADV);
 465       }
 466       ldrb(result_tmp, Address(sp, skipch)); // load skip distance
 467     BIND(BMADV);
 468       sub(cnt1tmp, cnt1, 1);
 469       add(str2, str2, result_tmp, LSL, str2_chr_shift);
 470       cmp(str2, str2end);
 471       br(LE, BMLOOPSTR2);
 472       add(sp, sp, ASIZE);
 473       b(NOMATCH);
 474     BIND(BMLOOPSTR1_LASTCMP);
 475       cmp(ch1, ch2);
 476       br(NE, BMSKIP);
 477     BIND(BMMATCH);
 478       sub(result, str2, tmp5);
 479       if (!str2_isL) lsr(result, result, 1);
 480       add(sp, sp, ASIZE);
 481       b(DONE);
 482 
 483     BIND(LINEARSTUB);
 484     cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm
 485     br(LT, LINEAR_MEDIUM);
 486     mov(result, zr);
 487     RuntimeAddress stub = nullptr;
 488     if (isL) {
 489       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
 490       assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated");
 491     } else if (str1_isL) {
 492       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
 493        assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated");
 494     } else {
 495       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
 496       assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated");
 497     }
 498     address call = trampoline_call(stub);
 499     if (call == nullptr) {
 500       DEBUG_ONLY(reset_labels(LINEARSEARCH, LINEAR_MEDIUM, DONE, NOMATCH, MATCH));
 501       ciEnv::current()->record_failure("CodeCache is full");
 502       return;
 503     }
 504     b(DONE);
 505   }
 506 
 507   BIND(LINEARSEARCH);
 508   {
 509     Label DO1, DO2, DO3;
 510 
 511     Register str2tmp = tmp2;
 512     Register first = tmp3;
 513 
 514     if (icnt1 == -1)
 515     {
 516         Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
 517 
 518         cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2));
 519         br(LT, DOSHORT);
 520       BIND(LINEAR_MEDIUM);
 521         (this->*str1_load_1chr)(first, Address(str1));
 522         lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
 523         sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
 524         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 525         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 526 
 527       BIND(FIRST_LOOP);
 528         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
 529         cmp(first, ch2);
 530         br(EQ, STR1_LOOP);
 531       BIND(STR2_NEXT);
 532         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 533         br(LE, FIRST_LOOP);
 534         b(NOMATCH);
 535 
 536       BIND(STR1_LOOP);
 537         adds(cnt1tmp, cnt1_neg, str1_chr_size);
 538         add(cnt2tmp, cnt2_neg, str2_chr_size);
 539         br(GE, MATCH);
 540 
 541       BIND(STR1_NEXT);
 542         (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
 543         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
 544         cmp(ch1, ch2);
 545         br(NE, STR2_NEXT);
 546         adds(cnt1tmp, cnt1tmp, str1_chr_size);
 547         add(cnt2tmp, cnt2tmp, str2_chr_size);
 548         br(LT, STR1_NEXT);
 549         b(MATCH);
 550 
 551       BIND(DOSHORT);
 552       if (str1_isL == str2_isL) {
 553         cmp(cnt1, (u1)2);
 554         br(LT, DO1);
 555         br(GT, DO3);
 556       }
 557     }
 558 
 559     if (icnt1 == 4) {
 560       Label CH1_LOOP;
 561 
 562         (this->*load_4chr)(ch1, str1);
 563         sub(result_tmp, cnt2, 4);
 564         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 565         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 566 
 567       BIND(CH1_LOOP);
 568         (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
 569         cmp(ch1, ch2);
 570         br(EQ, MATCH);
 571         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 572         br(LE, CH1_LOOP);
 573         b(NOMATCH);
 574       }
 575 
 576     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
 577       Label CH1_LOOP;
 578 
 579       BIND(DO2);
 580         (this->*load_2chr)(ch1, str1);
 581         if (icnt1 == 2) {
 582           sub(result_tmp, cnt2, 2);
 583         }
 584         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 585         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 586       BIND(CH1_LOOP);
 587         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
 588         cmp(ch1, ch2);
 589         br(EQ, MATCH);
 590         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 591         br(LE, CH1_LOOP);
 592         b(NOMATCH);
 593     }
 594 
 595     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
 596       Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
 597 
 598       BIND(DO3);
 599         (this->*load_2chr)(first, str1);
 600         (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
 601         if (icnt1 == 3) {
 602           sub(result_tmp, cnt2, 3);
 603         }
 604         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 605         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 606       BIND(FIRST_LOOP);
 607         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
 608         cmpw(first, ch2);
 609         br(EQ, STR1_LOOP);
 610       BIND(STR2_NEXT);
 611         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 612         br(LE, FIRST_LOOP);
 613         b(NOMATCH);
 614 
 615       BIND(STR1_LOOP);
 616         add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
 617         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
 618         cmp(ch1, ch2);
 619         br(NE, STR2_NEXT);
 620         b(MATCH);
 621     }
 622 
 623     if (icnt1 == -1 || icnt1 == 1) {
 624       Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
 625 
 626       BIND(DO1);
 627         (this->*str1_load_1chr)(ch1, str1);
 628         cmp(cnt2, (u1)8);
 629         br(LT, DO1_SHORT);
 630 
 631         sub(result_tmp, cnt2, 8/str2_chr_size);
 632         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 633         mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
 634         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 635 
 636         if (str2_isL) {
 637           orr(ch1, ch1, ch1, LSL, 8);
 638         }
 639         orr(ch1, ch1, ch1, LSL, 16);
 640         orr(ch1, ch1, ch1, LSL, 32);
 641       BIND(CH1_LOOP);
 642         ldr(ch2, Address(str2, cnt2_neg));
 643         eor(ch2, ch1, ch2);
 644         sub(tmp1, ch2, tmp3);
 645         orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 646         bics(tmp1, tmp1, tmp2);
 647         br(NE, HAS_ZERO);
 648         adds(cnt2_neg, cnt2_neg, 8);
 649         br(LT, CH1_LOOP);
 650 
 651         cmp(cnt2_neg, (u1)8);
 652         mov(cnt2_neg, 0);
 653         br(LT, CH1_LOOP);
 654         b(NOMATCH);
 655 
 656       BIND(HAS_ZERO);
 657         rev(tmp1, tmp1);
 658         clz(tmp1, tmp1);
 659         add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
 660         b(MATCH);
 661 
 662       BIND(DO1_SHORT);
 663         mov(result_tmp, cnt2);
 664         lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
 665         sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
 666       BIND(DO1_LOOP);
 667         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
 668         cmpw(ch1, ch2);
 669         br(EQ, MATCH);
 670         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 671         br(LT, DO1_LOOP);
 672     }
 673   }
 674   BIND(NOMATCH);
 675     mov(result, -1);
 676     b(DONE);
 677   BIND(MATCH);
 678     add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
 679   BIND(DONE);
 680 }
 681 
 682 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
 683 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
 684 
 685 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1,
 686                                             Register ch, Register result,
 687                                             Register tmp1, Register tmp2, Register tmp3)
 688 {
 689   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
 690   Register cnt1_neg = cnt1;
 691   Register ch1 = rscratch1;
 692   Register result_tmp = rscratch2;
 693 
 694   cbz(cnt1, NOMATCH);
 695 
 696   cmp(cnt1, (u1)4);
 697   br(LT, DO1_SHORT);
 698 
 699   orr(ch, ch, ch, LSL, 16);
 700   orr(ch, ch, ch, LSL, 32);
 701 
 702   sub(cnt1, cnt1, 4);
 703   mov(result_tmp, cnt1);
 704   lea(str1, Address(str1, cnt1, Address::uxtw(1)));
 705   sub(cnt1_neg, zr, cnt1, LSL, 1);
 706 
 707   mov(tmp3, 0x0001000100010001);
 708 
 709   BIND(CH1_LOOP);
 710     ldr(ch1, Address(str1, cnt1_neg));
 711     eor(ch1, ch, ch1);
 712     sub(tmp1, ch1, tmp3);
 713     orr(tmp2, ch1, 0x7fff7fff7fff7fff);
 714     bics(tmp1, tmp1, tmp2);
 715     br(NE, HAS_ZERO);
 716     adds(cnt1_neg, cnt1_neg, 8);
 717     br(LT, CH1_LOOP);
 718 
 719     cmp(cnt1_neg, (u1)8);
 720     mov(cnt1_neg, 0);
 721     br(LT, CH1_LOOP);
 722     b(NOMATCH);
 723 
 724   BIND(HAS_ZERO);
 725     rev(tmp1, tmp1);
 726     clz(tmp1, tmp1);
 727     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
 728     b(MATCH);
 729 
 730   BIND(DO1_SHORT);
 731     mov(result_tmp, cnt1);
 732     lea(str1, Address(str1, cnt1, Address::uxtw(1)));
 733     sub(cnt1_neg, zr, cnt1, LSL, 1);
 734   BIND(DO1_LOOP);
 735     ldrh(ch1, Address(str1, cnt1_neg));
 736     cmpw(ch, ch1);
 737     br(EQ, MATCH);
 738     adds(cnt1_neg, cnt1_neg, 2);
 739     br(LT, DO1_LOOP);
 740   BIND(NOMATCH);
 741     mov(result, -1);
 742     b(DONE);
 743   BIND(MATCH);
 744     add(result, result_tmp, cnt1_neg, ASR, 1);
 745   BIND(DONE);
 746 }
 747 
 748 void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1,
 749                                                 Register ch, Register result,
 750                                                 FloatRegister ztmp1,
 751                                                 FloatRegister ztmp2,
 752                                                 PRegister tmp_pg,
 753                                                 PRegister tmp_pdn, bool isL)
 754 {
 755   // Note that `tmp_pdn` should *NOT* be used as governing predicate register.
 756   assert(tmp_pg->is_governing(),
 757          "this register has to be a governing predicate register");
 758 
 759   Label LOOP, MATCH, DONE, NOMATCH;
 760   Register vec_len = rscratch1;
 761   Register idx = rscratch2;
 762 
 763   SIMD_RegVariant T = (isL == true) ? B : H;
 764 
 765   cbz(cnt1, NOMATCH);
 766 
 767   // Assign the particular char throughout the vector.
 768   sve_dup(ztmp2, T, ch);
 769   if (isL) {
 770     sve_cntb(vec_len);
 771   } else {
 772     sve_cnth(vec_len);
 773   }
 774   mov(idx, 0);
 775 
 776   // Generate a predicate to control the reading of input string.
 777   sve_whilelt(tmp_pg, T, idx, cnt1);
 778 
 779   BIND(LOOP);
 780     // Read a vector of 8- or 16-bit data depending on the string type. Note
 781     // that inactive elements indicated by the predicate register won't cause
 782     // a data read from memory to the destination vector.
 783     if (isL) {
 784       sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx));
 785     } else {
 786       sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1)));
 787     }
 788     add(idx, idx, vec_len);
 789 
 790     // Perform the comparison. An element of the destination predicate is set
 791     // to active if the particular char is matched.
 792     sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2);
 793 
 794     // Branch if the particular char is found.
 795     br(NE, MATCH);
 796 
 797     sve_whilelt(tmp_pg, T, idx, cnt1);
 798 
 799     // Loop back if the particular char not found.
 800     br(MI, LOOP);
 801 
 802   BIND(NOMATCH);
 803     mov(result, -1);
 804     b(DONE);
 805 
 806   BIND(MATCH);
 807     // Undo the index increment.
 808     sub(idx, idx, vec_len);
 809 
 810     // Crop the vector to find its location.
 811     sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */);
 812     add(result, idx, -1);
 813     sve_incp(result, T, tmp_pdn);
 814   BIND(DONE);
 815 }
 816 
 817 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1,
 818                                             Register ch, Register result,
 819                                             Register tmp1, Register tmp2, Register tmp3)
 820 {
 821   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
 822   Register cnt1_neg = cnt1;
 823   Register ch1 = rscratch1;
 824   Register result_tmp = rscratch2;
 825 
 826   cbz(cnt1, NOMATCH);
 827 
 828   cmp(cnt1, (u1)8);
 829   br(LT, DO1_SHORT);
 830 
 831   orr(ch, ch, ch, LSL, 8);
 832   orr(ch, ch, ch, LSL, 16);
 833   orr(ch, ch, ch, LSL, 32);
 834 
 835   sub(cnt1, cnt1, 8);
 836   mov(result_tmp, cnt1);
 837   lea(str1, Address(str1, cnt1));
 838   sub(cnt1_neg, zr, cnt1);
 839 
 840   mov(tmp3, 0x0101010101010101);
 841 
 842   BIND(CH1_LOOP);
 843     ldr(ch1, Address(str1, cnt1_neg));
 844     eor(ch1, ch, ch1);
 845     sub(tmp1, ch1, tmp3);
 846     orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f);
 847     bics(tmp1, tmp1, tmp2);
 848     br(NE, HAS_ZERO);
 849     adds(cnt1_neg, cnt1_neg, 8);
 850     br(LT, CH1_LOOP);
 851 
 852     cmp(cnt1_neg, (u1)8);
 853     mov(cnt1_neg, 0);
 854     br(LT, CH1_LOOP);
 855     b(NOMATCH);
 856 
 857   BIND(HAS_ZERO);
 858     rev(tmp1, tmp1);
 859     clz(tmp1, tmp1);
 860     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
 861     b(MATCH);
 862 
 863   BIND(DO1_SHORT);
 864     mov(result_tmp, cnt1);
 865     lea(str1, Address(str1, cnt1));
 866     sub(cnt1_neg, zr, cnt1);
 867   BIND(DO1_LOOP);
 868     ldrb(ch1, Address(str1, cnt1_neg));
 869     cmp(ch, ch1);
 870     br(EQ, MATCH);
 871     adds(cnt1_neg, cnt1_neg, 1);
 872     br(LT, DO1_LOOP);
 873   BIND(NOMATCH);
 874     mov(result, -1);
 875     b(DONE);
 876   BIND(MATCH);
 877     add(result, result_tmp, cnt1_neg);
 878   BIND(DONE);
 879 }
 880 
 881 // Compare strings.
 882 void C2_MacroAssembler::string_compare(Register str1, Register str2,
 883     Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
 884     FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
 885     PRegister pgtmp1, PRegister pgtmp2, int ae) {
 886   Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
 887       DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
 888       SHORT_LOOP_START, TAIL_CHECK;
 889 
 890   bool isLL = ae == StrIntrinsicNode::LL;
 891   bool isLU = ae == StrIntrinsicNode::LU;
 892   bool isUL = ae == StrIntrinsicNode::UL;
 893 
 894   // The stub threshold for LL strings is: 72 (64 + 8) chars
 895   // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch)
 896   // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least)
 897   const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36);
 898 
 899   bool str1_isL = isLL || isLU;
 900   bool str2_isL = isLL || isUL;
 901 
 902   int str1_chr_shift = str1_isL ? 0 : 1;
 903   int str2_chr_shift = str2_isL ? 0 : 1;
 904   int str1_chr_size = str1_isL ? 1 : 2;
 905   int str2_chr_size = str2_isL ? 1 : 2;
 906   int minCharsInWord = isLL ? wordSize : wordSize/2;
 907 
 908   FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
 909   chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
 910                                       (chr_insn)&MacroAssembler::ldrh;
 911   chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
 912                                       (chr_insn)&MacroAssembler::ldrh;
 913   uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
 914                             (uxt_insn)&MacroAssembler::uxthw;
 915 
 916   BLOCK_COMMENT("string_compare {");
 917 
 918   // Bizzarely, the counts are passed in bytes, regardless of whether they
 919   // are L or U strings, however the result is always in characters.
 920   if (!str1_isL) asrw(cnt1, cnt1, 1);
 921   if (!str2_isL) asrw(cnt2, cnt2, 1);
 922 
 923   // Compute the minimum of the string lengths and save the difference.
 924   subsw(result, cnt1, cnt2);
 925   cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
 926 
 927   // A very short string
 928   cmpw(cnt2, minCharsInWord);
 929   br(Assembler::LE, SHORT_STRING);
 930 
 931   // Compare longwords
 932   // load first parts of strings and finish initialization while loading
 933   {
 934     if (str1_isL == str2_isL) { // LL or UU
 935       ldr(tmp1, Address(str1));
 936       cmp(str1, str2);
 937       br(Assembler::EQ, DONE);
 938       ldr(tmp2, Address(str2));
 939       cmp(cnt2, stub_threshold);
 940       br(GE, STUB);
 941       subsw(cnt2, cnt2, minCharsInWord);
 942       br(EQ, TAIL_CHECK);
 943       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
 944       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
 945       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
 946     } else if (isLU) {
 947       ldrs(vtmp, Address(str1));
 948       ldr(tmp2, Address(str2));
 949       cmp(cnt2, stub_threshold);
 950       br(GE, STUB);
 951       subw(cnt2, cnt2, 4);
 952       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
 953       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
 954       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
 955       zip1(vtmp, T8B, vtmp, vtmpZ);
 956       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
 957       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
 958       add(cnt1, cnt1, 4);
 959       fmovd(tmp1, vtmp);
 960     } else { // UL case
 961       ldr(tmp1, Address(str1));
 962       ldrs(vtmp, Address(str2));
 963       cmp(cnt2, stub_threshold);
 964       br(GE, STUB);
 965       subw(cnt2, cnt2, 4);
 966       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
 967       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
 968       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
 969       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
 970       zip1(vtmp, T8B, vtmp, vtmpZ);
 971       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
 972       add(cnt1, cnt1, 8);
 973       fmovd(tmp2, vtmp);
 974     }
 975     adds(cnt2, cnt2, isUL ? 4 : 8);
 976     br(GE, TAIL);
 977     eor(rscratch2, tmp1, tmp2);
 978     cbnz(rscratch2, DIFF);
 979     // main loop
 980     bind(NEXT_WORD);
 981     if (str1_isL == str2_isL) {
 982       ldr(tmp1, Address(str1, cnt2));
 983       ldr(tmp2, Address(str2, cnt2));
 984       adds(cnt2, cnt2, 8);
 985     } else if (isLU) {
 986       ldrs(vtmp, Address(str1, cnt1));
 987       ldr(tmp2, Address(str2, cnt2));
 988       add(cnt1, cnt1, 4);
 989       zip1(vtmp, T8B, vtmp, vtmpZ);
 990       fmovd(tmp1, vtmp);
 991       adds(cnt2, cnt2, 8);
 992     } else { // UL
 993       ldrs(vtmp, Address(str2, cnt2));
 994       ldr(tmp1, Address(str1, cnt1));
 995       zip1(vtmp, T8B, vtmp, vtmpZ);
 996       add(cnt1, cnt1, 8);
 997       fmovd(tmp2, vtmp);
 998       adds(cnt2, cnt2, 4);
 999     }
1000     br(GE, TAIL);
1001 
1002     eor(rscratch2, tmp1, tmp2);
1003     cbz(rscratch2, NEXT_WORD);
1004     b(DIFF);
1005     bind(TAIL);
1006     eor(rscratch2, tmp1, tmp2);
1007     cbnz(rscratch2, DIFF);
1008     // Last longword.  In the case where length == 4 we compare the
1009     // same longword twice, but that's still faster than another
1010     // conditional branch.
1011     if (str1_isL == str2_isL) {
1012       ldr(tmp1, Address(str1));
1013       ldr(tmp2, Address(str2));
1014     } else if (isLU) {
1015       ldrs(vtmp, Address(str1));
1016       ldr(tmp2, Address(str2));
1017       zip1(vtmp, T8B, vtmp, vtmpZ);
1018       fmovd(tmp1, vtmp);
1019     } else { // UL
1020       ldrs(vtmp, Address(str2));
1021       ldr(tmp1, Address(str1));
1022       zip1(vtmp, T8B, vtmp, vtmpZ);
1023       fmovd(tmp2, vtmp);
1024     }
1025     bind(TAIL_CHECK);
1026     eor(rscratch2, tmp1, tmp2);
1027     cbz(rscratch2, DONE);
1028 
1029     // Find the first different characters in the longwords and
1030     // compute their difference.
1031     bind(DIFF);
1032     rev(rscratch2, rscratch2);
1033     clz(rscratch2, rscratch2);
1034     andr(rscratch2, rscratch2, isLL ? -8 : -16);
1035     lsrv(tmp1, tmp1, rscratch2);
1036     (this->*ext_chr)(tmp1, tmp1);
1037     lsrv(tmp2, tmp2, rscratch2);
1038     (this->*ext_chr)(tmp2, tmp2);
1039     subw(result, tmp1, tmp2);
1040     b(DONE);
1041   }
1042 
1043   bind(STUB);
1044     RuntimeAddress stub = nullptr;
1045     switch(ae) {
1046       case StrIntrinsicNode::LL:
1047         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
1048         break;
1049       case StrIntrinsicNode::UU:
1050         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
1051         break;
1052       case StrIntrinsicNode::LU:
1053         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
1054         break;
1055       case StrIntrinsicNode::UL:
1056         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
1057         break;
1058       default:
1059         ShouldNotReachHere();
1060      }
1061     assert(stub.target() != nullptr, "compare_long_string stub has not been generated");
1062     address call = trampoline_call(stub);
1063     if (call == nullptr) {
1064       DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START));
1065       ciEnv::current()->record_failure("CodeCache is full");
1066       return;
1067     }
1068     b(DONE);
1069 
1070   bind(SHORT_STRING);
1071   // Is the minimum length zero?
1072   cbz(cnt2, DONE);
1073   // arrange code to do most branches while loading and loading next characters
1074   // while comparing previous
1075   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1076   subs(cnt2, cnt2, 1);
1077   br(EQ, SHORT_LAST_INIT);
1078   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1079   b(SHORT_LOOP_START);
1080   bind(SHORT_LOOP);
1081   subs(cnt2, cnt2, 1);
1082   br(EQ, SHORT_LAST);
1083   bind(SHORT_LOOP_START);
1084   (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
1085   (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
1086   cmp(tmp1, cnt1);
1087   br(NE, SHORT_LOOP_TAIL);
1088   subs(cnt2, cnt2, 1);
1089   br(EQ, SHORT_LAST2);
1090   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1091   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1092   cmp(tmp2, rscratch1);
1093   br(EQ, SHORT_LOOP);
1094   sub(result, tmp2, rscratch1);
1095   b(DONE);
1096   bind(SHORT_LOOP_TAIL);
1097   sub(result, tmp1, cnt1);
1098   b(DONE);
1099   bind(SHORT_LAST2);
1100   cmp(tmp2, rscratch1);
1101   br(EQ, DONE);
1102   sub(result, tmp2, rscratch1);
1103 
1104   b(DONE);
1105   bind(SHORT_LAST_INIT);
1106   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1107   bind(SHORT_LAST);
1108   cmp(tmp1, cnt1);
1109   br(EQ, DONE);
1110   sub(result, tmp1, cnt1);
1111 
1112   bind(DONE);
1113 
1114   BLOCK_COMMENT("} string_compare");
1115 }
1116 
1117 void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1,
1118                                      FloatRegister src2, Condition cond, bool isQ) {
1119   SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1120   FloatRegister zn = src1, zm = src2;
1121   bool needs_negation = false;
1122   switch (cond) {
1123     case LT: cond = GT; zn = src2; zm = src1; break;
1124     case LE: cond = GE; zn = src2; zm = src1; break;
1125     case LO: cond = HI; zn = src2; zm = src1; break;
1126     case LS: cond = HS; zn = src2; zm = src1; break;
1127     case NE: cond = EQ; needs_negation = true; break;
1128     default:
1129       break;
1130   }
1131 
1132   if (is_floating_point_type(bt)) {
1133     fcm(cond, dst, size, zn, zm);
1134   } else {
1135     cm(cond, dst, size, zn, zm);
1136   }
1137 
1138   if (needs_negation) {
1139     notr(dst, isQ ? T16B : T8B, dst);
1140   }
1141 }
1142 
1143 void C2_MacroAssembler::neon_compare_zero(FloatRegister dst, BasicType bt, FloatRegister src,
1144                                           Condition cond, bool isQ) {
1145   SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1146   if (bt == T_FLOAT || bt == T_DOUBLE) {
1147     if (cond == Assembler::NE) {
1148       fcm(Assembler::EQ, dst, size, src);
1149       notr(dst, isQ ? T16B : T8B, dst);
1150     } else {
1151       fcm(cond, dst, size, src);
1152     }
1153   } else {
1154     if (cond == Assembler::NE) {
1155       cm(Assembler::EQ, dst, size, src);
1156       notr(dst, isQ ? T16B : T8B, dst);
1157     } else {
1158       cm(cond, dst, size, src);
1159     }
1160   }
1161 }
1162 
1163 // Compress the least significant bit of each byte to the rightmost and clear
1164 // the higher garbage bits.
1165 void C2_MacroAssembler::bytemask_compress(Register dst) {
1166   // Example input, dst = 0x01 00 00 00 01 01 00 01
1167   // The "??" bytes are garbage.
1168   orr(dst, dst, dst, Assembler::LSR, 7);  // dst = 0x?? 02 ?? 00 ?? 03 ?? 01
1169   orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D
1170   orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D
1171   andr(dst, dst, 0xff);                   // dst = 0x8D
1172 }
1173 
1174 // Pack the lowest-numbered bit of each mask element in src into a long value
1175 // in dst, at most the first 64 lane elements.
1176 // Clobbers: rscratch1, if UseSVE=1 or the hardware doesn't support FEAT_BITPERM.
1177 void C2_MacroAssembler::sve_vmask_tolong(Register dst, PRegister src, BasicType bt, int lane_cnt,
1178                                          FloatRegister vtmp1, FloatRegister vtmp2) {
1179   assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
1180   assert_different_registers(dst, rscratch1);
1181   assert_different_registers(vtmp1, vtmp2);
1182 
1183   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
1184   // Example:   src = 0b01100101 10001101, bt = T_BYTE, lane_cnt = 16
1185   // Expected:  dst = 0x658D
1186 
1187   // Convert the mask into vector with sequential bytes.
1188   // vtmp1 = 0x00010100 0x00010001 0x01000000 0x01010001
1189   sve_cpy(vtmp1, size, src, 1, false);
1190   if (bt != T_BYTE) {
1191     sve_vector_narrow(vtmp1, B, vtmp1, size, vtmp2);
1192   }
1193 
1194   if (UseSVE > 1 && VM_Version::supports_svebitperm()) {
1195     // Given a vector with the value 0x00 or 0x01 in each byte, the basic idea
1196     // is to compress each significant bit of the byte in a cross-lane way. Due
1197     // to the lack of a cross-lane bit-compress instruction, we use BEXT
1198     // (bit-compress in each lane) with the biggest lane size (T = D) then
1199     // concatenate the results.
1200 
1201     // The second source input of BEXT, initialized with 0x01 in each byte.
1202     // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1203     sve_dup(vtmp2, B, 1);
1204 
1205     // BEXT vtmp1.D, vtmp1.D, vtmp2.D
1206     // vtmp1 = 0x0001010000010001 | 0x0100000001010001
1207     // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1208     //         ---------------------------------------
1209     // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1210     sve_bext(vtmp1, D, vtmp1, vtmp2);
1211 
1212     // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the
1213     // result to dst.
1214     // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1215     // dst   = 0x658D
1216     if (lane_cnt <= 8) {
1217       // No need to concatenate.
1218       umov(dst, vtmp1, B, 0);
1219     } else if (lane_cnt <= 16) {
1220       ins(vtmp1, B, vtmp1, 1, 8);
1221       umov(dst, vtmp1, H, 0);
1222     } else {
1223       // As the lane count is 64 at most, the final expected value must be in
1224       // the lowest 64 bits after narrowing vtmp1 from D to B.
1225       sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2);
1226       umov(dst, vtmp1, D, 0);
1227     }
1228   } else if (UseSVE > 0) {
1229     // Compress the lowest 8 bytes.
1230     fmovd(dst, vtmp1);
1231     bytemask_compress(dst);
1232     if (lane_cnt <= 8) return;
1233 
1234     // Repeat on higher bytes and join the results.
1235     // Compress 8 bytes in each iteration.
1236     for (int idx = 1; idx < (lane_cnt / 8); idx++) {
1237       sve_extract_integral(rscratch1, T_LONG, vtmp1, idx, vtmp2);
1238       bytemask_compress(rscratch1);
1239       orr(dst, dst, rscratch1, Assembler::LSL, idx << 3);
1240     }
1241   } else {
1242     assert(false, "unsupported");
1243     ShouldNotReachHere();
1244   }
1245 }
1246 
1247 // Unpack the mask, a long value in src, into predicate register dst based on the
1248 // corresponding data type. Note that dst can support at most 64 lanes.
1249 // Below example gives the expected dst predicate register in different types, with
1250 // a valid src(0x658D) on a 1024-bit vector size machine.
1251 // BYTE:  dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 00 00 65 8D
1252 // SHORT: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 14 11 40 51
1253 // INT:   dst = 0x00 00 00 00 00 00 00 00 01 10 01 01 10 00 11 01
1254 // LONG:  dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01
1255 //
1256 // The number of significant bits of src must be equal to lane_cnt. E.g., 0xFF658D which
1257 // has 24 significant bits would be an invalid input if dst predicate register refers to
1258 // a LONG type 1024-bit vector, which has at most 16 lanes.
1259 void C2_MacroAssembler::sve_vmask_fromlong(PRegister dst, Register src, BasicType bt, int lane_cnt,
1260                                            FloatRegister vtmp1, FloatRegister vtmp2) {
1261   assert(UseSVE == 2 && VM_Version::supports_svebitperm() &&
1262          lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported");
1263   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
1264   // Example:   src = 0x658D, bt = T_BYTE, size = B, lane_cnt = 16
1265   // Expected:  dst = 0b01101001 10001101
1266 
1267   // Put long value from general purpose register into the first lane of vector.
1268   // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1269   sve_dup(vtmp1, B, 0);
1270   mov(vtmp1, D, 0, src);
1271 
1272   // As sve_cmp generates mask value with the minimum unit in byte, we should
1273   // transform the value in the first lane which is mask in bit now to the
1274   // mask in byte, which can be done by SVE2's BDEP instruction.
1275 
1276   // The first source input of BDEP instruction. Deposite each byte in every 8 bytes.
1277   // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1278   if (lane_cnt <= 8) {
1279     // Nothing. As only one byte exsits.
1280   } else if (lane_cnt <= 16) {
1281     ins(vtmp1, B, vtmp1, 8, 1);
1282     mov(vtmp1, B, 1, zr);
1283   } else {
1284     sve_vector_extend(vtmp1, D, vtmp1, B);
1285   }
1286 
1287   // The second source input of BDEP instruction, initialized with 0x01 for each byte.
1288   // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1289   sve_dup(vtmp2, B, 1);
1290 
1291   // BDEP vtmp1.D, vtmp1.D, vtmp2.D
1292   // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1293   // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1294   //         ---------------------------------------
1295   // vtmp1 = 0x0001010000010001 | 0x0100000001010001
1296   sve_bdep(vtmp1, D, vtmp1, vtmp2);
1297 
1298   if (bt != T_BYTE) {
1299     sve_vector_extend(vtmp1, size, vtmp1, B);
1300   }
1301   // Generate mask according to the given vector, in which the elements have been
1302   // extended to expected type.
1303   // dst = 0b01101001 10001101
1304   sve_cmp(Assembler::NE, dst, size, ptrue, vtmp1, 0);
1305 }
1306 
1307 // Clobbers: rflags
1308 void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg,
1309                                     FloatRegister zn, FloatRegister zm, Condition cond) {
1310   assert(pg->is_governing(), "This register has to be a governing predicate register");
1311   FloatRegister z1 = zn, z2 = zm;
1312   switch (cond) {
1313     case LE: z1 = zm; z2 = zn; cond = GE; break;
1314     case LT: z1 = zm; z2 = zn; cond = GT; break;
1315     case LO: z1 = zm; z2 = zn; cond = HI; break;
1316     case LS: z1 = zm; z2 = zn; cond = HS; break;
1317     default:
1318       break;
1319   }
1320 
1321   SIMD_RegVariant size = elemType_to_regVariant(bt);
1322   if (is_floating_point_type(bt)) {
1323     sve_fcm(cond, pd, size, pg, z1, z2);
1324   } else {
1325     assert(is_integral_type(bt), "unsupported element type");
1326     sve_cmp(cond, pd, size, pg, z1, z2);
1327   }
1328 }
1329 
1330 // Get index of the last mask lane that is set
1331 void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) {
1332   SIMD_RegVariant size = elemType_to_regVariant(bt);
1333   sve_rev(ptmp, size, src);
1334   sve_brkb(ptmp, ptrue, ptmp, false);
1335   sve_cntp(dst, size, ptrue, ptmp);
1336   movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1);
1337   subw(dst, rscratch1, dst);
1338 }
1339 
1340 // Extend integer vector src to dst with the same lane count
1341 // but larger element size, e.g. 4B -> 4I
1342 void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes,
1343                                            FloatRegister src, BasicType src_bt) {
1344   if (src_bt == T_BYTE) {
1345     if (dst_bt == T_SHORT) {
1346       // 4B/8B to 4S/8S
1347       assert(dst_vlen_in_bytes == 8 || dst_vlen_in_bytes == 16, "unsupported");
1348       sxtl(dst, T8H, src, T8B);
1349     } else {
1350       // 4B to 4I
1351       assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported");
1352       sxtl(dst, T8H, src, T8B);
1353       sxtl(dst, T4S, dst, T4H);
1354     }
1355   } else if (src_bt == T_SHORT) {
1356     // 4S to 4I
1357     assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported");
1358     sxtl(dst, T4S, src, T4H);
1359   } else if (src_bt == T_INT) {
1360     // 2I to 2L
1361     assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported");
1362     sxtl(dst, T2D, src, T2S);
1363   } else {
1364     ShouldNotReachHere();
1365   }
1366 }
1367 
1368 // Narrow integer vector src down to dst with the same lane count
1369 // but smaller element size, e.g. 4I -> 4B
1370 void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt,
1371                                            FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) {
1372   if (src_bt == T_SHORT) {
1373     // 4S/8S to 4B/8B
1374     assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
1375     assert(dst_bt == T_BYTE, "unsupported");
1376     xtn(dst, T8B, src, T8H);
1377   } else if (src_bt == T_INT) {
1378     // 4I to 4B/4S
1379     assert(src_vlen_in_bytes == 16, "unsupported");
1380     assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported");
1381     xtn(dst, T4H, src, T4S);
1382     if (dst_bt == T_BYTE) {
1383       xtn(dst, T8B, dst, T8H);
1384     }
1385   } else if (src_bt == T_LONG) {
1386     // 2L to 2I
1387     assert(src_vlen_in_bytes == 16, "unsupported");
1388     assert(dst_bt == T_INT, "unsupported");
1389     xtn(dst, T2S, src, T2D);
1390   } else {
1391     ShouldNotReachHere();
1392   }
1393 }
1394 
1395 void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size,
1396                                           FloatRegister src, SIMD_RegVariant src_size) {
1397   assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size");
1398   if (src_size == B) {
1399     switch (dst_size) {
1400     case H:
1401       sve_sunpklo(dst, H, src);
1402       break;
1403     case S:
1404       sve_sunpklo(dst, H, src);
1405       sve_sunpklo(dst, S, dst);
1406       break;
1407     case D:
1408       sve_sunpklo(dst, H, src);
1409       sve_sunpklo(dst, S, dst);
1410       sve_sunpklo(dst, D, dst);
1411       break;
1412     default:
1413       ShouldNotReachHere();
1414     }
1415   } else if (src_size == H) {
1416     if (dst_size == S) {
1417       sve_sunpklo(dst, S, src);
1418     } else { // D
1419       sve_sunpklo(dst, S, src);
1420       sve_sunpklo(dst, D, dst);
1421     }
1422   } else if (src_size == S) {
1423     sve_sunpklo(dst, D, src);
1424   }
1425 }
1426 
1427 // Vector narrow from src to dst with specified element sizes.
1428 // High part of dst vector will be filled with zero.
1429 void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size,
1430                                           FloatRegister src, SIMD_RegVariant src_size,
1431                                           FloatRegister tmp) {
1432   assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size");
1433   assert_different_registers(src, tmp);
1434   sve_dup(tmp, src_size, 0);
1435   if (src_size == D) {
1436     switch (dst_size) {
1437     case S:
1438       sve_uzp1(dst, S, src, tmp);
1439       break;
1440     case H:
1441       assert_different_registers(dst, tmp);
1442       sve_uzp1(dst, S, src, tmp);
1443       sve_uzp1(dst, H, dst, tmp);
1444       break;
1445     case B:
1446       assert_different_registers(dst, tmp);
1447       sve_uzp1(dst, S, src, tmp);
1448       sve_uzp1(dst, H, dst, tmp);
1449       sve_uzp1(dst, B, dst, tmp);
1450       break;
1451     default:
1452       ShouldNotReachHere();
1453     }
1454   } else if (src_size == S) {
1455     if (dst_size == H) {
1456       sve_uzp1(dst, H, src, tmp);
1457     } else { // B
1458       assert_different_registers(dst, tmp);
1459       sve_uzp1(dst, H, src, tmp);
1460       sve_uzp1(dst, B, dst, tmp);
1461     }
1462   } else if (src_size == H) {
1463     sve_uzp1(dst, B, src, tmp);
1464   }
1465 }
1466 
1467 // Extend src predicate to dst predicate with the same lane count but larger
1468 // element size, e.g. 64Byte -> 512Long
1469 void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src,
1470                                              uint dst_element_length_in_bytes,
1471                                              uint src_element_length_in_bytes) {
1472   if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) {
1473     sve_punpklo(dst, src);
1474   } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) {
1475     sve_punpklo(dst, src);
1476     sve_punpklo(dst, dst);
1477   } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) {
1478     sve_punpklo(dst, src);
1479     sve_punpklo(dst, dst);
1480     sve_punpklo(dst, dst);
1481   } else {
1482     assert(false, "unsupported");
1483     ShouldNotReachHere();
1484   }
1485 }
1486 
1487 // Narrow src predicate to dst predicate with the same lane count but
1488 // smaller element size, e.g. 512Long -> 64Byte
1489 void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src, PRegister ptmp,
1490                                              uint dst_element_length_in_bytes, uint src_element_length_in_bytes) {
1491   // The insignificant bits in src predicate are expected to be zero.
1492   // To ensure the higher order bits of the resultant narrowed vector are 0, an all-zero predicate is
1493   // passed as the second argument. An example narrowing operation with a given mask would be -
1494   // 128Long -> 64Int on a 128-bit machine i.e 2L -> 2I
1495   // Mask (for 2 Longs) : TF
1496   // Predicate register for the above mask (16 bits) : 00000001 00000000
1497   // After narrowing (uzp1 dst.b, src.b, ptmp.b) : 0000 0000 0001 0000
1498   // Which translates to mask for 2 integers as : TF (lower half is considered while upper half is 0)
1499   assert_different_registers(src, ptmp);
1500   assert_different_registers(dst, ptmp);
1501   sve_pfalse(ptmp);
1502   if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) {
1503     sve_uzp1(dst, B, src, ptmp);
1504   } else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) {
1505     sve_uzp1(dst, H, src, ptmp);
1506     sve_uzp1(dst, B, dst, ptmp);
1507   } else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) {
1508     sve_uzp1(dst, S, src, ptmp);
1509     sve_uzp1(dst, H, dst, ptmp);
1510     sve_uzp1(dst, B, dst, ptmp);
1511   } else {
1512     assert(false, "unsupported");
1513     ShouldNotReachHere();
1514   }
1515 }
1516 
1517 // Vector reduction add for integral type with ASIMD instructions.
1518 void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt,
1519                                                  Register isrc, FloatRegister vsrc,
1520                                                  unsigned vector_length_in_bytes,
1521                                                  FloatRegister vtmp) {
1522   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1523   assert_different_registers(dst, isrc);
1524   bool isQ = vector_length_in_bytes == 16;
1525 
1526   BLOCK_COMMENT("neon_reduce_add_integral {");
1527     switch(bt) {
1528       case T_BYTE:
1529         addv(vtmp, isQ ? T16B : T8B, vsrc);
1530         smov(dst, vtmp, B, 0);
1531         addw(dst, dst, isrc, ext::sxtb);
1532         break;
1533       case T_SHORT:
1534         addv(vtmp, isQ ? T8H : T4H, vsrc);
1535         smov(dst, vtmp, H, 0);
1536         addw(dst, dst, isrc, ext::sxth);
1537         break;
1538       case T_INT:
1539         isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc);
1540         umov(dst, vtmp, S, 0);
1541         addw(dst, dst, isrc);
1542         break;
1543       case T_LONG:
1544         assert(isQ, "unsupported");
1545         addpd(vtmp, vsrc);
1546         umov(dst, vtmp, D, 0);
1547         add(dst, dst, isrc);
1548         break;
1549       default:
1550         assert(false, "unsupported");
1551         ShouldNotReachHere();
1552     }
1553   BLOCK_COMMENT("} neon_reduce_add_integral");
1554 }
1555 
1556 // Vector reduction multiply for integral type with ASIMD instructions.
1557 // Note: temporary registers vtmp1 and vtmp2 are not used in some cases.
1558 // Clobbers: rscratch1
1559 void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt,
1560                                                  Register isrc, FloatRegister vsrc,
1561                                                  unsigned vector_length_in_bytes,
1562                                                  FloatRegister vtmp1, FloatRegister vtmp2) {
1563   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1564   bool isQ = vector_length_in_bytes == 16;
1565 
1566   BLOCK_COMMENT("neon_reduce_mul_integral {");
1567     switch(bt) {
1568       case T_BYTE:
1569         if (isQ) {
1570           // Multiply the lower half and higher half of vector iteratively.
1571           // vtmp1 = vsrc[8:15]
1572           ins(vtmp1, D, vsrc, 0, 1);
1573           // vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7]
1574           mulv(vtmp1, T8B, vtmp1, vsrc);
1575           // vtmp2 = vtmp1[4:7]
1576           ins(vtmp2, S, vtmp1, 0, 1);
1577           // vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3]
1578           mulv(vtmp1, T8B, vtmp2, vtmp1);
1579         } else {
1580           ins(vtmp1, S, vsrc, 0, 1);
1581           mulv(vtmp1, T8B, vtmp1, vsrc);
1582         }
1583         // vtmp2 = vtmp1[2:3]
1584         ins(vtmp2, H, vtmp1, 0, 1);
1585         // vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1]
1586         mulv(vtmp2, T8B, vtmp2, vtmp1);
1587         // dst = vtmp2[0] * isrc * vtmp2[1]
1588         umov(rscratch1, vtmp2, B, 0);
1589         mulw(dst, rscratch1, isrc);
1590         sxtb(dst, dst);
1591         umov(rscratch1, vtmp2, B, 1);
1592         mulw(dst, rscratch1, dst);
1593         sxtb(dst, dst);
1594         break;
1595       case T_SHORT:
1596         if (isQ) {
1597           ins(vtmp2, D, vsrc, 0, 1);
1598           mulv(vtmp2, T4H, vtmp2, vsrc);
1599           ins(vtmp1, S, vtmp2, 0, 1);
1600           mulv(vtmp1, T4H, vtmp1, vtmp2);
1601         } else {
1602           ins(vtmp1, S, vsrc, 0, 1);
1603           mulv(vtmp1, T4H, vtmp1, vsrc);
1604         }
1605         umov(rscratch1, vtmp1, H, 0);
1606         mulw(dst, rscratch1, isrc);
1607         sxth(dst, dst);
1608         umov(rscratch1, vtmp1, H, 1);
1609         mulw(dst, rscratch1, dst);
1610         sxth(dst, dst);
1611         break;
1612       case T_INT:
1613         if (isQ) {
1614           ins(vtmp1, D, vsrc, 0, 1);
1615           mulv(vtmp1, T2S, vtmp1, vsrc);
1616         } else {
1617           vtmp1 = vsrc;
1618         }
1619         umov(rscratch1, vtmp1, S, 0);
1620         mul(dst, rscratch1, isrc);
1621         umov(rscratch1, vtmp1, S, 1);
1622         mul(dst, rscratch1, dst);
1623         break;
1624       case T_LONG:
1625         umov(rscratch1, vsrc, D, 0);
1626         mul(dst, isrc, rscratch1);
1627         umov(rscratch1, vsrc, D, 1);
1628         mul(dst, dst, rscratch1);
1629         break;
1630       default:
1631         assert(false, "unsupported");
1632         ShouldNotReachHere();
1633     }
1634   BLOCK_COMMENT("} neon_reduce_mul_integral");
1635 }
1636 
1637 // Vector reduction multiply for floating-point type with ASIMD instructions.
1638 void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt,
1639                                            FloatRegister fsrc, FloatRegister vsrc,
1640                                            unsigned vector_length_in_bytes,
1641                                            FloatRegister vtmp) {
1642   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1643   bool isQ = vector_length_in_bytes == 16;
1644 
1645   BLOCK_COMMENT("neon_reduce_mul_fp {");
1646     switch(bt) {
1647       case T_FLOAT:
1648         fmuls(dst, fsrc, vsrc);
1649         ins(vtmp, S, vsrc, 0, 1);
1650         fmuls(dst, dst, vtmp);
1651         if (isQ) {
1652           ins(vtmp, S, vsrc, 0, 2);
1653           fmuls(dst, dst, vtmp);
1654           ins(vtmp, S, vsrc, 0, 3);
1655           fmuls(dst, dst, vtmp);
1656          }
1657         break;
1658       case T_DOUBLE:
1659         assert(isQ, "unsupported");
1660         fmuld(dst, fsrc, vsrc);
1661         ins(vtmp, D, vsrc, 0, 1);
1662         fmuld(dst, dst, vtmp);
1663         break;
1664       default:
1665         assert(false, "unsupported");
1666         ShouldNotReachHere();
1667     }
1668   BLOCK_COMMENT("} neon_reduce_mul_fp");
1669 }
1670 
1671 // Helper to select logical instruction
1672 void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd,
1673                                                    Register Rn, Register Rm,
1674                                                    enum shift_kind kind, unsigned shift) {
1675   switch(opc) {
1676     case Op_AndReductionV:
1677       is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift);
1678       break;
1679     case Op_OrReductionV:
1680       is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift);
1681       break;
1682     case Op_XorReductionV:
1683       is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift);
1684       break;
1685     default:
1686       assert(false, "unsupported");
1687       ShouldNotReachHere();
1688   }
1689 }
1690 
1691 // Vector reduction logical operations And, Or, Xor
1692 // Clobbers: rscratch1
1693 void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt,
1694                                             Register isrc, FloatRegister vsrc,
1695                                             unsigned vector_length_in_bytes) {
1696   assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV,
1697          "unsupported");
1698   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1699   assert_different_registers(dst, isrc);
1700   bool isQ = vector_length_in_bytes == 16;
1701 
1702   BLOCK_COMMENT("neon_reduce_logical {");
1703     umov(rscratch1, vsrc, isQ ? D : S, 0);
1704     umov(dst, vsrc, isQ ? D : S, 1);
1705     neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1);
1706     switch(bt) {
1707       case T_BYTE:
1708         if (isQ) {
1709           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1710         }
1711         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
1712         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8);
1713         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1714         sxtb(dst, dst);
1715         break;
1716       case T_SHORT:
1717         if (isQ) {
1718           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1719         }
1720         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
1721         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1722         sxth(dst, dst);
1723         break;
1724       case T_INT:
1725         if (isQ) {
1726           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1727         }
1728         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1729         break;
1730       case T_LONG:
1731         assert(isQ, "unsupported");
1732         neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst);
1733         break;
1734       default:
1735         assert(false, "unsupported");
1736         ShouldNotReachHere();
1737     }
1738   BLOCK_COMMENT("} neon_reduce_logical");
1739 }
1740 
1741 // Vector reduction min/max for integral type with ASIMD instructions.
1742 // Note: vtmp is not used and expected to be fnoreg for T_LONG case.
1743 // Clobbers: rscratch1, rflags
1744 void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt,
1745                                                     Register isrc, FloatRegister vsrc,
1746                                                     unsigned vector_length_in_bytes,
1747                                                     FloatRegister vtmp) {
1748   assert(opc == Op_MinReductionV || opc == Op_MaxReductionV, "unsupported");
1749   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1750   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported");
1751   assert_different_registers(dst, isrc);
1752   bool isQ = vector_length_in_bytes == 16;
1753   bool is_min = opc == Op_MinReductionV;
1754 
1755   BLOCK_COMMENT("neon_reduce_minmax_integral {");
1756     if (bt == T_LONG) {
1757       assert(vtmp == fnoreg, "should be");
1758       assert(isQ, "should be");
1759       umov(rscratch1, vsrc, D, 0);
1760       cmp(isrc, rscratch1);
1761       csel(dst, isrc, rscratch1, is_min ? LT : GT);
1762       umov(rscratch1, vsrc, D, 1);
1763       cmp(dst, rscratch1);
1764       csel(dst, dst, rscratch1, is_min ? LT : GT);
1765     } else {
1766       SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1767       if (size == T2S) {
1768         is_min ? sminp(vtmp, size, vsrc, vsrc) : smaxp(vtmp, size, vsrc, vsrc);
1769       } else {
1770         is_min ? sminv(vtmp, size, vsrc) : smaxv(vtmp, size, vsrc);
1771       }
1772       if (bt == T_INT) {
1773         umov(dst, vtmp, S, 0);
1774       } else {
1775         smov(dst, vtmp, elemType_to_regVariant(bt), 0);
1776       }
1777       cmpw(dst, isrc);
1778       cselw(dst, dst, isrc, is_min ? LT : GT);
1779     }
1780   BLOCK_COMMENT("} neon_reduce_minmax_integral");
1781 }
1782 
1783 // Vector reduction for integral type with SVE instruction.
1784 // Supported operations are Add, And, Or, Xor, Max, Min.
1785 // rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV.
1786 void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1,
1787                                             FloatRegister src2, PRegister pg, FloatRegister tmp) {
1788   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
1789   assert(pg->is_governing(), "This register has to be a governing predicate register");
1790   assert_different_registers(src1, dst);
1791   // Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved.
1792   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
1793   switch (opc) {
1794     case Op_AddReductionVI: {
1795       sve_uaddv(tmp, size, pg, src2);
1796       if (bt == T_BYTE) {
1797         smov(dst, tmp, size, 0);
1798         addw(dst, src1, dst, ext::sxtb);
1799       } else if (bt == T_SHORT) {
1800         smov(dst, tmp, size, 0);
1801         addw(dst, src1, dst, ext::sxth);
1802       } else {
1803         umov(dst, tmp, size, 0);
1804         addw(dst, dst, src1);
1805       }
1806       break;
1807     }
1808     case Op_AddReductionVL: {
1809       sve_uaddv(tmp, size, pg, src2);
1810       umov(dst, tmp, size, 0);
1811       add(dst, dst, src1);
1812       break;
1813     }
1814     case Op_AndReductionV: {
1815       sve_andv(tmp, size, pg, src2);
1816       if (bt == T_INT || bt == T_LONG) {
1817         umov(dst, tmp, size, 0);
1818       } else {
1819         smov(dst, tmp, size, 0);
1820       }
1821       if (bt == T_LONG) {
1822         andr(dst, dst, src1);
1823       } else {
1824         andw(dst, dst, src1);
1825       }
1826       break;
1827     }
1828     case Op_OrReductionV: {
1829       sve_orv(tmp, size, pg, src2);
1830       if (bt == T_INT || bt == T_LONG) {
1831         umov(dst, tmp, size, 0);
1832       } else {
1833         smov(dst, tmp, size, 0);
1834       }
1835       if (bt == T_LONG) {
1836         orr(dst, dst, src1);
1837       } else {
1838         orrw(dst, dst, src1);
1839       }
1840       break;
1841     }
1842     case Op_XorReductionV: {
1843       sve_eorv(tmp, size, pg, src2);
1844       if (bt == T_INT || bt == T_LONG) {
1845         umov(dst, tmp, size, 0);
1846       } else {
1847         smov(dst, tmp, size, 0);
1848       }
1849       if (bt == T_LONG) {
1850         eor(dst, dst, src1);
1851       } else {
1852         eorw(dst, dst, src1);
1853       }
1854       break;
1855     }
1856     case Op_MaxReductionV: {
1857       sve_smaxv(tmp, size, pg, src2);
1858       if (bt == T_INT || bt == T_LONG) {
1859         umov(dst, tmp, size, 0);
1860       } else {
1861         smov(dst, tmp, size, 0);
1862       }
1863       if (bt == T_LONG) {
1864         cmp(dst, src1);
1865         csel(dst, dst, src1, Assembler::GT);
1866       } else {
1867         cmpw(dst, src1);
1868         cselw(dst, dst, src1, Assembler::GT);
1869       }
1870       break;
1871     }
1872     case Op_MinReductionV: {
1873       sve_sminv(tmp, size, pg, src2);
1874       if (bt == T_INT || bt == T_LONG) {
1875         umov(dst, tmp, size, 0);
1876       } else {
1877         smov(dst, tmp, size, 0);
1878       }
1879       if (bt == T_LONG) {
1880         cmp(dst, src1);
1881         csel(dst, dst, src1, Assembler::LT);
1882       } else {
1883         cmpw(dst, src1);
1884         cselw(dst, dst, src1, Assembler::LT);
1885       }
1886       break;
1887     }
1888     default:
1889       assert(false, "unsupported");
1890       ShouldNotReachHere();
1891   }
1892 
1893   if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) {
1894     if (bt == T_BYTE) {
1895       sxtb(dst, dst);
1896     } else if (bt == T_SHORT) {
1897       sxth(dst, dst);
1898     }
1899   }
1900 }
1901 
1902 // Set elements of the dst predicate to true for lanes in the range of [0, lane_cnt), or
1903 // to false otherwise. The input "lane_cnt" should be smaller than or equal to the supported
1904 // max vector length of the basic type. Clobbers: rscratch1 and the rFlagsReg.
1905 void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt) {
1906   uint32_t max_vector_length = Matcher::max_vector_size(bt);
1907   assert(lane_cnt <= max_vector_length, "unsupported input lane_cnt");
1908 
1909   // Set all elements to false if the input "lane_cnt" is zero.
1910   if (lane_cnt == 0) {
1911     sve_pfalse(dst);
1912     return;
1913   }
1914 
1915   SIMD_RegVariant size = elemType_to_regVariant(bt);
1916   assert(size != Q, "invalid size");
1917 
1918   // Set all true if "lane_cnt" equals to the max lane count.
1919   if (lane_cnt == max_vector_length) {
1920     sve_ptrue(dst, size, /* ALL */ 0b11111);
1921     return;
1922   }
1923 
1924   // Fixed numbers for "ptrue".
1925   switch(lane_cnt) {
1926   case 1: /* VL1 */
1927   case 2: /* VL2 */
1928   case 3: /* VL3 */
1929   case 4: /* VL4 */
1930   case 5: /* VL5 */
1931   case 6: /* VL6 */
1932   case 7: /* VL7 */
1933   case 8: /* VL8 */
1934     sve_ptrue(dst, size, lane_cnt);
1935     return;
1936   case 16:
1937     sve_ptrue(dst, size, /* VL16 */ 0b01001);
1938     return;
1939   case 32:
1940     sve_ptrue(dst, size, /* VL32 */ 0b01010);
1941     return;
1942   case 64:
1943     sve_ptrue(dst, size, /* VL64 */ 0b01011);
1944     return;
1945   case 128:
1946     sve_ptrue(dst, size, /* VL128 */ 0b01100);
1947     return;
1948   case 256:
1949     sve_ptrue(dst, size, /* VL256 */ 0b01101);
1950     return;
1951   default:
1952     break;
1953   }
1954 
1955   // Special patterns for "ptrue".
1956   if (lane_cnt == round_down_power_of_2(max_vector_length)) {
1957     sve_ptrue(dst, size, /* POW2 */ 0b00000);
1958   } else if (lane_cnt == max_vector_length - (max_vector_length % 4)) {
1959     sve_ptrue(dst, size, /* MUL4 */ 0b11101);
1960   } else if (lane_cnt == max_vector_length - (max_vector_length % 3)) {
1961     sve_ptrue(dst, size, /* MUL3 */ 0b11110);
1962   } else {
1963     // Encode to "whileltw" for the remaining cases.
1964     mov(rscratch1, lane_cnt);
1965     sve_whileltw(dst, size, zr, rscratch1);
1966   }
1967 }
1968 
1969 // Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst.
1970 // Any remaining elements of dst will be filled with zero.
1971 // Clobbers: rscratch1
1972 // Preserves: src, mask
1973 void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask,
1974                                            FloatRegister vtmp1, FloatRegister vtmp2,
1975                                            PRegister pgtmp) {
1976   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
1977   assert_different_registers(dst, src, vtmp1, vtmp2);
1978   assert_different_registers(mask, pgtmp);
1979 
1980   // Example input:   src   = 8888 7777 6666 5555 4444 3333 2222 1111
1981   //                  mask  = 0001 0000 0000 0001 0001 0000 0001 0001
1982   // Expected result: dst   = 0000 0000 0000 8888 5555 4444 2222 1111
1983   sve_dup(vtmp2, H, 0);
1984 
1985   // Extend lowest half to type INT.
1986   // dst = 00004444 00003333 00002222 00001111
1987   sve_uunpklo(dst, S, src);
1988   // pgtmp = 00000001 00000000 00000001 00000001
1989   sve_punpklo(pgtmp, mask);
1990   // Pack the active elements in size of type INT to the right,
1991   // and fill the remainings with zero.
1992   // dst = 00000000 00004444 00002222 00001111
1993   sve_compact(dst, S, dst, pgtmp);
1994   // Narrow the result back to type SHORT.
1995   // dst = 0000 0000 0000 0000 0000 4444 2222 1111
1996   sve_uzp1(dst, H, dst, vtmp2);
1997   // Count the active elements of lowest half.
1998   // rscratch1 = 3
1999   sve_cntp(rscratch1, S, ptrue, pgtmp);
2000 
2001   // Repeat to the highest half.
2002   // pgtmp = 00000001 00000000 00000000 00000001
2003   sve_punpkhi(pgtmp, mask);
2004   // vtmp1 = 00008888 00007777 00006666 00005555
2005   sve_uunpkhi(vtmp1, S, src);
2006   // vtmp1 = 00000000 00000000 00008888 00005555
2007   sve_compact(vtmp1, S, vtmp1, pgtmp);
2008   // vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555
2009   sve_uzp1(vtmp1, H, vtmp1, vtmp2);
2010 
2011   // Compressed low:   dst   = 0000 0000 0000 0000 0000 4444 2222 1111
2012   // Compressed high:  vtmp1 = 0000 0000 0000 0000 0000 0000 8888  5555
2013   // Left shift(cross lane) compressed high with TRUE_CNT lanes,
2014   // TRUE_CNT is the number of active elements in the compressed low.
2015   neg(rscratch1, rscratch1);
2016   // vtmp2 = {4 3 2 1 0 -1 -2 -3}
2017   sve_index(vtmp2, H, rscratch1, 1);
2018   // vtmp1 = 0000 0000 0000 8888 5555 0000 0000 0000
2019   sve_tbl(vtmp1, H, vtmp1, vtmp2);
2020 
2021   // Combine the compressed high(after shifted) with the compressed low.
2022   // dst = 0000 0000 0000 8888 5555 4444 2222 1111
2023   sve_orr(dst, dst, vtmp1);
2024 }
2025 
2026 // Clobbers: rscratch1, rscratch2
2027 // Preserves: src, mask
2028 void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask,
2029                                           FloatRegister vtmp1, FloatRegister vtmp2,
2030                                           FloatRegister vtmp3, FloatRegister vtmp4,
2031                                           PRegister ptmp, PRegister pgtmp) {
2032   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2033   assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3, vtmp4);
2034   assert_different_registers(mask, ptmp, pgtmp);
2035   // Example input:   src   = 88 77 66 55 44 33 22 11
2036   //                  mask  = 01 00 00 01 01 00 01 01
2037   // Expected result: dst   = 00 00 00 88 55 44 22 11
2038 
2039   sve_dup(vtmp4, B, 0);
2040   // Extend lowest half to type SHORT.
2041   // vtmp1 = 0044 0033 0022 0011
2042   sve_uunpklo(vtmp1, H, src);
2043   // ptmp = 0001 0000 0001 0001
2044   sve_punpklo(ptmp, mask);
2045   // Count the active elements of lowest half.
2046   // rscratch2 = 3
2047   sve_cntp(rscratch2, H, ptrue, ptmp);
2048   // Pack the active elements in size of type SHORT to the right,
2049   // and fill the remainings with zero.
2050   // dst = 0000 0044 0022 0011
2051   sve_compress_short(dst, vtmp1, ptmp, vtmp2, vtmp3, pgtmp);
2052   // Narrow the result back to type BYTE.
2053   // dst = 00 00 00 00 00 44 22 11
2054   sve_uzp1(dst, B, dst, vtmp4);
2055 
2056   // Repeat to the highest half.
2057   // ptmp = 0001 0000 0000 0001
2058   sve_punpkhi(ptmp, mask);
2059   // vtmp1 = 0088 0077 0066 0055
2060   sve_uunpkhi(vtmp2, H, src);
2061   // vtmp1 = 0000 0000 0088 0055
2062   sve_compress_short(vtmp1, vtmp2, ptmp, vtmp3, vtmp4, pgtmp);
2063 
2064   sve_dup(vtmp4, B, 0);
2065   // vtmp1 = 00 00 00 00 00 00 88 55
2066   sve_uzp1(vtmp1, B, vtmp1, vtmp4);
2067 
2068   // Compressed low:   dst   = 00 00 00 00 00 44 22 11
2069   // Compressed high:  vtmp1 = 00 00 00 00 00 00 88 55
2070   // Left shift(cross lane) compressed high with TRUE_CNT lanes,
2071   // TRUE_CNT is the number of active elements in the compressed low.
2072   neg(rscratch2, rscratch2);
2073   // vtmp2 = {4 3 2 1 0 -1 -2 -3}
2074   sve_index(vtmp2, B, rscratch2, 1);
2075   // vtmp1 = 00 00 00 88 55 00 00 00
2076   sve_tbl(vtmp1, B, vtmp1, vtmp2);
2077   // Combine the compressed high(after shifted) with the compressed low.
2078   // dst = 00 00 00 88 55 44 22 11
2079   sve_orr(dst, dst, vtmp1);
2080 }
2081 
2082 void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2083   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2084   SIMD_Arrangement size = isQ ? T16B : T8B;
2085   if (bt == T_BYTE) {
2086     rbit(dst, size, src);
2087   } else {
2088     neon_reverse_bytes(dst, src, bt, isQ);
2089     rbit(dst, size, dst);
2090   }
2091 }
2092 
2093 void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2094   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2095   SIMD_Arrangement size = isQ ? T16B : T8B;
2096   switch (bt) {
2097     case T_BYTE:
2098       if (dst != src) {
2099         orr(dst, size, src, src);
2100       }
2101       break;
2102     case T_SHORT:
2103       rev16(dst, size, src);
2104       break;
2105     case T_INT:
2106       rev32(dst, size, src);
2107       break;
2108     case T_LONG:
2109       rev64(dst, size, src);
2110       break;
2111     default:
2112       assert(false, "unsupported");
2113       ShouldNotReachHere();
2114   }
2115 }
2116 
2117 // Extract a scalar element from an sve vector at position 'idx'.
2118 // The input elements in src are expected to be of integral type.
2119 void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src,
2120                                              int idx, FloatRegister vtmp) {
2121   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2122   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2123   if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction
2124     if (bt == T_INT || bt == T_LONG) {
2125       umov(dst, src, size, idx);
2126     } else {
2127       smov(dst, src, size, idx);
2128     }
2129   } else {
2130     sve_orr(vtmp, src, src);
2131     sve_ext(vtmp, vtmp, idx << size);
2132     if (bt == T_INT || bt == T_LONG) {
2133       umov(dst, vtmp, size, 0);
2134     } else {
2135       smov(dst, vtmp, size, 0);
2136     }
2137   }
2138 }
2139 
2140 // java.lang.Math::round intrinsics
2141 
2142 // Clobbers: rscratch1, rflags
2143 void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2144                                           FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) {
2145   assert_different_registers(tmp1, tmp2, tmp3, src, dst);
2146   switch (T) {
2147     case T2S:
2148     case T4S:
2149       fmovs(tmp1, T, 0.5f);
2150       mov(rscratch1, jint_cast(0x1.0p23f));
2151       break;
2152     case T2D:
2153       fmovd(tmp1, T, 0.5);
2154       mov(rscratch1, julong_cast(0x1.0p52));
2155       break;
2156     default:
2157       assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2158   }
2159   fadd(tmp1, T, tmp1, src);
2160   fcvtms(tmp1, T, tmp1);
2161   // tmp1 = floor(src + 0.5, ties to even)
2162 
2163   fcvtas(dst, T, src);
2164   // dst = round(src), ties to away
2165 
2166   fneg(tmp3, T, src);
2167   dup(tmp2, T, rscratch1);
2168   cm(HS, tmp3, T, tmp3, tmp2);
2169   // tmp3 is now a set of flags
2170 
2171   bif(dst, T16B, tmp1, tmp3);
2172   // result in dst
2173 }
2174 
2175 // Clobbers: rscratch1, rflags
2176 void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2177                                          FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) {
2178   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2179   assert_different_registers(tmp1, tmp2, src, dst);
2180 
2181   switch (T) {
2182     case S:
2183       mov(rscratch1, jint_cast(0x1.0p23f));
2184       break;
2185     case D:
2186       mov(rscratch1, julong_cast(0x1.0p52));
2187       break;
2188     default:
2189       assert(T == S || T == D, "invalid register variant");
2190   }
2191 
2192   sve_frinta(dst, T, ptrue, src);
2193   // dst = round(src), ties to away
2194 
2195   Label none;
2196 
2197   sve_fneg(tmp1, T, ptrue, src);
2198   sve_dup(tmp2, T, rscratch1);
2199   sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1);
2200   br(EQ, none);
2201   {
2202     sve_cpy(tmp1, T, pgtmp, 0.5);
2203     sve_fadd(tmp1, T, pgtmp, src);
2204     sve_frintm(dst, T, pgtmp, tmp1);
2205     // dst = floor(src + 0.5, ties to even)
2206   }
2207   bind(none);
2208 
2209   sve_fcvtzs(dst, T, ptrue, dst, T);
2210   // result in dst
2211 }
2212 
2213 void C2_MacroAssembler::vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero,
2214                                            FloatRegister one, SIMD_Arrangement T) {
2215   assert_different_registers(dst, src, zero, one);
2216   assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2217 
2218   facgt(dst, T, src, zero);
2219   ushr(dst, T, dst, 1); // dst=0 for +-0.0 and NaN. 0x7FF..F otherwise
2220   bsl(dst, T == T2S ? T8B : T16B, one, src); // Result in dst
2221 }
2222 
2223 void C2_MacroAssembler::vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero,
2224                                           FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T) {
2225     assert_different_registers(dst, src, zero, one, vtmp);
2226     assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2227 
2228     sve_orr(vtmp, src, src);
2229     sve_fac(Assembler::GT, pgtmp, T, ptrue, src, zero); // pmtp=0 for +-0.0 and NaN. 0x1 otherwise
2230     switch (T) {
2231     case S:
2232       sve_and(vtmp, T, min_jint); // Extract the sign bit of float value in every lane of src
2233       sve_orr(vtmp, T, jint_cast(1.0)); // OR it with +1 to make the final result +1 or -1 depending
2234                                         // on the sign of the float value
2235       break;
2236     case D:
2237       sve_and(vtmp, T, min_jlong);
2238       sve_orr(vtmp, T, jlong_cast(1.0));
2239       break;
2240     default:
2241       assert(false, "unsupported");
2242       ShouldNotReachHere();
2243     }
2244     sve_sel(dst, T, pgtmp, vtmp, src); // Select either from src or vtmp based on the predicate register pgtmp
2245                                        // Result in dst
2246 }
2247 
2248 bool C2_MacroAssembler::in_scratch_emit_size() {
2249   if (ciEnv::current()->task() != nullptr) {
2250     PhaseOutput* phase_output = Compile::current()->output();
2251     if (phase_output != nullptr && phase_output->in_scratch_emit_size()) {
2252       return true;
2253     }
2254   }
2255   return MacroAssembler::in_scratch_emit_size();
2256 }