1 /*
   2  * Copyright (c) 2020, 2023, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "opto/c2_MacroAssembler.hpp"
  29 #include "opto/compile.hpp"
  30 #include "opto/intrinsicnode.hpp"
  31 #include "opto/matcher.hpp"
  32 #include "opto/output.hpp"
  33 #include "opto/subnode.hpp"
  34 #include "runtime/stubRoutines.hpp"
  35 
  36 #ifdef PRODUCT
  37 #define BLOCK_COMMENT(str) /* nothing */
  38 #define STOP(error) stop(error)
  39 #else
  40 #define BLOCK_COMMENT(str) block_comment(str)
  41 #define STOP(error) block_comment(error); stop(error)
  42 #endif
  43 
  44 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  45 
  46 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
  47 
  48 void C2_MacroAssembler::fast_lock(Register objectReg, Register boxReg, Register tmpReg,
  49                                   Register tmp2Reg, Register tmp3Reg) {
  50   Register oop = objectReg;
  51   Register box = boxReg;
  52   Register disp_hdr = tmpReg;
  53   Register tmp = tmp2Reg;
  54   Label cont;
  55   Label object_has_monitor;
  56   Label count, no_count;
  57 
  58   assert_different_registers(oop, box, tmp, disp_hdr);
  59 
  60   // Load markWord from object into displaced_header.
  61   ldr(disp_hdr, Address(oop, oopDesc::mark_offset_in_bytes()));
  62 
  63   if (DiagnoseSyncOnValueBasedClasses != 0) {
  64     load_klass(tmp, oop);
  65     ldrw(tmp, Address(tmp, Klass::access_flags_offset()));
  66     tstw(tmp, JVM_ACC_IS_VALUE_BASED_CLASS);
  67     br(Assembler::NE, cont);
  68   }
  69 
  70   // Check for existing monitor
  71   tbnz(disp_hdr, exact_log2(markWord::monitor_value), object_has_monitor);
  72 
  73   if (LockingMode == LM_MONITOR) {
  74     tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0.
  75     b(cont);
  76   } else if (LockingMode == LM_LEGACY) {
  77     // Set tmp to be (markWord of object | UNLOCK_VALUE).
  78     orr(tmp, disp_hdr, markWord::unlocked_value);
  79 
  80     // Initialize the box. (Must happen before we update the object mark!)
  81     str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes()));
  82 
  83     // Compare object markWord with an unlocked value (tmp) and if
  84     // equal exchange the stack address of our box with object markWord.
  85     // On failure disp_hdr contains the possibly locked markWord.
  86     cmpxchg(oop, tmp, box, Assembler::xword, /*acquire*/ true,
  87             /*release*/ true, /*weak*/ false, disp_hdr);
  88     br(Assembler::EQ, cont);
  89 
  90     assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
  91 
  92     // If the compare-and-exchange succeeded, then we found an unlocked
  93     // object, will have now locked it will continue at label cont
  94 
  95     // Check if the owner is self by comparing the value in the
  96     // markWord of object (disp_hdr) with the stack pointer.
  97     mov(rscratch1, sp);
  98     sub(disp_hdr, disp_hdr, rscratch1);
  99     mov(tmp, (address) (~(os::vm_page_size()-1) | markWord::lock_mask_in_place));
 100     // If condition is true we are cont and hence we can store 0 as the
 101     // displaced header in the box, which indicates that it is a recursive lock.
 102     ands(tmp/*==0?*/, disp_hdr, tmp);   // Sets flags for result
 103     str(tmp/*==0, perhaps*/, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 104     b(cont);
 105   } else {
 106     assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 107     lightweight_lock(oop, disp_hdr, tmp, tmp3Reg, no_count);
 108     b(count);
 109   }
 110 
 111   // Handle existing monitor.
 112   bind(object_has_monitor);
 113 
 114   // The object's monitor m is unlocked iff m->owner == NULL,
 115   // otherwise m->owner may contain a thread or a stack address.
 116   //
 117   // Try to CAS m->owner from NULL to current thread.
 118   add(tmp, disp_hdr, (in_bytes(ObjectMonitor::owner_offset())-markWord::monitor_value));
 119   cmpxchg(tmp, zr, rthread, Assembler::xword, /*acquire*/ true,
 120           /*release*/ true, /*weak*/ false, tmp3Reg); // Sets flags for result
 121 
 122   if (LockingMode != LM_LIGHTWEIGHT) {
 123     // Store a non-null value into the box to avoid looking like a re-entrant
 124     // lock. The fast-path monitor unlock code checks for
 125     // markWord::monitor_value so use markWord::unused_mark which has the
 126     // relevant bit set, and also matches ObjectSynchronizer::enter.
 127     mov(tmp, (address)markWord::unused_mark().value());
 128     str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 129   }
 130   br(Assembler::EQ, cont); // CAS success means locking succeeded
 131 
 132   cmp(tmp3Reg, rthread);
 133   br(Assembler::NE, cont); // Check for recursive locking
 134 
 135   // Recursive lock case
 136   increment(Address(disp_hdr, in_bytes(ObjectMonitor::recursions_offset()) - markWord::monitor_value), 1);
 137   // flag == EQ still from the cmp above, checking if this is a reentrant lock
 138 
 139   bind(cont);
 140   // flag == EQ indicates success
 141   // flag == NE indicates failure
 142   br(Assembler::NE, no_count);
 143 
 144   bind(count);
 145   increment(Address(rthread, JavaThread::held_monitor_count_offset()));
 146 
 147   bind(no_count);
 148 }
 149 
 150 void C2_MacroAssembler::fast_unlock(Register objectReg, Register boxReg, Register tmpReg,
 151                                     Register tmp2Reg) {
 152   Register oop = objectReg;
 153   Register box = boxReg;
 154   Register disp_hdr = tmpReg;
 155   Register tmp = tmp2Reg;
 156   Label cont;
 157   Label object_has_monitor;
 158   Label count, no_count;
 159 
 160   assert_different_registers(oop, box, tmp, disp_hdr);
 161 
 162   if (LockingMode == LM_LEGACY) {
 163     // Find the lock address and load the displaced header from the stack.
 164     ldr(disp_hdr, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 165 
 166     // If the displaced header is 0, we have a recursive unlock.
 167     cmp(disp_hdr, zr);
 168     br(Assembler::EQ, cont);
 169   }
 170 
 171   // Handle existing monitor.
 172   ldr(tmp, Address(oop, oopDesc::mark_offset_in_bytes()));
 173   tbnz(tmp, exact_log2(markWord::monitor_value), object_has_monitor);
 174 
 175   if (LockingMode == LM_MONITOR) {
 176     tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0.
 177     b(cont);
 178   } else if (LockingMode == LM_LEGACY) {
 179     // Check if it is still a light weight lock, this is is true if we
 180     // see the stack address of the basicLock in the markWord of the
 181     // object.
 182 
 183     cmpxchg(oop, box, disp_hdr, Assembler::xword, /*acquire*/ false,
 184             /*release*/ true, /*weak*/ false, tmp);
 185     b(cont);
 186   } else {
 187     assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 188     lightweight_unlock(oop, tmp, box, disp_hdr, no_count);
 189     b(count);
 190   }
 191 
 192   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
 193 
 194   // Handle existing monitor.
 195   bind(object_has_monitor);
 196   STATIC_ASSERT(markWord::monitor_value <= INT_MAX);
 197   add(tmp, tmp, -(int)markWord::monitor_value); // monitor
 198 
 199   if (LockingMode == LM_LIGHTWEIGHT) {
 200     // If the owner is anonymous, we need to fix it -- in an outline stub.
 201     Register tmp2 = disp_hdr;
 202     ldr(tmp2, Address(tmp, ObjectMonitor::owner_offset()));
 203     // We cannot use tbnz here, the target might be too far away and cannot
 204     // be encoded.
 205     tst(tmp2, (uint64_t)ObjectMonitor::ANONYMOUS_OWNER);
 206     C2HandleAnonOMOwnerStub* stub = new (Compile::current()->comp_arena()) C2HandleAnonOMOwnerStub(tmp, tmp2);
 207     Compile::current()->output()->add_stub(stub);
 208     br(Assembler::NE, stub->entry());
 209     bind(stub->continuation());
 210   }
 211 
 212   ldr(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset()));
 213 
 214   Label notRecursive;
 215   cbz(disp_hdr, notRecursive);
 216 
 217   // Recursive lock
 218   sub(disp_hdr, disp_hdr, 1u);
 219   str(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset()));
 220   cmp(disp_hdr, disp_hdr); // Sets flags for result
 221   b(cont);
 222 
 223   bind(notRecursive);
 224   ldr(rscratch1, Address(tmp, ObjectMonitor::EntryList_offset()));
 225   ldr(disp_hdr, Address(tmp, ObjectMonitor::cxq_offset()));
 226   orr(rscratch1, rscratch1, disp_hdr); // Will be 0 if both are 0.
 227   cmp(rscratch1, zr); // Sets flags for result
 228   cbnz(rscratch1, cont);
 229   // need a release store here
 230   lea(tmp, Address(tmp, ObjectMonitor::owner_offset()));
 231   stlr(zr, tmp); // set unowned
 232 
 233   bind(cont);
 234   // flag == EQ indicates success
 235   // flag == NE indicates failure
 236   br(Assembler::NE, no_count);
 237 
 238   bind(count);
 239   decrement(Address(rthread, JavaThread::held_monitor_count_offset()));
 240 
 241   bind(no_count);
 242 }
 243 
 244 // Search for str1 in str2 and return index or -1
 245 // Clobbers: rscratch1, rscratch2, rflags. May also clobber v0-v1, when icnt1==-1.
 246 void C2_MacroAssembler::string_indexof(Register str2, Register str1,
 247                                        Register cnt2, Register cnt1,
 248                                        Register tmp1, Register tmp2,
 249                                        Register tmp3, Register tmp4,
 250                                        Register tmp5, Register tmp6,
 251                                        int icnt1, Register result, int ae) {
 252   // NOTE: tmp5, tmp6 can be zr depending on specific method version
 253   Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
 254 
 255   Register ch1 = rscratch1;
 256   Register ch2 = rscratch2;
 257   Register cnt1tmp = tmp1;
 258   Register cnt2tmp = tmp2;
 259   Register cnt1_neg = cnt1;
 260   Register cnt2_neg = cnt2;
 261   Register result_tmp = tmp4;
 262 
 263   bool isL = ae == StrIntrinsicNode::LL;
 264 
 265   bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
 266   bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
 267   int str1_chr_shift = str1_isL ? 0:1;
 268   int str2_chr_shift = str2_isL ? 0:1;
 269   int str1_chr_size = str1_isL ? 1:2;
 270   int str2_chr_size = str2_isL ? 1:2;
 271   chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
 272                                       (chr_insn)&MacroAssembler::ldrh;
 273   chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
 274                                       (chr_insn)&MacroAssembler::ldrh;
 275   chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
 276   chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
 277 
 278   // Note, inline_string_indexOf() generates checks:
 279   // if (substr.count > string.count) return -1;
 280   // if (substr.count == 0) return 0;
 281 
 282   // We have two strings, a source string in str2, cnt2 and a pattern string
 283   // in str1, cnt1. Find the 1st occurrence of pattern in source or return -1.
 284 
 285   // For larger pattern and source we use a simplified Boyer Moore algorithm.
 286   // With a small pattern and source we use linear scan.
 287 
 288   if (icnt1 == -1) {
 289     sub(result_tmp, cnt2, cnt1);
 290     cmp(cnt1, (u1)8);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
 291     br(LT, LINEARSEARCH);
 292     dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
 293     subs(zr, cnt1, 256);
 294     lsr(tmp1, cnt2, 2);
 295     ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
 296     br(GE, LINEARSTUB);
 297   }
 298 
 299 // The Boyer Moore alogorithm is based on the description here:-
 300 //
 301 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
 302 //
 303 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
 304 // and the 'Good Suffix' rule.
 305 //
 306 // These rules are essentially heuristics for how far we can shift the
 307 // pattern along the search string.
 308 //
 309 // The implementation here uses the 'Bad Character' rule only because of the
 310 // complexity of initialisation for the 'Good Suffix' rule.
 311 //
 312 // This is also known as the Boyer-Moore-Horspool algorithm:-
 313 //
 314 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
 315 //
 316 // This particular implementation has few java-specific optimizations.
 317 //
 318 // #define ASIZE 256
 319 //
 320 //    int bm(unsigned char *x, int m, unsigned char *y, int n) {
 321 //       int i, j;
 322 //       unsigned c;
 323 //       unsigned char bc[ASIZE];
 324 //
 325 //       /* Preprocessing */
 326 //       for (i = 0; i < ASIZE; ++i)
 327 //          bc[i] = m;
 328 //       for (i = 0; i < m - 1; ) {
 329 //          c = x[i];
 330 //          ++i;
 331 //          // c < 256 for Latin1 string, so, no need for branch
 332 //          #ifdef PATTERN_STRING_IS_LATIN1
 333 //          bc[c] = m - i;
 334 //          #else
 335 //          if (c < ASIZE) bc[c] = m - i;
 336 //          #endif
 337 //       }
 338 //
 339 //       /* Searching */
 340 //       j = 0;
 341 //       while (j <= n - m) {
 342 //          c = y[i+j];
 343 //          if (x[m-1] == c)
 344 //            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
 345 //          if (i < 0) return j;
 346 //          // c < 256 for Latin1 string, so, no need for branch
 347 //          #ifdef SOURCE_STRING_IS_LATIN1
 348 //          // LL case: (c< 256) always true. Remove branch
 349 //          j += bc[y[j+m-1]];
 350 //          #endif
 351 //          #ifndef PATTERN_STRING_IS_UTF
 352 //          // UU case: need if (c<ASIZE) check. Skip 1 character if not.
 353 //          if (c < ASIZE)
 354 //            j += bc[y[j+m-1]];
 355 //          else
 356 //            j += 1
 357 //          #endif
 358 //          #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
 359 //          // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
 360 //          if (c < ASIZE)
 361 //            j += bc[y[j+m-1]];
 362 //          else
 363 //            j += m
 364 //          #endif
 365 //       }
 366 //    }
 367 
 368   if (icnt1 == -1) {
 369     Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
 370         BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
 371     Register cnt1end = tmp2;
 372     Register str2end = cnt2;
 373     Register skipch = tmp2;
 374 
 375     // str1 length is >=8, so, we can read at least 1 register for cases when
 376     // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
 377     // UL case. We'll re-read last character in inner pre-loop code to have
 378     // single outer pre-loop load
 379     const int firstStep = isL ? 7 : 3;
 380 
 381     const int ASIZE = 256;
 382     const int STORED_BYTES = 32; // amount of bytes stored per instruction
 383     sub(sp, sp, ASIZE);
 384     mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
 385     mov(ch1, sp);
 386     BIND(BM_INIT_LOOP);
 387       stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
 388       subs(tmp5, tmp5, 1);
 389       br(GT, BM_INIT_LOOP);
 390 
 391       sub(cnt1tmp, cnt1, 1);
 392       mov(tmp5, str2);
 393       add(str2end, str2, result_tmp, LSL, str2_chr_shift);
 394       sub(ch2, cnt1, 1);
 395       mov(tmp3, str1);
 396     BIND(BCLOOP);
 397       (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
 398       if (!str1_isL) {
 399         subs(zr, ch1, ASIZE);
 400         br(HS, BCSKIP);
 401       }
 402       strb(ch2, Address(sp, ch1));
 403     BIND(BCSKIP);
 404       subs(ch2, ch2, 1);
 405       br(GT, BCLOOP);
 406 
 407       add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
 408       if (str1_isL == str2_isL) {
 409         // load last 8 bytes (8LL/4UU symbols)
 410         ldr(tmp6, Address(tmp6, -wordSize));
 411       } else {
 412         ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
 413         // convert Latin1 to UTF. We'll have to wait until load completed, but
 414         // it's still faster than per-character loads+checks
 415         lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
 416         ubfx(ch1, tmp6, 8, 8); // str1[N-2]
 417         ubfx(ch2, tmp6, 16, 8); // str1[N-3]
 418         andr(tmp6, tmp6, 0xFF); // str1[N-4]
 419         orr(ch2, ch1, ch2, LSL, 16);
 420         orr(tmp6, tmp6, tmp3, LSL, 48);
 421         orr(tmp6, tmp6, ch2, LSL, 16);
 422       }
 423     BIND(BMLOOPSTR2);
 424       (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 425       sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
 426       if (str1_isL == str2_isL) {
 427         // re-init tmp3. It's for free because it's executed in parallel with
 428         // load above. Alternative is to initialize it before loop, but it'll
 429         // affect performance on in-order systems with 2 or more ld/st pipelines
 430         lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
 431       }
 432       if (!isL) { // UU/UL case
 433         lsl(ch2, cnt1tmp, 1); // offset in bytes
 434       }
 435       cmp(tmp3, skipch);
 436       br(NE, BMSKIP);
 437       ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
 438       mov(ch1, tmp6);
 439       if (isL) {
 440         b(BMLOOPSTR1_AFTER_LOAD);
 441       } else {
 442         sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
 443         b(BMLOOPSTR1_CMP);
 444       }
 445     BIND(BMLOOPSTR1);
 446       (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
 447       (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 448     BIND(BMLOOPSTR1_AFTER_LOAD);
 449       subs(cnt1tmp, cnt1tmp, 1);
 450       br(LT, BMLOOPSTR1_LASTCMP);
 451     BIND(BMLOOPSTR1_CMP);
 452       cmp(ch1, ch2);
 453       br(EQ, BMLOOPSTR1);
 454     BIND(BMSKIP);
 455       if (!isL) {
 456         // if we've met UTF symbol while searching Latin1 pattern, then we can
 457         // skip cnt1 symbols
 458         if (str1_isL != str2_isL) {
 459           mov(result_tmp, cnt1);
 460         } else {
 461           mov(result_tmp, 1);
 462         }
 463         subs(zr, skipch, ASIZE);
 464         br(HS, BMADV);
 465       }
 466       ldrb(result_tmp, Address(sp, skipch)); // load skip distance
 467     BIND(BMADV);
 468       sub(cnt1tmp, cnt1, 1);
 469       add(str2, str2, result_tmp, LSL, str2_chr_shift);
 470       cmp(str2, str2end);
 471       br(LE, BMLOOPSTR2);
 472       add(sp, sp, ASIZE);
 473       b(NOMATCH);
 474     BIND(BMLOOPSTR1_LASTCMP);
 475       cmp(ch1, ch2);
 476       br(NE, BMSKIP);
 477     BIND(BMMATCH);
 478       sub(result, str2, tmp5);
 479       if (!str2_isL) lsr(result, result, 1);
 480       add(sp, sp, ASIZE);
 481       b(DONE);
 482 
 483     BIND(LINEARSTUB);
 484     cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm
 485     br(LT, LINEAR_MEDIUM);
 486     mov(result, zr);
 487     RuntimeAddress stub = nullptr;
 488     if (isL) {
 489       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
 490       assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated");
 491     } else if (str1_isL) {
 492       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
 493        assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated");
 494     } else {
 495       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
 496       assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated");
 497     }
 498     address call = trampoline_call(stub);
 499     if (call == nullptr) {
 500       DEBUG_ONLY(reset_labels(LINEARSEARCH, LINEAR_MEDIUM, DONE, NOMATCH, MATCH));
 501       ciEnv::current()->record_failure("CodeCache is full");
 502       return;
 503     }
 504     b(DONE);
 505   }
 506 
 507   BIND(LINEARSEARCH);
 508   {
 509     Label DO1, DO2, DO3;
 510 
 511     Register str2tmp = tmp2;
 512     Register first = tmp3;
 513 
 514     if (icnt1 == -1)
 515     {
 516         Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
 517 
 518         cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2));
 519         br(LT, DOSHORT);
 520       BIND(LINEAR_MEDIUM);
 521         (this->*str1_load_1chr)(first, Address(str1));
 522         lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
 523         sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
 524         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 525         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 526 
 527       BIND(FIRST_LOOP);
 528         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
 529         cmp(first, ch2);
 530         br(EQ, STR1_LOOP);
 531       BIND(STR2_NEXT);
 532         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 533         br(LE, FIRST_LOOP);
 534         b(NOMATCH);
 535 
 536       BIND(STR1_LOOP);
 537         adds(cnt1tmp, cnt1_neg, str1_chr_size);
 538         add(cnt2tmp, cnt2_neg, str2_chr_size);
 539         br(GE, MATCH);
 540 
 541       BIND(STR1_NEXT);
 542         (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
 543         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
 544         cmp(ch1, ch2);
 545         br(NE, STR2_NEXT);
 546         adds(cnt1tmp, cnt1tmp, str1_chr_size);
 547         add(cnt2tmp, cnt2tmp, str2_chr_size);
 548         br(LT, STR1_NEXT);
 549         b(MATCH);
 550 
 551       BIND(DOSHORT);
 552       if (str1_isL == str2_isL) {
 553         cmp(cnt1, (u1)2);
 554         br(LT, DO1);
 555         br(GT, DO3);
 556       }
 557     }
 558 
 559     if (icnt1 == 4) {
 560       Label CH1_LOOP;
 561 
 562         (this->*load_4chr)(ch1, str1);
 563         sub(result_tmp, cnt2, 4);
 564         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 565         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 566 
 567       BIND(CH1_LOOP);
 568         (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
 569         cmp(ch1, ch2);
 570         br(EQ, MATCH);
 571         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 572         br(LE, CH1_LOOP);
 573         b(NOMATCH);
 574       }
 575 
 576     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
 577       Label CH1_LOOP;
 578 
 579       BIND(DO2);
 580         (this->*load_2chr)(ch1, str1);
 581         if (icnt1 == 2) {
 582           sub(result_tmp, cnt2, 2);
 583         }
 584         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 585         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 586       BIND(CH1_LOOP);
 587         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
 588         cmp(ch1, ch2);
 589         br(EQ, MATCH);
 590         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 591         br(LE, CH1_LOOP);
 592         b(NOMATCH);
 593     }
 594 
 595     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
 596       Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
 597 
 598       BIND(DO3);
 599         (this->*load_2chr)(first, str1);
 600         (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
 601         if (icnt1 == 3) {
 602           sub(result_tmp, cnt2, 3);
 603         }
 604         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 605         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 606       BIND(FIRST_LOOP);
 607         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
 608         cmpw(first, ch2);
 609         br(EQ, STR1_LOOP);
 610       BIND(STR2_NEXT);
 611         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 612         br(LE, FIRST_LOOP);
 613         b(NOMATCH);
 614 
 615       BIND(STR1_LOOP);
 616         add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
 617         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
 618         cmp(ch1, ch2);
 619         br(NE, STR2_NEXT);
 620         b(MATCH);
 621     }
 622 
 623     if (icnt1 == -1 || icnt1 == 1) {
 624       Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
 625 
 626       BIND(DO1);
 627         (this->*str1_load_1chr)(ch1, str1);
 628         cmp(cnt2, (u1)8);
 629         br(LT, DO1_SHORT);
 630 
 631         sub(result_tmp, cnt2, 8/str2_chr_size);
 632         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 633         mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
 634         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 635 
 636         if (str2_isL) {
 637           orr(ch1, ch1, ch1, LSL, 8);
 638         }
 639         orr(ch1, ch1, ch1, LSL, 16);
 640         orr(ch1, ch1, ch1, LSL, 32);
 641       BIND(CH1_LOOP);
 642         ldr(ch2, Address(str2, cnt2_neg));
 643         eor(ch2, ch1, ch2);
 644         sub(tmp1, ch2, tmp3);
 645         orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 646         bics(tmp1, tmp1, tmp2);
 647         br(NE, HAS_ZERO);
 648         adds(cnt2_neg, cnt2_neg, 8);
 649         br(LT, CH1_LOOP);
 650 
 651         cmp(cnt2_neg, (u1)8);
 652         mov(cnt2_neg, 0);
 653         br(LT, CH1_LOOP);
 654         b(NOMATCH);
 655 
 656       BIND(HAS_ZERO);
 657         rev(tmp1, tmp1);
 658         clz(tmp1, tmp1);
 659         add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
 660         b(MATCH);
 661 
 662       BIND(DO1_SHORT);
 663         mov(result_tmp, cnt2);
 664         lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
 665         sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
 666       BIND(DO1_LOOP);
 667         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
 668         cmpw(ch1, ch2);
 669         br(EQ, MATCH);
 670         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 671         br(LT, DO1_LOOP);
 672     }
 673   }
 674   BIND(NOMATCH);
 675     mov(result, -1);
 676     b(DONE);
 677   BIND(MATCH);
 678     add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
 679   BIND(DONE);
 680 }
 681 
 682 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
 683 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
 684 
 685 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1,
 686                                             Register ch, Register result,
 687                                             Register tmp1, Register tmp2, Register tmp3)
 688 {
 689   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
 690   Register cnt1_neg = cnt1;
 691   Register ch1 = rscratch1;
 692   Register result_tmp = rscratch2;
 693 
 694   cbz(cnt1, NOMATCH);
 695 
 696   cmp(cnt1, (u1)4);
 697   br(LT, DO1_SHORT);
 698 
 699   orr(ch, ch, ch, LSL, 16);
 700   orr(ch, ch, ch, LSL, 32);
 701 
 702   sub(cnt1, cnt1, 4);
 703   mov(result_tmp, cnt1);
 704   lea(str1, Address(str1, cnt1, Address::uxtw(1)));
 705   sub(cnt1_neg, zr, cnt1, LSL, 1);
 706 
 707   mov(tmp3, 0x0001000100010001);
 708 
 709   BIND(CH1_LOOP);
 710     ldr(ch1, Address(str1, cnt1_neg));
 711     eor(ch1, ch, ch1);
 712     sub(tmp1, ch1, tmp3);
 713     orr(tmp2, ch1, 0x7fff7fff7fff7fff);
 714     bics(tmp1, tmp1, tmp2);
 715     br(NE, HAS_ZERO);
 716     adds(cnt1_neg, cnt1_neg, 8);
 717     br(LT, CH1_LOOP);
 718 
 719     cmp(cnt1_neg, (u1)8);
 720     mov(cnt1_neg, 0);
 721     br(LT, CH1_LOOP);
 722     b(NOMATCH);
 723 
 724   BIND(HAS_ZERO);
 725     rev(tmp1, tmp1);
 726     clz(tmp1, tmp1);
 727     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
 728     b(MATCH);
 729 
 730   BIND(DO1_SHORT);
 731     mov(result_tmp, cnt1);
 732     lea(str1, Address(str1, cnt1, Address::uxtw(1)));
 733     sub(cnt1_neg, zr, cnt1, LSL, 1);
 734   BIND(DO1_LOOP);
 735     ldrh(ch1, Address(str1, cnt1_neg));
 736     cmpw(ch, ch1);
 737     br(EQ, MATCH);
 738     adds(cnt1_neg, cnt1_neg, 2);
 739     br(LT, DO1_LOOP);
 740   BIND(NOMATCH);
 741     mov(result, -1);
 742     b(DONE);
 743   BIND(MATCH);
 744     add(result, result_tmp, cnt1_neg, ASR, 1);
 745   BIND(DONE);
 746 }
 747 
 748 void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1,
 749                                                 Register ch, Register result,
 750                                                 FloatRegister ztmp1,
 751                                                 FloatRegister ztmp2,
 752                                                 PRegister tmp_pg,
 753                                                 PRegister tmp_pdn, bool isL)
 754 {
 755   // Note that `tmp_pdn` should *NOT* be used as governing predicate register.
 756   assert(tmp_pg->is_governing(),
 757          "this register has to be a governing predicate register");
 758 
 759   Label LOOP, MATCH, DONE, NOMATCH;
 760   Register vec_len = rscratch1;
 761   Register idx = rscratch2;
 762 
 763   SIMD_RegVariant T = (isL == true) ? B : H;
 764 
 765   cbz(cnt1, NOMATCH);
 766 
 767   // Assign the particular char throughout the vector.
 768   sve_dup(ztmp2, T, ch);
 769   if (isL) {
 770     sve_cntb(vec_len);
 771   } else {
 772     sve_cnth(vec_len);
 773   }
 774   mov(idx, 0);
 775 
 776   // Generate a predicate to control the reading of input string.
 777   sve_whilelt(tmp_pg, T, idx, cnt1);
 778 
 779   BIND(LOOP);
 780     // Read a vector of 8- or 16-bit data depending on the string type. Note
 781     // that inactive elements indicated by the predicate register won't cause
 782     // a data read from memory to the destination vector.
 783     if (isL) {
 784       sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx));
 785     } else {
 786       sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1)));
 787     }
 788     add(idx, idx, vec_len);
 789 
 790     // Perform the comparison. An element of the destination predicate is set
 791     // to active if the particular char is matched.
 792     sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2);
 793 
 794     // Branch if the particular char is found.
 795     br(NE, MATCH);
 796 
 797     sve_whilelt(tmp_pg, T, idx, cnt1);
 798 
 799     // Loop back if the particular char not found.
 800     br(MI, LOOP);
 801 
 802   BIND(NOMATCH);
 803     mov(result, -1);
 804     b(DONE);
 805 
 806   BIND(MATCH);
 807     // Undo the index increment.
 808     sub(idx, idx, vec_len);
 809 
 810     // Crop the vector to find its location.
 811     sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */);
 812     add(result, idx, -1);
 813     sve_incp(result, T, tmp_pdn);
 814   BIND(DONE);
 815 }
 816 
 817 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1,
 818                                             Register ch, Register result,
 819                                             Register tmp1, Register tmp2, Register tmp3)
 820 {
 821   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
 822   Register cnt1_neg = cnt1;
 823   Register ch1 = rscratch1;
 824   Register result_tmp = rscratch2;
 825 
 826   cbz(cnt1, NOMATCH);
 827 
 828   cmp(cnt1, (u1)8);
 829   br(LT, DO1_SHORT);
 830 
 831   orr(ch, ch, ch, LSL, 8);
 832   orr(ch, ch, ch, LSL, 16);
 833   orr(ch, ch, ch, LSL, 32);
 834 
 835   sub(cnt1, cnt1, 8);
 836   mov(result_tmp, cnt1);
 837   lea(str1, Address(str1, cnt1));
 838   sub(cnt1_neg, zr, cnt1);
 839 
 840   mov(tmp3, 0x0101010101010101);
 841 
 842   BIND(CH1_LOOP);
 843     ldr(ch1, Address(str1, cnt1_neg));
 844     eor(ch1, ch, ch1);
 845     sub(tmp1, ch1, tmp3);
 846     orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f);
 847     bics(tmp1, tmp1, tmp2);
 848     br(NE, HAS_ZERO);
 849     adds(cnt1_neg, cnt1_neg, 8);
 850     br(LT, CH1_LOOP);
 851 
 852     cmp(cnt1_neg, (u1)8);
 853     mov(cnt1_neg, 0);
 854     br(LT, CH1_LOOP);
 855     b(NOMATCH);
 856 
 857   BIND(HAS_ZERO);
 858     rev(tmp1, tmp1);
 859     clz(tmp1, tmp1);
 860     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
 861     b(MATCH);
 862 
 863   BIND(DO1_SHORT);
 864     mov(result_tmp, cnt1);
 865     lea(str1, Address(str1, cnt1));
 866     sub(cnt1_neg, zr, cnt1);
 867   BIND(DO1_LOOP);
 868     ldrb(ch1, Address(str1, cnt1_neg));
 869     cmp(ch, ch1);
 870     br(EQ, MATCH);
 871     adds(cnt1_neg, cnt1_neg, 1);
 872     br(LT, DO1_LOOP);
 873   BIND(NOMATCH);
 874     mov(result, -1);
 875     b(DONE);
 876   BIND(MATCH);
 877     add(result, result_tmp, cnt1_neg);
 878   BIND(DONE);
 879 }
 880 
 881 // Compare strings.
 882 void C2_MacroAssembler::string_compare(Register str1, Register str2,
 883     Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
 884     FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
 885     PRegister pgtmp1, PRegister pgtmp2, int ae) {
 886   Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
 887       DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
 888       SHORT_LOOP_START, TAIL_CHECK;
 889 
 890   bool isLL = ae == StrIntrinsicNode::LL;
 891   bool isLU = ae == StrIntrinsicNode::LU;
 892   bool isUL = ae == StrIntrinsicNode::UL;
 893 
 894   // The stub threshold for LL strings is: 72 (64 + 8) chars
 895   // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch)
 896   // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least)
 897   const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36);
 898 
 899   bool str1_isL = isLL || isLU;
 900   bool str2_isL = isLL || isUL;
 901 
 902   int str1_chr_shift = str1_isL ? 0 : 1;
 903   int str2_chr_shift = str2_isL ? 0 : 1;
 904   int str1_chr_size = str1_isL ? 1 : 2;
 905   int str2_chr_size = str2_isL ? 1 : 2;
 906   int minCharsInWord = isLL ? wordSize : wordSize/2;
 907 
 908   FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
 909   chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
 910                                       (chr_insn)&MacroAssembler::ldrh;
 911   chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
 912                                       (chr_insn)&MacroAssembler::ldrh;
 913   uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
 914                             (uxt_insn)&MacroAssembler::uxthw;
 915 
 916   BLOCK_COMMENT("string_compare {");
 917 
 918   // Bizzarely, the counts are passed in bytes, regardless of whether they
 919   // are L or U strings, however the result is always in characters.
 920   if (!str1_isL) asrw(cnt1, cnt1, 1);
 921   if (!str2_isL) asrw(cnt2, cnt2, 1);
 922 
 923   // Compute the minimum of the string lengths and save the difference.
 924   subsw(result, cnt1, cnt2);
 925   cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
 926 
 927   // A very short string
 928   cmpw(cnt2, minCharsInWord);
 929   br(Assembler::LE, SHORT_STRING);
 930 
 931   // Compare longwords
 932   // load first parts of strings and finish initialization while loading
 933   {
 934     if (str1_isL == str2_isL) { // LL or UU
 935       ldr(tmp1, Address(str1));
 936       cmp(str1, str2);
 937       br(Assembler::EQ, DONE);
 938       ldr(tmp2, Address(str2));
 939       cmp(cnt2, stub_threshold);
 940       br(GE, STUB);
 941       subsw(cnt2, cnt2, minCharsInWord);
 942       br(EQ, TAIL_CHECK);
 943       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
 944       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
 945       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
 946     } else if (isLU) {
 947       ldrs(vtmp, Address(str1));
 948       ldr(tmp2, Address(str2));
 949       cmp(cnt2, stub_threshold);
 950       br(GE, STUB);
 951       subw(cnt2, cnt2, 4);
 952       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
 953       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
 954       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
 955       zip1(vtmp, T8B, vtmp, vtmpZ);
 956       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
 957       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
 958       add(cnt1, cnt1, 4);
 959       fmovd(tmp1, vtmp);
 960     } else { // UL case
 961       ldr(tmp1, Address(str1));
 962       ldrs(vtmp, Address(str2));
 963       cmp(cnt2, stub_threshold);
 964       br(GE, STUB);
 965       subw(cnt2, cnt2, 4);
 966       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
 967       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
 968       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
 969       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
 970       zip1(vtmp, T8B, vtmp, vtmpZ);
 971       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
 972       add(cnt1, cnt1, 8);
 973       fmovd(tmp2, vtmp);
 974     }
 975     adds(cnt2, cnt2, isUL ? 4 : 8);
 976     br(GE, TAIL);
 977     eor(rscratch2, tmp1, tmp2);
 978     cbnz(rscratch2, DIFF);
 979     // main loop
 980     bind(NEXT_WORD);
 981     if (str1_isL == str2_isL) {
 982       ldr(tmp1, Address(str1, cnt2));
 983       ldr(tmp2, Address(str2, cnt2));
 984       adds(cnt2, cnt2, 8);
 985     } else if (isLU) {
 986       ldrs(vtmp, Address(str1, cnt1));
 987       ldr(tmp2, Address(str2, cnt2));
 988       add(cnt1, cnt1, 4);
 989       zip1(vtmp, T8B, vtmp, vtmpZ);
 990       fmovd(tmp1, vtmp);
 991       adds(cnt2, cnt2, 8);
 992     } else { // UL
 993       ldrs(vtmp, Address(str2, cnt2));
 994       ldr(tmp1, Address(str1, cnt1));
 995       zip1(vtmp, T8B, vtmp, vtmpZ);
 996       add(cnt1, cnt1, 8);
 997       fmovd(tmp2, vtmp);
 998       adds(cnt2, cnt2, 4);
 999     }
1000     br(GE, TAIL);
1001 
1002     eor(rscratch2, tmp1, tmp2);
1003     cbz(rscratch2, NEXT_WORD);
1004     b(DIFF);
1005     bind(TAIL);
1006     eor(rscratch2, tmp1, tmp2);
1007     cbnz(rscratch2, DIFF);
1008     // Last longword.  In the case where length == 4 we compare the
1009     // same longword twice, but that's still faster than another
1010     // conditional branch.
1011     if (str1_isL == str2_isL) {
1012       ldr(tmp1, Address(str1));
1013       ldr(tmp2, Address(str2));
1014     } else if (isLU) {
1015       ldrs(vtmp, Address(str1));
1016       ldr(tmp2, Address(str2));
1017       zip1(vtmp, T8B, vtmp, vtmpZ);
1018       fmovd(tmp1, vtmp);
1019     } else { // UL
1020       ldrs(vtmp, Address(str2));
1021       ldr(tmp1, Address(str1));
1022       zip1(vtmp, T8B, vtmp, vtmpZ);
1023       fmovd(tmp2, vtmp);
1024     }
1025     bind(TAIL_CHECK);
1026     eor(rscratch2, tmp1, tmp2);
1027     cbz(rscratch2, DONE);
1028 
1029     // Find the first different characters in the longwords and
1030     // compute their difference.
1031     bind(DIFF);
1032     rev(rscratch2, rscratch2);
1033     clz(rscratch2, rscratch2);
1034     andr(rscratch2, rscratch2, isLL ? -8 : -16);
1035     lsrv(tmp1, tmp1, rscratch2);
1036     (this->*ext_chr)(tmp1, tmp1);
1037     lsrv(tmp2, tmp2, rscratch2);
1038     (this->*ext_chr)(tmp2, tmp2);
1039     subw(result, tmp1, tmp2);
1040     b(DONE);
1041   }
1042 
1043   bind(STUB);
1044     RuntimeAddress stub = nullptr;
1045     switch(ae) {
1046       case StrIntrinsicNode::LL:
1047         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
1048         break;
1049       case StrIntrinsicNode::UU:
1050         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
1051         break;
1052       case StrIntrinsicNode::LU:
1053         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
1054         break;
1055       case StrIntrinsicNode::UL:
1056         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
1057         break;
1058       default:
1059         ShouldNotReachHere();
1060      }
1061     assert(stub.target() != nullptr, "compare_long_string stub has not been generated");
1062     address call = trampoline_call(stub);
1063     if (call == nullptr) {
1064       DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START));
1065       ciEnv::current()->record_failure("CodeCache is full");
1066       return;
1067     }
1068     b(DONE);
1069 
1070   bind(SHORT_STRING);
1071   // Is the minimum length zero?
1072   cbz(cnt2, DONE);
1073   // arrange code to do most branches while loading and loading next characters
1074   // while comparing previous
1075   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1076   subs(cnt2, cnt2, 1);
1077   br(EQ, SHORT_LAST_INIT);
1078   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1079   b(SHORT_LOOP_START);
1080   bind(SHORT_LOOP);
1081   subs(cnt2, cnt2, 1);
1082   br(EQ, SHORT_LAST);
1083   bind(SHORT_LOOP_START);
1084   (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
1085   (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
1086   cmp(tmp1, cnt1);
1087   br(NE, SHORT_LOOP_TAIL);
1088   subs(cnt2, cnt2, 1);
1089   br(EQ, SHORT_LAST2);
1090   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1091   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1092   cmp(tmp2, rscratch1);
1093   br(EQ, SHORT_LOOP);
1094   sub(result, tmp2, rscratch1);
1095   b(DONE);
1096   bind(SHORT_LOOP_TAIL);
1097   sub(result, tmp1, cnt1);
1098   b(DONE);
1099   bind(SHORT_LAST2);
1100   cmp(tmp2, rscratch1);
1101   br(EQ, DONE);
1102   sub(result, tmp2, rscratch1);
1103 
1104   b(DONE);
1105   bind(SHORT_LAST_INIT);
1106   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1107   bind(SHORT_LAST);
1108   cmp(tmp1, cnt1);
1109   br(EQ, DONE);
1110   sub(result, tmp1, cnt1);
1111 
1112   bind(DONE);
1113 
1114   BLOCK_COMMENT("} string_compare");
1115 }
1116 
1117 void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1,
1118                                      FloatRegister src2, Condition cond, bool isQ) {
1119   SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1120   FloatRegister zn = src1, zm = src2;
1121   bool needs_negation = false;
1122   switch (cond) {
1123     case LT: cond = GT; zn = src2; zm = src1; break;
1124     case LE: cond = GE; zn = src2; zm = src1; break;
1125     case LO: cond = HI; zn = src2; zm = src1; break;
1126     case LS: cond = HS; zn = src2; zm = src1; break;
1127     case NE: cond = EQ; needs_negation = true; break;
1128     default:
1129       break;
1130   }
1131 
1132   if (is_floating_point_type(bt)) {
1133     fcm(cond, dst, size, zn, zm);
1134   } else {
1135     cm(cond, dst, size, zn, zm);
1136   }
1137 
1138   if (needs_negation) {
1139     notr(dst, isQ ? T16B : T8B, dst);
1140   }
1141 }
1142 
1143 void C2_MacroAssembler::neon_compare_zero(FloatRegister dst, BasicType bt, FloatRegister src,
1144                                           Condition cond, bool isQ) {
1145   SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1146   if (bt == T_FLOAT || bt == T_DOUBLE) {
1147     if (cond == Assembler::NE) {
1148       fcm(Assembler::EQ, dst, size, src);
1149       notr(dst, isQ ? T16B : T8B, dst);
1150     } else {
1151       fcm(cond, dst, size, src);
1152     }
1153   } else {
1154     if (cond == Assembler::NE) {
1155       cm(Assembler::EQ, dst, size, src);
1156       notr(dst, isQ ? T16B : T8B, dst);
1157     } else {
1158       cm(cond, dst, size, src);
1159     }
1160   }
1161 }
1162 
1163 // Compress the least significant bit of each byte to the rightmost and clear
1164 // the higher garbage bits.
1165 void C2_MacroAssembler::bytemask_compress(Register dst) {
1166   // Example input, dst = 0x01 00 00 00 01 01 00 01
1167   // The "??" bytes are garbage.
1168   orr(dst, dst, dst, Assembler::LSR, 7);  // dst = 0x?? 02 ?? 00 ?? 03 ?? 01
1169   orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D
1170   orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D
1171   andr(dst, dst, 0xff);                   // dst = 0x8D
1172 }
1173 
1174 // Pack the lowest-numbered bit of each mask element in src into a long value
1175 // in dst, at most the first 64 lane elements.
1176 // Clobbers: rscratch1, if UseSVE=1 or the hardware doesn't support FEAT_BITPERM.
1177 void C2_MacroAssembler::sve_vmask_tolong(Register dst, PRegister src, BasicType bt, int lane_cnt,
1178                                          FloatRegister vtmp1, FloatRegister vtmp2) {
1179   assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
1180   assert_different_registers(dst, rscratch1);
1181   assert_different_registers(vtmp1, vtmp2);
1182 
1183   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
1184   // Example:   src = 0b01100101 10001101, bt = T_BYTE, lane_cnt = 16
1185   // Expected:  dst = 0x658D
1186 
1187   // Convert the mask into vector with sequential bytes.
1188   // vtmp1 = 0x00010100 0x00010001 0x01000000 0x01010001
1189   sve_cpy(vtmp1, size, src, 1, false);
1190   if (bt != T_BYTE) {
1191     sve_vector_narrow(vtmp1, B, vtmp1, size, vtmp2);
1192   }
1193 
1194   if (UseSVE > 1 && VM_Version::supports_svebitperm()) {
1195     // Given a vector with the value 0x00 or 0x01 in each byte, the basic idea
1196     // is to compress each significant bit of the byte in a cross-lane way. Due
1197     // to the lack of a cross-lane bit-compress instruction, we use BEXT
1198     // (bit-compress in each lane) with the biggest lane size (T = D) then
1199     // concatenate the results.
1200 
1201     // The second source input of BEXT, initialized with 0x01 in each byte.
1202     // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1203     sve_dup(vtmp2, B, 1);
1204 
1205     // BEXT vtmp1.D, vtmp1.D, vtmp2.D
1206     // vtmp1 = 0x0001010000010001 | 0x0100000001010001
1207     // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1208     //         ---------------------------------------
1209     // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1210     sve_bext(vtmp1, D, vtmp1, vtmp2);
1211 
1212     // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the
1213     // result to dst.
1214     // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1215     // dst   = 0x658D
1216     if (lane_cnt <= 8) {
1217       // No need to concatenate.
1218       umov(dst, vtmp1, B, 0);
1219     } else if (lane_cnt <= 16) {
1220       ins(vtmp1, B, vtmp1, 1, 8);
1221       umov(dst, vtmp1, H, 0);
1222     } else {
1223       // As the lane count is 64 at most, the final expected value must be in
1224       // the lowest 64 bits after narrowing vtmp1 from D to B.
1225       sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2);
1226       umov(dst, vtmp1, D, 0);
1227     }
1228   } else if (UseSVE > 0) {
1229     // Compress the lowest 8 bytes.
1230     fmovd(dst, vtmp1);
1231     bytemask_compress(dst);
1232     if (lane_cnt <= 8) return;
1233 
1234     // Repeat on higher bytes and join the results.
1235     // Compress 8 bytes in each iteration.
1236     for (int idx = 1; idx < (lane_cnt / 8); idx++) {
1237       sve_extract_integral(rscratch1, T_LONG, vtmp1, idx, vtmp2);
1238       bytemask_compress(rscratch1);
1239       orr(dst, dst, rscratch1, Assembler::LSL, idx << 3);
1240     }
1241   } else {
1242     assert(false, "unsupported");
1243     ShouldNotReachHere();
1244   }
1245 }
1246 
1247 // Unpack the mask, a long value in src, into predicate register dst based on the
1248 // corresponding data type. Note that dst can support at most 64 lanes.
1249 // Below example gives the expected dst predicate register in different types, with
1250 // a valid src(0x658D) on a 1024-bit vector size machine.
1251 // BYTE:  dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 00 00 65 8D
1252 // SHORT: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 14 11 40 51
1253 // INT:   dst = 0x00 00 00 00 00 00 00 00 01 10 01 01 10 00 11 01
1254 // LONG:  dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01
1255 //
1256 // The number of significant bits of src must be equal to lane_cnt. E.g., 0xFF658D which
1257 // has 24 significant bits would be an invalid input if dst predicate register refers to
1258 // a LONG type 1024-bit vector, which has at most 16 lanes.
1259 void C2_MacroAssembler::sve_vmask_fromlong(PRegister dst, Register src, BasicType bt, int lane_cnt,
1260                                            FloatRegister vtmp1, FloatRegister vtmp2) {
1261   assert(UseSVE == 2 && VM_Version::supports_svebitperm() &&
1262          lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported");
1263   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
1264   // Example:   src = 0x658D, bt = T_BYTE, size = B, lane_cnt = 16
1265   // Expected:  dst = 0b01101001 10001101
1266 
1267   // Put long value from general purpose register into the first lane of vector.
1268   // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1269   sve_dup(vtmp1, B, 0);
1270   mov(vtmp1, D, 0, src);
1271 
1272   // As sve_cmp generates mask value with the minimum unit in byte, we should
1273   // transform the value in the first lane which is mask in bit now to the
1274   // mask in byte, which can be done by SVE2's BDEP instruction.
1275 
1276   // The first source input of BDEP instruction. Deposite each byte in every 8 bytes.
1277   // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1278   if (lane_cnt <= 8) {
1279     // Nothing. As only one byte exsits.
1280   } else if (lane_cnt <= 16) {
1281     ins(vtmp1, B, vtmp1, 8, 1);
1282     mov(vtmp1, B, 1, zr);
1283   } else {
1284     sve_vector_extend(vtmp1, D, vtmp1, B);
1285   }
1286 
1287   // The second source input of BDEP instruction, initialized with 0x01 for each byte.
1288   // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1289   sve_dup(vtmp2, B, 1);
1290 
1291   // BDEP vtmp1.D, vtmp1.D, vtmp2.D
1292   // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1293   // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1294   //         ---------------------------------------
1295   // vtmp1 = 0x0001010000010001 | 0x0100000001010001
1296   sve_bdep(vtmp1, D, vtmp1, vtmp2);
1297 
1298   if (bt != T_BYTE) {
1299     sve_vector_extend(vtmp1, size, vtmp1, B);
1300   }
1301   // Generate mask according to the given vector, in which the elements have been
1302   // extended to expected type.
1303   // dst = 0b01101001 10001101
1304   sve_cmp(Assembler::NE, dst, size, ptrue, vtmp1, 0);
1305 }
1306 
1307 // Clobbers: rflags
1308 void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg,
1309                                     FloatRegister zn, FloatRegister zm, Condition cond) {
1310   assert(pg->is_governing(), "This register has to be a governing predicate register");
1311   FloatRegister z1 = zn, z2 = zm;
1312   switch (cond) {
1313     case LE: z1 = zm; z2 = zn; cond = GE; break;
1314     case LT: z1 = zm; z2 = zn; cond = GT; break;
1315     case LO: z1 = zm; z2 = zn; cond = HI; break;
1316     case LS: z1 = zm; z2 = zn; cond = HS; break;
1317     default:
1318       break;
1319   }
1320 
1321   SIMD_RegVariant size = elemType_to_regVariant(bt);
1322   if (is_floating_point_type(bt)) {
1323     sve_fcm(cond, pd, size, pg, z1, z2);
1324   } else {
1325     assert(is_integral_type(bt), "unsupported element type");
1326     sve_cmp(cond, pd, size, pg, z1, z2);
1327   }
1328 }
1329 
1330 // Get index of the last mask lane that is set
1331 void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) {
1332   SIMD_RegVariant size = elemType_to_regVariant(bt);
1333   sve_rev(ptmp, size, src);
1334   sve_brkb(ptmp, ptrue, ptmp, false);
1335   sve_cntp(dst, size, ptrue, ptmp);
1336   movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1);
1337   subw(dst, rscratch1, dst);
1338 }
1339 
1340 // Extend integer vector src to dst with the same lane count
1341 // but larger element size, e.g. 4B -> 4I
1342 void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes,
1343                                            FloatRegister src, BasicType src_bt, bool is_unsigned) {
1344   if (src_bt == T_BYTE) {
1345     if (dst_bt == T_SHORT) {
1346       // 4B/8B to 4S/8S
1347       _xshll(is_unsigned, dst, T8H, src, T8B, 0);
1348     } else {
1349       // 4B to 4I
1350       assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported");
1351       _xshll(is_unsigned, dst, T8H, src, T8B, 0);
1352       _xshll(is_unsigned, dst, T4S, dst, T4H, 0);
1353     }
1354   } else if (src_bt == T_SHORT) {
1355     // 4S to 4I
1356     assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported");
1357     _xshll(is_unsigned, dst, T4S, src, T4H, 0);
1358   } else if (src_bt == T_INT) {
1359     // 2I to 2L
1360     assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported");
1361     _xshll(is_unsigned, dst, T2D, src, T2S, 0);
1362   } else {
1363     ShouldNotReachHere();
1364   }
1365 }
1366 
1367 // Narrow integer vector src down to dst with the same lane count
1368 // but smaller element size, e.g. 4I -> 4B
1369 void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt,
1370                                            FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) {
1371   if (src_bt == T_SHORT) {
1372     // 4S/8S to 4B/8B
1373     assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
1374     assert(dst_bt == T_BYTE, "unsupported");
1375     xtn(dst, T8B, src, T8H);
1376   } else if (src_bt == T_INT) {
1377     // 4I to 4B/4S
1378     assert(src_vlen_in_bytes == 16, "unsupported");
1379     assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported");
1380     xtn(dst, T4H, src, T4S);
1381     if (dst_bt == T_BYTE) {
1382       xtn(dst, T8B, dst, T8H);
1383     }
1384   } else if (src_bt == T_LONG) {
1385     // 2L to 2I
1386     assert(src_vlen_in_bytes == 16, "unsupported");
1387     assert(dst_bt == T_INT, "unsupported");
1388     xtn(dst, T2S, src, T2D);
1389   } else {
1390     ShouldNotReachHere();
1391   }
1392 }
1393 
1394 void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size,
1395                                           FloatRegister src, SIMD_RegVariant src_size,
1396                                           bool is_unsigned) {
1397   assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size");
1398 
1399   if (src_size == B) {
1400     switch (dst_size) {
1401     case H:
1402       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1403       break;
1404     case S:
1405       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1406       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1407       break;
1408     case D:
1409       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1410       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1411       _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1412       break;
1413     default:
1414       ShouldNotReachHere();
1415     }
1416   } else if (src_size == H) {
1417     if (dst_size == S) {
1418       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1419     } else { // D
1420       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1421       _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1422     }
1423   } else if (src_size == S) {
1424     _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, src);
1425   }
1426 }
1427 
1428 // Vector narrow from src to dst with specified element sizes.
1429 // High part of dst vector will be filled with zero.
1430 void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size,
1431                                           FloatRegister src, SIMD_RegVariant src_size,
1432                                           FloatRegister tmp) {
1433   assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size");
1434   assert_different_registers(src, tmp);
1435   sve_dup(tmp, src_size, 0);
1436   if (src_size == D) {
1437     switch (dst_size) {
1438     case S:
1439       sve_uzp1(dst, S, src, tmp);
1440       break;
1441     case H:
1442       assert_different_registers(dst, tmp);
1443       sve_uzp1(dst, S, src, tmp);
1444       sve_uzp1(dst, H, dst, tmp);
1445       break;
1446     case B:
1447       assert_different_registers(dst, tmp);
1448       sve_uzp1(dst, S, src, tmp);
1449       sve_uzp1(dst, H, dst, tmp);
1450       sve_uzp1(dst, B, dst, tmp);
1451       break;
1452     default:
1453       ShouldNotReachHere();
1454     }
1455   } else if (src_size == S) {
1456     if (dst_size == H) {
1457       sve_uzp1(dst, H, src, tmp);
1458     } else { // B
1459       assert_different_registers(dst, tmp);
1460       sve_uzp1(dst, H, src, tmp);
1461       sve_uzp1(dst, B, dst, tmp);
1462     }
1463   } else if (src_size == H) {
1464     sve_uzp1(dst, B, src, tmp);
1465   }
1466 }
1467 
1468 // Extend src predicate to dst predicate with the same lane count but larger
1469 // element size, e.g. 64Byte -> 512Long
1470 void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src,
1471                                              uint dst_element_length_in_bytes,
1472                                              uint src_element_length_in_bytes) {
1473   if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) {
1474     sve_punpklo(dst, src);
1475   } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) {
1476     sve_punpklo(dst, src);
1477     sve_punpklo(dst, dst);
1478   } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) {
1479     sve_punpklo(dst, src);
1480     sve_punpklo(dst, dst);
1481     sve_punpklo(dst, dst);
1482   } else {
1483     assert(false, "unsupported");
1484     ShouldNotReachHere();
1485   }
1486 }
1487 
1488 // Narrow src predicate to dst predicate with the same lane count but
1489 // smaller element size, e.g. 512Long -> 64Byte
1490 void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src, PRegister ptmp,
1491                                              uint dst_element_length_in_bytes, uint src_element_length_in_bytes) {
1492   // The insignificant bits in src predicate are expected to be zero.
1493   // To ensure the higher order bits of the resultant narrowed vector are 0, an all-zero predicate is
1494   // passed as the second argument. An example narrowing operation with a given mask would be -
1495   // 128Long -> 64Int on a 128-bit machine i.e 2L -> 2I
1496   // Mask (for 2 Longs) : TF
1497   // Predicate register for the above mask (16 bits) : 00000001 00000000
1498   // After narrowing (uzp1 dst.b, src.b, ptmp.b) : 0000 0000 0001 0000
1499   // Which translates to mask for 2 integers as : TF (lower half is considered while upper half is 0)
1500   assert_different_registers(src, ptmp);
1501   assert_different_registers(dst, ptmp);
1502   sve_pfalse(ptmp);
1503   if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) {
1504     sve_uzp1(dst, B, src, ptmp);
1505   } else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) {
1506     sve_uzp1(dst, H, src, ptmp);
1507     sve_uzp1(dst, B, dst, ptmp);
1508   } else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) {
1509     sve_uzp1(dst, S, src, ptmp);
1510     sve_uzp1(dst, H, dst, ptmp);
1511     sve_uzp1(dst, B, dst, ptmp);
1512   } else {
1513     assert(false, "unsupported");
1514     ShouldNotReachHere();
1515   }
1516 }
1517 
1518 // Vector reduction add for integral type with ASIMD instructions.
1519 void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt,
1520                                                  Register isrc, FloatRegister vsrc,
1521                                                  unsigned vector_length_in_bytes,
1522                                                  FloatRegister vtmp) {
1523   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1524   assert_different_registers(dst, isrc);
1525   bool isQ = vector_length_in_bytes == 16;
1526 
1527   BLOCK_COMMENT("neon_reduce_add_integral {");
1528     switch(bt) {
1529       case T_BYTE:
1530         addv(vtmp, isQ ? T16B : T8B, vsrc);
1531         smov(dst, vtmp, B, 0);
1532         addw(dst, dst, isrc, ext::sxtb);
1533         break;
1534       case T_SHORT:
1535         addv(vtmp, isQ ? T8H : T4H, vsrc);
1536         smov(dst, vtmp, H, 0);
1537         addw(dst, dst, isrc, ext::sxth);
1538         break;
1539       case T_INT:
1540         isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc);
1541         umov(dst, vtmp, S, 0);
1542         addw(dst, dst, isrc);
1543         break;
1544       case T_LONG:
1545         assert(isQ, "unsupported");
1546         addpd(vtmp, vsrc);
1547         umov(dst, vtmp, D, 0);
1548         add(dst, dst, isrc);
1549         break;
1550       default:
1551         assert(false, "unsupported");
1552         ShouldNotReachHere();
1553     }
1554   BLOCK_COMMENT("} neon_reduce_add_integral");
1555 }
1556 
1557 // Vector reduction multiply for integral type with ASIMD instructions.
1558 // Note: temporary registers vtmp1 and vtmp2 are not used in some cases.
1559 // Clobbers: rscratch1
1560 void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt,
1561                                                  Register isrc, FloatRegister vsrc,
1562                                                  unsigned vector_length_in_bytes,
1563                                                  FloatRegister vtmp1, FloatRegister vtmp2) {
1564   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1565   bool isQ = vector_length_in_bytes == 16;
1566 
1567   BLOCK_COMMENT("neon_reduce_mul_integral {");
1568     switch(bt) {
1569       case T_BYTE:
1570         if (isQ) {
1571           // Multiply the lower half and higher half of vector iteratively.
1572           // vtmp1 = vsrc[8:15]
1573           ins(vtmp1, D, vsrc, 0, 1);
1574           // vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7]
1575           mulv(vtmp1, T8B, vtmp1, vsrc);
1576           // vtmp2 = vtmp1[4:7]
1577           ins(vtmp2, S, vtmp1, 0, 1);
1578           // vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3]
1579           mulv(vtmp1, T8B, vtmp2, vtmp1);
1580         } else {
1581           ins(vtmp1, S, vsrc, 0, 1);
1582           mulv(vtmp1, T8B, vtmp1, vsrc);
1583         }
1584         // vtmp2 = vtmp1[2:3]
1585         ins(vtmp2, H, vtmp1, 0, 1);
1586         // vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1]
1587         mulv(vtmp2, T8B, vtmp2, vtmp1);
1588         // dst = vtmp2[0] * isrc * vtmp2[1]
1589         umov(rscratch1, vtmp2, B, 0);
1590         mulw(dst, rscratch1, isrc);
1591         sxtb(dst, dst);
1592         umov(rscratch1, vtmp2, B, 1);
1593         mulw(dst, rscratch1, dst);
1594         sxtb(dst, dst);
1595         break;
1596       case T_SHORT:
1597         if (isQ) {
1598           ins(vtmp2, D, vsrc, 0, 1);
1599           mulv(vtmp2, T4H, vtmp2, vsrc);
1600           ins(vtmp1, S, vtmp2, 0, 1);
1601           mulv(vtmp1, T4H, vtmp1, vtmp2);
1602         } else {
1603           ins(vtmp1, S, vsrc, 0, 1);
1604           mulv(vtmp1, T4H, vtmp1, vsrc);
1605         }
1606         umov(rscratch1, vtmp1, H, 0);
1607         mulw(dst, rscratch1, isrc);
1608         sxth(dst, dst);
1609         umov(rscratch1, vtmp1, H, 1);
1610         mulw(dst, rscratch1, dst);
1611         sxth(dst, dst);
1612         break;
1613       case T_INT:
1614         if (isQ) {
1615           ins(vtmp1, D, vsrc, 0, 1);
1616           mulv(vtmp1, T2S, vtmp1, vsrc);
1617         } else {
1618           vtmp1 = vsrc;
1619         }
1620         umov(rscratch1, vtmp1, S, 0);
1621         mul(dst, rscratch1, isrc);
1622         umov(rscratch1, vtmp1, S, 1);
1623         mul(dst, rscratch1, dst);
1624         break;
1625       case T_LONG:
1626         umov(rscratch1, vsrc, D, 0);
1627         mul(dst, isrc, rscratch1);
1628         umov(rscratch1, vsrc, D, 1);
1629         mul(dst, dst, rscratch1);
1630         break;
1631       default:
1632         assert(false, "unsupported");
1633         ShouldNotReachHere();
1634     }
1635   BLOCK_COMMENT("} neon_reduce_mul_integral");
1636 }
1637 
1638 // Vector reduction multiply for floating-point type with ASIMD instructions.
1639 void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt,
1640                                            FloatRegister fsrc, FloatRegister vsrc,
1641                                            unsigned vector_length_in_bytes,
1642                                            FloatRegister vtmp) {
1643   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1644   bool isQ = vector_length_in_bytes == 16;
1645 
1646   BLOCK_COMMENT("neon_reduce_mul_fp {");
1647     switch(bt) {
1648       case T_FLOAT:
1649         fmuls(dst, fsrc, vsrc);
1650         ins(vtmp, S, vsrc, 0, 1);
1651         fmuls(dst, dst, vtmp);
1652         if (isQ) {
1653           ins(vtmp, S, vsrc, 0, 2);
1654           fmuls(dst, dst, vtmp);
1655           ins(vtmp, S, vsrc, 0, 3);
1656           fmuls(dst, dst, vtmp);
1657          }
1658         break;
1659       case T_DOUBLE:
1660         assert(isQ, "unsupported");
1661         fmuld(dst, fsrc, vsrc);
1662         ins(vtmp, D, vsrc, 0, 1);
1663         fmuld(dst, dst, vtmp);
1664         break;
1665       default:
1666         assert(false, "unsupported");
1667         ShouldNotReachHere();
1668     }
1669   BLOCK_COMMENT("} neon_reduce_mul_fp");
1670 }
1671 
1672 // Helper to select logical instruction
1673 void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd,
1674                                                    Register Rn, Register Rm,
1675                                                    enum shift_kind kind, unsigned shift) {
1676   switch(opc) {
1677     case Op_AndReductionV:
1678       is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift);
1679       break;
1680     case Op_OrReductionV:
1681       is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift);
1682       break;
1683     case Op_XorReductionV:
1684       is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift);
1685       break;
1686     default:
1687       assert(false, "unsupported");
1688       ShouldNotReachHere();
1689   }
1690 }
1691 
1692 // Vector reduction logical operations And, Or, Xor
1693 // Clobbers: rscratch1
1694 void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt,
1695                                             Register isrc, FloatRegister vsrc,
1696                                             unsigned vector_length_in_bytes) {
1697   assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV,
1698          "unsupported");
1699   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1700   assert_different_registers(dst, isrc);
1701   bool isQ = vector_length_in_bytes == 16;
1702 
1703   BLOCK_COMMENT("neon_reduce_logical {");
1704     umov(rscratch1, vsrc, isQ ? D : S, 0);
1705     umov(dst, vsrc, isQ ? D : S, 1);
1706     neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1);
1707     switch(bt) {
1708       case T_BYTE:
1709         if (isQ) {
1710           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1711         }
1712         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
1713         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8);
1714         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1715         sxtb(dst, dst);
1716         break;
1717       case T_SHORT:
1718         if (isQ) {
1719           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1720         }
1721         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
1722         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1723         sxth(dst, dst);
1724         break;
1725       case T_INT:
1726         if (isQ) {
1727           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1728         }
1729         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1730         break;
1731       case T_LONG:
1732         assert(isQ, "unsupported");
1733         neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst);
1734         break;
1735       default:
1736         assert(false, "unsupported");
1737         ShouldNotReachHere();
1738     }
1739   BLOCK_COMMENT("} neon_reduce_logical");
1740 }
1741 
1742 // Vector reduction min/max for integral type with ASIMD instructions.
1743 // Note: vtmp is not used and expected to be fnoreg for T_LONG case.
1744 // Clobbers: rscratch1, rflags
1745 void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt,
1746                                                     Register isrc, FloatRegister vsrc,
1747                                                     unsigned vector_length_in_bytes,
1748                                                     FloatRegister vtmp) {
1749   assert(opc == Op_MinReductionV || opc == Op_MaxReductionV, "unsupported");
1750   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1751   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported");
1752   assert_different_registers(dst, isrc);
1753   bool isQ = vector_length_in_bytes == 16;
1754   bool is_min = opc == Op_MinReductionV;
1755 
1756   BLOCK_COMMENT("neon_reduce_minmax_integral {");
1757     if (bt == T_LONG) {
1758       assert(vtmp == fnoreg, "should be");
1759       assert(isQ, "should be");
1760       umov(rscratch1, vsrc, D, 0);
1761       cmp(isrc, rscratch1);
1762       csel(dst, isrc, rscratch1, is_min ? LT : GT);
1763       umov(rscratch1, vsrc, D, 1);
1764       cmp(dst, rscratch1);
1765       csel(dst, dst, rscratch1, is_min ? LT : GT);
1766     } else {
1767       SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1768       if (size == T2S) {
1769         is_min ? sminp(vtmp, size, vsrc, vsrc) : smaxp(vtmp, size, vsrc, vsrc);
1770       } else {
1771         is_min ? sminv(vtmp, size, vsrc) : smaxv(vtmp, size, vsrc);
1772       }
1773       if (bt == T_INT) {
1774         umov(dst, vtmp, S, 0);
1775       } else {
1776         smov(dst, vtmp, elemType_to_regVariant(bt), 0);
1777       }
1778       cmpw(dst, isrc);
1779       cselw(dst, dst, isrc, is_min ? LT : GT);
1780     }
1781   BLOCK_COMMENT("} neon_reduce_minmax_integral");
1782 }
1783 
1784 // Vector reduction for integral type with SVE instruction.
1785 // Supported operations are Add, And, Or, Xor, Max, Min.
1786 // rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV.
1787 void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1,
1788                                             FloatRegister src2, PRegister pg, FloatRegister tmp) {
1789   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
1790   assert(pg->is_governing(), "This register has to be a governing predicate register");
1791   assert_different_registers(src1, dst);
1792   // Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved.
1793   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
1794   switch (opc) {
1795     case Op_AddReductionVI: {
1796       sve_uaddv(tmp, size, pg, src2);
1797       if (bt == T_BYTE) {
1798         smov(dst, tmp, size, 0);
1799         addw(dst, src1, dst, ext::sxtb);
1800       } else if (bt == T_SHORT) {
1801         smov(dst, tmp, size, 0);
1802         addw(dst, src1, dst, ext::sxth);
1803       } else {
1804         umov(dst, tmp, size, 0);
1805         addw(dst, dst, src1);
1806       }
1807       break;
1808     }
1809     case Op_AddReductionVL: {
1810       sve_uaddv(tmp, size, pg, src2);
1811       umov(dst, tmp, size, 0);
1812       add(dst, dst, src1);
1813       break;
1814     }
1815     case Op_AndReductionV: {
1816       sve_andv(tmp, size, pg, src2);
1817       if (bt == T_INT || bt == T_LONG) {
1818         umov(dst, tmp, size, 0);
1819       } else {
1820         smov(dst, tmp, size, 0);
1821       }
1822       if (bt == T_LONG) {
1823         andr(dst, dst, src1);
1824       } else {
1825         andw(dst, dst, src1);
1826       }
1827       break;
1828     }
1829     case Op_OrReductionV: {
1830       sve_orv(tmp, size, pg, src2);
1831       if (bt == T_INT || bt == T_LONG) {
1832         umov(dst, tmp, size, 0);
1833       } else {
1834         smov(dst, tmp, size, 0);
1835       }
1836       if (bt == T_LONG) {
1837         orr(dst, dst, src1);
1838       } else {
1839         orrw(dst, dst, src1);
1840       }
1841       break;
1842     }
1843     case Op_XorReductionV: {
1844       sve_eorv(tmp, size, pg, src2);
1845       if (bt == T_INT || bt == T_LONG) {
1846         umov(dst, tmp, size, 0);
1847       } else {
1848         smov(dst, tmp, size, 0);
1849       }
1850       if (bt == T_LONG) {
1851         eor(dst, dst, src1);
1852       } else {
1853         eorw(dst, dst, src1);
1854       }
1855       break;
1856     }
1857     case Op_MaxReductionV: {
1858       sve_smaxv(tmp, size, pg, src2);
1859       if (bt == T_INT || bt == T_LONG) {
1860         umov(dst, tmp, size, 0);
1861       } else {
1862         smov(dst, tmp, size, 0);
1863       }
1864       if (bt == T_LONG) {
1865         cmp(dst, src1);
1866         csel(dst, dst, src1, Assembler::GT);
1867       } else {
1868         cmpw(dst, src1);
1869         cselw(dst, dst, src1, Assembler::GT);
1870       }
1871       break;
1872     }
1873     case Op_MinReductionV: {
1874       sve_sminv(tmp, size, pg, src2);
1875       if (bt == T_INT || bt == T_LONG) {
1876         umov(dst, tmp, size, 0);
1877       } else {
1878         smov(dst, tmp, size, 0);
1879       }
1880       if (bt == T_LONG) {
1881         cmp(dst, src1);
1882         csel(dst, dst, src1, Assembler::LT);
1883       } else {
1884         cmpw(dst, src1);
1885         cselw(dst, dst, src1, Assembler::LT);
1886       }
1887       break;
1888     }
1889     default:
1890       assert(false, "unsupported");
1891       ShouldNotReachHere();
1892   }
1893 
1894   if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) {
1895     if (bt == T_BYTE) {
1896       sxtb(dst, dst);
1897     } else if (bt == T_SHORT) {
1898       sxth(dst, dst);
1899     }
1900   }
1901 }
1902 
1903 // Set elements of the dst predicate to true for lanes in the range of [0, lane_cnt), or
1904 // to false otherwise. The input "lane_cnt" should be smaller than or equal to the supported
1905 // max vector length of the basic type. Clobbers: rscratch1 and the rFlagsReg.
1906 void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt) {
1907   uint32_t max_vector_length = Matcher::max_vector_size(bt);
1908   assert(lane_cnt <= max_vector_length, "unsupported input lane_cnt");
1909 
1910   // Set all elements to false if the input "lane_cnt" is zero.
1911   if (lane_cnt == 0) {
1912     sve_pfalse(dst);
1913     return;
1914   }
1915 
1916   SIMD_RegVariant size = elemType_to_regVariant(bt);
1917   assert(size != Q, "invalid size");
1918 
1919   // Set all true if "lane_cnt" equals to the max lane count.
1920   if (lane_cnt == max_vector_length) {
1921     sve_ptrue(dst, size, /* ALL */ 0b11111);
1922     return;
1923   }
1924 
1925   // Fixed numbers for "ptrue".
1926   switch(lane_cnt) {
1927   case 1: /* VL1 */
1928   case 2: /* VL2 */
1929   case 3: /* VL3 */
1930   case 4: /* VL4 */
1931   case 5: /* VL5 */
1932   case 6: /* VL6 */
1933   case 7: /* VL7 */
1934   case 8: /* VL8 */
1935     sve_ptrue(dst, size, lane_cnt);
1936     return;
1937   case 16:
1938     sve_ptrue(dst, size, /* VL16 */ 0b01001);
1939     return;
1940   case 32:
1941     sve_ptrue(dst, size, /* VL32 */ 0b01010);
1942     return;
1943   case 64:
1944     sve_ptrue(dst, size, /* VL64 */ 0b01011);
1945     return;
1946   case 128:
1947     sve_ptrue(dst, size, /* VL128 */ 0b01100);
1948     return;
1949   case 256:
1950     sve_ptrue(dst, size, /* VL256 */ 0b01101);
1951     return;
1952   default:
1953     break;
1954   }
1955 
1956   // Special patterns for "ptrue".
1957   if (lane_cnt == round_down_power_of_2(max_vector_length)) {
1958     sve_ptrue(dst, size, /* POW2 */ 0b00000);
1959   } else if (lane_cnt == max_vector_length - (max_vector_length % 4)) {
1960     sve_ptrue(dst, size, /* MUL4 */ 0b11101);
1961   } else if (lane_cnt == max_vector_length - (max_vector_length % 3)) {
1962     sve_ptrue(dst, size, /* MUL3 */ 0b11110);
1963   } else {
1964     // Encode to "whileltw" for the remaining cases.
1965     mov(rscratch1, lane_cnt);
1966     sve_whileltw(dst, size, zr, rscratch1);
1967   }
1968 }
1969 
1970 // Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst.
1971 // Any remaining elements of dst will be filled with zero.
1972 // Clobbers: rscratch1
1973 // Preserves: src, mask
1974 void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask,
1975                                            FloatRegister vtmp1, FloatRegister vtmp2,
1976                                            PRegister pgtmp) {
1977   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
1978   assert_different_registers(dst, src, vtmp1, vtmp2);
1979   assert_different_registers(mask, pgtmp);
1980 
1981   // Example input:   src   = 8888 7777 6666 5555 4444 3333 2222 1111
1982   //                  mask  = 0001 0000 0000 0001 0001 0000 0001 0001
1983   // Expected result: dst   = 0000 0000 0000 8888 5555 4444 2222 1111
1984   sve_dup(vtmp2, H, 0);
1985 
1986   // Extend lowest half to type INT.
1987   // dst = 00004444 00003333 00002222 00001111
1988   sve_uunpklo(dst, S, src);
1989   // pgtmp = 00000001 00000000 00000001 00000001
1990   sve_punpklo(pgtmp, mask);
1991   // Pack the active elements in size of type INT to the right,
1992   // and fill the remainings with zero.
1993   // dst = 00000000 00004444 00002222 00001111
1994   sve_compact(dst, S, dst, pgtmp);
1995   // Narrow the result back to type SHORT.
1996   // dst = 0000 0000 0000 0000 0000 4444 2222 1111
1997   sve_uzp1(dst, H, dst, vtmp2);
1998   // Count the active elements of lowest half.
1999   // rscratch1 = 3
2000   sve_cntp(rscratch1, S, ptrue, pgtmp);
2001 
2002   // Repeat to the highest half.
2003   // pgtmp = 00000001 00000000 00000000 00000001
2004   sve_punpkhi(pgtmp, mask);
2005   // vtmp1 = 00008888 00007777 00006666 00005555
2006   sve_uunpkhi(vtmp1, S, src);
2007   // vtmp1 = 00000000 00000000 00008888 00005555
2008   sve_compact(vtmp1, S, vtmp1, pgtmp);
2009   // vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555
2010   sve_uzp1(vtmp1, H, vtmp1, vtmp2);
2011 
2012   // Compressed low:   dst   = 0000 0000 0000 0000 0000 4444 2222 1111
2013   // Compressed high:  vtmp1 = 0000 0000 0000 0000 0000 0000 8888  5555
2014   // Left shift(cross lane) compressed high with TRUE_CNT lanes,
2015   // TRUE_CNT is the number of active elements in the compressed low.
2016   neg(rscratch1, rscratch1);
2017   // vtmp2 = {4 3 2 1 0 -1 -2 -3}
2018   sve_index(vtmp2, H, rscratch1, 1);
2019   // vtmp1 = 0000 0000 0000 8888 5555 0000 0000 0000
2020   sve_tbl(vtmp1, H, vtmp1, vtmp2);
2021 
2022   // Combine the compressed high(after shifted) with the compressed low.
2023   // dst = 0000 0000 0000 8888 5555 4444 2222 1111
2024   sve_orr(dst, dst, vtmp1);
2025 }
2026 
2027 // Clobbers: rscratch1, rscratch2
2028 // Preserves: src, mask
2029 void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask,
2030                                           FloatRegister vtmp1, FloatRegister vtmp2,
2031                                           FloatRegister vtmp3, FloatRegister vtmp4,
2032                                           PRegister ptmp, PRegister pgtmp) {
2033   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2034   assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3, vtmp4);
2035   assert_different_registers(mask, ptmp, pgtmp);
2036   // Example input:   src   = 88 77 66 55 44 33 22 11
2037   //                  mask  = 01 00 00 01 01 00 01 01
2038   // Expected result: dst   = 00 00 00 88 55 44 22 11
2039 
2040   sve_dup(vtmp4, B, 0);
2041   // Extend lowest half to type SHORT.
2042   // vtmp1 = 0044 0033 0022 0011
2043   sve_uunpklo(vtmp1, H, src);
2044   // ptmp = 0001 0000 0001 0001
2045   sve_punpklo(ptmp, mask);
2046   // Count the active elements of lowest half.
2047   // rscratch2 = 3
2048   sve_cntp(rscratch2, H, ptrue, ptmp);
2049   // Pack the active elements in size of type SHORT to the right,
2050   // and fill the remainings with zero.
2051   // dst = 0000 0044 0022 0011
2052   sve_compress_short(dst, vtmp1, ptmp, vtmp2, vtmp3, pgtmp);
2053   // Narrow the result back to type BYTE.
2054   // dst = 00 00 00 00 00 44 22 11
2055   sve_uzp1(dst, B, dst, vtmp4);
2056 
2057   // Repeat to the highest half.
2058   // ptmp = 0001 0000 0000 0001
2059   sve_punpkhi(ptmp, mask);
2060   // vtmp1 = 0088 0077 0066 0055
2061   sve_uunpkhi(vtmp2, H, src);
2062   // vtmp1 = 0000 0000 0088 0055
2063   sve_compress_short(vtmp1, vtmp2, ptmp, vtmp3, vtmp4, pgtmp);
2064 
2065   sve_dup(vtmp4, B, 0);
2066   // vtmp1 = 00 00 00 00 00 00 88 55
2067   sve_uzp1(vtmp1, B, vtmp1, vtmp4);
2068 
2069   // Compressed low:   dst   = 00 00 00 00 00 44 22 11
2070   // Compressed high:  vtmp1 = 00 00 00 00 00 00 88 55
2071   // Left shift(cross lane) compressed high with TRUE_CNT lanes,
2072   // TRUE_CNT is the number of active elements in the compressed low.
2073   neg(rscratch2, rscratch2);
2074   // vtmp2 = {4 3 2 1 0 -1 -2 -3}
2075   sve_index(vtmp2, B, rscratch2, 1);
2076   // vtmp1 = 00 00 00 88 55 00 00 00
2077   sve_tbl(vtmp1, B, vtmp1, vtmp2);
2078   // Combine the compressed high(after shifted) with the compressed low.
2079   // dst = 00 00 00 88 55 44 22 11
2080   sve_orr(dst, dst, vtmp1);
2081 }
2082 
2083 void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2084   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2085   SIMD_Arrangement size = isQ ? T16B : T8B;
2086   if (bt == T_BYTE) {
2087     rbit(dst, size, src);
2088   } else {
2089     neon_reverse_bytes(dst, src, bt, isQ);
2090     rbit(dst, size, dst);
2091   }
2092 }
2093 
2094 void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2095   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2096   SIMD_Arrangement size = isQ ? T16B : T8B;
2097   switch (bt) {
2098     case T_BYTE:
2099       if (dst != src) {
2100         orr(dst, size, src, src);
2101       }
2102       break;
2103     case T_SHORT:
2104       rev16(dst, size, src);
2105       break;
2106     case T_INT:
2107       rev32(dst, size, src);
2108       break;
2109     case T_LONG:
2110       rev64(dst, size, src);
2111       break;
2112     default:
2113       assert(false, "unsupported");
2114       ShouldNotReachHere();
2115   }
2116 }
2117 
2118 // Extract a scalar element from an sve vector at position 'idx'.
2119 // The input elements in src are expected to be of integral type.
2120 void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src,
2121                                              int idx, FloatRegister vtmp) {
2122   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2123   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2124   if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction
2125     if (bt == T_INT || bt == T_LONG) {
2126       umov(dst, src, size, idx);
2127     } else {
2128       smov(dst, src, size, idx);
2129     }
2130   } else {
2131     sve_orr(vtmp, src, src);
2132     sve_ext(vtmp, vtmp, idx << size);
2133     if (bt == T_INT || bt == T_LONG) {
2134       umov(dst, vtmp, size, 0);
2135     } else {
2136       smov(dst, vtmp, size, 0);
2137     }
2138   }
2139 }
2140 
2141 // java.lang.Math::round intrinsics
2142 
2143 // Clobbers: rscratch1, rflags
2144 void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2145                                           FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) {
2146   assert_different_registers(tmp1, tmp2, tmp3, src, dst);
2147   switch (T) {
2148     case T2S:
2149     case T4S:
2150       fmovs(tmp1, T, 0.5f);
2151       mov(rscratch1, jint_cast(0x1.0p23f));
2152       break;
2153     case T2D:
2154       fmovd(tmp1, T, 0.5);
2155       mov(rscratch1, julong_cast(0x1.0p52));
2156       break;
2157     default:
2158       assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2159   }
2160   fadd(tmp1, T, tmp1, src);
2161   fcvtms(tmp1, T, tmp1);
2162   // tmp1 = floor(src + 0.5, ties to even)
2163 
2164   fcvtas(dst, T, src);
2165   // dst = round(src), ties to away
2166 
2167   fneg(tmp3, T, src);
2168   dup(tmp2, T, rscratch1);
2169   cm(HS, tmp3, T, tmp3, tmp2);
2170   // tmp3 is now a set of flags
2171 
2172   bif(dst, T16B, tmp1, tmp3);
2173   // result in dst
2174 }
2175 
2176 // Clobbers: rscratch1, rflags
2177 void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2178                                          FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) {
2179   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2180   assert_different_registers(tmp1, tmp2, src, dst);
2181 
2182   switch (T) {
2183     case S:
2184       mov(rscratch1, jint_cast(0x1.0p23f));
2185       break;
2186     case D:
2187       mov(rscratch1, julong_cast(0x1.0p52));
2188       break;
2189     default:
2190       assert(T == S || T == D, "invalid register variant");
2191   }
2192 
2193   sve_frinta(dst, T, ptrue, src);
2194   // dst = round(src), ties to away
2195 
2196   Label none;
2197 
2198   sve_fneg(tmp1, T, ptrue, src);
2199   sve_dup(tmp2, T, rscratch1);
2200   sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1);
2201   br(EQ, none);
2202   {
2203     sve_cpy(tmp1, T, pgtmp, 0.5);
2204     sve_fadd(tmp1, T, pgtmp, src);
2205     sve_frintm(dst, T, pgtmp, tmp1);
2206     // dst = floor(src + 0.5, ties to even)
2207   }
2208   bind(none);
2209 
2210   sve_fcvtzs(dst, T, ptrue, dst, T);
2211   // result in dst
2212 }
2213 
2214 void C2_MacroAssembler::vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero,
2215                                            FloatRegister one, SIMD_Arrangement T) {
2216   assert_different_registers(dst, src, zero, one);
2217   assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2218 
2219   facgt(dst, T, src, zero);
2220   ushr(dst, T, dst, 1); // dst=0 for +-0.0 and NaN. 0x7FF..F otherwise
2221   bsl(dst, T == T2S ? T8B : T16B, one, src); // Result in dst
2222 }
2223 
2224 void C2_MacroAssembler::vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero,
2225                                           FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T) {
2226     assert_different_registers(dst, src, zero, one, vtmp);
2227     assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2228 
2229     sve_orr(vtmp, src, src);
2230     sve_fac(Assembler::GT, pgtmp, T, ptrue, src, zero); // pmtp=0 for +-0.0 and NaN. 0x1 otherwise
2231     switch (T) {
2232     case S:
2233       sve_and(vtmp, T, min_jint); // Extract the sign bit of float value in every lane of src
2234       sve_orr(vtmp, T, jint_cast(1.0)); // OR it with +1 to make the final result +1 or -1 depending
2235                                         // on the sign of the float value
2236       break;
2237     case D:
2238       sve_and(vtmp, T, min_jlong);
2239       sve_orr(vtmp, T, jlong_cast(1.0));
2240       break;
2241     default:
2242       assert(false, "unsupported");
2243       ShouldNotReachHere();
2244     }
2245     sve_sel(dst, T, pgtmp, vtmp, src); // Select either from src or vtmp based on the predicate register pgtmp
2246                                        // Result in dst
2247 }
2248 
2249 bool C2_MacroAssembler::in_scratch_emit_size() {
2250   if (ciEnv::current()->task() != nullptr) {
2251     PhaseOutput* phase_output = Compile::current()->output();
2252     if (phase_output != nullptr && phase_output->in_scratch_emit_size()) {
2253       return true;
2254     }
2255   }
2256   return MacroAssembler::in_scratch_emit_size();
2257 }