1 /*
   2  * Copyright (c) 2020, 2023, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "opto/c2_MacroAssembler.hpp"
  29 #include "opto/compile.hpp"
  30 #include "opto/intrinsicnode.hpp"
  31 #include "opto/matcher.hpp"
  32 #include "opto/output.hpp"
  33 #include "opto/subnode.hpp"
  34 #include "runtime/stubRoutines.hpp"
  35 
  36 #ifdef PRODUCT
  37 #define BLOCK_COMMENT(str) /* nothing */
  38 #define STOP(error) stop(error)
  39 #else
  40 #define BLOCK_COMMENT(str) block_comment(str)
  41 #define STOP(error) block_comment(error); stop(error)
  42 #endif
  43 
  44 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  45 
  46 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
  47 
  48 // Search for str1 in str2 and return index or -1
  49 // Clobbers: rscratch1, rscratch2, rflags. May also clobber v0-v1, when icnt1==-1.
  50 void C2_MacroAssembler::string_indexof(Register str2, Register str1,
  51                                        Register cnt2, Register cnt1,
  52                                        Register tmp1, Register tmp2,
  53                                        Register tmp3, Register tmp4,
  54                                        Register tmp5, Register tmp6,
  55                                        int icnt1, Register result, int ae) {
  56   // NOTE: tmp5, tmp6 can be zr depending on specific method version
  57   Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
  58 
  59   Register ch1 = rscratch1;
  60   Register ch2 = rscratch2;
  61   Register cnt1tmp = tmp1;
  62   Register cnt2tmp = tmp2;
  63   Register cnt1_neg = cnt1;
  64   Register cnt2_neg = cnt2;
  65   Register result_tmp = tmp4;
  66 
  67   bool isL = ae == StrIntrinsicNode::LL;
  68 
  69   bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
  70   bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
  71   int str1_chr_shift = str1_isL ? 0:1;
  72   int str2_chr_shift = str2_isL ? 0:1;
  73   int str1_chr_size = str1_isL ? 1:2;
  74   int str2_chr_size = str2_isL ? 1:2;
  75   chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
  76                                       (chr_insn)&MacroAssembler::ldrh;
  77   chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
  78                                       (chr_insn)&MacroAssembler::ldrh;
  79   chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
  80   chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
  81 
  82   // Note, inline_string_indexOf() generates checks:
  83   // if (substr.count > string.count) return -1;
  84   // if (substr.count == 0) return 0;
  85 
  86   // We have two strings, a source string in str2, cnt2 and a pattern string
  87   // in str1, cnt1. Find the 1st occurrence of pattern in source or return -1.
  88 
  89   // For larger pattern and source we use a simplified Boyer Moore algorithm.
  90   // With a small pattern and source we use linear scan.
  91 
  92   if (icnt1 == -1) {
  93     sub(result_tmp, cnt2, cnt1);
  94     cmp(cnt1, (u1)8);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
  95     br(LT, LINEARSEARCH);
  96     dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
  97     subs(zr, cnt1, 256);
  98     lsr(tmp1, cnt2, 2);
  99     ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
 100     br(GE, LINEARSTUB);
 101   }
 102 
 103 // The Boyer Moore alogorithm is based on the description here:-
 104 //
 105 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
 106 //
 107 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
 108 // and the 'Good Suffix' rule.
 109 //
 110 // These rules are essentially heuristics for how far we can shift the
 111 // pattern along the search string.
 112 //
 113 // The implementation here uses the 'Bad Character' rule only because of the
 114 // complexity of initialisation for the 'Good Suffix' rule.
 115 //
 116 // This is also known as the Boyer-Moore-Horspool algorithm:-
 117 //
 118 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
 119 //
 120 // This particular implementation has few java-specific optimizations.
 121 //
 122 // #define ASIZE 256
 123 //
 124 //    int bm(unsigned char *x, int m, unsigned char *y, int n) {
 125 //       int i, j;
 126 //       unsigned c;
 127 //       unsigned char bc[ASIZE];
 128 //
 129 //       /* Preprocessing */
 130 //       for (i = 0; i < ASIZE; ++i)
 131 //          bc[i] = m;
 132 //       for (i = 0; i < m - 1; ) {
 133 //          c = x[i];
 134 //          ++i;
 135 //          // c < 256 for Latin1 string, so, no need for branch
 136 //          #ifdef PATTERN_STRING_IS_LATIN1
 137 //          bc[c] = m - i;
 138 //          #else
 139 //          if (c < ASIZE) bc[c] = m - i;
 140 //          #endif
 141 //       }
 142 //
 143 //       /* Searching */
 144 //       j = 0;
 145 //       while (j <= n - m) {
 146 //          c = y[i+j];
 147 //          if (x[m-1] == c)
 148 //            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
 149 //          if (i < 0) return j;
 150 //          // c < 256 for Latin1 string, so, no need for branch
 151 //          #ifdef SOURCE_STRING_IS_LATIN1
 152 //          // LL case: (c< 256) always true. Remove branch
 153 //          j += bc[y[j+m-1]];
 154 //          #endif
 155 //          #ifndef PATTERN_STRING_IS_UTF
 156 //          // UU case: need if (c<ASIZE) check. Skip 1 character if not.
 157 //          if (c < ASIZE)
 158 //            j += bc[y[j+m-1]];
 159 //          else
 160 //            j += 1
 161 //          #endif
 162 //          #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
 163 //          // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
 164 //          if (c < ASIZE)
 165 //            j += bc[y[j+m-1]];
 166 //          else
 167 //            j += m
 168 //          #endif
 169 //       }
 170 //    }
 171 
 172   if (icnt1 == -1) {
 173     Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
 174         BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
 175     Register cnt1end = tmp2;
 176     Register str2end = cnt2;
 177     Register skipch = tmp2;
 178 
 179     // str1 length is >=8, so, we can read at least 1 register for cases when
 180     // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
 181     // UL case. We'll re-read last character in inner pre-loop code to have
 182     // single outer pre-loop load
 183     const int firstStep = isL ? 7 : 3;
 184 
 185     const int ASIZE = 256;
 186     const int STORED_BYTES = 32; // amount of bytes stored per instruction
 187     sub(sp, sp, ASIZE);
 188     mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
 189     mov(ch1, sp);
 190     BIND(BM_INIT_LOOP);
 191       stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
 192       subs(tmp5, tmp5, 1);
 193       br(GT, BM_INIT_LOOP);
 194 
 195       sub(cnt1tmp, cnt1, 1);
 196       mov(tmp5, str2);
 197       add(str2end, str2, result_tmp, LSL, str2_chr_shift);
 198       sub(ch2, cnt1, 1);
 199       mov(tmp3, str1);
 200     BIND(BCLOOP);
 201       (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
 202       if (!str1_isL) {
 203         subs(zr, ch1, ASIZE);
 204         br(HS, BCSKIP);
 205       }
 206       strb(ch2, Address(sp, ch1));
 207     BIND(BCSKIP);
 208       subs(ch2, ch2, 1);
 209       br(GT, BCLOOP);
 210 
 211       add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
 212       if (str1_isL == str2_isL) {
 213         // load last 8 bytes (8LL/4UU symbols)
 214         ldr(tmp6, Address(tmp6, -wordSize));
 215       } else {
 216         ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
 217         // convert Latin1 to UTF. We'll have to wait until load completed, but
 218         // it's still faster than per-character loads+checks
 219         lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
 220         ubfx(ch1, tmp6, 8, 8); // str1[N-2]
 221         ubfx(ch2, tmp6, 16, 8); // str1[N-3]
 222         andr(tmp6, tmp6, 0xFF); // str1[N-4]
 223         orr(ch2, ch1, ch2, LSL, 16);
 224         orr(tmp6, tmp6, tmp3, LSL, 48);
 225         orr(tmp6, tmp6, ch2, LSL, 16);
 226       }
 227     BIND(BMLOOPSTR2);
 228       (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 229       sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
 230       if (str1_isL == str2_isL) {
 231         // re-init tmp3. It's for free because it's executed in parallel with
 232         // load above. Alternative is to initialize it before loop, but it'll
 233         // affect performance on in-order systems with 2 or more ld/st pipelines
 234         lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
 235       }
 236       if (!isL) { // UU/UL case
 237         lsl(ch2, cnt1tmp, 1); // offset in bytes
 238       }
 239       cmp(tmp3, skipch);
 240       br(NE, BMSKIP);
 241       ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
 242       mov(ch1, tmp6);
 243       if (isL) {
 244         b(BMLOOPSTR1_AFTER_LOAD);
 245       } else {
 246         sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
 247         b(BMLOOPSTR1_CMP);
 248       }
 249     BIND(BMLOOPSTR1);
 250       (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
 251       (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 252     BIND(BMLOOPSTR1_AFTER_LOAD);
 253       subs(cnt1tmp, cnt1tmp, 1);
 254       br(LT, BMLOOPSTR1_LASTCMP);
 255     BIND(BMLOOPSTR1_CMP);
 256       cmp(ch1, ch2);
 257       br(EQ, BMLOOPSTR1);
 258     BIND(BMSKIP);
 259       if (!isL) {
 260         // if we've met UTF symbol while searching Latin1 pattern, then we can
 261         // skip cnt1 symbols
 262         if (str1_isL != str2_isL) {
 263           mov(result_tmp, cnt1);
 264         } else {
 265           mov(result_tmp, 1);
 266         }
 267         subs(zr, skipch, ASIZE);
 268         br(HS, BMADV);
 269       }
 270       ldrb(result_tmp, Address(sp, skipch)); // load skip distance
 271     BIND(BMADV);
 272       sub(cnt1tmp, cnt1, 1);
 273       add(str2, str2, result_tmp, LSL, str2_chr_shift);
 274       cmp(str2, str2end);
 275       br(LE, BMLOOPSTR2);
 276       add(sp, sp, ASIZE);
 277       b(NOMATCH);
 278     BIND(BMLOOPSTR1_LASTCMP);
 279       cmp(ch1, ch2);
 280       br(NE, BMSKIP);
 281     BIND(BMMATCH);
 282       sub(result, str2, tmp5);
 283       if (!str2_isL) lsr(result, result, 1);
 284       add(sp, sp, ASIZE);
 285       b(DONE);
 286 
 287     BIND(LINEARSTUB);
 288     cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm
 289     br(LT, LINEAR_MEDIUM);
 290     mov(result, zr);
 291     RuntimeAddress stub = nullptr;
 292     if (isL) {
 293       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
 294       assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated");
 295     } else if (str1_isL) {
 296       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
 297        assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated");
 298     } else {
 299       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
 300       assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated");
 301     }
 302     address call = trampoline_call(stub);
 303     if (call == nullptr) {
 304       DEBUG_ONLY(reset_labels(LINEARSEARCH, LINEAR_MEDIUM, DONE, NOMATCH, MATCH));
 305       ciEnv::current()->record_failure("CodeCache is full");
 306       return;
 307     }
 308     b(DONE);
 309   }
 310 
 311   BIND(LINEARSEARCH);
 312   {
 313     Label DO1, DO2, DO3;
 314 
 315     Register str2tmp = tmp2;
 316     Register first = tmp3;
 317 
 318     if (icnt1 == -1)
 319     {
 320         Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
 321 
 322         cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2));
 323         br(LT, DOSHORT);
 324       BIND(LINEAR_MEDIUM);
 325         (this->*str1_load_1chr)(first, Address(str1));
 326         lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
 327         sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
 328         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 329         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 330 
 331       BIND(FIRST_LOOP);
 332         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
 333         cmp(first, ch2);
 334         br(EQ, STR1_LOOP);
 335       BIND(STR2_NEXT);
 336         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 337         br(LE, FIRST_LOOP);
 338         b(NOMATCH);
 339 
 340       BIND(STR1_LOOP);
 341         adds(cnt1tmp, cnt1_neg, str1_chr_size);
 342         add(cnt2tmp, cnt2_neg, str2_chr_size);
 343         br(GE, MATCH);
 344 
 345       BIND(STR1_NEXT);
 346         (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
 347         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
 348         cmp(ch1, ch2);
 349         br(NE, STR2_NEXT);
 350         adds(cnt1tmp, cnt1tmp, str1_chr_size);
 351         add(cnt2tmp, cnt2tmp, str2_chr_size);
 352         br(LT, STR1_NEXT);
 353         b(MATCH);
 354 
 355       BIND(DOSHORT);
 356       if (str1_isL == str2_isL) {
 357         cmp(cnt1, (u1)2);
 358         br(LT, DO1);
 359         br(GT, DO3);
 360       }
 361     }
 362 
 363     if (icnt1 == 4) {
 364       Label CH1_LOOP;
 365 
 366         (this->*load_4chr)(ch1, str1);
 367         sub(result_tmp, cnt2, 4);
 368         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 369         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 370 
 371       BIND(CH1_LOOP);
 372         (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
 373         cmp(ch1, ch2);
 374         br(EQ, MATCH);
 375         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 376         br(LE, CH1_LOOP);
 377         b(NOMATCH);
 378       }
 379 
 380     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
 381       Label CH1_LOOP;
 382 
 383       BIND(DO2);
 384         (this->*load_2chr)(ch1, str1);
 385         if (icnt1 == 2) {
 386           sub(result_tmp, cnt2, 2);
 387         }
 388         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 389         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 390       BIND(CH1_LOOP);
 391         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
 392         cmp(ch1, ch2);
 393         br(EQ, MATCH);
 394         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 395         br(LE, CH1_LOOP);
 396         b(NOMATCH);
 397     }
 398 
 399     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
 400       Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
 401 
 402       BIND(DO3);
 403         (this->*load_2chr)(first, str1);
 404         (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
 405         if (icnt1 == 3) {
 406           sub(result_tmp, cnt2, 3);
 407         }
 408         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 409         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 410       BIND(FIRST_LOOP);
 411         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
 412         cmpw(first, ch2);
 413         br(EQ, STR1_LOOP);
 414       BIND(STR2_NEXT);
 415         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 416         br(LE, FIRST_LOOP);
 417         b(NOMATCH);
 418 
 419       BIND(STR1_LOOP);
 420         add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
 421         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
 422         cmp(ch1, ch2);
 423         br(NE, STR2_NEXT);
 424         b(MATCH);
 425     }
 426 
 427     if (icnt1 == -1 || icnt1 == 1) {
 428       Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
 429 
 430       BIND(DO1);
 431         (this->*str1_load_1chr)(ch1, str1);
 432         cmp(cnt2, (u1)8);
 433         br(LT, DO1_SHORT);
 434 
 435         sub(result_tmp, cnt2, 8/str2_chr_size);
 436         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 437         mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
 438         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 439 
 440         if (str2_isL) {
 441           orr(ch1, ch1, ch1, LSL, 8);
 442         }
 443         orr(ch1, ch1, ch1, LSL, 16);
 444         orr(ch1, ch1, ch1, LSL, 32);
 445       BIND(CH1_LOOP);
 446         ldr(ch2, Address(str2, cnt2_neg));
 447         eor(ch2, ch1, ch2);
 448         sub(tmp1, ch2, tmp3);
 449         orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 450         bics(tmp1, tmp1, tmp2);
 451         br(NE, HAS_ZERO);
 452         adds(cnt2_neg, cnt2_neg, 8);
 453         br(LT, CH1_LOOP);
 454 
 455         cmp(cnt2_neg, (u1)8);
 456         mov(cnt2_neg, 0);
 457         br(LT, CH1_LOOP);
 458         b(NOMATCH);
 459 
 460       BIND(HAS_ZERO);
 461         rev(tmp1, tmp1);
 462         clz(tmp1, tmp1);
 463         add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
 464         b(MATCH);
 465 
 466       BIND(DO1_SHORT);
 467         mov(result_tmp, cnt2);
 468         lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
 469         sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
 470       BIND(DO1_LOOP);
 471         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
 472         cmpw(ch1, ch2);
 473         br(EQ, MATCH);
 474         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 475         br(LT, DO1_LOOP);
 476     }
 477   }
 478   BIND(NOMATCH);
 479     mov(result, -1);
 480     b(DONE);
 481   BIND(MATCH);
 482     add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
 483   BIND(DONE);
 484 }
 485 
 486 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
 487 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
 488 
 489 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1,
 490                                             Register ch, Register result,
 491                                             Register tmp1, Register tmp2, Register tmp3)
 492 {
 493   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
 494   Register cnt1_neg = cnt1;
 495   Register ch1 = rscratch1;
 496   Register result_tmp = rscratch2;
 497 
 498   cbz(cnt1, NOMATCH);
 499 
 500   cmp(cnt1, (u1)4);
 501   br(LT, DO1_SHORT);
 502 
 503   orr(ch, ch, ch, LSL, 16);
 504   orr(ch, ch, ch, LSL, 32);
 505 
 506   sub(cnt1, cnt1, 4);
 507   mov(result_tmp, cnt1);
 508   lea(str1, Address(str1, cnt1, Address::uxtw(1)));
 509   sub(cnt1_neg, zr, cnt1, LSL, 1);
 510 
 511   mov(tmp3, 0x0001000100010001);
 512 
 513   BIND(CH1_LOOP);
 514     ldr(ch1, Address(str1, cnt1_neg));
 515     eor(ch1, ch, ch1);
 516     sub(tmp1, ch1, tmp3);
 517     orr(tmp2, ch1, 0x7fff7fff7fff7fff);
 518     bics(tmp1, tmp1, tmp2);
 519     br(NE, HAS_ZERO);
 520     adds(cnt1_neg, cnt1_neg, 8);
 521     br(LT, CH1_LOOP);
 522 
 523     cmp(cnt1_neg, (u1)8);
 524     mov(cnt1_neg, 0);
 525     br(LT, CH1_LOOP);
 526     b(NOMATCH);
 527 
 528   BIND(HAS_ZERO);
 529     rev(tmp1, tmp1);
 530     clz(tmp1, tmp1);
 531     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
 532     b(MATCH);
 533 
 534   BIND(DO1_SHORT);
 535     mov(result_tmp, cnt1);
 536     lea(str1, Address(str1, cnt1, Address::uxtw(1)));
 537     sub(cnt1_neg, zr, cnt1, LSL, 1);
 538   BIND(DO1_LOOP);
 539     ldrh(ch1, Address(str1, cnt1_neg));
 540     cmpw(ch, ch1);
 541     br(EQ, MATCH);
 542     adds(cnt1_neg, cnt1_neg, 2);
 543     br(LT, DO1_LOOP);
 544   BIND(NOMATCH);
 545     mov(result, -1);
 546     b(DONE);
 547   BIND(MATCH);
 548     add(result, result_tmp, cnt1_neg, ASR, 1);
 549   BIND(DONE);
 550 }
 551 
 552 void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1,
 553                                                 Register ch, Register result,
 554                                                 FloatRegister ztmp1,
 555                                                 FloatRegister ztmp2,
 556                                                 PRegister tmp_pg,
 557                                                 PRegister tmp_pdn, bool isL)
 558 {
 559   // Note that `tmp_pdn` should *NOT* be used as governing predicate register.
 560   assert(tmp_pg->is_governing(),
 561          "this register has to be a governing predicate register");
 562 
 563   Label LOOP, MATCH, DONE, NOMATCH;
 564   Register vec_len = rscratch1;
 565   Register idx = rscratch2;
 566 
 567   SIMD_RegVariant T = (isL == true) ? B : H;
 568 
 569   cbz(cnt1, NOMATCH);
 570 
 571   // Assign the particular char throughout the vector.
 572   sve_dup(ztmp2, T, ch);
 573   if (isL) {
 574     sve_cntb(vec_len);
 575   } else {
 576     sve_cnth(vec_len);
 577   }
 578   mov(idx, 0);
 579 
 580   // Generate a predicate to control the reading of input string.
 581   sve_whilelt(tmp_pg, T, idx, cnt1);
 582 
 583   BIND(LOOP);
 584     // Read a vector of 8- or 16-bit data depending on the string type. Note
 585     // that inactive elements indicated by the predicate register won't cause
 586     // a data read from memory to the destination vector.
 587     if (isL) {
 588       sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx));
 589     } else {
 590       sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1)));
 591     }
 592     add(idx, idx, vec_len);
 593 
 594     // Perform the comparison. An element of the destination predicate is set
 595     // to active if the particular char is matched.
 596     sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2);
 597 
 598     // Branch if the particular char is found.
 599     br(NE, MATCH);
 600 
 601     sve_whilelt(tmp_pg, T, idx, cnt1);
 602 
 603     // Loop back if the particular char not found.
 604     br(MI, LOOP);
 605 
 606   BIND(NOMATCH);
 607     mov(result, -1);
 608     b(DONE);
 609 
 610   BIND(MATCH);
 611     // Undo the index increment.
 612     sub(idx, idx, vec_len);
 613 
 614     // Crop the vector to find its location.
 615     sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */);
 616     add(result, idx, -1);
 617     sve_incp(result, T, tmp_pdn);
 618   BIND(DONE);
 619 }
 620 
 621 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1,
 622                                             Register ch, Register result,
 623                                             Register tmp1, Register tmp2, Register tmp3)
 624 {
 625   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
 626   Register cnt1_neg = cnt1;
 627   Register ch1 = rscratch1;
 628   Register result_tmp = rscratch2;
 629 
 630   cbz(cnt1, NOMATCH);
 631 
 632   cmp(cnt1, (u1)8);
 633   br(LT, DO1_SHORT);
 634 
 635   orr(ch, ch, ch, LSL, 8);
 636   orr(ch, ch, ch, LSL, 16);
 637   orr(ch, ch, ch, LSL, 32);
 638 
 639   sub(cnt1, cnt1, 8);
 640   mov(result_tmp, cnt1);
 641   lea(str1, Address(str1, cnt1));
 642   sub(cnt1_neg, zr, cnt1);
 643 
 644   mov(tmp3, 0x0101010101010101);
 645 
 646   BIND(CH1_LOOP);
 647     ldr(ch1, Address(str1, cnt1_neg));
 648     eor(ch1, ch, ch1);
 649     sub(tmp1, ch1, tmp3);
 650     orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f);
 651     bics(tmp1, tmp1, tmp2);
 652     br(NE, HAS_ZERO);
 653     adds(cnt1_neg, cnt1_neg, 8);
 654     br(LT, CH1_LOOP);
 655 
 656     cmp(cnt1_neg, (u1)8);
 657     mov(cnt1_neg, 0);
 658     br(LT, CH1_LOOP);
 659     b(NOMATCH);
 660 
 661   BIND(HAS_ZERO);
 662     rev(tmp1, tmp1);
 663     clz(tmp1, tmp1);
 664     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
 665     b(MATCH);
 666 
 667   BIND(DO1_SHORT);
 668     mov(result_tmp, cnt1);
 669     lea(str1, Address(str1, cnt1));
 670     sub(cnt1_neg, zr, cnt1);
 671   BIND(DO1_LOOP);
 672     ldrb(ch1, Address(str1, cnt1_neg));
 673     cmp(ch, ch1);
 674     br(EQ, MATCH);
 675     adds(cnt1_neg, cnt1_neg, 1);
 676     br(LT, DO1_LOOP);
 677   BIND(NOMATCH);
 678     mov(result, -1);
 679     b(DONE);
 680   BIND(MATCH);
 681     add(result, result_tmp, cnt1_neg);
 682   BIND(DONE);
 683 }
 684 
 685 // Compare strings.
 686 void C2_MacroAssembler::string_compare(Register str1, Register str2,
 687     Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
 688     FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
 689     PRegister pgtmp1, PRegister pgtmp2, int ae) {
 690   Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
 691       DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
 692       SHORT_LOOP_START, TAIL_CHECK;
 693 
 694   bool isLL = ae == StrIntrinsicNode::LL;
 695   bool isLU = ae == StrIntrinsicNode::LU;
 696   bool isUL = ae == StrIntrinsicNode::UL;
 697 
 698   // The stub threshold for LL strings is: 72 (64 + 8) chars
 699   // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch)
 700   // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least)
 701   const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36);
 702 
 703   bool str1_isL = isLL || isLU;
 704   bool str2_isL = isLL || isUL;
 705 
 706   int str1_chr_shift = str1_isL ? 0 : 1;
 707   int str2_chr_shift = str2_isL ? 0 : 1;
 708   int str1_chr_size = str1_isL ? 1 : 2;
 709   int str2_chr_size = str2_isL ? 1 : 2;
 710   int minCharsInWord = isLL ? wordSize : wordSize/2;
 711 
 712   FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
 713   chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
 714                                       (chr_insn)&MacroAssembler::ldrh;
 715   chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
 716                                       (chr_insn)&MacroAssembler::ldrh;
 717   uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
 718                             (uxt_insn)&MacroAssembler::uxthw;
 719 
 720   BLOCK_COMMENT("string_compare {");
 721 
 722   // Bizzarely, the counts are passed in bytes, regardless of whether they
 723   // are L or U strings, however the result is always in characters.
 724   if (!str1_isL) asrw(cnt1, cnt1, 1);
 725   if (!str2_isL) asrw(cnt2, cnt2, 1);
 726 
 727   // Compute the minimum of the string lengths and save the difference.
 728   subsw(result, cnt1, cnt2);
 729   cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
 730 
 731   // A very short string
 732   cmpw(cnt2, minCharsInWord);
 733   br(Assembler::LE, SHORT_STRING);
 734 
 735   // Compare longwords
 736   // load first parts of strings and finish initialization while loading
 737   {
 738     if (str1_isL == str2_isL) { // LL or UU
 739       ldr(tmp1, Address(str1));
 740       cmp(str1, str2);
 741       br(Assembler::EQ, DONE);
 742       ldr(tmp2, Address(str2));
 743       cmp(cnt2, stub_threshold);
 744       br(GE, STUB);
 745       subsw(cnt2, cnt2, minCharsInWord);
 746       br(EQ, TAIL_CHECK);
 747       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
 748       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
 749       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
 750     } else if (isLU) {
 751       ldrs(vtmp, Address(str1));
 752       ldr(tmp2, Address(str2));
 753       cmp(cnt2, stub_threshold);
 754       br(GE, STUB);
 755       subw(cnt2, cnt2, 4);
 756       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
 757       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
 758       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
 759       zip1(vtmp, T8B, vtmp, vtmpZ);
 760       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
 761       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
 762       add(cnt1, cnt1, 4);
 763       fmovd(tmp1, vtmp);
 764     } else { // UL case
 765       ldr(tmp1, Address(str1));
 766       ldrs(vtmp, Address(str2));
 767       cmp(cnt2, stub_threshold);
 768       br(GE, STUB);
 769       subw(cnt2, cnt2, 4);
 770       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
 771       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
 772       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
 773       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
 774       zip1(vtmp, T8B, vtmp, vtmpZ);
 775       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
 776       add(cnt1, cnt1, 8);
 777       fmovd(tmp2, vtmp);
 778     }
 779     adds(cnt2, cnt2, isUL ? 4 : 8);
 780     br(GE, TAIL);
 781     eor(rscratch2, tmp1, tmp2);
 782     cbnz(rscratch2, DIFF);
 783     // main loop
 784     bind(NEXT_WORD);
 785     if (str1_isL == str2_isL) {
 786       ldr(tmp1, Address(str1, cnt2));
 787       ldr(tmp2, Address(str2, cnt2));
 788       adds(cnt2, cnt2, 8);
 789     } else if (isLU) {
 790       ldrs(vtmp, Address(str1, cnt1));
 791       ldr(tmp2, Address(str2, cnt2));
 792       add(cnt1, cnt1, 4);
 793       zip1(vtmp, T8B, vtmp, vtmpZ);
 794       fmovd(tmp1, vtmp);
 795       adds(cnt2, cnt2, 8);
 796     } else { // UL
 797       ldrs(vtmp, Address(str2, cnt2));
 798       ldr(tmp1, Address(str1, cnt1));
 799       zip1(vtmp, T8B, vtmp, vtmpZ);
 800       add(cnt1, cnt1, 8);
 801       fmovd(tmp2, vtmp);
 802       adds(cnt2, cnt2, 4);
 803     }
 804     br(GE, TAIL);
 805 
 806     eor(rscratch2, tmp1, tmp2);
 807     cbz(rscratch2, NEXT_WORD);
 808     b(DIFF);
 809     bind(TAIL);
 810     eor(rscratch2, tmp1, tmp2);
 811     cbnz(rscratch2, DIFF);
 812     // Last longword.  In the case where length == 4 we compare the
 813     // same longword twice, but that's still faster than another
 814     // conditional branch.
 815     if (str1_isL == str2_isL) {
 816       ldr(tmp1, Address(str1));
 817       ldr(tmp2, Address(str2));
 818     } else if (isLU) {
 819       ldrs(vtmp, Address(str1));
 820       ldr(tmp2, Address(str2));
 821       zip1(vtmp, T8B, vtmp, vtmpZ);
 822       fmovd(tmp1, vtmp);
 823     } else { // UL
 824       ldrs(vtmp, Address(str2));
 825       ldr(tmp1, Address(str1));
 826       zip1(vtmp, T8B, vtmp, vtmpZ);
 827       fmovd(tmp2, vtmp);
 828     }
 829     bind(TAIL_CHECK);
 830     eor(rscratch2, tmp1, tmp2);
 831     cbz(rscratch2, DONE);
 832 
 833     // Find the first different characters in the longwords and
 834     // compute their difference.
 835     bind(DIFF);
 836     rev(rscratch2, rscratch2);
 837     clz(rscratch2, rscratch2);
 838     andr(rscratch2, rscratch2, isLL ? -8 : -16);
 839     lsrv(tmp1, tmp1, rscratch2);
 840     (this->*ext_chr)(tmp1, tmp1);
 841     lsrv(tmp2, tmp2, rscratch2);
 842     (this->*ext_chr)(tmp2, tmp2);
 843     subw(result, tmp1, tmp2);
 844     b(DONE);
 845   }
 846 
 847   bind(STUB);
 848     RuntimeAddress stub = nullptr;
 849     switch(ae) {
 850       case StrIntrinsicNode::LL:
 851         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
 852         break;
 853       case StrIntrinsicNode::UU:
 854         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
 855         break;
 856       case StrIntrinsicNode::LU:
 857         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
 858         break;
 859       case StrIntrinsicNode::UL:
 860         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
 861         break;
 862       default:
 863         ShouldNotReachHere();
 864      }
 865     assert(stub.target() != nullptr, "compare_long_string stub has not been generated");
 866     address call = trampoline_call(stub);
 867     if (call == nullptr) {
 868       DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START));
 869       ciEnv::current()->record_failure("CodeCache is full");
 870       return;
 871     }
 872     b(DONE);
 873 
 874   bind(SHORT_STRING);
 875   // Is the minimum length zero?
 876   cbz(cnt2, DONE);
 877   // arrange code to do most branches while loading and loading next characters
 878   // while comparing previous
 879   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
 880   subs(cnt2, cnt2, 1);
 881   br(EQ, SHORT_LAST_INIT);
 882   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
 883   b(SHORT_LOOP_START);
 884   bind(SHORT_LOOP);
 885   subs(cnt2, cnt2, 1);
 886   br(EQ, SHORT_LAST);
 887   bind(SHORT_LOOP_START);
 888   (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
 889   (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
 890   cmp(tmp1, cnt1);
 891   br(NE, SHORT_LOOP_TAIL);
 892   subs(cnt2, cnt2, 1);
 893   br(EQ, SHORT_LAST2);
 894   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
 895   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
 896   cmp(tmp2, rscratch1);
 897   br(EQ, SHORT_LOOP);
 898   sub(result, tmp2, rscratch1);
 899   b(DONE);
 900   bind(SHORT_LOOP_TAIL);
 901   sub(result, tmp1, cnt1);
 902   b(DONE);
 903   bind(SHORT_LAST2);
 904   cmp(tmp2, rscratch1);
 905   br(EQ, DONE);
 906   sub(result, tmp2, rscratch1);
 907 
 908   b(DONE);
 909   bind(SHORT_LAST_INIT);
 910   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
 911   bind(SHORT_LAST);
 912   cmp(tmp1, cnt1);
 913   br(EQ, DONE);
 914   sub(result, tmp1, cnt1);
 915 
 916   bind(DONE);
 917 
 918   BLOCK_COMMENT("} string_compare");
 919 }
 920 
 921 void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1,
 922                                      FloatRegister src2, Condition cond, bool isQ) {
 923   SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
 924   FloatRegister zn = src1, zm = src2;
 925   bool needs_negation = false;
 926   switch (cond) {
 927     case LT: cond = GT; zn = src2; zm = src1; break;
 928     case LE: cond = GE; zn = src2; zm = src1; break;
 929     case LO: cond = HI; zn = src2; zm = src1; break;
 930     case LS: cond = HS; zn = src2; zm = src1; break;
 931     case NE: cond = EQ; needs_negation = true; break;
 932     default:
 933       break;
 934   }
 935 
 936   if (is_floating_point_type(bt)) {
 937     fcm(cond, dst, size, zn, zm);
 938   } else {
 939     cm(cond, dst, size, zn, zm);
 940   }
 941 
 942   if (needs_negation) {
 943     notr(dst, isQ ? T16B : T8B, dst);
 944   }
 945 }
 946 
 947 void C2_MacroAssembler::neon_compare_zero(FloatRegister dst, BasicType bt, FloatRegister src,
 948                                           Condition cond, bool isQ) {
 949   SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
 950   if (bt == T_FLOAT || bt == T_DOUBLE) {
 951     if (cond == Assembler::NE) {
 952       fcm(Assembler::EQ, dst, size, src);
 953       notr(dst, isQ ? T16B : T8B, dst);
 954     } else {
 955       fcm(cond, dst, size, src);
 956     }
 957   } else {
 958     if (cond == Assembler::NE) {
 959       cm(Assembler::EQ, dst, size, src);
 960       notr(dst, isQ ? T16B : T8B, dst);
 961     } else {
 962       cm(cond, dst, size, src);
 963     }
 964   }
 965 }
 966 
 967 // Compress the least significant bit of each byte to the rightmost and clear
 968 // the higher garbage bits.
 969 void C2_MacroAssembler::bytemask_compress(Register dst) {
 970   // Example input, dst = 0x01 00 00 00 01 01 00 01
 971   // The "??" bytes are garbage.
 972   orr(dst, dst, dst, Assembler::LSR, 7);  // dst = 0x?? 02 ?? 00 ?? 03 ?? 01
 973   orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D
 974   orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D
 975   andr(dst, dst, 0xff);                   // dst = 0x8D
 976 }
 977 
 978 // Pack the lowest-numbered bit of each mask element in src into a long value
 979 // in dst, at most the first 64 lane elements.
 980 // Clobbers: rscratch1, if UseSVE=1 or the hardware doesn't support FEAT_BITPERM.
 981 void C2_MacroAssembler::sve_vmask_tolong(Register dst, PRegister src, BasicType bt, int lane_cnt,
 982                                          FloatRegister vtmp1, FloatRegister vtmp2) {
 983   assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
 984   assert_different_registers(dst, rscratch1);
 985   assert_different_registers(vtmp1, vtmp2);
 986 
 987   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
 988   // Example:   src = 0b01100101 10001101, bt = T_BYTE, lane_cnt = 16
 989   // Expected:  dst = 0x658D
 990 
 991   // Convert the mask into vector with sequential bytes.
 992   // vtmp1 = 0x00010100 0x00010001 0x01000000 0x01010001
 993   sve_cpy(vtmp1, size, src, 1, false);
 994   if (bt != T_BYTE) {
 995     sve_vector_narrow(vtmp1, B, vtmp1, size, vtmp2);
 996   }
 997 
 998   if (UseSVE > 1 && VM_Version::supports_svebitperm()) {
 999     // Given a vector with the value 0x00 or 0x01 in each byte, the basic idea
1000     // is to compress each significant bit of the byte in a cross-lane way. Due
1001     // to the lack of a cross-lane bit-compress instruction, we use BEXT
1002     // (bit-compress in each lane) with the biggest lane size (T = D) then
1003     // concatenate the results.
1004 
1005     // The second source input of BEXT, initialized with 0x01 in each byte.
1006     // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1007     sve_dup(vtmp2, B, 1);
1008 
1009     // BEXT vtmp1.D, vtmp1.D, vtmp2.D
1010     // vtmp1 = 0x0001010000010001 | 0x0100000001010001
1011     // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1012     //         ---------------------------------------
1013     // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1014     sve_bext(vtmp1, D, vtmp1, vtmp2);
1015 
1016     // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the
1017     // result to dst.
1018     // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1019     // dst   = 0x658D
1020     if (lane_cnt <= 8) {
1021       // No need to concatenate.
1022       umov(dst, vtmp1, B, 0);
1023     } else if (lane_cnt <= 16) {
1024       ins(vtmp1, B, vtmp1, 1, 8);
1025       umov(dst, vtmp1, H, 0);
1026     } else {
1027       // As the lane count is 64 at most, the final expected value must be in
1028       // the lowest 64 bits after narrowing vtmp1 from D to B.
1029       sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2);
1030       umov(dst, vtmp1, D, 0);
1031     }
1032   } else if (UseSVE > 0) {
1033     // Compress the lowest 8 bytes.
1034     fmovd(dst, vtmp1);
1035     bytemask_compress(dst);
1036     if (lane_cnt <= 8) return;
1037 
1038     // Repeat on higher bytes and join the results.
1039     // Compress 8 bytes in each iteration.
1040     for (int idx = 1; idx < (lane_cnt / 8); idx++) {
1041       sve_extract_integral(rscratch1, T_LONG, vtmp1, idx, vtmp2);
1042       bytemask_compress(rscratch1);
1043       orr(dst, dst, rscratch1, Assembler::LSL, idx << 3);
1044     }
1045   } else {
1046     assert(false, "unsupported");
1047     ShouldNotReachHere();
1048   }
1049 }
1050 
1051 // Unpack the mask, a long value in src, into predicate register dst based on the
1052 // corresponding data type. Note that dst can support at most 64 lanes.
1053 // Below example gives the expected dst predicate register in different types, with
1054 // a valid src(0x658D) on a 1024-bit vector size machine.
1055 // BYTE:  dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 00 00 65 8D
1056 // SHORT: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 14 11 40 51
1057 // INT:   dst = 0x00 00 00 00 00 00 00 00 01 10 01 01 10 00 11 01
1058 // LONG:  dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01
1059 //
1060 // The number of significant bits of src must be equal to lane_cnt. E.g., 0xFF658D which
1061 // has 24 significant bits would be an invalid input if dst predicate register refers to
1062 // a LONG type 1024-bit vector, which has at most 16 lanes.
1063 void C2_MacroAssembler::sve_vmask_fromlong(PRegister dst, Register src, BasicType bt, int lane_cnt,
1064                                            FloatRegister vtmp1, FloatRegister vtmp2) {
1065   assert(UseSVE == 2 && VM_Version::supports_svebitperm() &&
1066          lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported");
1067   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
1068   // Example:   src = 0x658D, bt = T_BYTE, size = B, lane_cnt = 16
1069   // Expected:  dst = 0b01101001 10001101
1070 
1071   // Put long value from general purpose register into the first lane of vector.
1072   // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1073   sve_dup(vtmp1, B, 0);
1074   mov(vtmp1, D, 0, src);
1075 
1076   // As sve_cmp generates mask value with the minimum unit in byte, we should
1077   // transform the value in the first lane which is mask in bit now to the
1078   // mask in byte, which can be done by SVE2's BDEP instruction.
1079 
1080   // The first source input of BDEP instruction. Deposite each byte in every 8 bytes.
1081   // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1082   if (lane_cnt <= 8) {
1083     // Nothing. As only one byte exsits.
1084   } else if (lane_cnt <= 16) {
1085     ins(vtmp1, B, vtmp1, 8, 1);
1086     mov(vtmp1, B, 1, zr);
1087   } else {
1088     sve_vector_extend(vtmp1, D, vtmp1, B);
1089   }
1090 
1091   // The second source input of BDEP instruction, initialized with 0x01 for each byte.
1092   // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1093   sve_dup(vtmp2, B, 1);
1094 
1095   // BDEP vtmp1.D, vtmp1.D, vtmp2.D
1096   // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1097   // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1098   //         ---------------------------------------
1099   // vtmp1 = 0x0001010000010001 | 0x0100000001010001
1100   sve_bdep(vtmp1, D, vtmp1, vtmp2);
1101 
1102   if (bt != T_BYTE) {
1103     sve_vector_extend(vtmp1, size, vtmp1, B);
1104   }
1105   // Generate mask according to the given vector, in which the elements have been
1106   // extended to expected type.
1107   // dst = 0b01101001 10001101
1108   sve_cmp(Assembler::NE, dst, size, ptrue, vtmp1, 0);
1109 }
1110 
1111 // Clobbers: rflags
1112 void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg,
1113                                     FloatRegister zn, FloatRegister zm, Condition cond) {
1114   assert(pg->is_governing(), "This register has to be a governing predicate register");
1115   FloatRegister z1 = zn, z2 = zm;
1116   switch (cond) {
1117     case LE: z1 = zm; z2 = zn; cond = GE; break;
1118     case LT: z1 = zm; z2 = zn; cond = GT; break;
1119     case LO: z1 = zm; z2 = zn; cond = HI; break;
1120     case LS: z1 = zm; z2 = zn; cond = HS; break;
1121     default:
1122       break;
1123   }
1124 
1125   SIMD_RegVariant size = elemType_to_regVariant(bt);
1126   if (is_floating_point_type(bt)) {
1127     sve_fcm(cond, pd, size, pg, z1, z2);
1128   } else {
1129     assert(is_integral_type(bt), "unsupported element type");
1130     sve_cmp(cond, pd, size, pg, z1, z2);
1131   }
1132 }
1133 
1134 // Get index of the last mask lane that is set
1135 void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) {
1136   SIMD_RegVariant size = elemType_to_regVariant(bt);
1137   sve_rev(ptmp, size, src);
1138   sve_brkb(ptmp, ptrue, ptmp, false);
1139   sve_cntp(dst, size, ptrue, ptmp);
1140   movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1);
1141   subw(dst, rscratch1, dst);
1142 }
1143 
1144 // Extend integer vector src to dst with the same lane count
1145 // but larger element size, e.g. 4B -> 4I
1146 void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes,
1147                                            FloatRegister src, BasicType src_bt) {
1148   if (src_bt == T_BYTE) {
1149     if (dst_bt == T_SHORT) {
1150       // 4B/8B to 4S/8S
1151       assert(dst_vlen_in_bytes == 8 || dst_vlen_in_bytes == 16, "unsupported");
1152       sxtl(dst, T8H, src, T8B);
1153     } else {
1154       // 4B to 4I
1155       assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported");
1156       sxtl(dst, T8H, src, T8B);
1157       sxtl(dst, T4S, dst, T4H);
1158     }
1159   } else if (src_bt == T_SHORT) {
1160     // 4S to 4I
1161     assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported");
1162     sxtl(dst, T4S, src, T4H);
1163   } else if (src_bt == T_INT) {
1164     // 2I to 2L
1165     assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported");
1166     sxtl(dst, T2D, src, T2S);
1167   } else {
1168     ShouldNotReachHere();
1169   }
1170 }
1171 
1172 // Narrow integer vector src down to dst with the same lane count
1173 // but smaller element size, e.g. 4I -> 4B
1174 void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt,
1175                                            FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) {
1176   if (src_bt == T_SHORT) {
1177     // 4S/8S to 4B/8B
1178     assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
1179     assert(dst_bt == T_BYTE, "unsupported");
1180     xtn(dst, T8B, src, T8H);
1181   } else if (src_bt == T_INT) {
1182     // 4I to 4B/4S
1183     assert(src_vlen_in_bytes == 16, "unsupported");
1184     assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported");
1185     xtn(dst, T4H, src, T4S);
1186     if (dst_bt == T_BYTE) {
1187       xtn(dst, T8B, dst, T8H);
1188     }
1189   } else if (src_bt == T_LONG) {
1190     // 2L to 2I
1191     assert(src_vlen_in_bytes == 16, "unsupported");
1192     assert(dst_bt == T_INT, "unsupported");
1193     xtn(dst, T2S, src, T2D);
1194   } else {
1195     ShouldNotReachHere();
1196   }
1197 }
1198 
1199 void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size,
1200                                           FloatRegister src, SIMD_RegVariant src_size) {
1201   assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size");
1202   if (src_size == B) {
1203     switch (dst_size) {
1204     case H:
1205       sve_sunpklo(dst, H, src);
1206       break;
1207     case S:
1208       sve_sunpklo(dst, H, src);
1209       sve_sunpklo(dst, S, dst);
1210       break;
1211     case D:
1212       sve_sunpklo(dst, H, src);
1213       sve_sunpklo(dst, S, dst);
1214       sve_sunpklo(dst, D, dst);
1215       break;
1216     default:
1217       ShouldNotReachHere();
1218     }
1219   } else if (src_size == H) {
1220     if (dst_size == S) {
1221       sve_sunpklo(dst, S, src);
1222     } else { // D
1223       sve_sunpklo(dst, S, src);
1224       sve_sunpklo(dst, D, dst);
1225     }
1226   } else if (src_size == S) {
1227     sve_sunpklo(dst, D, src);
1228   }
1229 }
1230 
1231 // Vector narrow from src to dst with specified element sizes.
1232 // High part of dst vector will be filled with zero.
1233 void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size,
1234                                           FloatRegister src, SIMD_RegVariant src_size,
1235                                           FloatRegister tmp) {
1236   assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size");
1237   assert_different_registers(src, tmp);
1238   sve_dup(tmp, src_size, 0);
1239   if (src_size == D) {
1240     switch (dst_size) {
1241     case S:
1242       sve_uzp1(dst, S, src, tmp);
1243       break;
1244     case H:
1245       assert_different_registers(dst, tmp);
1246       sve_uzp1(dst, S, src, tmp);
1247       sve_uzp1(dst, H, dst, tmp);
1248       break;
1249     case B:
1250       assert_different_registers(dst, tmp);
1251       sve_uzp1(dst, S, src, tmp);
1252       sve_uzp1(dst, H, dst, tmp);
1253       sve_uzp1(dst, B, dst, tmp);
1254       break;
1255     default:
1256       ShouldNotReachHere();
1257     }
1258   } else if (src_size == S) {
1259     if (dst_size == H) {
1260       sve_uzp1(dst, H, src, tmp);
1261     } else { // B
1262       assert_different_registers(dst, tmp);
1263       sve_uzp1(dst, H, src, tmp);
1264       sve_uzp1(dst, B, dst, tmp);
1265     }
1266   } else if (src_size == H) {
1267     sve_uzp1(dst, B, src, tmp);
1268   }
1269 }
1270 
1271 // Extend src predicate to dst predicate with the same lane count but larger
1272 // element size, e.g. 64Byte -> 512Long
1273 void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src,
1274                                              uint dst_element_length_in_bytes,
1275                                              uint src_element_length_in_bytes) {
1276   if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) {
1277     sve_punpklo(dst, src);
1278   } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) {
1279     sve_punpklo(dst, src);
1280     sve_punpklo(dst, dst);
1281   } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) {
1282     sve_punpklo(dst, src);
1283     sve_punpklo(dst, dst);
1284     sve_punpklo(dst, dst);
1285   } else {
1286     assert(false, "unsupported");
1287     ShouldNotReachHere();
1288   }
1289 }
1290 
1291 // Narrow src predicate to dst predicate with the same lane count but
1292 // smaller element size, e.g. 512Long -> 64Byte
1293 void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src, PRegister ptmp,
1294                                              uint dst_element_length_in_bytes, uint src_element_length_in_bytes) {
1295   // The insignificant bits in src predicate are expected to be zero.
1296   // To ensure the higher order bits of the resultant narrowed vector are 0, an all-zero predicate is
1297   // passed as the second argument. An example narrowing operation with a given mask would be -
1298   // 128Long -> 64Int on a 128-bit machine i.e 2L -> 2I
1299   // Mask (for 2 Longs) : TF
1300   // Predicate register for the above mask (16 bits) : 00000001 00000000
1301   // After narrowing (uzp1 dst.b, src.b, ptmp.b) : 0000 0000 0001 0000
1302   // Which translates to mask for 2 integers as : TF (lower half is considered while upper half is 0)
1303   assert_different_registers(src, ptmp);
1304   assert_different_registers(dst, ptmp);
1305   sve_pfalse(ptmp);
1306   if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) {
1307     sve_uzp1(dst, B, src, ptmp);
1308   } else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) {
1309     sve_uzp1(dst, H, src, ptmp);
1310     sve_uzp1(dst, B, dst, ptmp);
1311   } else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) {
1312     sve_uzp1(dst, S, src, ptmp);
1313     sve_uzp1(dst, H, dst, ptmp);
1314     sve_uzp1(dst, B, dst, ptmp);
1315   } else {
1316     assert(false, "unsupported");
1317     ShouldNotReachHere();
1318   }
1319 }
1320 
1321 // Vector reduction add for integral type with ASIMD instructions.
1322 void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt,
1323                                                  Register isrc, FloatRegister vsrc,
1324                                                  unsigned vector_length_in_bytes,
1325                                                  FloatRegister vtmp) {
1326   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1327   assert_different_registers(dst, isrc);
1328   bool isQ = vector_length_in_bytes == 16;
1329 
1330   BLOCK_COMMENT("neon_reduce_add_integral {");
1331     switch(bt) {
1332       case T_BYTE:
1333         addv(vtmp, isQ ? T16B : T8B, vsrc);
1334         smov(dst, vtmp, B, 0);
1335         addw(dst, dst, isrc, ext::sxtb);
1336         break;
1337       case T_SHORT:
1338         addv(vtmp, isQ ? T8H : T4H, vsrc);
1339         smov(dst, vtmp, H, 0);
1340         addw(dst, dst, isrc, ext::sxth);
1341         break;
1342       case T_INT:
1343         isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc);
1344         umov(dst, vtmp, S, 0);
1345         addw(dst, dst, isrc);
1346         break;
1347       case T_LONG:
1348         assert(isQ, "unsupported");
1349         addpd(vtmp, vsrc);
1350         umov(dst, vtmp, D, 0);
1351         add(dst, dst, isrc);
1352         break;
1353       default:
1354         assert(false, "unsupported");
1355         ShouldNotReachHere();
1356     }
1357   BLOCK_COMMENT("} neon_reduce_add_integral");
1358 }
1359 
1360 // Vector reduction multiply for integral type with ASIMD instructions.
1361 // Note: temporary registers vtmp1 and vtmp2 are not used in some cases.
1362 // Clobbers: rscratch1
1363 void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt,
1364                                                  Register isrc, FloatRegister vsrc,
1365                                                  unsigned vector_length_in_bytes,
1366                                                  FloatRegister vtmp1, FloatRegister vtmp2) {
1367   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1368   bool isQ = vector_length_in_bytes == 16;
1369 
1370   BLOCK_COMMENT("neon_reduce_mul_integral {");
1371     switch(bt) {
1372       case T_BYTE:
1373         if (isQ) {
1374           // Multiply the lower half and higher half of vector iteratively.
1375           // vtmp1 = vsrc[8:15]
1376           ins(vtmp1, D, vsrc, 0, 1);
1377           // vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7]
1378           mulv(vtmp1, T8B, vtmp1, vsrc);
1379           // vtmp2 = vtmp1[4:7]
1380           ins(vtmp2, S, vtmp1, 0, 1);
1381           // vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3]
1382           mulv(vtmp1, T8B, vtmp2, vtmp1);
1383         } else {
1384           ins(vtmp1, S, vsrc, 0, 1);
1385           mulv(vtmp1, T8B, vtmp1, vsrc);
1386         }
1387         // vtmp2 = vtmp1[2:3]
1388         ins(vtmp2, H, vtmp1, 0, 1);
1389         // vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1]
1390         mulv(vtmp2, T8B, vtmp2, vtmp1);
1391         // dst = vtmp2[0] * isrc * vtmp2[1]
1392         umov(rscratch1, vtmp2, B, 0);
1393         mulw(dst, rscratch1, isrc);
1394         sxtb(dst, dst);
1395         umov(rscratch1, vtmp2, B, 1);
1396         mulw(dst, rscratch1, dst);
1397         sxtb(dst, dst);
1398         break;
1399       case T_SHORT:
1400         if (isQ) {
1401           ins(vtmp2, D, vsrc, 0, 1);
1402           mulv(vtmp2, T4H, vtmp2, vsrc);
1403           ins(vtmp1, S, vtmp2, 0, 1);
1404           mulv(vtmp1, T4H, vtmp1, vtmp2);
1405         } else {
1406           ins(vtmp1, S, vsrc, 0, 1);
1407           mulv(vtmp1, T4H, vtmp1, vsrc);
1408         }
1409         umov(rscratch1, vtmp1, H, 0);
1410         mulw(dst, rscratch1, isrc);
1411         sxth(dst, dst);
1412         umov(rscratch1, vtmp1, H, 1);
1413         mulw(dst, rscratch1, dst);
1414         sxth(dst, dst);
1415         break;
1416       case T_INT:
1417         if (isQ) {
1418           ins(vtmp1, D, vsrc, 0, 1);
1419           mulv(vtmp1, T2S, vtmp1, vsrc);
1420         } else {
1421           vtmp1 = vsrc;
1422         }
1423         umov(rscratch1, vtmp1, S, 0);
1424         mul(dst, rscratch1, isrc);
1425         umov(rscratch1, vtmp1, S, 1);
1426         mul(dst, rscratch1, dst);
1427         break;
1428       case T_LONG:
1429         umov(rscratch1, vsrc, D, 0);
1430         mul(dst, isrc, rscratch1);
1431         umov(rscratch1, vsrc, D, 1);
1432         mul(dst, dst, rscratch1);
1433         break;
1434       default:
1435         assert(false, "unsupported");
1436         ShouldNotReachHere();
1437     }
1438   BLOCK_COMMENT("} neon_reduce_mul_integral");
1439 }
1440 
1441 // Vector reduction multiply for floating-point type with ASIMD instructions.
1442 void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt,
1443                                            FloatRegister fsrc, FloatRegister vsrc,
1444                                            unsigned vector_length_in_bytes,
1445                                            FloatRegister vtmp) {
1446   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1447   bool isQ = vector_length_in_bytes == 16;
1448 
1449   BLOCK_COMMENT("neon_reduce_mul_fp {");
1450     switch(bt) {
1451       case T_FLOAT:
1452         fmuls(dst, fsrc, vsrc);
1453         ins(vtmp, S, vsrc, 0, 1);
1454         fmuls(dst, dst, vtmp);
1455         if (isQ) {
1456           ins(vtmp, S, vsrc, 0, 2);
1457           fmuls(dst, dst, vtmp);
1458           ins(vtmp, S, vsrc, 0, 3);
1459           fmuls(dst, dst, vtmp);
1460          }
1461         break;
1462       case T_DOUBLE:
1463         assert(isQ, "unsupported");
1464         fmuld(dst, fsrc, vsrc);
1465         ins(vtmp, D, vsrc, 0, 1);
1466         fmuld(dst, dst, vtmp);
1467         break;
1468       default:
1469         assert(false, "unsupported");
1470         ShouldNotReachHere();
1471     }
1472   BLOCK_COMMENT("} neon_reduce_mul_fp");
1473 }
1474 
1475 // Helper to select logical instruction
1476 void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd,
1477                                                    Register Rn, Register Rm,
1478                                                    enum shift_kind kind, unsigned shift) {
1479   switch(opc) {
1480     case Op_AndReductionV:
1481       is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift);
1482       break;
1483     case Op_OrReductionV:
1484       is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift);
1485       break;
1486     case Op_XorReductionV:
1487       is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift);
1488       break;
1489     default:
1490       assert(false, "unsupported");
1491       ShouldNotReachHere();
1492   }
1493 }
1494 
1495 // Vector reduction logical operations And, Or, Xor
1496 // Clobbers: rscratch1
1497 void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt,
1498                                             Register isrc, FloatRegister vsrc,
1499                                             unsigned vector_length_in_bytes) {
1500   assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV,
1501          "unsupported");
1502   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1503   assert_different_registers(dst, isrc);
1504   bool isQ = vector_length_in_bytes == 16;
1505 
1506   BLOCK_COMMENT("neon_reduce_logical {");
1507     umov(rscratch1, vsrc, isQ ? D : S, 0);
1508     umov(dst, vsrc, isQ ? D : S, 1);
1509     neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1);
1510     switch(bt) {
1511       case T_BYTE:
1512         if (isQ) {
1513           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1514         }
1515         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
1516         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8);
1517         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1518         sxtb(dst, dst);
1519         break;
1520       case T_SHORT:
1521         if (isQ) {
1522           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1523         }
1524         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
1525         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1526         sxth(dst, dst);
1527         break;
1528       case T_INT:
1529         if (isQ) {
1530           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1531         }
1532         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1533         break;
1534       case T_LONG:
1535         assert(isQ, "unsupported");
1536         neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst);
1537         break;
1538       default:
1539         assert(false, "unsupported");
1540         ShouldNotReachHere();
1541     }
1542   BLOCK_COMMENT("} neon_reduce_logical");
1543 }
1544 
1545 // Vector reduction min/max for integral type with ASIMD instructions.
1546 // Note: vtmp is not used and expected to be fnoreg for T_LONG case.
1547 // Clobbers: rscratch1, rflags
1548 void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt,
1549                                                     Register isrc, FloatRegister vsrc,
1550                                                     unsigned vector_length_in_bytes,
1551                                                     FloatRegister vtmp) {
1552   assert(opc == Op_MinReductionV || opc == Op_MaxReductionV, "unsupported");
1553   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1554   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported");
1555   assert_different_registers(dst, isrc);
1556   bool isQ = vector_length_in_bytes == 16;
1557   bool is_min = opc == Op_MinReductionV;
1558 
1559   BLOCK_COMMENT("neon_reduce_minmax_integral {");
1560     if (bt == T_LONG) {
1561       assert(vtmp == fnoreg, "should be");
1562       assert(isQ, "should be");
1563       umov(rscratch1, vsrc, D, 0);
1564       cmp(isrc, rscratch1);
1565       csel(dst, isrc, rscratch1, is_min ? LT : GT);
1566       umov(rscratch1, vsrc, D, 1);
1567       cmp(dst, rscratch1);
1568       csel(dst, dst, rscratch1, is_min ? LT : GT);
1569     } else {
1570       SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1571       if (size == T2S) {
1572         is_min ? sminp(vtmp, size, vsrc, vsrc) : smaxp(vtmp, size, vsrc, vsrc);
1573       } else {
1574         is_min ? sminv(vtmp, size, vsrc) : smaxv(vtmp, size, vsrc);
1575       }
1576       if (bt == T_INT) {
1577         umov(dst, vtmp, S, 0);
1578       } else {
1579         smov(dst, vtmp, elemType_to_regVariant(bt), 0);
1580       }
1581       cmpw(dst, isrc);
1582       cselw(dst, dst, isrc, is_min ? LT : GT);
1583     }
1584   BLOCK_COMMENT("} neon_reduce_minmax_integral");
1585 }
1586 
1587 // Vector reduction for integral type with SVE instruction.
1588 // Supported operations are Add, And, Or, Xor, Max, Min.
1589 // rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV.
1590 void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1,
1591                                             FloatRegister src2, PRegister pg, FloatRegister tmp) {
1592   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
1593   assert(pg->is_governing(), "This register has to be a governing predicate register");
1594   assert_different_registers(src1, dst);
1595   // Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved.
1596   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
1597   switch (opc) {
1598     case Op_AddReductionVI: {
1599       sve_uaddv(tmp, size, pg, src2);
1600       if (bt == T_BYTE) {
1601         smov(dst, tmp, size, 0);
1602         addw(dst, src1, dst, ext::sxtb);
1603       } else if (bt == T_SHORT) {
1604         smov(dst, tmp, size, 0);
1605         addw(dst, src1, dst, ext::sxth);
1606       } else {
1607         umov(dst, tmp, size, 0);
1608         addw(dst, dst, src1);
1609       }
1610       break;
1611     }
1612     case Op_AddReductionVL: {
1613       sve_uaddv(tmp, size, pg, src2);
1614       umov(dst, tmp, size, 0);
1615       add(dst, dst, src1);
1616       break;
1617     }
1618     case Op_AndReductionV: {
1619       sve_andv(tmp, size, pg, src2);
1620       if (bt == T_INT || bt == T_LONG) {
1621         umov(dst, tmp, size, 0);
1622       } else {
1623         smov(dst, tmp, size, 0);
1624       }
1625       if (bt == T_LONG) {
1626         andr(dst, dst, src1);
1627       } else {
1628         andw(dst, dst, src1);
1629       }
1630       break;
1631     }
1632     case Op_OrReductionV: {
1633       sve_orv(tmp, size, pg, src2);
1634       if (bt == T_INT || bt == T_LONG) {
1635         umov(dst, tmp, size, 0);
1636       } else {
1637         smov(dst, tmp, size, 0);
1638       }
1639       if (bt == T_LONG) {
1640         orr(dst, dst, src1);
1641       } else {
1642         orrw(dst, dst, src1);
1643       }
1644       break;
1645     }
1646     case Op_XorReductionV: {
1647       sve_eorv(tmp, size, pg, src2);
1648       if (bt == T_INT || bt == T_LONG) {
1649         umov(dst, tmp, size, 0);
1650       } else {
1651         smov(dst, tmp, size, 0);
1652       }
1653       if (bt == T_LONG) {
1654         eor(dst, dst, src1);
1655       } else {
1656         eorw(dst, dst, src1);
1657       }
1658       break;
1659     }
1660     case Op_MaxReductionV: {
1661       sve_smaxv(tmp, size, pg, src2);
1662       if (bt == T_INT || bt == T_LONG) {
1663         umov(dst, tmp, size, 0);
1664       } else {
1665         smov(dst, tmp, size, 0);
1666       }
1667       if (bt == T_LONG) {
1668         cmp(dst, src1);
1669         csel(dst, dst, src1, Assembler::GT);
1670       } else {
1671         cmpw(dst, src1);
1672         cselw(dst, dst, src1, Assembler::GT);
1673       }
1674       break;
1675     }
1676     case Op_MinReductionV: {
1677       sve_sminv(tmp, size, pg, src2);
1678       if (bt == T_INT || bt == T_LONG) {
1679         umov(dst, tmp, size, 0);
1680       } else {
1681         smov(dst, tmp, size, 0);
1682       }
1683       if (bt == T_LONG) {
1684         cmp(dst, src1);
1685         csel(dst, dst, src1, Assembler::LT);
1686       } else {
1687         cmpw(dst, src1);
1688         cselw(dst, dst, src1, Assembler::LT);
1689       }
1690       break;
1691     }
1692     default:
1693       assert(false, "unsupported");
1694       ShouldNotReachHere();
1695   }
1696 
1697   if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) {
1698     if (bt == T_BYTE) {
1699       sxtb(dst, dst);
1700     } else if (bt == T_SHORT) {
1701       sxth(dst, dst);
1702     }
1703   }
1704 }
1705 
1706 // Set elements of the dst predicate to true for lanes in the range of [0, lane_cnt), or
1707 // to false otherwise. The input "lane_cnt" should be smaller than or equal to the supported
1708 // max vector length of the basic type. Clobbers: rscratch1 and the rFlagsReg.
1709 void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt) {
1710   uint32_t max_vector_length = Matcher::max_vector_size(bt);
1711   assert(lane_cnt <= max_vector_length, "unsupported input lane_cnt");
1712 
1713   // Set all elements to false if the input "lane_cnt" is zero.
1714   if (lane_cnt == 0) {
1715     sve_pfalse(dst);
1716     return;
1717   }
1718 
1719   SIMD_RegVariant size = elemType_to_regVariant(bt);
1720   assert(size != Q, "invalid size");
1721 
1722   // Set all true if "lane_cnt" equals to the max lane count.
1723   if (lane_cnt == max_vector_length) {
1724     sve_ptrue(dst, size, /* ALL */ 0b11111);
1725     return;
1726   }
1727 
1728   // Fixed numbers for "ptrue".
1729   switch(lane_cnt) {
1730   case 1: /* VL1 */
1731   case 2: /* VL2 */
1732   case 3: /* VL3 */
1733   case 4: /* VL4 */
1734   case 5: /* VL5 */
1735   case 6: /* VL6 */
1736   case 7: /* VL7 */
1737   case 8: /* VL8 */
1738     sve_ptrue(dst, size, lane_cnt);
1739     return;
1740   case 16:
1741     sve_ptrue(dst, size, /* VL16 */ 0b01001);
1742     return;
1743   case 32:
1744     sve_ptrue(dst, size, /* VL32 */ 0b01010);
1745     return;
1746   case 64:
1747     sve_ptrue(dst, size, /* VL64 */ 0b01011);
1748     return;
1749   case 128:
1750     sve_ptrue(dst, size, /* VL128 */ 0b01100);
1751     return;
1752   case 256:
1753     sve_ptrue(dst, size, /* VL256 */ 0b01101);
1754     return;
1755   default:
1756     break;
1757   }
1758 
1759   // Special patterns for "ptrue".
1760   if (lane_cnt == round_down_power_of_2(max_vector_length)) {
1761     sve_ptrue(dst, size, /* POW2 */ 0b00000);
1762   } else if (lane_cnt == max_vector_length - (max_vector_length % 4)) {
1763     sve_ptrue(dst, size, /* MUL4 */ 0b11101);
1764   } else if (lane_cnt == max_vector_length - (max_vector_length % 3)) {
1765     sve_ptrue(dst, size, /* MUL3 */ 0b11110);
1766   } else {
1767     // Encode to "whileltw" for the remaining cases.
1768     mov(rscratch1, lane_cnt);
1769     sve_whileltw(dst, size, zr, rscratch1);
1770   }
1771 }
1772 
1773 // Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst.
1774 // Any remaining elements of dst will be filled with zero.
1775 // Clobbers: rscratch1
1776 // Preserves: src, mask
1777 void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask,
1778                                            FloatRegister vtmp1, FloatRegister vtmp2,
1779                                            PRegister pgtmp) {
1780   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
1781   assert_different_registers(dst, src, vtmp1, vtmp2);
1782   assert_different_registers(mask, pgtmp);
1783 
1784   // Example input:   src   = 8888 7777 6666 5555 4444 3333 2222 1111
1785   //                  mask  = 0001 0000 0000 0001 0001 0000 0001 0001
1786   // Expected result: dst   = 0000 0000 0000 8888 5555 4444 2222 1111
1787   sve_dup(vtmp2, H, 0);
1788 
1789   // Extend lowest half to type INT.
1790   // dst = 00004444 00003333 00002222 00001111
1791   sve_uunpklo(dst, S, src);
1792   // pgtmp = 00000001 00000000 00000001 00000001
1793   sve_punpklo(pgtmp, mask);
1794   // Pack the active elements in size of type INT to the right,
1795   // and fill the remainings with zero.
1796   // dst = 00000000 00004444 00002222 00001111
1797   sve_compact(dst, S, dst, pgtmp);
1798   // Narrow the result back to type SHORT.
1799   // dst = 0000 0000 0000 0000 0000 4444 2222 1111
1800   sve_uzp1(dst, H, dst, vtmp2);
1801   // Count the active elements of lowest half.
1802   // rscratch1 = 3
1803   sve_cntp(rscratch1, S, ptrue, pgtmp);
1804 
1805   // Repeat to the highest half.
1806   // pgtmp = 00000001 00000000 00000000 00000001
1807   sve_punpkhi(pgtmp, mask);
1808   // vtmp1 = 00008888 00007777 00006666 00005555
1809   sve_uunpkhi(vtmp1, S, src);
1810   // vtmp1 = 00000000 00000000 00008888 00005555
1811   sve_compact(vtmp1, S, vtmp1, pgtmp);
1812   // vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555
1813   sve_uzp1(vtmp1, H, vtmp1, vtmp2);
1814 
1815   // Compressed low:   dst   = 0000 0000 0000 0000 0000 4444 2222 1111
1816   // Compressed high:  vtmp1 = 0000 0000 0000 0000 0000 0000 8888  5555
1817   // Left shift(cross lane) compressed high with TRUE_CNT lanes,
1818   // TRUE_CNT is the number of active elements in the compressed low.
1819   neg(rscratch1, rscratch1);
1820   // vtmp2 = {4 3 2 1 0 -1 -2 -3}
1821   sve_index(vtmp2, H, rscratch1, 1);
1822   // vtmp1 = 0000 0000 0000 8888 5555 0000 0000 0000
1823   sve_tbl(vtmp1, H, vtmp1, vtmp2);
1824 
1825   // Combine the compressed high(after shifted) with the compressed low.
1826   // dst = 0000 0000 0000 8888 5555 4444 2222 1111
1827   sve_orr(dst, dst, vtmp1);
1828 }
1829 
1830 // Clobbers: rscratch1, rscratch2
1831 // Preserves: src, mask
1832 void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask,
1833                                           FloatRegister vtmp1, FloatRegister vtmp2,
1834                                           FloatRegister vtmp3, FloatRegister vtmp4,
1835                                           PRegister ptmp, PRegister pgtmp) {
1836   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
1837   assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3, vtmp4);
1838   assert_different_registers(mask, ptmp, pgtmp);
1839   // Example input:   src   = 88 77 66 55 44 33 22 11
1840   //                  mask  = 01 00 00 01 01 00 01 01
1841   // Expected result: dst   = 00 00 00 88 55 44 22 11
1842 
1843   sve_dup(vtmp4, B, 0);
1844   // Extend lowest half to type SHORT.
1845   // vtmp1 = 0044 0033 0022 0011
1846   sve_uunpklo(vtmp1, H, src);
1847   // ptmp = 0001 0000 0001 0001
1848   sve_punpklo(ptmp, mask);
1849   // Count the active elements of lowest half.
1850   // rscratch2 = 3
1851   sve_cntp(rscratch2, H, ptrue, ptmp);
1852   // Pack the active elements in size of type SHORT to the right,
1853   // and fill the remainings with zero.
1854   // dst = 0000 0044 0022 0011
1855   sve_compress_short(dst, vtmp1, ptmp, vtmp2, vtmp3, pgtmp);
1856   // Narrow the result back to type BYTE.
1857   // dst = 00 00 00 00 00 44 22 11
1858   sve_uzp1(dst, B, dst, vtmp4);
1859 
1860   // Repeat to the highest half.
1861   // ptmp = 0001 0000 0000 0001
1862   sve_punpkhi(ptmp, mask);
1863   // vtmp1 = 0088 0077 0066 0055
1864   sve_uunpkhi(vtmp2, H, src);
1865   // vtmp1 = 0000 0000 0088 0055
1866   sve_compress_short(vtmp1, vtmp2, ptmp, vtmp3, vtmp4, pgtmp);
1867 
1868   sve_dup(vtmp4, B, 0);
1869   // vtmp1 = 00 00 00 00 00 00 88 55
1870   sve_uzp1(vtmp1, B, vtmp1, vtmp4);
1871 
1872   // Compressed low:   dst   = 00 00 00 00 00 44 22 11
1873   // Compressed high:  vtmp1 = 00 00 00 00 00 00 88 55
1874   // Left shift(cross lane) compressed high with TRUE_CNT lanes,
1875   // TRUE_CNT is the number of active elements in the compressed low.
1876   neg(rscratch2, rscratch2);
1877   // vtmp2 = {4 3 2 1 0 -1 -2 -3}
1878   sve_index(vtmp2, B, rscratch2, 1);
1879   // vtmp1 = 00 00 00 88 55 00 00 00
1880   sve_tbl(vtmp1, B, vtmp1, vtmp2);
1881   // Combine the compressed high(after shifted) with the compressed low.
1882   // dst = 00 00 00 88 55 44 22 11
1883   sve_orr(dst, dst, vtmp1);
1884 }
1885 
1886 void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
1887   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
1888   SIMD_Arrangement size = isQ ? T16B : T8B;
1889   if (bt == T_BYTE) {
1890     rbit(dst, size, src);
1891   } else {
1892     neon_reverse_bytes(dst, src, bt, isQ);
1893     rbit(dst, size, dst);
1894   }
1895 }
1896 
1897 void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
1898   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
1899   SIMD_Arrangement size = isQ ? T16B : T8B;
1900   switch (bt) {
1901     case T_BYTE:
1902       if (dst != src) {
1903         orr(dst, size, src, src);
1904       }
1905       break;
1906     case T_SHORT:
1907       rev16(dst, size, src);
1908       break;
1909     case T_INT:
1910       rev32(dst, size, src);
1911       break;
1912     case T_LONG:
1913       rev64(dst, size, src);
1914       break;
1915     default:
1916       assert(false, "unsupported");
1917       ShouldNotReachHere();
1918   }
1919 }
1920 
1921 // Extract a scalar element from an sve vector at position 'idx'.
1922 // The input elements in src are expected to be of integral type.
1923 void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src,
1924                                              int idx, FloatRegister vtmp) {
1925   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
1926   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
1927   if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction
1928     if (bt == T_INT || bt == T_LONG) {
1929       umov(dst, src, size, idx);
1930     } else {
1931       smov(dst, src, size, idx);
1932     }
1933   } else {
1934     sve_orr(vtmp, src, src);
1935     sve_ext(vtmp, vtmp, idx << size);
1936     if (bt == T_INT || bt == T_LONG) {
1937       umov(dst, vtmp, size, 0);
1938     } else {
1939       smov(dst, vtmp, size, 0);
1940     }
1941   }
1942 }
1943 
1944 // java.lang.Math::round intrinsics
1945 
1946 // Clobbers: rscratch1, rflags
1947 void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
1948                                           FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) {
1949   assert_different_registers(tmp1, tmp2, tmp3, src, dst);
1950   switch (T) {
1951     case T2S:
1952     case T4S:
1953       fmovs(tmp1, T, 0.5f);
1954       mov(rscratch1, jint_cast(0x1.0p23f));
1955       break;
1956     case T2D:
1957       fmovd(tmp1, T, 0.5);
1958       mov(rscratch1, julong_cast(0x1.0p52));
1959       break;
1960     default:
1961       assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
1962   }
1963   fadd(tmp1, T, tmp1, src);
1964   fcvtms(tmp1, T, tmp1);
1965   // tmp1 = floor(src + 0.5, ties to even)
1966 
1967   fcvtas(dst, T, src);
1968   // dst = round(src), ties to away
1969 
1970   fneg(tmp3, T, src);
1971   dup(tmp2, T, rscratch1);
1972   cm(HS, tmp3, T, tmp3, tmp2);
1973   // tmp3 is now a set of flags
1974 
1975   bif(dst, T16B, tmp1, tmp3);
1976   // result in dst
1977 }
1978 
1979 // Clobbers: rscratch1, rflags
1980 void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
1981                                          FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) {
1982   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
1983   assert_different_registers(tmp1, tmp2, src, dst);
1984 
1985   switch (T) {
1986     case S:
1987       mov(rscratch1, jint_cast(0x1.0p23f));
1988       break;
1989     case D:
1990       mov(rscratch1, julong_cast(0x1.0p52));
1991       break;
1992     default:
1993       assert(T == S || T == D, "invalid register variant");
1994   }
1995 
1996   sve_frinta(dst, T, ptrue, src);
1997   // dst = round(src), ties to away
1998 
1999   Label none;
2000 
2001   sve_fneg(tmp1, T, ptrue, src);
2002   sve_dup(tmp2, T, rscratch1);
2003   sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1);
2004   br(EQ, none);
2005   {
2006     sve_cpy(tmp1, T, pgtmp, 0.5);
2007     sve_fadd(tmp1, T, pgtmp, src);
2008     sve_frintm(dst, T, pgtmp, tmp1);
2009     // dst = floor(src + 0.5, ties to even)
2010   }
2011   bind(none);
2012 
2013   sve_fcvtzs(dst, T, ptrue, dst, T);
2014   // result in dst
2015 }
2016 
2017 void C2_MacroAssembler::vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero,
2018                                            FloatRegister one, SIMD_Arrangement T) {
2019   assert_different_registers(dst, src, zero, one);
2020   assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2021 
2022   facgt(dst, T, src, zero);
2023   ushr(dst, T, dst, 1); // dst=0 for +-0.0 and NaN. 0x7FF..F otherwise
2024   bsl(dst, T == T2S ? T8B : T16B, one, src); // Result in dst
2025 }
2026 
2027 void C2_MacroAssembler::vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero,
2028                                           FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T) {
2029     assert_different_registers(dst, src, zero, one, vtmp);
2030     assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2031 
2032     sve_orr(vtmp, src, src);
2033     sve_fac(Assembler::GT, pgtmp, T, ptrue, src, zero); // pmtp=0 for +-0.0 and NaN. 0x1 otherwise
2034     switch (T) {
2035     case S:
2036       sve_and(vtmp, T, min_jint); // Extract the sign bit of float value in every lane of src
2037       sve_orr(vtmp, T, jint_cast(1.0)); // OR it with +1 to make the final result +1 or -1 depending
2038                                         // on the sign of the float value
2039       break;
2040     case D:
2041       sve_and(vtmp, T, min_jlong);
2042       sve_orr(vtmp, T, jlong_cast(1.0));
2043       break;
2044     default:
2045       assert(false, "unsupported");
2046       ShouldNotReachHere();
2047     }
2048     sve_sel(dst, T, pgtmp, vtmp, src); // Select either from src or vtmp based on the predicate register pgtmp
2049                                        // Result in dst
2050 }
2051 
2052 bool C2_MacroAssembler::in_scratch_emit_size() {
2053   if (ciEnv::current()->task() != nullptr) {
2054     PhaseOutput* phase_output = Compile::current()->output();
2055     if (phase_output != nullptr && phase_output->in_scratch_emit_size()) {
2056       return true;
2057     }
2058   }
2059   return MacroAssembler::in_scratch_emit_size();
2060 }