1 /*
   2  * Copyright (c) 2020, 2023, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "opto/c2_MacroAssembler.hpp"
  29 #include "opto/compile.hpp"
  30 #include "opto/intrinsicnode.hpp"
  31 #include "opto/matcher.hpp"
  32 #include "opto/output.hpp"
  33 #include "opto/subnode.hpp"
  34 #include "runtime/stubRoutines.hpp"
  35 
  36 #ifdef PRODUCT
  37 #define BLOCK_COMMENT(str) /* nothing */
  38 #define STOP(error) stop(error)
  39 #else
  40 #define BLOCK_COMMENT(str) block_comment(str)
  41 #define STOP(error) block_comment(error); stop(error)
  42 #endif
  43 
  44 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  45 
  46 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
  47 
  48 // Search for str1 in str2 and return index or -1
  49 void C2_MacroAssembler::string_indexof(Register str2, Register str1,
  50                                        Register cnt2, Register cnt1,
  51                                        Register tmp1, Register tmp2,
  52                                        Register tmp3, Register tmp4,
  53                                        Register tmp5, Register tmp6,
  54                                        int icnt1, Register result, int ae) {
  55   // NOTE: tmp5, tmp6 can be zr depending on specific method version
  56   Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
  57 
  58   Register ch1 = rscratch1;
  59   Register ch2 = rscratch2;
  60   Register cnt1tmp = tmp1;
  61   Register cnt2tmp = tmp2;
  62   Register cnt1_neg = cnt1;
  63   Register cnt2_neg = cnt2;
  64   Register result_tmp = tmp4;
  65 
  66   bool isL = ae == StrIntrinsicNode::LL;
  67 
  68   bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
  69   bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
  70   int str1_chr_shift = str1_isL ? 0:1;
  71   int str2_chr_shift = str2_isL ? 0:1;
  72   int str1_chr_size = str1_isL ? 1:2;
  73   int str2_chr_size = str2_isL ? 1:2;
  74   chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
  75                                       (chr_insn)&MacroAssembler::ldrh;
  76   chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
  77                                       (chr_insn)&MacroAssembler::ldrh;
  78   chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
  79   chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
  80 
  81   // Note, inline_string_indexOf() generates checks:
  82   // if (substr.count > string.count) return -1;
  83   // if (substr.count == 0) return 0;
  84 
  85   // We have two strings, a source string in str2, cnt2 and a pattern string
  86   // in str1, cnt1. Find the 1st occurrence of pattern in source or return -1.
  87 
  88   // For larger pattern and source we use a simplified Boyer Moore algorithm.
  89   // With a small pattern and source we use linear scan.
  90 
  91   if (icnt1 == -1) {
  92     sub(result_tmp, cnt2, cnt1);
  93     cmp(cnt1, (u1)8);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
  94     br(LT, LINEARSEARCH);
  95     dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
  96     subs(zr, cnt1, 256);
  97     lsr(tmp1, cnt2, 2);
  98     ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
  99     br(GE, LINEARSTUB);
 100   }
 101 
 102 // The Boyer Moore alogorithm is based on the description here:-
 103 //
 104 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
 105 //
 106 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
 107 // and the 'Good Suffix' rule.
 108 //
 109 // These rules are essentially heuristics for how far we can shift the
 110 // pattern along the search string.
 111 //
 112 // The implementation here uses the 'Bad Character' rule only because of the
 113 // complexity of initialisation for the 'Good Suffix' rule.
 114 //
 115 // This is also known as the Boyer-Moore-Horspool algorithm:-
 116 //
 117 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
 118 //
 119 // This particular implementation has few java-specific optimizations.
 120 //
 121 // #define ASIZE 256
 122 //
 123 //    int bm(unsigned char *x, int m, unsigned char *y, int n) {
 124 //       int i, j;
 125 //       unsigned c;
 126 //       unsigned char bc[ASIZE];
 127 //
 128 //       /* Preprocessing */
 129 //       for (i = 0; i < ASIZE; ++i)
 130 //          bc[i] = m;
 131 //       for (i = 0; i < m - 1; ) {
 132 //          c = x[i];
 133 //          ++i;
 134 //          // c < 256 for Latin1 string, so, no need for branch
 135 //          #ifdef PATTERN_STRING_IS_LATIN1
 136 //          bc[c] = m - i;
 137 //          #else
 138 //          if (c < ASIZE) bc[c] = m - i;
 139 //          #endif
 140 //       }
 141 //
 142 //       /* Searching */
 143 //       j = 0;
 144 //       while (j <= n - m) {
 145 //          c = y[i+j];
 146 //          if (x[m-1] == c)
 147 //            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
 148 //          if (i < 0) return j;
 149 //          // c < 256 for Latin1 string, so, no need for branch
 150 //          #ifdef SOURCE_STRING_IS_LATIN1
 151 //          // LL case: (c< 256) always true. Remove branch
 152 //          j += bc[y[j+m-1]];
 153 //          #endif
 154 //          #ifndef PATTERN_STRING_IS_UTF
 155 //          // UU case: need if (c<ASIZE) check. Skip 1 character if not.
 156 //          if (c < ASIZE)
 157 //            j += bc[y[j+m-1]];
 158 //          else
 159 //            j += 1
 160 //          #endif
 161 //          #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
 162 //          // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
 163 //          if (c < ASIZE)
 164 //            j += bc[y[j+m-1]];
 165 //          else
 166 //            j += m
 167 //          #endif
 168 //       }
 169 //    }
 170 
 171   if (icnt1 == -1) {
 172     Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
 173         BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
 174     Register cnt1end = tmp2;
 175     Register str2end = cnt2;
 176     Register skipch = tmp2;
 177 
 178     // str1 length is >=8, so, we can read at least 1 register for cases when
 179     // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
 180     // UL case. We'll re-read last character in inner pre-loop code to have
 181     // single outer pre-loop load
 182     const int firstStep = isL ? 7 : 3;
 183 
 184     const int ASIZE = 256;
 185     const int STORED_BYTES = 32; // amount of bytes stored per instruction
 186     sub(sp, sp, ASIZE);
 187     mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
 188     mov(ch1, sp);
 189     BIND(BM_INIT_LOOP);
 190       stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
 191       subs(tmp5, tmp5, 1);
 192       br(GT, BM_INIT_LOOP);
 193 
 194       sub(cnt1tmp, cnt1, 1);
 195       mov(tmp5, str2);
 196       add(str2end, str2, result_tmp, LSL, str2_chr_shift);
 197       sub(ch2, cnt1, 1);
 198       mov(tmp3, str1);
 199     BIND(BCLOOP);
 200       (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
 201       if (!str1_isL) {
 202         subs(zr, ch1, ASIZE);
 203         br(HS, BCSKIP);
 204       }
 205       strb(ch2, Address(sp, ch1));
 206     BIND(BCSKIP);
 207       subs(ch2, ch2, 1);
 208       br(GT, BCLOOP);
 209 
 210       add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
 211       if (str1_isL == str2_isL) {
 212         // load last 8 bytes (8LL/4UU symbols)
 213         ldr(tmp6, Address(tmp6, -wordSize));
 214       } else {
 215         ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
 216         // convert Latin1 to UTF. We'll have to wait until load completed, but
 217         // it's still faster than per-character loads+checks
 218         lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
 219         ubfx(ch1, tmp6, 8, 8); // str1[N-2]
 220         ubfx(ch2, tmp6, 16, 8); // str1[N-3]
 221         andr(tmp6, tmp6, 0xFF); // str1[N-4]
 222         orr(ch2, ch1, ch2, LSL, 16);
 223         orr(tmp6, tmp6, tmp3, LSL, 48);
 224         orr(tmp6, tmp6, ch2, LSL, 16);
 225       }
 226     BIND(BMLOOPSTR2);
 227       (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 228       sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
 229       if (str1_isL == str2_isL) {
 230         // re-init tmp3. It's for free because it's executed in parallel with
 231         // load above. Alternative is to initialize it before loop, but it'll
 232         // affect performance on in-order systems with 2 or more ld/st pipelines
 233         lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
 234       }
 235       if (!isL) { // UU/UL case
 236         lsl(ch2, cnt1tmp, 1); // offset in bytes
 237       }
 238       cmp(tmp3, skipch);
 239       br(NE, BMSKIP);
 240       ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
 241       mov(ch1, tmp6);
 242       if (isL) {
 243         b(BMLOOPSTR1_AFTER_LOAD);
 244       } else {
 245         sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
 246         b(BMLOOPSTR1_CMP);
 247       }
 248     BIND(BMLOOPSTR1);
 249       (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
 250       (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 251     BIND(BMLOOPSTR1_AFTER_LOAD);
 252       subs(cnt1tmp, cnt1tmp, 1);
 253       br(LT, BMLOOPSTR1_LASTCMP);
 254     BIND(BMLOOPSTR1_CMP);
 255       cmp(ch1, ch2);
 256       br(EQ, BMLOOPSTR1);
 257     BIND(BMSKIP);
 258       if (!isL) {
 259         // if we've met UTF symbol while searching Latin1 pattern, then we can
 260         // skip cnt1 symbols
 261         if (str1_isL != str2_isL) {
 262           mov(result_tmp, cnt1);
 263         } else {
 264           mov(result_tmp, 1);
 265         }
 266         subs(zr, skipch, ASIZE);
 267         br(HS, BMADV);
 268       }
 269       ldrb(result_tmp, Address(sp, skipch)); // load skip distance
 270     BIND(BMADV);
 271       sub(cnt1tmp, cnt1, 1);
 272       add(str2, str2, result_tmp, LSL, str2_chr_shift);
 273       cmp(str2, str2end);
 274       br(LE, BMLOOPSTR2);
 275       add(sp, sp, ASIZE);
 276       b(NOMATCH);
 277     BIND(BMLOOPSTR1_LASTCMP);
 278       cmp(ch1, ch2);
 279       br(NE, BMSKIP);
 280     BIND(BMMATCH);
 281       sub(result, str2, tmp5);
 282       if (!str2_isL) lsr(result, result, 1);
 283       add(sp, sp, ASIZE);
 284       b(DONE);
 285 
 286     BIND(LINEARSTUB);
 287     cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm
 288     br(LT, LINEAR_MEDIUM);
 289     mov(result, zr);
 290     RuntimeAddress stub = NULL;
 291     if (isL) {
 292       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
 293       assert(stub.target() != NULL, "string_indexof_linear_ll stub has not been generated");
 294     } else if (str1_isL) {
 295       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
 296        assert(stub.target() != NULL, "string_indexof_linear_ul stub has not been generated");
 297     } else {
 298       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
 299       assert(stub.target() != NULL, "string_indexof_linear_uu stub has not been generated");
 300     }
 301     address call = trampoline_call(stub);
 302     if (call == nullptr) {
 303       DEBUG_ONLY(reset_labels(LINEARSEARCH, LINEAR_MEDIUM, DONE, NOMATCH, MATCH));
 304       ciEnv::current()->record_failure("CodeCache is full");
 305       return;
 306     }
 307     b(DONE);
 308   }
 309 
 310   BIND(LINEARSEARCH);
 311   {
 312     Label DO1, DO2, DO3;
 313 
 314     Register str2tmp = tmp2;
 315     Register first = tmp3;
 316 
 317     if (icnt1 == -1)
 318     {
 319         Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
 320 
 321         cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2));
 322         br(LT, DOSHORT);
 323       BIND(LINEAR_MEDIUM);
 324         (this->*str1_load_1chr)(first, Address(str1));
 325         lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
 326         sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
 327         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 328         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 329 
 330       BIND(FIRST_LOOP);
 331         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
 332         cmp(first, ch2);
 333         br(EQ, STR1_LOOP);
 334       BIND(STR2_NEXT);
 335         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 336         br(LE, FIRST_LOOP);
 337         b(NOMATCH);
 338 
 339       BIND(STR1_LOOP);
 340         adds(cnt1tmp, cnt1_neg, str1_chr_size);
 341         add(cnt2tmp, cnt2_neg, str2_chr_size);
 342         br(GE, MATCH);
 343 
 344       BIND(STR1_NEXT);
 345         (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
 346         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
 347         cmp(ch1, ch2);
 348         br(NE, STR2_NEXT);
 349         adds(cnt1tmp, cnt1tmp, str1_chr_size);
 350         add(cnt2tmp, cnt2tmp, str2_chr_size);
 351         br(LT, STR1_NEXT);
 352         b(MATCH);
 353 
 354       BIND(DOSHORT);
 355       if (str1_isL == str2_isL) {
 356         cmp(cnt1, (u1)2);
 357         br(LT, DO1);
 358         br(GT, DO3);
 359       }
 360     }
 361 
 362     if (icnt1 == 4) {
 363       Label CH1_LOOP;
 364 
 365         (this->*load_4chr)(ch1, str1);
 366         sub(result_tmp, cnt2, 4);
 367         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 368         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 369 
 370       BIND(CH1_LOOP);
 371         (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
 372         cmp(ch1, ch2);
 373         br(EQ, MATCH);
 374         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 375         br(LE, CH1_LOOP);
 376         b(NOMATCH);
 377       }
 378 
 379     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
 380       Label CH1_LOOP;
 381 
 382       BIND(DO2);
 383         (this->*load_2chr)(ch1, str1);
 384         if (icnt1 == 2) {
 385           sub(result_tmp, cnt2, 2);
 386         }
 387         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 388         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 389       BIND(CH1_LOOP);
 390         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
 391         cmp(ch1, ch2);
 392         br(EQ, MATCH);
 393         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 394         br(LE, CH1_LOOP);
 395         b(NOMATCH);
 396     }
 397 
 398     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
 399       Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
 400 
 401       BIND(DO3);
 402         (this->*load_2chr)(first, str1);
 403         (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
 404         if (icnt1 == 3) {
 405           sub(result_tmp, cnt2, 3);
 406         }
 407         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 408         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 409       BIND(FIRST_LOOP);
 410         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
 411         cmpw(first, ch2);
 412         br(EQ, STR1_LOOP);
 413       BIND(STR2_NEXT);
 414         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 415         br(LE, FIRST_LOOP);
 416         b(NOMATCH);
 417 
 418       BIND(STR1_LOOP);
 419         add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
 420         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
 421         cmp(ch1, ch2);
 422         br(NE, STR2_NEXT);
 423         b(MATCH);
 424     }
 425 
 426     if (icnt1 == -1 || icnt1 == 1) {
 427       Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
 428 
 429       BIND(DO1);
 430         (this->*str1_load_1chr)(ch1, str1);
 431         cmp(cnt2, (u1)8);
 432         br(LT, DO1_SHORT);
 433 
 434         sub(result_tmp, cnt2, 8/str2_chr_size);
 435         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 436         mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
 437         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 438 
 439         if (str2_isL) {
 440           orr(ch1, ch1, ch1, LSL, 8);
 441         }
 442         orr(ch1, ch1, ch1, LSL, 16);
 443         orr(ch1, ch1, ch1, LSL, 32);
 444       BIND(CH1_LOOP);
 445         ldr(ch2, Address(str2, cnt2_neg));
 446         eor(ch2, ch1, ch2);
 447         sub(tmp1, ch2, tmp3);
 448         orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 449         bics(tmp1, tmp1, tmp2);
 450         br(NE, HAS_ZERO);
 451         adds(cnt2_neg, cnt2_neg, 8);
 452         br(LT, CH1_LOOP);
 453 
 454         cmp(cnt2_neg, (u1)8);
 455         mov(cnt2_neg, 0);
 456         br(LT, CH1_LOOP);
 457         b(NOMATCH);
 458 
 459       BIND(HAS_ZERO);
 460         rev(tmp1, tmp1);
 461         clz(tmp1, tmp1);
 462         add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
 463         b(MATCH);
 464 
 465       BIND(DO1_SHORT);
 466         mov(result_tmp, cnt2);
 467         lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
 468         sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
 469       BIND(DO1_LOOP);
 470         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
 471         cmpw(ch1, ch2);
 472         br(EQ, MATCH);
 473         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 474         br(LT, DO1_LOOP);
 475     }
 476   }
 477   BIND(NOMATCH);
 478     mov(result, -1);
 479     b(DONE);
 480   BIND(MATCH);
 481     add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
 482   BIND(DONE);
 483 }
 484 
 485 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
 486 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
 487 
 488 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1,
 489                                             Register ch, Register result,
 490                                             Register tmp1, Register tmp2, Register tmp3)
 491 {
 492   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
 493   Register cnt1_neg = cnt1;
 494   Register ch1 = rscratch1;
 495   Register result_tmp = rscratch2;
 496 
 497   cbz(cnt1, NOMATCH);
 498 
 499   cmp(cnt1, (u1)4);
 500   br(LT, DO1_SHORT);
 501 
 502   orr(ch, ch, ch, LSL, 16);
 503   orr(ch, ch, ch, LSL, 32);
 504 
 505   sub(cnt1, cnt1, 4);
 506   mov(result_tmp, cnt1);
 507   lea(str1, Address(str1, cnt1, Address::uxtw(1)));
 508   sub(cnt1_neg, zr, cnt1, LSL, 1);
 509 
 510   mov(tmp3, 0x0001000100010001);
 511 
 512   BIND(CH1_LOOP);
 513     ldr(ch1, Address(str1, cnt1_neg));
 514     eor(ch1, ch, ch1);
 515     sub(tmp1, ch1, tmp3);
 516     orr(tmp2, ch1, 0x7fff7fff7fff7fff);
 517     bics(tmp1, tmp1, tmp2);
 518     br(NE, HAS_ZERO);
 519     adds(cnt1_neg, cnt1_neg, 8);
 520     br(LT, CH1_LOOP);
 521 
 522     cmp(cnt1_neg, (u1)8);
 523     mov(cnt1_neg, 0);
 524     br(LT, CH1_LOOP);
 525     b(NOMATCH);
 526 
 527   BIND(HAS_ZERO);
 528     rev(tmp1, tmp1);
 529     clz(tmp1, tmp1);
 530     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
 531     b(MATCH);
 532 
 533   BIND(DO1_SHORT);
 534     mov(result_tmp, cnt1);
 535     lea(str1, Address(str1, cnt1, Address::uxtw(1)));
 536     sub(cnt1_neg, zr, cnt1, LSL, 1);
 537   BIND(DO1_LOOP);
 538     ldrh(ch1, Address(str1, cnt1_neg));
 539     cmpw(ch, ch1);
 540     br(EQ, MATCH);
 541     adds(cnt1_neg, cnt1_neg, 2);
 542     br(LT, DO1_LOOP);
 543   BIND(NOMATCH);
 544     mov(result, -1);
 545     b(DONE);
 546   BIND(MATCH);
 547     add(result, result_tmp, cnt1_neg, ASR, 1);
 548   BIND(DONE);
 549 }
 550 
 551 void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1,
 552                                                 Register ch, Register result,
 553                                                 FloatRegister ztmp1,
 554                                                 FloatRegister ztmp2,
 555                                                 PRegister tmp_pg,
 556                                                 PRegister tmp_pdn, bool isL)
 557 {
 558   // Note that `tmp_pdn` should *NOT* be used as governing predicate register.
 559   assert(tmp_pg->is_governing(),
 560          "this register has to be a governing predicate register");
 561 
 562   Label LOOP, MATCH, DONE, NOMATCH;
 563   Register vec_len = rscratch1;
 564   Register idx = rscratch2;
 565 
 566   SIMD_RegVariant T = (isL == true) ? B : H;
 567 
 568   cbz(cnt1, NOMATCH);
 569 
 570   // Assign the particular char throughout the vector.
 571   sve_dup(ztmp2, T, ch);
 572   if (isL) {
 573     sve_cntb(vec_len);
 574   } else {
 575     sve_cnth(vec_len);
 576   }
 577   mov(idx, 0);
 578 
 579   // Generate a predicate to control the reading of input string.
 580   sve_whilelt(tmp_pg, T, idx, cnt1);
 581 
 582   BIND(LOOP);
 583     // Read a vector of 8- or 16-bit data depending on the string type. Note
 584     // that inactive elements indicated by the predicate register won't cause
 585     // a data read from memory to the destination vector.
 586     if (isL) {
 587       sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx));
 588     } else {
 589       sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1)));
 590     }
 591     add(idx, idx, vec_len);
 592 
 593     // Perform the comparison. An element of the destination predicate is set
 594     // to active if the particular char is matched.
 595     sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2);
 596 
 597     // Branch if the particular char is found.
 598     br(NE, MATCH);
 599 
 600     sve_whilelt(tmp_pg, T, idx, cnt1);
 601 
 602     // Loop back if the particular char not found.
 603     br(MI, LOOP);
 604 
 605   BIND(NOMATCH);
 606     mov(result, -1);
 607     b(DONE);
 608 
 609   BIND(MATCH);
 610     // Undo the index increment.
 611     sub(idx, idx, vec_len);
 612 
 613     // Crop the vector to find its location.
 614     sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */);
 615     add(result, idx, -1);
 616     sve_incp(result, T, tmp_pdn);
 617   BIND(DONE);
 618 }
 619 
 620 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1,
 621                                             Register ch, Register result,
 622                                             Register tmp1, Register tmp2, Register tmp3)
 623 {
 624   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
 625   Register cnt1_neg = cnt1;
 626   Register ch1 = rscratch1;
 627   Register result_tmp = rscratch2;
 628 
 629   cbz(cnt1, NOMATCH);
 630 
 631   cmp(cnt1, (u1)8);
 632   br(LT, DO1_SHORT);
 633 
 634   orr(ch, ch, ch, LSL, 8);
 635   orr(ch, ch, ch, LSL, 16);
 636   orr(ch, ch, ch, LSL, 32);
 637 
 638   sub(cnt1, cnt1, 8);
 639   mov(result_tmp, cnt1);
 640   lea(str1, Address(str1, cnt1));
 641   sub(cnt1_neg, zr, cnt1);
 642 
 643   mov(tmp3, 0x0101010101010101);
 644 
 645   BIND(CH1_LOOP);
 646     ldr(ch1, Address(str1, cnt1_neg));
 647     eor(ch1, ch, ch1);
 648     sub(tmp1, ch1, tmp3);
 649     orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f);
 650     bics(tmp1, tmp1, tmp2);
 651     br(NE, HAS_ZERO);
 652     adds(cnt1_neg, cnt1_neg, 8);
 653     br(LT, CH1_LOOP);
 654 
 655     cmp(cnt1_neg, (u1)8);
 656     mov(cnt1_neg, 0);
 657     br(LT, CH1_LOOP);
 658     b(NOMATCH);
 659 
 660   BIND(HAS_ZERO);
 661     rev(tmp1, tmp1);
 662     clz(tmp1, tmp1);
 663     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
 664     b(MATCH);
 665 
 666   BIND(DO1_SHORT);
 667     mov(result_tmp, cnt1);
 668     lea(str1, Address(str1, cnt1));
 669     sub(cnt1_neg, zr, cnt1);
 670   BIND(DO1_LOOP);
 671     ldrb(ch1, Address(str1, cnt1_neg));
 672     cmp(ch, ch1);
 673     br(EQ, MATCH);
 674     adds(cnt1_neg, cnt1_neg, 1);
 675     br(LT, DO1_LOOP);
 676   BIND(NOMATCH);
 677     mov(result, -1);
 678     b(DONE);
 679   BIND(MATCH);
 680     add(result, result_tmp, cnt1_neg);
 681   BIND(DONE);
 682 }
 683 
 684 // Compare strings.
 685 void C2_MacroAssembler::string_compare(Register str1, Register str2,
 686     Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
 687     FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
 688     PRegister pgtmp1, PRegister pgtmp2, int ae) {
 689   Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
 690       DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
 691       SHORT_LOOP_START, TAIL_CHECK;
 692 
 693   bool isLL = ae == StrIntrinsicNode::LL;
 694   bool isLU = ae == StrIntrinsicNode::LU;
 695   bool isUL = ae == StrIntrinsicNode::UL;
 696 
 697   // The stub threshold for LL strings is: 72 (64 + 8) chars
 698   // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch)
 699   // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least)
 700   const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36);
 701 
 702   bool str1_isL = isLL || isLU;
 703   bool str2_isL = isLL || isUL;
 704 
 705   int str1_chr_shift = str1_isL ? 0 : 1;
 706   int str2_chr_shift = str2_isL ? 0 : 1;
 707   int str1_chr_size = str1_isL ? 1 : 2;
 708   int str2_chr_size = str2_isL ? 1 : 2;
 709   int minCharsInWord = isLL ? wordSize : wordSize/2;
 710 
 711   FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
 712   chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
 713                                       (chr_insn)&MacroAssembler::ldrh;
 714   chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
 715                                       (chr_insn)&MacroAssembler::ldrh;
 716   uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
 717                             (uxt_insn)&MacroAssembler::uxthw;
 718 
 719   BLOCK_COMMENT("string_compare {");
 720 
 721   // Bizzarely, the counts are passed in bytes, regardless of whether they
 722   // are L or U strings, however the result is always in characters.
 723   if (!str1_isL) asrw(cnt1, cnt1, 1);
 724   if (!str2_isL) asrw(cnt2, cnt2, 1);
 725 
 726   // Compute the minimum of the string lengths and save the difference.
 727   subsw(result, cnt1, cnt2);
 728   cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
 729 
 730   // A very short string
 731   cmpw(cnt2, minCharsInWord);
 732   br(Assembler::LE, SHORT_STRING);
 733 
 734   // Compare longwords
 735   // load first parts of strings and finish initialization while loading
 736   {
 737     if (str1_isL == str2_isL) { // LL or UU
 738       ldr(tmp1, Address(str1));
 739       cmp(str1, str2);
 740       br(Assembler::EQ, DONE);
 741       ldr(tmp2, Address(str2));
 742       cmp(cnt2, stub_threshold);
 743       br(GE, STUB);
 744       subsw(cnt2, cnt2, minCharsInWord);
 745       br(EQ, TAIL_CHECK);
 746       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
 747       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
 748       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
 749     } else if (isLU) {
 750       ldrs(vtmp, Address(str1));
 751       ldr(tmp2, Address(str2));
 752       cmp(cnt2, stub_threshold);
 753       br(GE, STUB);
 754       subw(cnt2, cnt2, 4);
 755       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
 756       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
 757       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
 758       zip1(vtmp, T8B, vtmp, vtmpZ);
 759       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
 760       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
 761       add(cnt1, cnt1, 4);
 762       fmovd(tmp1, vtmp);
 763     } else { // UL case
 764       ldr(tmp1, Address(str1));
 765       ldrs(vtmp, Address(str2));
 766       cmp(cnt2, stub_threshold);
 767       br(GE, STUB);
 768       subw(cnt2, cnt2, 4);
 769       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
 770       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
 771       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
 772       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
 773       zip1(vtmp, T8B, vtmp, vtmpZ);
 774       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
 775       add(cnt1, cnt1, 8);
 776       fmovd(tmp2, vtmp);
 777     }
 778     adds(cnt2, cnt2, isUL ? 4 : 8);
 779     br(GE, TAIL);
 780     eor(rscratch2, tmp1, tmp2);
 781     cbnz(rscratch2, DIFF);
 782     // main loop
 783     bind(NEXT_WORD);
 784     if (str1_isL == str2_isL) {
 785       ldr(tmp1, Address(str1, cnt2));
 786       ldr(tmp2, Address(str2, cnt2));
 787       adds(cnt2, cnt2, 8);
 788     } else if (isLU) {
 789       ldrs(vtmp, Address(str1, cnt1));
 790       ldr(tmp2, Address(str2, cnt2));
 791       add(cnt1, cnt1, 4);
 792       zip1(vtmp, T8B, vtmp, vtmpZ);
 793       fmovd(tmp1, vtmp);
 794       adds(cnt2, cnt2, 8);
 795     } else { // UL
 796       ldrs(vtmp, Address(str2, cnt2));
 797       ldr(tmp1, Address(str1, cnt1));
 798       zip1(vtmp, T8B, vtmp, vtmpZ);
 799       add(cnt1, cnt1, 8);
 800       fmovd(tmp2, vtmp);
 801       adds(cnt2, cnt2, 4);
 802     }
 803     br(GE, TAIL);
 804 
 805     eor(rscratch2, tmp1, tmp2);
 806     cbz(rscratch2, NEXT_WORD);
 807     b(DIFF);
 808     bind(TAIL);
 809     eor(rscratch2, tmp1, tmp2);
 810     cbnz(rscratch2, DIFF);
 811     // Last longword.  In the case where length == 4 we compare the
 812     // same longword twice, but that's still faster than another
 813     // conditional branch.
 814     if (str1_isL == str2_isL) {
 815       ldr(tmp1, Address(str1));
 816       ldr(tmp2, Address(str2));
 817     } else if (isLU) {
 818       ldrs(vtmp, Address(str1));
 819       ldr(tmp2, Address(str2));
 820       zip1(vtmp, T8B, vtmp, vtmpZ);
 821       fmovd(tmp1, vtmp);
 822     } else { // UL
 823       ldrs(vtmp, Address(str2));
 824       ldr(tmp1, Address(str1));
 825       zip1(vtmp, T8B, vtmp, vtmpZ);
 826       fmovd(tmp2, vtmp);
 827     }
 828     bind(TAIL_CHECK);
 829     eor(rscratch2, tmp1, tmp2);
 830     cbz(rscratch2, DONE);
 831 
 832     // Find the first different characters in the longwords and
 833     // compute their difference.
 834     bind(DIFF);
 835     rev(rscratch2, rscratch2);
 836     clz(rscratch2, rscratch2);
 837     andr(rscratch2, rscratch2, isLL ? -8 : -16);
 838     lsrv(tmp1, tmp1, rscratch2);
 839     (this->*ext_chr)(tmp1, tmp1);
 840     lsrv(tmp2, tmp2, rscratch2);
 841     (this->*ext_chr)(tmp2, tmp2);
 842     subw(result, tmp1, tmp2);
 843     b(DONE);
 844   }
 845 
 846   bind(STUB);
 847     RuntimeAddress stub = NULL;
 848     switch(ae) {
 849       case StrIntrinsicNode::LL:
 850         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
 851         break;
 852       case StrIntrinsicNode::UU:
 853         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
 854         break;
 855       case StrIntrinsicNode::LU:
 856         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
 857         break;
 858       case StrIntrinsicNode::UL:
 859         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
 860         break;
 861       default:
 862         ShouldNotReachHere();
 863      }
 864     assert(stub.target() != NULL, "compare_long_string stub has not been generated");
 865     address call = trampoline_call(stub);
 866     if (call == nullptr) {
 867       DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START));
 868       ciEnv::current()->record_failure("CodeCache is full");
 869       return;
 870     }
 871     b(DONE);
 872 
 873   bind(SHORT_STRING);
 874   // Is the minimum length zero?
 875   cbz(cnt2, DONE);
 876   // arrange code to do most branches while loading and loading next characters
 877   // while comparing previous
 878   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
 879   subs(cnt2, cnt2, 1);
 880   br(EQ, SHORT_LAST_INIT);
 881   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
 882   b(SHORT_LOOP_START);
 883   bind(SHORT_LOOP);
 884   subs(cnt2, cnt2, 1);
 885   br(EQ, SHORT_LAST);
 886   bind(SHORT_LOOP_START);
 887   (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
 888   (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
 889   cmp(tmp1, cnt1);
 890   br(NE, SHORT_LOOP_TAIL);
 891   subs(cnt2, cnt2, 1);
 892   br(EQ, SHORT_LAST2);
 893   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
 894   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
 895   cmp(tmp2, rscratch1);
 896   br(EQ, SHORT_LOOP);
 897   sub(result, tmp2, rscratch1);
 898   b(DONE);
 899   bind(SHORT_LOOP_TAIL);
 900   sub(result, tmp1, cnt1);
 901   b(DONE);
 902   bind(SHORT_LAST2);
 903   cmp(tmp2, rscratch1);
 904   br(EQ, DONE);
 905   sub(result, tmp2, rscratch1);
 906 
 907   b(DONE);
 908   bind(SHORT_LAST_INIT);
 909   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
 910   bind(SHORT_LAST);
 911   cmp(tmp1, cnt1);
 912   br(EQ, DONE);
 913   sub(result, tmp1, cnt1);
 914 
 915   bind(DONE);
 916 
 917   BLOCK_COMMENT("} string_compare");
 918 }
 919 
 920 void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1,
 921                                      FloatRegister src2, int cond, bool isQ) {
 922   SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
 923   if (bt == T_FLOAT || bt == T_DOUBLE) {
 924     switch (cond) {
 925       case BoolTest::eq: fcmeq(dst, size, src1, src2); break;
 926       case BoolTest::ne: {
 927         fcmeq(dst, size, src1, src2);
 928         notr(dst, T16B, dst);
 929         break;
 930       }
 931       case BoolTest::ge: fcmge(dst, size, src1, src2); break;
 932       case BoolTest::gt: fcmgt(dst, size, src1, src2); break;
 933       case BoolTest::le: fcmge(dst, size, src2, src1); break;
 934       case BoolTest::lt: fcmgt(dst, size, src2, src1); break;
 935       default:
 936         assert(false, "unsupported");
 937         ShouldNotReachHere();
 938     }
 939   } else {
 940     switch (cond) {
 941       case BoolTest::eq: cmeq(dst, size, src1, src2); break;
 942       case BoolTest::ne: {
 943         cmeq(dst, size, src1, src2);
 944         notr(dst, T16B, dst);
 945         break;
 946       }
 947       case BoolTest::ge: cmge(dst, size, src1, src2); break;
 948       case BoolTest::gt: cmgt(dst, size, src1, src2); break;
 949       case BoolTest::le: cmge(dst, size, src2, src1); break;
 950       case BoolTest::lt: cmgt(dst, size, src2, src1); break;
 951       case BoolTest::uge: cmhs(dst, size, src1, src2); break;
 952       case BoolTest::ugt: cmhi(dst, size, src1, src2); break;
 953       case BoolTest::ult: cmhi(dst, size, src2, src1); break;
 954       case BoolTest::ule: cmhs(dst, size, src2, src1); break;
 955       default:
 956         assert(false, "unsupported");
 957         ShouldNotReachHere();
 958     }
 959   }
 960 }
 961 
 962 // Compress the least significant bit of each byte to the rightmost and clear
 963 // the higher garbage bits.
 964 void C2_MacroAssembler::bytemask_compress(Register dst) {
 965   // Example input, dst = 0x01 00 00 00 01 01 00 01
 966   // The "??" bytes are garbage.
 967   orr(dst, dst, dst, Assembler::LSR, 7);  // dst = 0x?? 02 ?? 00 ?? 03 ?? 01
 968   orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D
 969   orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D
 970   andr(dst, dst, 0xff);                   // dst = 0x8D
 971 }
 972 
 973 // Pack the lowest-numbered bit of each mask element in src into a long value
 974 // in dst, at most the first 64 lane elements.
 975 // Clobbers: rscratch1, if UseSVE=1 or the hardware doesn't support FEAT_BITPERM.
 976 void C2_MacroAssembler::sve_vmask_tolong(Register dst, PRegister src, BasicType bt, int lane_cnt,
 977                                          FloatRegister vtmp1, FloatRegister vtmp2) {
 978   assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
 979   assert_different_registers(dst, rscratch1);
 980   assert_different_registers(vtmp1, vtmp2);
 981 
 982   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
 983   // Example:   src = 0b01100101 10001101, bt = T_BYTE, lane_cnt = 16
 984   // Expected:  dst = 0x658D
 985 
 986   // Convert the mask into vector with sequential bytes.
 987   // vtmp1 = 0x00010100 0x00010001 0x01000000 0x01010001
 988   sve_cpy(vtmp1, size, src, 1, false);
 989   if (bt != T_BYTE) {
 990     sve_vector_narrow(vtmp1, B, vtmp1, size, vtmp2);
 991   }
 992 
 993   if (UseSVE > 1 && VM_Version::supports_svebitperm()) {
 994     // Given a vector with the value 0x00 or 0x01 in each byte, the basic idea
 995     // is to compress each significant bit of the byte in a cross-lane way. Due
 996     // to the lack of a cross-lane bit-compress instruction, we use BEXT
 997     // (bit-compress in each lane) with the biggest lane size (T = D) then
 998     // concatenate the results.
 999 
1000     // The second source input of BEXT, initialized with 0x01 in each byte.
1001     // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1002     sve_dup(vtmp2, B, 1);
1003 
1004     // BEXT vtmp1.D, vtmp1.D, vtmp2.D
1005     // vtmp1 = 0x0001010000010001 | 0x0100000001010001
1006     // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1007     //         ---------------------------------------
1008     // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1009     sve_bext(vtmp1, D, vtmp1, vtmp2);
1010 
1011     // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the
1012     // result to dst.
1013     // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1014     // dst   = 0x658D
1015     if (lane_cnt <= 8) {
1016       // No need to concatenate.
1017       umov(dst, vtmp1, B, 0);
1018     } else if (lane_cnt <= 16) {
1019       ins(vtmp1, B, vtmp1, 1, 8);
1020       umov(dst, vtmp1, H, 0);
1021     } else {
1022       // As the lane count is 64 at most, the final expected value must be in
1023       // the lowest 64 bits after narrowing vtmp1 from D to B.
1024       sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2);
1025       umov(dst, vtmp1, D, 0);
1026     }
1027   } else if (UseSVE > 0) {
1028     // Compress the lowest 8 bytes.
1029     fmovd(dst, vtmp1);
1030     bytemask_compress(dst);
1031     if (lane_cnt <= 8) return;
1032 
1033     // Repeat on higher bytes and join the results.
1034     // Compress 8 bytes in each iteration.
1035     for (int idx = 1; idx < (lane_cnt / 8); idx++) {
1036       sve_extract_integral(rscratch1, T_LONG, vtmp1, idx, vtmp2);
1037       bytemask_compress(rscratch1);
1038       orr(dst, dst, rscratch1, Assembler::LSL, idx << 3);
1039     }
1040   } else {
1041     assert(false, "unsupported");
1042     ShouldNotReachHere();
1043   }
1044 }
1045 
1046 // Unpack the mask, a long value in src, into predicate register dst based on the
1047 // corresponding data type. Note that dst can support at most 64 lanes.
1048 // Below example gives the expected dst predicate register in different types, with
1049 // a valid src(0x658D) on a 1024-bit vector size machine.
1050 // BYTE:  dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 00 00 65 8D
1051 // SHORT: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 14 11 40 51
1052 // INT:   dst = 0x00 00 00 00 00 00 00 00 01 10 01 01 10 00 11 01
1053 // LONG:  dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01
1054 //
1055 // The number of significant bits of src must be equal to lane_cnt. E.g., 0xFF658D which
1056 // has 24 significant bits would be an invalid input if dst predicate register refers to
1057 // a LONG type 1024-bit vector, which has at most 16 lanes.
1058 void C2_MacroAssembler::sve_vmask_fromlong(PRegister dst, Register src, BasicType bt, int lane_cnt,
1059                                            FloatRegister vtmp1, FloatRegister vtmp2) {
1060   assert(UseSVE == 2 && VM_Version::supports_svebitperm() &&
1061          lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported");
1062   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
1063   // Example:   src = 0x658D, bt = T_BYTE, size = B, lane_cnt = 16
1064   // Expected:  dst = 0b01101001 10001101
1065 
1066   // Put long value from general purpose register into the first lane of vector.
1067   // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1068   sve_dup(vtmp1, B, 0);
1069   mov(vtmp1, D, 0, src);
1070 
1071   // As sve_cmp generates mask value with the minimum unit in byte, we should
1072   // transform the value in the first lane which is mask in bit now to the
1073   // mask in byte, which can be done by SVE2's BDEP instruction.
1074 
1075   // The first source input of BDEP instruction. Deposite each byte in every 8 bytes.
1076   // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1077   if (lane_cnt <= 8) {
1078     // Nothing. As only one byte exsits.
1079   } else if (lane_cnt <= 16) {
1080     ins(vtmp1, B, vtmp1, 8, 1);
1081     mov(vtmp1, B, 1, zr);
1082   } else {
1083     sve_vector_extend(vtmp1, D, vtmp1, B);
1084   }
1085 
1086   // The second source input of BDEP instruction, initialized with 0x01 for each byte.
1087   // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1088   sve_dup(vtmp2, B, 1);
1089 
1090   // BDEP vtmp1.D, vtmp1.D, vtmp2.D
1091   // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1092   // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1093   //         ---------------------------------------
1094   // vtmp1 = 0x0001010000010001 | 0x0100000001010001
1095   sve_bdep(vtmp1, D, vtmp1, vtmp2);
1096 
1097   if (bt != T_BYTE) {
1098     sve_vector_extend(vtmp1, size, vtmp1, B);
1099   }
1100   // Generate mask according to the given vector, in which the elements have been
1101   // extended to expected type.
1102   // dst = 0b01101001 10001101
1103   sve_cmp(Assembler::NE, dst, size, ptrue, vtmp1, 0);
1104 }
1105 
1106 // Clobbers: rflags
1107 void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg,
1108                                     FloatRegister zn, FloatRegister zm, int cond) {
1109   assert(pg->is_governing(), "This register has to be a governing predicate register");
1110   FloatRegister z1 = zn, z2 = zm;
1111   // Convert the original BoolTest condition to Assembler::condition.
1112   Condition condition;
1113   switch (cond) {
1114     case BoolTest::eq: condition = Assembler::EQ; break;
1115     case BoolTest::ne: condition = Assembler::NE; break;
1116     case BoolTest::le: z1 = zm; z2 = zn; condition = Assembler::GE; break;
1117     case BoolTest::ge: condition = Assembler::GE; break;
1118     case BoolTest::lt: z1 = zm; z2 = zn; condition = Assembler::GT; break;
1119     case BoolTest::gt: condition = Assembler::GT; break;
1120     default:
1121       assert(false, "unsupported compare condition");
1122       ShouldNotReachHere();
1123   }
1124 
1125   SIMD_RegVariant size = elemType_to_regVariant(bt);
1126   if (bt == T_FLOAT || bt == T_DOUBLE) {
1127     sve_fcm(condition, pd, size, pg, z1, z2);
1128   } else {
1129     assert(is_integral_type(bt), "unsupported element type");
1130     sve_cmp(condition, pd, size, pg, z1, z2);
1131   }
1132 }
1133 
1134 // Get index of the last mask lane that is set
1135 void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) {
1136   SIMD_RegVariant size = elemType_to_regVariant(bt);
1137   sve_rev(ptmp, size, src);
1138   sve_brkb(ptmp, ptrue, ptmp, false);
1139   sve_cntp(dst, size, ptrue, ptmp);
1140   movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1);
1141   subw(dst, rscratch1, dst);
1142 }
1143 
1144 // Extend integer vector src to dst with the same lane count
1145 // but larger element size, e.g. 4B -> 4I
1146 void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes,
1147                                            FloatRegister src, BasicType src_bt) {
1148   if (src_bt == T_BYTE) {
1149     if (dst_bt == T_SHORT) {
1150       // 4B/8B to 4S/8S
1151       assert(dst_vlen_in_bytes == 8 || dst_vlen_in_bytes == 16, "unsupported");
1152       sxtl(dst, T8H, src, T8B);
1153     } else {
1154       // 4B to 4I
1155       assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported");
1156       sxtl(dst, T8H, src, T8B);
1157       sxtl(dst, T4S, dst, T4H);
1158     }
1159   } else if (src_bt == T_SHORT) {
1160     // 4S to 4I
1161     assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported");
1162     sxtl(dst, T4S, src, T4H);
1163   } else if (src_bt == T_INT) {
1164     // 2I to 2L
1165     assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported");
1166     sxtl(dst, T2D, src, T2S);
1167   } else {
1168     ShouldNotReachHere();
1169   }
1170 }
1171 
1172 // Narrow integer vector src down to dst with the same lane count
1173 // but smaller element size, e.g. 4I -> 4B
1174 void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt,
1175                                            FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) {
1176   if (src_bt == T_SHORT) {
1177     // 4S/8S to 4B/8B
1178     assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
1179     assert(dst_bt == T_BYTE, "unsupported");
1180     xtn(dst, T8B, src, T8H);
1181   } else if (src_bt == T_INT) {
1182     // 4I to 4B/4S
1183     assert(src_vlen_in_bytes == 16, "unsupported");
1184     assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported");
1185     xtn(dst, T4H, src, T4S);
1186     if (dst_bt == T_BYTE) {
1187       xtn(dst, T8B, dst, T8H);
1188     }
1189   } else if (src_bt == T_LONG) {
1190     // 2L to 2I
1191     assert(src_vlen_in_bytes == 16, "unsupported");
1192     assert(dst_bt == T_INT, "unsupported");
1193     xtn(dst, T2S, src, T2D);
1194   } else {
1195     ShouldNotReachHere();
1196   }
1197 }
1198 
1199 void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size,
1200                                           FloatRegister src, SIMD_RegVariant src_size) {
1201   assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size");
1202   if (src_size == B) {
1203     switch (dst_size) {
1204     case H:
1205       sve_sunpklo(dst, H, src);
1206       break;
1207     case S:
1208       sve_sunpklo(dst, H, src);
1209       sve_sunpklo(dst, S, dst);
1210       break;
1211     case D:
1212       sve_sunpklo(dst, H, src);
1213       sve_sunpklo(dst, S, dst);
1214       sve_sunpklo(dst, D, dst);
1215       break;
1216     default:
1217       ShouldNotReachHere();
1218     }
1219   } else if (src_size == H) {
1220     if (dst_size == S) {
1221       sve_sunpklo(dst, S, src);
1222     } else { // D
1223       sve_sunpklo(dst, S, src);
1224       sve_sunpklo(dst, D, dst);
1225     }
1226   } else if (src_size == S) {
1227     sve_sunpklo(dst, D, src);
1228   }
1229 }
1230 
1231 // Vector narrow from src to dst with specified element sizes.
1232 // High part of dst vector will be filled with zero.
1233 void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size,
1234                                           FloatRegister src, SIMD_RegVariant src_size,
1235                                           FloatRegister tmp) {
1236   assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size");
1237   assert_different_registers(src, tmp);
1238   sve_dup(tmp, src_size, 0);
1239   if (src_size == D) {
1240     switch (dst_size) {
1241     case S:
1242       sve_uzp1(dst, S, src, tmp);
1243       break;
1244     case H:
1245       assert_different_registers(dst, tmp);
1246       sve_uzp1(dst, S, src, tmp);
1247       sve_uzp1(dst, H, dst, tmp);
1248       break;
1249     case B:
1250       assert_different_registers(dst, tmp);
1251       sve_uzp1(dst, S, src, tmp);
1252       sve_uzp1(dst, H, dst, tmp);
1253       sve_uzp1(dst, B, dst, tmp);
1254       break;
1255     default:
1256       ShouldNotReachHere();
1257     }
1258   } else if (src_size == S) {
1259     if (dst_size == H) {
1260       sve_uzp1(dst, H, src, tmp);
1261     } else { // B
1262       assert_different_registers(dst, tmp);
1263       sve_uzp1(dst, H, src, tmp);
1264       sve_uzp1(dst, B, dst, tmp);
1265     }
1266   } else if (src_size == H) {
1267     sve_uzp1(dst, B, src, tmp);
1268   }
1269 }
1270 
1271 // Extend src predicate to dst predicate with the same lane count but larger
1272 // element size, e.g. 64Byte -> 512Long
1273 void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src,
1274                                              uint dst_element_length_in_bytes,
1275                                              uint src_element_length_in_bytes) {
1276   if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) {
1277     sve_punpklo(dst, src);
1278   } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) {
1279     sve_punpklo(dst, src);
1280     sve_punpklo(dst, dst);
1281   } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) {
1282     sve_punpklo(dst, src);
1283     sve_punpklo(dst, dst);
1284     sve_punpklo(dst, dst);
1285   } else {
1286     assert(false, "unsupported");
1287     ShouldNotReachHere();
1288   }
1289 }
1290 
1291 // Narrow src predicate to dst predicate with the same lane count but
1292 // smaller element size, e.g. 512Long -> 64Byte
1293 void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src,
1294                                              uint dst_element_length_in_bytes, uint src_element_length_in_bytes) {
1295   // The insignificant bits in src predicate are expected to be zero.
1296   if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) {
1297     sve_uzp1(dst, B, src, src);
1298   } else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) {
1299     sve_uzp1(dst, H, src, src);
1300     sve_uzp1(dst, B, dst, dst);
1301   } else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) {
1302     sve_uzp1(dst, S, src, src);
1303     sve_uzp1(dst, H, dst, dst);
1304     sve_uzp1(dst, B, dst, dst);
1305   } else {
1306     assert(false, "unsupported");
1307     ShouldNotReachHere();
1308   }
1309 }
1310 
1311 // Vector reduction add for integral type with ASIMD instructions.
1312 void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt,
1313                                                  Register isrc, FloatRegister vsrc,
1314                                                  unsigned vector_length_in_bytes,
1315                                                  FloatRegister vtmp) {
1316   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1317   assert_different_registers(dst, isrc);
1318   bool isQ = vector_length_in_bytes == 16;
1319 
1320   BLOCK_COMMENT("neon_reduce_add_integral {");
1321     switch(bt) {
1322       case T_BYTE:
1323         addv(vtmp, isQ ? T16B : T8B, vsrc);
1324         smov(dst, vtmp, B, 0);
1325         addw(dst, dst, isrc, ext::sxtb);
1326         break;
1327       case T_SHORT:
1328         addv(vtmp, isQ ? T8H : T4H, vsrc);
1329         smov(dst, vtmp, H, 0);
1330         addw(dst, dst, isrc, ext::sxth);
1331         break;
1332       case T_INT:
1333         isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc);
1334         umov(dst, vtmp, S, 0);
1335         addw(dst, dst, isrc);
1336         break;
1337       case T_LONG:
1338         assert(isQ, "unsupported");
1339         addpd(vtmp, vsrc);
1340         umov(dst, vtmp, D, 0);
1341         add(dst, dst, isrc);
1342         break;
1343       default:
1344         assert(false, "unsupported");
1345         ShouldNotReachHere();
1346     }
1347   BLOCK_COMMENT("} neon_reduce_add_integral");
1348 }
1349 
1350 // Vector reduction multiply for integral type with ASIMD instructions.
1351 // Note: temporary registers vtmp1 and vtmp2 are not used in some cases.
1352 // Clobbers: rscratch1
1353 void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt,
1354                                                  Register isrc, FloatRegister vsrc,
1355                                                  unsigned vector_length_in_bytes,
1356                                                  FloatRegister vtmp1, FloatRegister vtmp2) {
1357   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1358   bool isQ = vector_length_in_bytes == 16;
1359 
1360   BLOCK_COMMENT("neon_reduce_mul_integral {");
1361     switch(bt) {
1362       case T_BYTE:
1363         if (isQ) {
1364           // Multiply the lower half and higher half of vector iteratively.
1365           // vtmp1 = vsrc[8:15]
1366           ins(vtmp1, D, vsrc, 0, 1);
1367           // vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7]
1368           mulv(vtmp1, T8B, vtmp1, vsrc);
1369           // vtmp2 = vtmp1[4:7]
1370           ins(vtmp2, S, vtmp1, 0, 1);
1371           // vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3]
1372           mulv(vtmp1, T8B, vtmp2, vtmp1);
1373         } else {
1374           ins(vtmp1, S, vsrc, 0, 1);
1375           mulv(vtmp1, T8B, vtmp1, vsrc);
1376         }
1377         // vtmp2 = vtmp1[2:3]
1378         ins(vtmp2, H, vtmp1, 0, 1);
1379         // vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1]
1380         mulv(vtmp2, T8B, vtmp2, vtmp1);
1381         // dst = vtmp2[0] * isrc * vtmp2[1]
1382         umov(rscratch1, vtmp2, B, 0);
1383         mulw(dst, rscratch1, isrc);
1384         sxtb(dst, dst);
1385         umov(rscratch1, vtmp2, B, 1);
1386         mulw(dst, rscratch1, dst);
1387         sxtb(dst, dst);
1388         break;
1389       case T_SHORT:
1390         if (isQ) {
1391           ins(vtmp2, D, vsrc, 0, 1);
1392           mulv(vtmp2, T4H, vtmp2, vsrc);
1393           ins(vtmp1, S, vtmp2, 0, 1);
1394           mulv(vtmp1, T4H, vtmp1, vtmp2);
1395         } else {
1396           ins(vtmp1, S, vsrc, 0, 1);
1397           mulv(vtmp1, T4H, vtmp1, vsrc);
1398         }
1399         umov(rscratch1, vtmp1, H, 0);
1400         mulw(dst, rscratch1, isrc);
1401         sxth(dst, dst);
1402         umov(rscratch1, vtmp1, H, 1);
1403         mulw(dst, rscratch1, dst);
1404         sxth(dst, dst);
1405         break;
1406       case T_INT:
1407         if (isQ) {
1408           ins(vtmp1, D, vsrc, 0, 1);
1409           mulv(vtmp1, T2S, vtmp1, vsrc);
1410         } else {
1411           vtmp1 = vsrc;
1412         }
1413         umov(rscratch1, vtmp1, S, 0);
1414         mul(dst, rscratch1, isrc);
1415         umov(rscratch1, vtmp1, S, 1);
1416         mul(dst, rscratch1, dst);
1417         break;
1418       case T_LONG:
1419         umov(rscratch1, vsrc, D, 0);
1420         mul(dst, isrc, rscratch1);
1421         umov(rscratch1, vsrc, D, 1);
1422         mul(dst, dst, rscratch1);
1423         break;
1424       default:
1425         assert(false, "unsupported");
1426         ShouldNotReachHere();
1427     }
1428   BLOCK_COMMENT("} neon_reduce_mul_integral");
1429 }
1430 
1431 // Vector reduction multiply for floating-point type with ASIMD instructions.
1432 void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt,
1433                                            FloatRegister fsrc, FloatRegister vsrc,
1434                                            unsigned vector_length_in_bytes,
1435                                            FloatRegister vtmp) {
1436   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1437   bool isQ = vector_length_in_bytes == 16;
1438 
1439   BLOCK_COMMENT("neon_reduce_mul_fp {");
1440     switch(bt) {
1441       case T_FLOAT:
1442         fmuls(dst, fsrc, vsrc);
1443         ins(vtmp, S, vsrc, 0, 1);
1444         fmuls(dst, dst, vtmp);
1445         if (isQ) {
1446           ins(vtmp, S, vsrc, 0, 2);
1447           fmuls(dst, dst, vtmp);
1448           ins(vtmp, S, vsrc, 0, 3);
1449           fmuls(dst, dst, vtmp);
1450          }
1451         break;
1452       case T_DOUBLE:
1453         assert(isQ, "unsupported");
1454         fmuld(dst, fsrc, vsrc);
1455         ins(vtmp, D, vsrc, 0, 1);
1456         fmuld(dst, dst, vtmp);
1457         break;
1458       default:
1459         assert(false, "unsupported");
1460         ShouldNotReachHere();
1461     }
1462   BLOCK_COMMENT("} neon_reduce_mul_fp");
1463 }
1464 
1465 // Helper to select logical instruction
1466 void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd,
1467                                                    Register Rn, Register Rm,
1468                                                    enum shift_kind kind, unsigned shift) {
1469   switch(opc) {
1470     case Op_AndReductionV:
1471       is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift);
1472       break;
1473     case Op_OrReductionV:
1474       is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift);
1475       break;
1476     case Op_XorReductionV:
1477       is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift);
1478       break;
1479     default:
1480       assert(false, "unsupported");
1481       ShouldNotReachHere();
1482   }
1483 }
1484 
1485 // Vector reduction logical operations And, Or, Xor
1486 // Clobbers: rscratch1
1487 void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt,
1488                                             Register isrc, FloatRegister vsrc,
1489                                             unsigned vector_length_in_bytes) {
1490   assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV,
1491          "unsupported");
1492   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1493   assert_different_registers(dst, isrc);
1494   bool isQ = vector_length_in_bytes == 16;
1495 
1496   BLOCK_COMMENT("neon_reduce_logical {");
1497     umov(rscratch1, vsrc, isQ ? D : S, 0);
1498     umov(dst, vsrc, isQ ? D : S, 1);
1499     neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1);
1500     switch(bt) {
1501       case T_BYTE:
1502         if (isQ) {
1503           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1504         }
1505         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
1506         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8);
1507         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1508         sxtb(dst, dst);
1509         break;
1510       case T_SHORT:
1511         if (isQ) {
1512           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1513         }
1514         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
1515         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1516         sxth(dst, dst);
1517         break;
1518       case T_INT:
1519         if (isQ) {
1520           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1521         }
1522         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1523         break;
1524       case T_LONG:
1525         assert(isQ, "unsupported");
1526         neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst);
1527         break;
1528       default:
1529         assert(false, "unsupported");
1530         ShouldNotReachHere();
1531     }
1532   BLOCK_COMMENT("} neon_reduce_logical");
1533 }
1534 
1535 // Vector reduction min/max for integral type with ASIMD instructions.
1536 // Note: vtmp is not used and expected to be fnoreg for T_LONG case.
1537 // Clobbers: rscratch1, rflags
1538 void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt,
1539                                                     Register isrc, FloatRegister vsrc,
1540                                                     unsigned vector_length_in_bytes,
1541                                                     FloatRegister vtmp) {
1542   assert(opc == Op_MinReductionV || opc == Op_MaxReductionV, "unsupported");
1543   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1544   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported");
1545   assert_different_registers(dst, isrc);
1546   bool isQ = vector_length_in_bytes == 16;
1547   bool is_min = opc == Op_MinReductionV;
1548 
1549   BLOCK_COMMENT("neon_reduce_minmax_integral {");
1550     if (bt == T_LONG) {
1551       assert(vtmp == fnoreg, "should be");
1552       assert(isQ, "should be");
1553       umov(rscratch1, vsrc, D, 0);
1554       cmp(isrc, rscratch1);
1555       csel(dst, isrc, rscratch1, is_min ? LT : GT);
1556       umov(rscratch1, vsrc, D, 1);
1557       cmp(dst, rscratch1);
1558       csel(dst, dst, rscratch1, is_min ? LT : GT);
1559     } else {
1560       SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1561       if (size == T2S) {
1562         is_min ? sminp(vtmp, size, vsrc, vsrc) : smaxp(vtmp, size, vsrc, vsrc);
1563       } else {
1564         is_min ? sminv(vtmp, size, vsrc) : smaxv(vtmp, size, vsrc);
1565       }
1566       if (bt == T_INT) {
1567         umov(dst, vtmp, S, 0);
1568       } else {
1569         smov(dst, vtmp, elemType_to_regVariant(bt), 0);
1570       }
1571       cmpw(dst, isrc);
1572       cselw(dst, dst, isrc, is_min ? LT : GT);
1573     }
1574   BLOCK_COMMENT("} neon_reduce_minmax_integral");
1575 }
1576 
1577 // Vector reduction for integral type with SVE instruction.
1578 // Supported operations are Add, And, Or, Xor, Max, Min.
1579 // rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV.
1580 void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1,
1581                                             FloatRegister src2, PRegister pg, FloatRegister tmp) {
1582   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
1583   assert(pg->is_governing(), "This register has to be a governing predicate register");
1584   assert_different_registers(src1, dst);
1585   // Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved.
1586   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
1587   switch (opc) {
1588     case Op_AddReductionVI: {
1589       sve_uaddv(tmp, size, pg, src2);
1590       if (bt == T_BYTE) {
1591         smov(dst, tmp, size, 0);
1592         addw(dst, src1, dst, ext::sxtb);
1593       } else if (bt == T_SHORT) {
1594         smov(dst, tmp, size, 0);
1595         addw(dst, src1, dst, ext::sxth);
1596       } else {
1597         umov(dst, tmp, size, 0);
1598         addw(dst, dst, src1);
1599       }
1600       break;
1601     }
1602     case Op_AddReductionVL: {
1603       sve_uaddv(tmp, size, pg, src2);
1604       umov(dst, tmp, size, 0);
1605       add(dst, dst, src1);
1606       break;
1607     }
1608     case Op_AndReductionV: {
1609       sve_andv(tmp, size, pg, src2);
1610       if (bt == T_INT || bt == T_LONG) {
1611         umov(dst, tmp, size, 0);
1612       } else {
1613         smov(dst, tmp, size, 0);
1614       }
1615       if (bt == T_LONG) {
1616         andr(dst, dst, src1);
1617       } else {
1618         andw(dst, dst, src1);
1619       }
1620       break;
1621     }
1622     case Op_OrReductionV: {
1623       sve_orv(tmp, size, pg, src2);
1624       if (bt == T_INT || bt == T_LONG) {
1625         umov(dst, tmp, size, 0);
1626       } else {
1627         smov(dst, tmp, size, 0);
1628       }
1629       if (bt == T_LONG) {
1630         orr(dst, dst, src1);
1631       } else {
1632         orrw(dst, dst, src1);
1633       }
1634       break;
1635     }
1636     case Op_XorReductionV: {
1637       sve_eorv(tmp, size, pg, src2);
1638       if (bt == T_INT || bt == T_LONG) {
1639         umov(dst, tmp, size, 0);
1640       } else {
1641         smov(dst, tmp, size, 0);
1642       }
1643       if (bt == T_LONG) {
1644         eor(dst, dst, src1);
1645       } else {
1646         eorw(dst, dst, src1);
1647       }
1648       break;
1649     }
1650     case Op_MaxReductionV: {
1651       sve_smaxv(tmp, size, pg, src2);
1652       if (bt == T_INT || bt == T_LONG) {
1653         umov(dst, tmp, size, 0);
1654       } else {
1655         smov(dst, tmp, size, 0);
1656       }
1657       if (bt == T_LONG) {
1658         cmp(dst, src1);
1659         csel(dst, dst, src1, Assembler::GT);
1660       } else {
1661         cmpw(dst, src1);
1662         cselw(dst, dst, src1, Assembler::GT);
1663       }
1664       break;
1665     }
1666     case Op_MinReductionV: {
1667       sve_sminv(tmp, size, pg, src2);
1668       if (bt == T_INT || bt == T_LONG) {
1669         umov(dst, tmp, size, 0);
1670       } else {
1671         smov(dst, tmp, size, 0);
1672       }
1673       if (bt == T_LONG) {
1674         cmp(dst, src1);
1675         csel(dst, dst, src1, Assembler::LT);
1676       } else {
1677         cmpw(dst, src1);
1678         cselw(dst, dst, src1, Assembler::LT);
1679       }
1680       break;
1681     }
1682     default:
1683       assert(false, "unsupported");
1684       ShouldNotReachHere();
1685   }
1686 
1687   if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) {
1688     if (bt == T_BYTE) {
1689       sxtb(dst, dst);
1690     } else if (bt == T_SHORT) {
1691       sxth(dst, dst);
1692     }
1693   }
1694 }
1695 
1696 // Set elements of the dst predicate to true for lanes in the range of [0, lane_cnt), or
1697 // to false otherwise. The input "lane_cnt" should be smaller than or equal to the supported
1698 // max vector length of the basic type. Clobbers: rscratch1 and the rFlagsReg.
1699 void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt) {
1700   uint32_t max_vector_length = Matcher::max_vector_size(bt);
1701   assert(lane_cnt <= max_vector_length, "unsupported input lane_cnt");
1702 
1703   // Set all elements to false if the input "lane_cnt" is zero.
1704   if (lane_cnt == 0) {
1705     sve_pfalse(dst);
1706     return;
1707   }
1708 
1709   SIMD_RegVariant size = elemType_to_regVariant(bt);
1710   assert(size != Q, "invalid size");
1711 
1712   // Set all true if "lane_cnt" equals to the max lane count.
1713   if (lane_cnt == max_vector_length) {
1714     sve_ptrue(dst, size, /* ALL */ 0b11111);
1715     return;
1716   }
1717 
1718   // Fixed numbers for "ptrue".
1719   switch(lane_cnt) {
1720   case 1: /* VL1 */
1721   case 2: /* VL2 */
1722   case 3: /* VL3 */
1723   case 4: /* VL4 */
1724   case 5: /* VL5 */
1725   case 6: /* VL6 */
1726   case 7: /* VL7 */
1727   case 8: /* VL8 */
1728     sve_ptrue(dst, size, lane_cnt);
1729     return;
1730   case 16:
1731     sve_ptrue(dst, size, /* VL16 */ 0b01001);
1732     return;
1733   case 32:
1734     sve_ptrue(dst, size, /* VL32 */ 0b01010);
1735     return;
1736   case 64:
1737     sve_ptrue(dst, size, /* VL64 */ 0b01011);
1738     return;
1739   case 128:
1740     sve_ptrue(dst, size, /* VL128 */ 0b01100);
1741     return;
1742   case 256:
1743     sve_ptrue(dst, size, /* VL256 */ 0b01101);
1744     return;
1745   default:
1746     break;
1747   }
1748 
1749   // Special patterns for "ptrue".
1750   if (lane_cnt == round_down_power_of_2(max_vector_length)) {
1751     sve_ptrue(dst, size, /* POW2 */ 0b00000);
1752   } else if (lane_cnt == max_vector_length - (max_vector_length % 4)) {
1753     sve_ptrue(dst, size, /* MUL4 */ 0b11101);
1754   } else if (lane_cnt == max_vector_length - (max_vector_length % 3)) {
1755     sve_ptrue(dst, size, /* MUL3 */ 0b11110);
1756   } else {
1757     // Encode to "whilelow" for the remaining cases.
1758     mov(rscratch1, lane_cnt);
1759     sve_whilelow(dst, size, zr, rscratch1);
1760   }
1761 }
1762 
1763 // Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst.
1764 // Any remaining elements of dst will be filled with zero.
1765 // Clobbers: rscratch1
1766 // Preserves: src, mask
1767 void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask,
1768                                            FloatRegister vtmp1, FloatRegister vtmp2,
1769                                            PRegister pgtmp) {
1770   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
1771   assert_different_registers(dst, src, vtmp1, vtmp2);
1772   assert_different_registers(mask, pgtmp);
1773 
1774   // Example input:   src   = 8888 7777 6666 5555 4444 3333 2222 1111
1775   //                  mask  = 0001 0000 0000 0001 0001 0000 0001 0001
1776   // Expected result: dst   = 0000 0000 0000 8888 5555 4444 2222 1111
1777   sve_dup(vtmp2, H, 0);
1778 
1779   // Extend lowest half to type INT.
1780   // dst = 00004444 00003333 00002222 00001111
1781   sve_uunpklo(dst, S, src);
1782   // pgtmp = 00000001 00000000 00000001 00000001
1783   sve_punpklo(pgtmp, mask);
1784   // Pack the active elements in size of type INT to the right,
1785   // and fill the remainings with zero.
1786   // dst = 00000000 00004444 00002222 00001111
1787   sve_compact(dst, S, dst, pgtmp);
1788   // Narrow the result back to type SHORT.
1789   // dst = 0000 0000 0000 0000 0000 4444 2222 1111
1790   sve_uzp1(dst, H, dst, vtmp2);
1791   // Count the active elements of lowest half.
1792   // rscratch1 = 3
1793   sve_cntp(rscratch1, S, ptrue, pgtmp);
1794 
1795   // Repeat to the highest half.
1796   // pgtmp = 00000001 00000000 00000000 00000001
1797   sve_punpkhi(pgtmp, mask);
1798   // vtmp1 = 00008888 00007777 00006666 00005555
1799   sve_uunpkhi(vtmp1, S, src);
1800   // vtmp1 = 00000000 00000000 00008888 00005555
1801   sve_compact(vtmp1, S, vtmp1, pgtmp);
1802   // vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555
1803   sve_uzp1(vtmp1, H, vtmp1, vtmp2);
1804 
1805   // Compressed low:   dst   = 0000 0000 0000 0000 0000 4444 2222 1111
1806   // Compressed high:  vtmp1 = 0000 0000 0000 0000 0000 0000 8888  5555
1807   // Left shift(cross lane) compressed high with TRUE_CNT lanes,
1808   // TRUE_CNT is the number of active elements in the compressed low.
1809   neg(rscratch1, rscratch1);
1810   // vtmp2 = {4 3 2 1 0 -1 -2 -3}
1811   sve_index(vtmp2, H, rscratch1, 1);
1812   // vtmp1 = 0000 0000 0000 8888 5555 0000 0000 0000
1813   sve_tbl(vtmp1, H, vtmp1, vtmp2);
1814 
1815   // Combine the compressed high(after shifted) with the compressed low.
1816   // dst = 0000 0000 0000 8888 5555 4444 2222 1111
1817   sve_orr(dst, dst, vtmp1);
1818 }
1819 
1820 // Clobbers: rscratch1, rscratch2
1821 // Preserves: src, mask
1822 void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask,
1823                                           FloatRegister vtmp1, FloatRegister vtmp2,
1824                                           FloatRegister vtmp3, FloatRegister vtmp4,
1825                                           PRegister ptmp, PRegister pgtmp) {
1826   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
1827   assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3, vtmp4);
1828   assert_different_registers(mask, ptmp, pgtmp);
1829   // Example input:   src   = 88 77 66 55 44 33 22 11
1830   //                  mask  = 01 00 00 01 01 00 01 01
1831   // Expected result: dst   = 00 00 00 88 55 44 22 11
1832 
1833   sve_dup(vtmp4, B, 0);
1834   // Extend lowest half to type SHORT.
1835   // vtmp1 = 0044 0033 0022 0011
1836   sve_uunpklo(vtmp1, H, src);
1837   // ptmp = 0001 0000 0001 0001
1838   sve_punpklo(ptmp, mask);
1839   // Count the active elements of lowest half.
1840   // rscratch2 = 3
1841   sve_cntp(rscratch2, H, ptrue, ptmp);
1842   // Pack the active elements in size of type SHORT to the right,
1843   // and fill the remainings with zero.
1844   // dst = 0000 0044 0022 0011
1845   sve_compress_short(dst, vtmp1, ptmp, vtmp2, vtmp3, pgtmp);
1846   // Narrow the result back to type BYTE.
1847   // dst = 00 00 00 00 00 44 22 11
1848   sve_uzp1(dst, B, dst, vtmp4);
1849 
1850   // Repeat to the highest half.
1851   // ptmp = 0001 0000 0000 0001
1852   sve_punpkhi(ptmp, mask);
1853   // vtmp1 = 0088 0077 0066 0055
1854   sve_uunpkhi(vtmp2, H, src);
1855   // vtmp1 = 0000 0000 0088 0055
1856   sve_compress_short(vtmp1, vtmp2, ptmp, vtmp3, vtmp4, pgtmp);
1857 
1858   sve_dup(vtmp4, B, 0);
1859   // vtmp1 = 00 00 00 00 00 00 88 55
1860   sve_uzp1(vtmp1, B, vtmp1, vtmp4);
1861 
1862   // Compressed low:   dst   = 00 00 00 00 00 44 22 11
1863   // Compressed high:  vtmp1 = 00 00 00 00 00 00 88 55
1864   // Left shift(cross lane) compressed high with TRUE_CNT lanes,
1865   // TRUE_CNT is the number of active elements in the compressed low.
1866   neg(rscratch2, rscratch2);
1867   // vtmp2 = {4 3 2 1 0 -1 -2 -3}
1868   sve_index(vtmp2, B, rscratch2, 1);
1869   // vtmp1 = 00 00 00 88 55 00 00 00
1870   sve_tbl(vtmp1, B, vtmp1, vtmp2);
1871   // Combine the compressed high(after shifted) with the compressed low.
1872   // dst = 00 00 00 88 55 44 22 11
1873   sve_orr(dst, dst, vtmp1);
1874 }
1875 
1876 void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
1877   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
1878   SIMD_Arrangement size = isQ ? T16B : T8B;
1879   if (bt == T_BYTE) {
1880     rbit(dst, size, src);
1881   } else {
1882     neon_reverse_bytes(dst, src, bt, isQ);
1883     rbit(dst, size, dst);
1884   }
1885 }
1886 
1887 void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
1888   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
1889   SIMD_Arrangement size = isQ ? T16B : T8B;
1890   switch (bt) {
1891     case T_BYTE:
1892       if (dst != src) {
1893         orr(dst, size, src, src);
1894       }
1895       break;
1896     case T_SHORT:
1897       rev16(dst, size, src);
1898       break;
1899     case T_INT:
1900       rev32(dst, size, src);
1901       break;
1902     case T_LONG:
1903       rev64(dst, size, src);
1904       break;
1905     default:
1906       assert(false, "unsupported");
1907       ShouldNotReachHere();
1908   }
1909 }
1910 
1911 // Extract a scalar element from an sve vector at position 'idx'.
1912 // The input elements in src are expected to be of integral type.
1913 void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src,
1914                                              int idx, FloatRegister vtmp) {
1915   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
1916   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
1917   if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction
1918     if (bt == T_INT || bt == T_LONG) {
1919       umov(dst, src, size, idx);
1920     } else {
1921       smov(dst, src, size, idx);
1922     }
1923   } else {
1924     sve_orr(vtmp, src, src);
1925     sve_ext(vtmp, vtmp, idx << size);
1926     if (bt == T_INT || bt == T_LONG) {
1927       umov(dst, vtmp, size, 0);
1928     } else {
1929       smov(dst, vtmp, size, 0);
1930     }
1931   }
1932 }
1933 
1934 // java.lang.Math::round intrinsics
1935 
1936 // Clobbers: rscratch1, rflags
1937 void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
1938                                           FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) {
1939   assert_different_registers(tmp1, tmp2, tmp3, src, dst);
1940   switch (T) {
1941     case T2S:
1942     case T4S:
1943       fmovs(tmp1, T, 0.5f);
1944       mov(rscratch1, jint_cast(0x1.0p23f));
1945       break;
1946     case T2D:
1947       fmovd(tmp1, T, 0.5);
1948       mov(rscratch1, julong_cast(0x1.0p52));
1949       break;
1950     default:
1951       assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
1952   }
1953   fadd(tmp1, T, tmp1, src);
1954   fcvtms(tmp1, T, tmp1);
1955   // tmp1 = floor(src + 0.5, ties to even)
1956 
1957   fcvtas(dst, T, src);
1958   // dst = round(src), ties to away
1959 
1960   fneg(tmp3, T, src);
1961   dup(tmp2, T, rscratch1);
1962   cmhs(tmp3, T, tmp3, tmp2);
1963   // tmp3 is now a set of flags
1964 
1965   bif(dst, T16B, tmp1, tmp3);
1966   // result in dst
1967 }
1968 
1969 // Clobbers: rscratch1, rflags
1970 void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
1971                                          FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) {
1972   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
1973   assert_different_registers(tmp1, tmp2, src, dst);
1974 
1975   switch (T) {
1976     case S:
1977       mov(rscratch1, jint_cast(0x1.0p23f));
1978       break;
1979     case D:
1980       mov(rscratch1, julong_cast(0x1.0p52));
1981       break;
1982     default:
1983       assert(T == S || T == D, "invalid register variant");
1984   }
1985 
1986   sve_frinta(dst, T, ptrue, src);
1987   // dst = round(src), ties to away
1988 
1989   Label none;
1990 
1991   sve_fneg(tmp1, T, ptrue, src);
1992   sve_dup(tmp2, T, rscratch1);
1993   sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1);
1994   br(EQ, none);
1995   {
1996     sve_cpy(tmp1, T, pgtmp, 0.5);
1997     sve_fadd(tmp1, T, pgtmp, src);
1998     sve_frintm(dst, T, pgtmp, tmp1);
1999     // dst = floor(src + 0.5, ties to even)
2000   }
2001   bind(none);
2002 
2003   sve_fcvtzs(dst, T, ptrue, dst, T);
2004   // result in dst
2005 }
2006 
2007 void C2_MacroAssembler::vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero,
2008                                            FloatRegister one, SIMD_Arrangement T) {
2009   assert_different_registers(dst, src, zero, one);
2010   assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2011 
2012   facgt(dst, T, src, zero);
2013   ushr(dst, T, dst, 1); // dst=0 for +-0.0 and NaN. 0x7FF..F otherwise
2014   bsl(dst, T == T2S ? T8B : T16B, one, src); // Result in dst
2015 }
2016 
2017 void C2_MacroAssembler::vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero,
2018                                           FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T) {
2019     assert_different_registers(dst, src, zero, one, vtmp);
2020     assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2021 
2022     sve_orr(vtmp, src, src);
2023     sve_fac(Assembler::GT, pgtmp, T, ptrue, src, zero); // pmtp=0 for +-0.0 and NaN. 0x1 otherwise
2024     switch (T) {
2025     case S:
2026       sve_and(vtmp, T, min_jint); // Extract the sign bit of float value in every lane of src
2027       sve_orr(vtmp, T, jint_cast(1.0)); // OR it with +1 to make the final result +1 or -1 depending
2028                                         // on the sign of the float value
2029       break;
2030     case D:
2031       sve_and(vtmp, T, min_jlong);
2032       sve_orr(vtmp, T, jlong_cast(1.0));
2033       break;
2034     default:
2035       assert(false, "unsupported");
2036       ShouldNotReachHere();
2037     }
2038     sve_sel(dst, T, pgtmp, vtmp, src); // Select either from src or vtmp based on the predicate register pgtmp
2039                                        // Result in dst
2040 }
2041 
2042 bool C2_MacroAssembler::in_scratch_emit_size() {
2043   if (ciEnv::current()->task() != NULL) {
2044     PhaseOutput* phase_output = Compile::current()->output();
2045     if (phase_output != NULL && phase_output->in_scratch_emit_size()) {
2046       return true;
2047     }
2048   }
2049   return MacroAssembler::in_scratch_emit_size();
2050 }