1 /*
   2  * Copyright (c) 2020, 2021, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "opto/c2_MacroAssembler.hpp"
  29 #include "opto/intrinsicnode.hpp"
  30 #include "opto/subnode.hpp"
  31 #include "runtime/stubRoutines.hpp"
  32 
  33 #ifdef PRODUCT
  34 #define BLOCK_COMMENT(str) /* nothing */
  35 #define STOP(error) stop(error)
  36 #else
  37 #define BLOCK_COMMENT(str) block_comment(str)
  38 #define STOP(error) block_comment(error); stop(error)
  39 #endif
  40 
  41 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  42 
  43 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
  44 
  45 // Search for str1 in str2 and return index or -1
  46 void C2_MacroAssembler::string_indexof(Register str2, Register str1,
  47                                        Register cnt2, Register cnt1,
  48                                        Register tmp1, Register tmp2,
  49                                        Register tmp3, Register tmp4,
  50                                        Register tmp5, Register tmp6,
  51                                        int icnt1, Register result, int ae) {
  52   // NOTE: tmp5, tmp6 can be zr depending on specific method version
  53   Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
  54 
  55   Register ch1 = rscratch1;
  56   Register ch2 = rscratch2;
  57   Register cnt1tmp = tmp1;
  58   Register cnt2tmp = tmp2;
  59   Register cnt1_neg = cnt1;
  60   Register cnt2_neg = cnt2;
  61   Register result_tmp = tmp4;
  62 
  63   bool isL = ae == StrIntrinsicNode::LL;
  64 
  65   bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
  66   bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
  67   int str1_chr_shift = str1_isL ? 0:1;
  68   int str2_chr_shift = str2_isL ? 0:1;
  69   int str1_chr_size = str1_isL ? 1:2;
  70   int str2_chr_size = str2_isL ? 1:2;
  71   chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
  72                                       (chr_insn)&MacroAssembler::ldrh;
  73   chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
  74                                       (chr_insn)&MacroAssembler::ldrh;
  75   chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
  76   chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
  77 
  78   // Note, inline_string_indexOf() generates checks:
  79   // if (substr.count > string.count) return -1;
  80   // if (substr.count == 0) return 0;
  81 
  82   // We have two strings, a source string in str2, cnt2 and a pattern string
  83   // in str1, cnt1. Find the 1st occurence of pattern in source or return -1.
  84 
  85   // For larger pattern and source we use a simplified Boyer Moore algorithm.
  86   // With a small pattern and source we use linear scan.
  87 
  88   if (icnt1 == -1) {
  89     sub(result_tmp, cnt2, cnt1);
  90     cmp(cnt1, (u1)8);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
  91     br(LT, LINEARSEARCH);
  92     dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
  93     subs(zr, cnt1, 256);
  94     lsr(tmp1, cnt2, 2);
  95     ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
  96     br(GE, LINEARSTUB);
  97   }
  98 
  99 // The Boyer Moore alogorithm is based on the description here:-
 100 //
 101 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
 102 //
 103 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
 104 // and the 'Good Suffix' rule.
 105 //
 106 // These rules are essentially heuristics for how far we can shift the
 107 // pattern along the search string.
 108 //
 109 // The implementation here uses the 'Bad Character' rule only because of the
 110 // complexity of initialisation for the 'Good Suffix' rule.
 111 //
 112 // This is also known as the Boyer-Moore-Horspool algorithm:-
 113 //
 114 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
 115 //
 116 // This particular implementation has few java-specific optimizations.
 117 //
 118 // #define ASIZE 256
 119 //
 120 //    int bm(unsigned char *x, int m, unsigned char *y, int n) {
 121 //       int i, j;
 122 //       unsigned c;
 123 //       unsigned char bc[ASIZE];
 124 //
 125 //       /* Preprocessing */
 126 //       for (i = 0; i < ASIZE; ++i)
 127 //          bc[i] = m;
 128 //       for (i = 0; i < m - 1; ) {
 129 //          c = x[i];
 130 //          ++i;
 131 //          // c < 256 for Latin1 string, so, no need for branch
 132 //          #ifdef PATTERN_STRING_IS_LATIN1
 133 //          bc[c] = m - i;
 134 //          #else
 135 //          if (c < ASIZE) bc[c] = m - i;
 136 //          #endif
 137 //       }
 138 //
 139 //       /* Searching */
 140 //       j = 0;
 141 //       while (j <= n - m) {
 142 //          c = y[i+j];
 143 //          if (x[m-1] == c)
 144 //            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
 145 //          if (i < 0) return j;
 146 //          // c < 256 for Latin1 string, so, no need for branch
 147 //          #ifdef SOURCE_STRING_IS_LATIN1
 148 //          // LL case: (c< 256) always true. Remove branch
 149 //          j += bc[y[j+m-1]];
 150 //          #endif
 151 //          #ifndef PATTERN_STRING_IS_UTF
 152 //          // UU case: need if (c<ASIZE) check. Skip 1 character if not.
 153 //          if (c < ASIZE)
 154 //            j += bc[y[j+m-1]];
 155 //          else
 156 //            j += 1
 157 //          #endif
 158 //          #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
 159 //          // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
 160 //          if (c < ASIZE)
 161 //            j += bc[y[j+m-1]];
 162 //          else
 163 //            j += m
 164 //          #endif
 165 //       }
 166 //    }
 167 
 168   if (icnt1 == -1) {
 169     Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
 170         BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
 171     Register cnt1end = tmp2;
 172     Register str2end = cnt2;
 173     Register skipch = tmp2;
 174 
 175     // str1 length is >=8, so, we can read at least 1 register for cases when
 176     // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
 177     // UL case. We'll re-read last character in inner pre-loop code to have
 178     // single outer pre-loop load
 179     const int firstStep = isL ? 7 : 3;
 180 
 181     const int ASIZE = 256;
 182     const int STORED_BYTES = 32; // amount of bytes stored per instruction
 183     sub(sp, sp, ASIZE);
 184     mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
 185     mov(ch1, sp);
 186     BIND(BM_INIT_LOOP);
 187       stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
 188       subs(tmp5, tmp5, 1);
 189       br(GT, BM_INIT_LOOP);
 190 
 191       sub(cnt1tmp, cnt1, 1);
 192       mov(tmp5, str2);
 193       add(str2end, str2, result_tmp, LSL, str2_chr_shift);
 194       sub(ch2, cnt1, 1);
 195       mov(tmp3, str1);
 196     BIND(BCLOOP);
 197       (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
 198       if (!str1_isL) {
 199         subs(zr, ch1, ASIZE);
 200         br(HS, BCSKIP);
 201       }
 202       strb(ch2, Address(sp, ch1));
 203     BIND(BCSKIP);
 204       subs(ch2, ch2, 1);
 205       br(GT, BCLOOP);
 206 
 207       add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
 208       if (str1_isL == str2_isL) {
 209         // load last 8 bytes (8LL/4UU symbols)
 210         ldr(tmp6, Address(tmp6, -wordSize));
 211       } else {
 212         ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
 213         // convert Latin1 to UTF. We'll have to wait until load completed, but
 214         // it's still faster than per-character loads+checks
 215         lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
 216         ubfx(ch1, tmp6, 8, 8); // str1[N-2]
 217         ubfx(ch2, tmp6, 16, 8); // str1[N-3]
 218         andr(tmp6, tmp6, 0xFF); // str1[N-4]
 219         orr(ch2, ch1, ch2, LSL, 16);
 220         orr(tmp6, tmp6, tmp3, LSL, 48);
 221         orr(tmp6, tmp6, ch2, LSL, 16);
 222       }
 223     BIND(BMLOOPSTR2);
 224       (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 225       sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
 226       if (str1_isL == str2_isL) {
 227         // re-init tmp3. It's for free because it's executed in parallel with
 228         // load above. Alternative is to initialize it before loop, but it'll
 229         // affect performance on in-order systems with 2 or more ld/st pipelines
 230         lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
 231       }
 232       if (!isL) { // UU/UL case
 233         lsl(ch2, cnt1tmp, 1); // offset in bytes
 234       }
 235       cmp(tmp3, skipch);
 236       br(NE, BMSKIP);
 237       ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
 238       mov(ch1, tmp6);
 239       if (isL) {
 240         b(BMLOOPSTR1_AFTER_LOAD);
 241       } else {
 242         sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
 243         b(BMLOOPSTR1_CMP);
 244       }
 245     BIND(BMLOOPSTR1);
 246       (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
 247       (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 248     BIND(BMLOOPSTR1_AFTER_LOAD);
 249       subs(cnt1tmp, cnt1tmp, 1);
 250       br(LT, BMLOOPSTR1_LASTCMP);
 251     BIND(BMLOOPSTR1_CMP);
 252       cmp(ch1, ch2);
 253       br(EQ, BMLOOPSTR1);
 254     BIND(BMSKIP);
 255       if (!isL) {
 256         // if we've met UTF symbol while searching Latin1 pattern, then we can
 257         // skip cnt1 symbols
 258         if (str1_isL != str2_isL) {
 259           mov(result_tmp, cnt1);
 260         } else {
 261           mov(result_tmp, 1);
 262         }
 263         subs(zr, skipch, ASIZE);
 264         br(HS, BMADV);
 265       }
 266       ldrb(result_tmp, Address(sp, skipch)); // load skip distance
 267     BIND(BMADV);
 268       sub(cnt1tmp, cnt1, 1);
 269       add(str2, str2, result_tmp, LSL, str2_chr_shift);
 270       cmp(str2, str2end);
 271       br(LE, BMLOOPSTR2);
 272       add(sp, sp, ASIZE);
 273       b(NOMATCH);
 274     BIND(BMLOOPSTR1_LASTCMP);
 275       cmp(ch1, ch2);
 276       br(NE, BMSKIP);
 277     BIND(BMMATCH);
 278       sub(result, str2, tmp5);
 279       if (!str2_isL) lsr(result, result, 1);
 280       add(sp, sp, ASIZE);
 281       b(DONE);
 282 
 283     BIND(LINEARSTUB);
 284     cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm
 285     br(LT, LINEAR_MEDIUM);
 286     mov(result, zr);
 287     RuntimeAddress stub = NULL;
 288     if (isL) {
 289       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
 290       assert(stub.target() != NULL, "string_indexof_linear_ll stub has not been generated");
 291     } else if (str1_isL) {
 292       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
 293        assert(stub.target() != NULL, "string_indexof_linear_ul stub has not been generated");
 294     } else {
 295       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
 296       assert(stub.target() != NULL, "string_indexof_linear_uu stub has not been generated");
 297     }
 298     trampoline_call(stub);
 299     b(DONE);
 300   }
 301 
 302   BIND(LINEARSEARCH);
 303   {
 304     Label DO1, DO2, DO3;
 305 
 306     Register str2tmp = tmp2;
 307     Register first = tmp3;
 308 
 309     if (icnt1 == -1)
 310     {
 311         Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
 312 
 313         cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2));
 314         br(LT, DOSHORT);
 315       BIND(LINEAR_MEDIUM);
 316         (this->*str1_load_1chr)(first, Address(str1));
 317         lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
 318         sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
 319         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 320         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 321 
 322       BIND(FIRST_LOOP);
 323         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
 324         cmp(first, ch2);
 325         br(EQ, STR1_LOOP);
 326       BIND(STR2_NEXT);
 327         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 328         br(LE, FIRST_LOOP);
 329         b(NOMATCH);
 330 
 331       BIND(STR1_LOOP);
 332         adds(cnt1tmp, cnt1_neg, str1_chr_size);
 333         add(cnt2tmp, cnt2_neg, str2_chr_size);
 334         br(GE, MATCH);
 335 
 336       BIND(STR1_NEXT);
 337         (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
 338         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
 339         cmp(ch1, ch2);
 340         br(NE, STR2_NEXT);
 341         adds(cnt1tmp, cnt1tmp, str1_chr_size);
 342         add(cnt2tmp, cnt2tmp, str2_chr_size);
 343         br(LT, STR1_NEXT);
 344         b(MATCH);
 345 
 346       BIND(DOSHORT);
 347       if (str1_isL == str2_isL) {
 348         cmp(cnt1, (u1)2);
 349         br(LT, DO1);
 350         br(GT, DO3);
 351       }
 352     }
 353 
 354     if (icnt1 == 4) {
 355       Label CH1_LOOP;
 356 
 357         (this->*load_4chr)(ch1, str1);
 358         sub(result_tmp, cnt2, 4);
 359         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 360         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 361 
 362       BIND(CH1_LOOP);
 363         (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
 364         cmp(ch1, ch2);
 365         br(EQ, MATCH);
 366         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 367         br(LE, CH1_LOOP);
 368         b(NOMATCH);
 369       }
 370 
 371     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
 372       Label CH1_LOOP;
 373 
 374       BIND(DO2);
 375         (this->*load_2chr)(ch1, str1);
 376         if (icnt1 == 2) {
 377           sub(result_tmp, cnt2, 2);
 378         }
 379         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 380         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 381       BIND(CH1_LOOP);
 382         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
 383         cmp(ch1, ch2);
 384         br(EQ, MATCH);
 385         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 386         br(LE, CH1_LOOP);
 387         b(NOMATCH);
 388     }
 389 
 390     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
 391       Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
 392 
 393       BIND(DO3);
 394         (this->*load_2chr)(first, str1);
 395         (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
 396         if (icnt1 == 3) {
 397           sub(result_tmp, cnt2, 3);
 398         }
 399         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 400         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 401       BIND(FIRST_LOOP);
 402         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
 403         cmpw(first, ch2);
 404         br(EQ, STR1_LOOP);
 405       BIND(STR2_NEXT);
 406         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 407         br(LE, FIRST_LOOP);
 408         b(NOMATCH);
 409 
 410       BIND(STR1_LOOP);
 411         add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
 412         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
 413         cmp(ch1, ch2);
 414         br(NE, STR2_NEXT);
 415         b(MATCH);
 416     }
 417 
 418     if (icnt1 == -1 || icnt1 == 1) {
 419       Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
 420 
 421       BIND(DO1);
 422         (this->*str1_load_1chr)(ch1, str1);
 423         cmp(cnt2, (u1)8);
 424         br(LT, DO1_SHORT);
 425 
 426         sub(result_tmp, cnt2, 8/str2_chr_size);
 427         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 428         mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
 429         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 430 
 431         if (str2_isL) {
 432           orr(ch1, ch1, ch1, LSL, 8);
 433         }
 434         orr(ch1, ch1, ch1, LSL, 16);
 435         orr(ch1, ch1, ch1, LSL, 32);
 436       BIND(CH1_LOOP);
 437         ldr(ch2, Address(str2, cnt2_neg));
 438         eor(ch2, ch1, ch2);
 439         sub(tmp1, ch2, tmp3);
 440         orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 441         bics(tmp1, tmp1, tmp2);
 442         br(NE, HAS_ZERO);
 443         adds(cnt2_neg, cnt2_neg, 8);
 444         br(LT, CH1_LOOP);
 445 
 446         cmp(cnt2_neg, (u1)8);
 447         mov(cnt2_neg, 0);
 448         br(LT, CH1_LOOP);
 449         b(NOMATCH);
 450 
 451       BIND(HAS_ZERO);
 452         rev(tmp1, tmp1);
 453         clz(tmp1, tmp1);
 454         add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
 455         b(MATCH);
 456 
 457       BIND(DO1_SHORT);
 458         mov(result_tmp, cnt2);
 459         lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
 460         sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
 461       BIND(DO1_LOOP);
 462         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
 463         cmpw(ch1, ch2);
 464         br(EQ, MATCH);
 465         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 466         br(LT, DO1_LOOP);
 467     }
 468   }
 469   BIND(NOMATCH);
 470     mov(result, -1);
 471     b(DONE);
 472   BIND(MATCH);
 473     add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
 474   BIND(DONE);
 475 }
 476 
 477 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
 478 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
 479 
 480 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1,
 481                                             Register ch, Register result,
 482                                             Register tmp1, Register tmp2, Register tmp3)
 483 {
 484   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
 485   Register cnt1_neg = cnt1;
 486   Register ch1 = rscratch1;
 487   Register result_tmp = rscratch2;
 488 
 489   cbz(cnt1, NOMATCH);
 490 
 491   cmp(cnt1, (u1)4);
 492   br(LT, DO1_SHORT);
 493 
 494   orr(ch, ch, ch, LSL, 16);
 495   orr(ch, ch, ch, LSL, 32);
 496 
 497   sub(cnt1, cnt1, 4);
 498   mov(result_tmp, cnt1);
 499   lea(str1, Address(str1, cnt1, Address::uxtw(1)));
 500   sub(cnt1_neg, zr, cnt1, LSL, 1);
 501 
 502   mov(tmp3, 0x0001000100010001);
 503 
 504   BIND(CH1_LOOP);
 505     ldr(ch1, Address(str1, cnt1_neg));
 506     eor(ch1, ch, ch1);
 507     sub(tmp1, ch1, tmp3);
 508     orr(tmp2, ch1, 0x7fff7fff7fff7fff);
 509     bics(tmp1, tmp1, tmp2);
 510     br(NE, HAS_ZERO);
 511     adds(cnt1_neg, cnt1_neg, 8);
 512     br(LT, CH1_LOOP);
 513 
 514     cmp(cnt1_neg, (u1)8);
 515     mov(cnt1_neg, 0);
 516     br(LT, CH1_LOOP);
 517     b(NOMATCH);
 518 
 519   BIND(HAS_ZERO);
 520     rev(tmp1, tmp1);
 521     clz(tmp1, tmp1);
 522     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
 523     b(MATCH);
 524 
 525   BIND(DO1_SHORT);
 526     mov(result_tmp, cnt1);
 527     lea(str1, Address(str1, cnt1, Address::uxtw(1)));
 528     sub(cnt1_neg, zr, cnt1, LSL, 1);
 529   BIND(DO1_LOOP);
 530     ldrh(ch1, Address(str1, cnt1_neg));
 531     cmpw(ch, ch1);
 532     br(EQ, MATCH);
 533     adds(cnt1_neg, cnt1_neg, 2);
 534     br(LT, DO1_LOOP);
 535   BIND(NOMATCH);
 536     mov(result, -1);
 537     b(DONE);
 538   BIND(MATCH);
 539     add(result, result_tmp, cnt1_neg, ASR, 1);
 540   BIND(DONE);
 541 }
 542 
 543 void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1,
 544                                                 Register ch, Register result,
 545                                                 FloatRegister ztmp1,
 546                                                 FloatRegister ztmp2,
 547                                                 PRegister tmp_pg,
 548                                                 PRegister tmp_pdn, bool isL)
 549 {
 550   // Note that `tmp_pdn` should *NOT* be used as governing predicate register.
 551   assert(tmp_pg->is_governing(),
 552          "this register has to be a governing predicate register");
 553 
 554   Label LOOP, MATCH, DONE, NOMATCH;
 555   Register vec_len = rscratch1;
 556   Register idx = rscratch2;
 557 
 558   SIMD_RegVariant T = (isL == true) ? B : H;
 559 
 560   cbz(cnt1, NOMATCH);
 561 
 562   // Assign the particular char throughout the vector.
 563   sve_dup(ztmp2, T, ch);
 564   if (isL) {
 565     sve_cntb(vec_len);
 566   } else {
 567     sve_cnth(vec_len);
 568   }
 569   mov(idx, 0);
 570 
 571   // Generate a predicate to control the reading of input string.
 572   sve_whilelt(tmp_pg, T, idx, cnt1);
 573 
 574   BIND(LOOP);
 575     // Read a vector of 8- or 16-bit data depending on the string type. Note
 576     // that inactive elements indicated by the predicate register won't cause
 577     // a data read from memory to the destination vector.
 578     if (isL) {
 579       sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx));
 580     } else {
 581       sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1)));
 582     }
 583     add(idx, idx, vec_len);
 584 
 585     // Perform the comparison. An element of the destination predicate is set
 586     // to active if the particular char is matched.
 587     sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2);
 588 
 589     // Branch if the particular char is found.
 590     br(NE, MATCH);
 591 
 592     sve_whilelt(tmp_pg, T, idx, cnt1);
 593 
 594     // Loop back if the particular char not found.
 595     br(MI, LOOP);
 596 
 597   BIND(NOMATCH);
 598     mov(result, -1);
 599     b(DONE);
 600 
 601   BIND(MATCH);
 602     // Undo the index increment.
 603     sub(idx, idx, vec_len);
 604 
 605     // Crop the vector to find its location.
 606     sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */);
 607     add(result, idx, -1);
 608     sve_incp(result, T, tmp_pdn);
 609   BIND(DONE);
 610 }
 611 
 612 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1,
 613                                             Register ch, Register result,
 614                                             Register tmp1, Register tmp2, Register tmp3)
 615 {
 616   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
 617   Register cnt1_neg = cnt1;
 618   Register ch1 = rscratch1;
 619   Register result_tmp = rscratch2;
 620 
 621   cbz(cnt1, NOMATCH);
 622 
 623   cmp(cnt1, (u1)8);
 624   br(LT, DO1_SHORT);
 625 
 626   orr(ch, ch, ch, LSL, 8);
 627   orr(ch, ch, ch, LSL, 16);
 628   orr(ch, ch, ch, LSL, 32);
 629 
 630   sub(cnt1, cnt1, 8);
 631   mov(result_tmp, cnt1);
 632   lea(str1, Address(str1, cnt1));
 633   sub(cnt1_neg, zr, cnt1);
 634 
 635   mov(tmp3, 0x0101010101010101);
 636 
 637   BIND(CH1_LOOP);
 638     ldr(ch1, Address(str1, cnt1_neg));
 639     eor(ch1, ch, ch1);
 640     sub(tmp1, ch1, tmp3);
 641     orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f);
 642     bics(tmp1, tmp1, tmp2);
 643     br(NE, HAS_ZERO);
 644     adds(cnt1_neg, cnt1_neg, 8);
 645     br(LT, CH1_LOOP);
 646 
 647     cmp(cnt1_neg, (u1)8);
 648     mov(cnt1_neg, 0);
 649     br(LT, CH1_LOOP);
 650     b(NOMATCH);
 651 
 652   BIND(HAS_ZERO);
 653     rev(tmp1, tmp1);
 654     clz(tmp1, tmp1);
 655     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
 656     b(MATCH);
 657 
 658   BIND(DO1_SHORT);
 659     mov(result_tmp, cnt1);
 660     lea(str1, Address(str1, cnt1));
 661     sub(cnt1_neg, zr, cnt1);
 662   BIND(DO1_LOOP);
 663     ldrb(ch1, Address(str1, cnt1_neg));
 664     cmp(ch, ch1);
 665     br(EQ, MATCH);
 666     adds(cnt1_neg, cnt1_neg, 1);
 667     br(LT, DO1_LOOP);
 668   BIND(NOMATCH);
 669     mov(result, -1);
 670     b(DONE);
 671   BIND(MATCH);
 672     add(result, result_tmp, cnt1_neg);
 673   BIND(DONE);
 674 }
 675 
 676 // Compare strings.
 677 void C2_MacroAssembler::string_compare(Register str1, Register str2,
 678     Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
 679     FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, int ae) {
 680   Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
 681       DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
 682       SHORT_LOOP_START, TAIL_CHECK;
 683 
 684   bool isLL = ae == StrIntrinsicNode::LL;
 685   bool isLU = ae == StrIntrinsicNode::LU;
 686   bool isUL = ae == StrIntrinsicNode::UL;
 687 
 688   // The stub threshold for LL strings is: 72 (64 + 8) chars
 689   // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch)
 690   // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least)
 691   const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36);
 692 
 693   bool str1_isL = isLL || isLU;
 694   bool str2_isL = isLL || isUL;
 695 
 696   int str1_chr_shift = str1_isL ? 0 : 1;
 697   int str2_chr_shift = str2_isL ? 0 : 1;
 698   int str1_chr_size = str1_isL ? 1 : 2;
 699   int str2_chr_size = str2_isL ? 1 : 2;
 700   int minCharsInWord = isLL ? wordSize : wordSize/2;
 701 
 702   FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
 703   chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
 704                                       (chr_insn)&MacroAssembler::ldrh;
 705   chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
 706                                       (chr_insn)&MacroAssembler::ldrh;
 707   uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
 708                             (uxt_insn)&MacroAssembler::uxthw;
 709 
 710   BLOCK_COMMENT("string_compare {");
 711 
 712   // Bizzarely, the counts are passed in bytes, regardless of whether they
 713   // are L or U strings, however the result is always in characters.
 714   if (!str1_isL) asrw(cnt1, cnt1, 1);
 715   if (!str2_isL) asrw(cnt2, cnt2, 1);
 716 
 717   // Compute the minimum of the string lengths and save the difference.
 718   subsw(result, cnt1, cnt2);
 719   cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
 720 
 721   // A very short string
 722   cmpw(cnt2, minCharsInWord);
 723   br(Assembler::LE, SHORT_STRING);
 724 
 725   // Compare longwords
 726   // load first parts of strings and finish initialization while loading
 727   {
 728     if (str1_isL == str2_isL) { // LL or UU
 729       ldr(tmp1, Address(str1));
 730       cmp(str1, str2);
 731       br(Assembler::EQ, DONE);
 732       ldr(tmp2, Address(str2));
 733       cmp(cnt2, stub_threshold);
 734       br(GE, STUB);
 735       subsw(cnt2, cnt2, minCharsInWord);
 736       br(EQ, TAIL_CHECK);
 737       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
 738       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
 739       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
 740     } else if (isLU) {
 741       ldrs(vtmp, Address(str1));
 742       ldr(tmp2, Address(str2));
 743       cmp(cnt2, stub_threshold);
 744       br(GE, STUB);
 745       subw(cnt2, cnt2, 4);
 746       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
 747       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
 748       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
 749       zip1(vtmp, T8B, vtmp, vtmpZ);
 750       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
 751       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
 752       add(cnt1, cnt1, 4);
 753       fmovd(tmp1, vtmp);
 754     } else { // UL case
 755       ldr(tmp1, Address(str1));
 756       ldrs(vtmp, Address(str2));
 757       cmp(cnt2, stub_threshold);
 758       br(GE, STUB);
 759       subw(cnt2, cnt2, 4);
 760       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
 761       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
 762       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
 763       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
 764       zip1(vtmp, T8B, vtmp, vtmpZ);
 765       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
 766       add(cnt1, cnt1, 8);
 767       fmovd(tmp2, vtmp);
 768     }
 769     adds(cnt2, cnt2, isUL ? 4 : 8);
 770     br(GE, TAIL);
 771     eor(rscratch2, tmp1, tmp2);
 772     cbnz(rscratch2, DIFF);
 773     // main loop
 774     bind(NEXT_WORD);
 775     if (str1_isL == str2_isL) {
 776       ldr(tmp1, Address(str1, cnt2));
 777       ldr(tmp2, Address(str2, cnt2));
 778       adds(cnt2, cnt2, 8);
 779     } else if (isLU) {
 780       ldrs(vtmp, Address(str1, cnt1));
 781       ldr(tmp2, Address(str2, cnt2));
 782       add(cnt1, cnt1, 4);
 783       zip1(vtmp, T8B, vtmp, vtmpZ);
 784       fmovd(tmp1, vtmp);
 785       adds(cnt2, cnt2, 8);
 786     } else { // UL
 787       ldrs(vtmp, Address(str2, cnt2));
 788       ldr(tmp1, Address(str1, cnt1));
 789       zip1(vtmp, T8B, vtmp, vtmpZ);
 790       add(cnt1, cnt1, 8);
 791       fmovd(tmp2, vtmp);
 792       adds(cnt2, cnt2, 4);
 793     }
 794     br(GE, TAIL);
 795 
 796     eor(rscratch2, tmp1, tmp2);
 797     cbz(rscratch2, NEXT_WORD);
 798     b(DIFF);
 799     bind(TAIL);
 800     eor(rscratch2, tmp1, tmp2);
 801     cbnz(rscratch2, DIFF);
 802     // Last longword.  In the case where length == 4 we compare the
 803     // same longword twice, but that's still faster than another
 804     // conditional branch.
 805     if (str1_isL == str2_isL) {
 806       ldr(tmp1, Address(str1));
 807       ldr(tmp2, Address(str2));
 808     } else if (isLU) {
 809       ldrs(vtmp, Address(str1));
 810       ldr(tmp2, Address(str2));
 811       zip1(vtmp, T8B, vtmp, vtmpZ);
 812       fmovd(tmp1, vtmp);
 813     } else { // UL
 814       ldrs(vtmp, Address(str2));
 815       ldr(tmp1, Address(str1));
 816       zip1(vtmp, T8B, vtmp, vtmpZ);
 817       fmovd(tmp2, vtmp);
 818     }
 819     bind(TAIL_CHECK);
 820     eor(rscratch2, tmp1, tmp2);
 821     cbz(rscratch2, DONE);
 822 
 823     // Find the first different characters in the longwords and
 824     // compute their difference.
 825     bind(DIFF);
 826     rev(rscratch2, rscratch2);
 827     clz(rscratch2, rscratch2);
 828     andr(rscratch2, rscratch2, isLL ? -8 : -16);
 829     lsrv(tmp1, tmp1, rscratch2);
 830     (this->*ext_chr)(tmp1, tmp1);
 831     lsrv(tmp2, tmp2, rscratch2);
 832     (this->*ext_chr)(tmp2, tmp2);
 833     subw(result, tmp1, tmp2);
 834     b(DONE);
 835   }
 836 
 837   bind(STUB);
 838     RuntimeAddress stub = NULL;
 839     switch(ae) {
 840       case StrIntrinsicNode::LL:
 841         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
 842         break;
 843       case StrIntrinsicNode::UU:
 844         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
 845         break;
 846       case StrIntrinsicNode::LU:
 847         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
 848         break;
 849       case StrIntrinsicNode::UL:
 850         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
 851         break;
 852       default:
 853         ShouldNotReachHere();
 854      }
 855     assert(stub.target() != NULL, "compare_long_string stub has not been generated");
 856     trampoline_call(stub);
 857     b(DONE);
 858 
 859   bind(SHORT_STRING);
 860   // Is the minimum length zero?
 861   cbz(cnt2, DONE);
 862   // arrange code to do most branches while loading and loading next characters
 863   // while comparing previous
 864   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
 865   subs(cnt2, cnt2, 1);
 866   br(EQ, SHORT_LAST_INIT);
 867   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
 868   b(SHORT_LOOP_START);
 869   bind(SHORT_LOOP);
 870   subs(cnt2, cnt2, 1);
 871   br(EQ, SHORT_LAST);
 872   bind(SHORT_LOOP_START);
 873   (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
 874   (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
 875   cmp(tmp1, cnt1);
 876   br(NE, SHORT_LOOP_TAIL);
 877   subs(cnt2, cnt2, 1);
 878   br(EQ, SHORT_LAST2);
 879   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
 880   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
 881   cmp(tmp2, rscratch1);
 882   br(EQ, SHORT_LOOP);
 883   sub(result, tmp2, rscratch1);
 884   b(DONE);
 885   bind(SHORT_LOOP_TAIL);
 886   sub(result, tmp1, cnt1);
 887   b(DONE);
 888   bind(SHORT_LAST2);
 889   cmp(tmp2, rscratch1);
 890   br(EQ, DONE);
 891   sub(result, tmp2, rscratch1);
 892 
 893   b(DONE);
 894   bind(SHORT_LAST_INIT);
 895   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
 896   bind(SHORT_LAST);
 897   cmp(tmp1, cnt1);
 898   br(EQ, DONE);
 899   sub(result, tmp1, cnt1);
 900 
 901   bind(DONE);
 902 
 903   BLOCK_COMMENT("} string_compare");
 904 }
 905 
 906 void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1,
 907                                      FloatRegister src2, int cond, bool isQ) {
 908   SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
 909   if (bt == T_FLOAT || bt == T_DOUBLE) {
 910     switch (cond) {
 911       case BoolTest::eq: fcmeq(dst, size, src1, src2); break;
 912       case BoolTest::ne: {
 913         fcmeq(dst, size, src1, src2);
 914         notr(dst, T16B, dst);
 915         break;
 916       }
 917       case BoolTest::ge: fcmge(dst, size, src1, src2); break;
 918       case BoolTest::gt: fcmgt(dst, size, src1, src2); break;
 919       case BoolTest::le: fcmge(dst, size, src2, src1); break;
 920       case BoolTest::lt: fcmgt(dst, size, src2, src1); break;
 921       default:
 922         assert(false, "unsupported");
 923         ShouldNotReachHere();
 924     }
 925   } else {
 926     switch (cond) {
 927       case BoolTest::eq: cmeq(dst, size, src1, src2); break;
 928       case BoolTest::ne: {
 929         cmeq(dst, size, src1, src2);
 930         notr(dst, T16B, dst);
 931         break;
 932       }
 933       case BoolTest::ge: cmge(dst, size, src1, src2); break;
 934       case BoolTest::gt: cmgt(dst, size, src1, src2); break;
 935       case BoolTest::le: cmge(dst, size, src2, src1); break;
 936       case BoolTest::lt: cmgt(dst, size, src2, src1); break;
 937       case BoolTest::uge: cmhs(dst, size, src1, src2); break;
 938       case BoolTest::ugt: cmhi(dst, size, src1, src2); break;
 939       case BoolTest::ult: cmhi(dst, size, src2, src1); break;
 940       case BoolTest::ule: cmhs(dst, size, src2, src1); break;
 941       default:
 942         assert(false, "unsupported");
 943         ShouldNotReachHere();
 944     }
 945   }
 946 }
 947 
 948 void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg,
 949                                     FloatRegister zn, FloatRegister zm, int cond) {
 950   assert(pg->is_governing(), "This register has to be a governing predicate register");
 951   FloatRegister z1 = zn, z2 = zm;
 952   // Convert the original BoolTest condition to Assembler::condition.
 953   Condition condition;
 954   switch (cond) {
 955     case BoolTest::eq: condition = Assembler::EQ; break;
 956     case BoolTest::ne: condition = Assembler::NE; break;
 957     case BoolTest::le: z1 = zm; z2 = zn; condition = Assembler::GE; break;
 958     case BoolTest::ge: condition = Assembler::GE; break;
 959     case BoolTest::lt: z1 = zm; z2 = zn; condition = Assembler::GT; break;
 960     case BoolTest::gt: condition = Assembler::GT; break;
 961     default:
 962       assert(false, "unsupported compare condition");
 963       ShouldNotReachHere();
 964   }
 965 
 966   SIMD_RegVariant size = elemType_to_regVariant(bt);
 967   if (bt == T_FLOAT || bt == T_DOUBLE) {
 968     sve_fcm(condition, pd, size, pg, z1, z2);
 969   } else {
 970     assert(is_integral_type(bt), "unsupported element type");
 971     sve_cmp(condition, pd, size, pg, z1, z2);
 972   }
 973 }
 974 
 975 // Get index of the last mask lane that is set
 976 void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) {
 977   SIMD_RegVariant size = elemType_to_regVariant(bt);
 978   sve_rev(ptmp, size, src);
 979   sve_brkb(ptmp, ptrue, ptmp, false);
 980   sve_cntp(dst, size, ptrue, ptmp);
 981   movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1);
 982   subw(dst, rscratch1, dst);
 983 }
 984 
 985 void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size,
 986                                           FloatRegister src, SIMD_RegVariant src_size) {
 987   assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size");
 988   if (src_size == B) {
 989     switch (dst_size) {
 990     case H:
 991       sve_sunpklo(dst, H, src);
 992       break;
 993     case S:
 994       sve_sunpklo(dst, H, src);
 995       sve_sunpklo(dst, S, dst);
 996       break;
 997     case D:
 998       sve_sunpklo(dst, H, src);
 999       sve_sunpklo(dst, S, dst);
1000       sve_sunpklo(dst, D, dst);
1001       break;
1002     default:
1003       ShouldNotReachHere();
1004     }
1005   } else if (src_size == H) {
1006     if (dst_size == S) {
1007       sve_sunpklo(dst, S, src);
1008     } else { // D
1009       sve_sunpklo(dst, S, src);
1010       sve_sunpklo(dst, D, dst);
1011     }
1012   } else if (src_size == S) {
1013     sve_sunpklo(dst, D, src);
1014   }
1015 }
1016 
1017 // Vector narrow from src to dst with specified element sizes.
1018 // High part of dst vector will be filled with zero.
1019 void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size,
1020                                           FloatRegister src, SIMD_RegVariant src_size,
1021                                           FloatRegister tmp) {
1022   assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size");
1023   sve_dup(tmp, src_size, 0);
1024   if (src_size == D) {
1025     switch (dst_size) {
1026     case S:
1027       sve_uzp1(dst, S, src, tmp);
1028       break;
1029     case H:
1030       sve_uzp1(dst, S, src, tmp);
1031       sve_uzp1(dst, H, dst, tmp);
1032       break;
1033     case B:
1034       sve_uzp1(dst, S, src, tmp);
1035       sve_uzp1(dst, H, dst, tmp);
1036       sve_uzp1(dst, B, dst, tmp);
1037       break;
1038     default:
1039       ShouldNotReachHere();
1040     }
1041   } else if (src_size == S) {
1042     if (dst_size == H) {
1043       sve_uzp1(dst, H, src, tmp);
1044     } else { // B
1045       sve_uzp1(dst, H, src, tmp);
1046       sve_uzp1(dst, B, dst, tmp);
1047     }
1048   } else if (src_size == H) {
1049     sve_uzp1(dst, B, src, tmp);
1050   }
1051 }
1052 
1053 // Extend src predicate to dst predicate with the same lane count but larger
1054 // element size, e.g. 64Byte -> 512Long
1055 void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src,
1056                                              uint dst_element_length_in_bytes,
1057                                              uint src_element_length_in_bytes) {
1058   if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) {
1059     sve_punpklo(dst, src);
1060   } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) {
1061     sve_punpklo(dst, src);
1062     sve_punpklo(dst, dst);
1063   } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) {
1064     sve_punpklo(dst, src);
1065     sve_punpklo(dst, dst);
1066     sve_punpklo(dst, dst);
1067   } else {
1068     assert(false, "unsupported");
1069     ShouldNotReachHere();
1070   }
1071 }
1072 
1073 // Narrow src predicate to dst predicate with the same lane count but
1074 // smaller element size, e.g. 512Long -> 64Byte
1075 void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src,
1076                                              uint dst_element_length_in_bytes, uint src_element_length_in_bytes) {
1077   // The insignificant bits in src predicate are expected to be zero.
1078   if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) {
1079     sve_uzp1(dst, B, src, src);
1080   } else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) {
1081     sve_uzp1(dst, H, src, src);
1082     sve_uzp1(dst, B, dst, dst);
1083   } else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) {
1084     sve_uzp1(dst, S, src, src);
1085     sve_uzp1(dst, H, dst, dst);
1086     sve_uzp1(dst, B, dst, dst);
1087   } else {
1088     assert(false, "unsupported");
1089     ShouldNotReachHere();
1090   }
1091 }
1092 
1093 void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1,
1094                                             FloatRegister src2, PRegister pg, FloatRegister tmp) {
1095   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
1096   assert(pg->is_governing(), "This register has to be a governing predicate register");
1097   assert_different_registers(src1, dst);
1098   // Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved.
1099   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
1100   switch (opc) {
1101     case Op_AddReductionVI: {
1102       sve_uaddv(tmp, size, pg, src2);
1103       smov(dst, tmp, size, 0);
1104       if (bt == T_BYTE) {
1105         addw(dst, src1, dst, ext::sxtb);
1106       } else if (bt == T_SHORT) {
1107         addw(dst, src1, dst, ext::sxth);
1108       } else {
1109         addw(dst, dst, src1);
1110       }
1111       break;
1112     }
1113     case Op_AddReductionVL: {
1114       sve_uaddv(tmp, size, pg, src2);
1115       umov(dst, tmp, size, 0);
1116       add(dst, dst, src1);
1117       break;
1118     }
1119     case Op_AndReductionV: {
1120       sve_andv(tmp, size, pg, src2);
1121       if (bt == T_LONG) {
1122         umov(dst, tmp, size, 0);
1123         andr(dst, dst, src1);
1124       } else {
1125         smov(dst, tmp, size, 0);
1126         andw(dst, dst, src1);
1127       }
1128       break;
1129     }
1130     case Op_OrReductionV: {
1131       sve_orv(tmp, size, pg, src2);
1132       if (bt == T_LONG) {
1133         umov(dst, tmp, size, 0);
1134         orr(dst, dst, src1);
1135       } else {
1136         smov(dst, tmp, size, 0);
1137         orrw(dst, dst, src1);
1138       }
1139       break;
1140     }
1141     case Op_XorReductionV: {
1142       sve_eorv(tmp, size, pg, src2);
1143       if (bt == T_LONG) {
1144         umov(dst, tmp, size, 0);
1145         eor(dst, dst, src1);
1146       } else {
1147         smov(dst, tmp, size, 0);
1148         eorw(dst, dst, src1);
1149       }
1150       break;
1151     }
1152     case Op_MaxReductionV: {
1153       sve_smaxv(tmp, size, pg, src2);
1154       if (bt == T_LONG) {
1155         umov(dst, tmp, size, 0);
1156         cmp(dst, src1);
1157         csel(dst, dst, src1, Assembler::GT);
1158       } else {
1159         smov(dst, tmp, size, 0);
1160         cmpw(dst, src1);
1161         cselw(dst, dst, src1, Assembler::GT);
1162       }
1163       break;
1164     }
1165     case Op_MinReductionV: {
1166       sve_sminv(tmp, size, pg, src2);
1167       if (bt == T_LONG) {
1168         umov(dst, tmp, size, 0);
1169         cmp(dst, src1);
1170         csel(dst, dst, src1, Assembler::LT);
1171       } else {
1172         smov(dst, tmp, size, 0);
1173         cmpw(dst, src1);
1174         cselw(dst, dst, src1, Assembler::LT);
1175       }
1176       break;
1177     }
1178     default:
1179       assert(false, "unsupported");
1180       ShouldNotReachHere();
1181   }
1182 
1183   if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) {
1184     if (bt == T_BYTE) {
1185       sxtb(dst, dst);
1186     } else if (bt == T_SHORT) {
1187       sxth(dst, dst);
1188     }
1189   }
1190 }