1 /*
   2  * Copyright (c) 2020, 2022, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "opto/c2_MacroAssembler.hpp"
  29 #include "opto/compile.hpp"
  30 #include "opto/intrinsicnode.hpp"
  31 #include "opto/matcher.hpp"
  32 #include "opto/output.hpp"
  33 #include "opto/subnode.hpp"
  34 #include "runtime/stubRoutines.hpp"
  35 
  36 #ifdef PRODUCT
  37 #define BLOCK_COMMENT(str) /* nothing */
  38 #define STOP(error) stop(error)
  39 #else
  40 #define BLOCK_COMMENT(str) block_comment(str)
  41 #define STOP(error) block_comment(error); stop(error)
  42 #endif
  43 
  44 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  45 
  46 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
  47 
  48 void C2_MacroAssembler::emit_entry_barrier_stub(C2EntryBarrierStub* stub) {
  49   bind(stub->slow_path());
  50   movptr(rscratch1, (uintptr_t) StubRoutines::aarch64::method_entry_barrier());
  51   blr(rscratch1);
  52   b(stub->continuation());
  53 
  54   bind(stub->guard());
  55   relocate(entry_guard_Relocation::spec());
  56   emit_int32(0);   // nmethod guard value
  57 }
  58 
  59 int C2_MacroAssembler::entry_barrier_stub_size() {
  60   return 4 * 6;
  61 }
  62 
  63 // Search for str1 in str2 and return index or -1
  64 void C2_MacroAssembler::string_indexof(Register str2, Register str1,
  65                                        Register cnt2, Register cnt1,
  66                                        Register tmp1, Register tmp2,
  67                                        Register tmp3, Register tmp4,
  68                                        Register tmp5, Register tmp6,
  69                                        int icnt1, Register result, int ae) {
  70   // NOTE: tmp5, tmp6 can be zr depending on specific method version
  71   Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
  72 
  73   Register ch1 = rscratch1;
  74   Register ch2 = rscratch2;
  75   Register cnt1tmp = tmp1;
  76   Register cnt2tmp = tmp2;
  77   Register cnt1_neg = cnt1;
  78   Register cnt2_neg = cnt2;
  79   Register result_tmp = tmp4;
  80 
  81   bool isL = ae == StrIntrinsicNode::LL;
  82 
  83   bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
  84   bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
  85   int str1_chr_shift = str1_isL ? 0:1;
  86   int str2_chr_shift = str2_isL ? 0:1;
  87   int str1_chr_size = str1_isL ? 1:2;
  88   int str2_chr_size = str2_isL ? 1:2;
  89   chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
  90                                       (chr_insn)&MacroAssembler::ldrh;
  91   chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
  92                                       (chr_insn)&MacroAssembler::ldrh;
  93   chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
  94   chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
  95 
  96   // Note, inline_string_indexOf() generates checks:
  97   // if (substr.count > string.count) return -1;
  98   // if (substr.count == 0) return 0;
  99 
 100   // We have two strings, a source string in str2, cnt2 and a pattern string
 101   // in str1, cnt1. Find the 1st occurrence of pattern in source or return -1.
 102 
 103   // For larger pattern and source we use a simplified Boyer Moore algorithm.
 104   // With a small pattern and source we use linear scan.
 105 
 106   if (icnt1 == -1) {
 107     sub(result_tmp, cnt2, cnt1);
 108     cmp(cnt1, (u1)8);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
 109     br(LT, LINEARSEARCH);
 110     dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
 111     subs(zr, cnt1, 256);
 112     lsr(tmp1, cnt2, 2);
 113     ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
 114     br(GE, LINEARSTUB);
 115   }
 116 
 117 // The Boyer Moore alogorithm is based on the description here:-
 118 //
 119 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
 120 //
 121 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
 122 // and the 'Good Suffix' rule.
 123 //
 124 // These rules are essentially heuristics for how far we can shift the
 125 // pattern along the search string.
 126 //
 127 // The implementation here uses the 'Bad Character' rule only because of the
 128 // complexity of initialisation for the 'Good Suffix' rule.
 129 //
 130 // This is also known as the Boyer-Moore-Horspool algorithm:-
 131 //
 132 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
 133 //
 134 // This particular implementation has few java-specific optimizations.
 135 //
 136 // #define ASIZE 256
 137 //
 138 //    int bm(unsigned char *x, int m, unsigned char *y, int n) {
 139 //       int i, j;
 140 //       unsigned c;
 141 //       unsigned char bc[ASIZE];
 142 //
 143 //       /* Preprocessing */
 144 //       for (i = 0; i < ASIZE; ++i)
 145 //          bc[i] = m;
 146 //       for (i = 0; i < m - 1; ) {
 147 //          c = x[i];
 148 //          ++i;
 149 //          // c < 256 for Latin1 string, so, no need for branch
 150 //          #ifdef PATTERN_STRING_IS_LATIN1
 151 //          bc[c] = m - i;
 152 //          #else
 153 //          if (c < ASIZE) bc[c] = m - i;
 154 //          #endif
 155 //       }
 156 //
 157 //       /* Searching */
 158 //       j = 0;
 159 //       while (j <= n - m) {
 160 //          c = y[i+j];
 161 //          if (x[m-1] == c)
 162 //            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
 163 //          if (i < 0) return j;
 164 //          // c < 256 for Latin1 string, so, no need for branch
 165 //          #ifdef SOURCE_STRING_IS_LATIN1
 166 //          // LL case: (c< 256) always true. Remove branch
 167 //          j += bc[y[j+m-1]];
 168 //          #endif
 169 //          #ifndef PATTERN_STRING_IS_UTF
 170 //          // UU case: need if (c<ASIZE) check. Skip 1 character if not.
 171 //          if (c < ASIZE)
 172 //            j += bc[y[j+m-1]];
 173 //          else
 174 //            j += 1
 175 //          #endif
 176 //          #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
 177 //          // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
 178 //          if (c < ASIZE)
 179 //            j += bc[y[j+m-1]];
 180 //          else
 181 //            j += m
 182 //          #endif
 183 //       }
 184 //    }
 185 
 186   if (icnt1 == -1) {
 187     Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
 188         BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
 189     Register cnt1end = tmp2;
 190     Register str2end = cnt2;
 191     Register skipch = tmp2;
 192 
 193     // str1 length is >=8, so, we can read at least 1 register for cases when
 194     // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
 195     // UL case. We'll re-read last character in inner pre-loop code to have
 196     // single outer pre-loop load
 197     const int firstStep = isL ? 7 : 3;
 198 
 199     const int ASIZE = 256;
 200     const int STORED_BYTES = 32; // amount of bytes stored per instruction
 201     sub(sp, sp, ASIZE);
 202     mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
 203     mov(ch1, sp);
 204     BIND(BM_INIT_LOOP);
 205       stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
 206       subs(tmp5, tmp5, 1);
 207       br(GT, BM_INIT_LOOP);
 208 
 209       sub(cnt1tmp, cnt1, 1);
 210       mov(tmp5, str2);
 211       add(str2end, str2, result_tmp, LSL, str2_chr_shift);
 212       sub(ch2, cnt1, 1);
 213       mov(tmp3, str1);
 214     BIND(BCLOOP);
 215       (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
 216       if (!str1_isL) {
 217         subs(zr, ch1, ASIZE);
 218         br(HS, BCSKIP);
 219       }
 220       strb(ch2, Address(sp, ch1));
 221     BIND(BCSKIP);
 222       subs(ch2, ch2, 1);
 223       br(GT, BCLOOP);
 224 
 225       add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
 226       if (str1_isL == str2_isL) {
 227         // load last 8 bytes (8LL/4UU symbols)
 228         ldr(tmp6, Address(tmp6, -wordSize));
 229       } else {
 230         ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
 231         // convert Latin1 to UTF. We'll have to wait until load completed, but
 232         // it's still faster than per-character loads+checks
 233         lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
 234         ubfx(ch1, tmp6, 8, 8); // str1[N-2]
 235         ubfx(ch2, tmp6, 16, 8); // str1[N-3]
 236         andr(tmp6, tmp6, 0xFF); // str1[N-4]
 237         orr(ch2, ch1, ch2, LSL, 16);
 238         orr(tmp6, tmp6, tmp3, LSL, 48);
 239         orr(tmp6, tmp6, ch2, LSL, 16);
 240       }
 241     BIND(BMLOOPSTR2);
 242       (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 243       sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
 244       if (str1_isL == str2_isL) {
 245         // re-init tmp3. It's for free because it's executed in parallel with
 246         // load above. Alternative is to initialize it before loop, but it'll
 247         // affect performance on in-order systems with 2 or more ld/st pipelines
 248         lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
 249       }
 250       if (!isL) { // UU/UL case
 251         lsl(ch2, cnt1tmp, 1); // offset in bytes
 252       }
 253       cmp(tmp3, skipch);
 254       br(NE, BMSKIP);
 255       ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
 256       mov(ch1, tmp6);
 257       if (isL) {
 258         b(BMLOOPSTR1_AFTER_LOAD);
 259       } else {
 260         sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
 261         b(BMLOOPSTR1_CMP);
 262       }
 263     BIND(BMLOOPSTR1);
 264       (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
 265       (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 266     BIND(BMLOOPSTR1_AFTER_LOAD);
 267       subs(cnt1tmp, cnt1tmp, 1);
 268       br(LT, BMLOOPSTR1_LASTCMP);
 269     BIND(BMLOOPSTR1_CMP);
 270       cmp(ch1, ch2);
 271       br(EQ, BMLOOPSTR1);
 272     BIND(BMSKIP);
 273       if (!isL) {
 274         // if we've met UTF symbol while searching Latin1 pattern, then we can
 275         // skip cnt1 symbols
 276         if (str1_isL != str2_isL) {
 277           mov(result_tmp, cnt1);
 278         } else {
 279           mov(result_tmp, 1);
 280         }
 281         subs(zr, skipch, ASIZE);
 282         br(HS, BMADV);
 283       }
 284       ldrb(result_tmp, Address(sp, skipch)); // load skip distance
 285     BIND(BMADV);
 286       sub(cnt1tmp, cnt1, 1);
 287       add(str2, str2, result_tmp, LSL, str2_chr_shift);
 288       cmp(str2, str2end);
 289       br(LE, BMLOOPSTR2);
 290       add(sp, sp, ASIZE);
 291       b(NOMATCH);
 292     BIND(BMLOOPSTR1_LASTCMP);
 293       cmp(ch1, ch2);
 294       br(NE, BMSKIP);
 295     BIND(BMMATCH);
 296       sub(result, str2, tmp5);
 297       if (!str2_isL) lsr(result, result, 1);
 298       add(sp, sp, ASIZE);
 299       b(DONE);
 300 
 301     BIND(LINEARSTUB);
 302     cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm
 303     br(LT, LINEAR_MEDIUM);
 304     mov(result, zr);
 305     RuntimeAddress stub = NULL;
 306     if (isL) {
 307       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
 308       assert(stub.target() != NULL, "string_indexof_linear_ll stub has not been generated");
 309     } else if (str1_isL) {
 310       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
 311        assert(stub.target() != NULL, "string_indexof_linear_ul stub has not been generated");
 312     } else {
 313       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
 314       assert(stub.target() != NULL, "string_indexof_linear_uu stub has not been generated");
 315     }
 316     trampoline_call(stub);
 317     b(DONE);
 318   }
 319 
 320   BIND(LINEARSEARCH);
 321   {
 322     Label DO1, DO2, DO3;
 323 
 324     Register str2tmp = tmp2;
 325     Register first = tmp3;
 326 
 327     if (icnt1 == -1)
 328     {
 329         Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
 330 
 331         cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2));
 332         br(LT, DOSHORT);
 333       BIND(LINEAR_MEDIUM);
 334         (this->*str1_load_1chr)(first, Address(str1));
 335         lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
 336         sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
 337         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 338         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 339 
 340       BIND(FIRST_LOOP);
 341         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
 342         cmp(first, ch2);
 343         br(EQ, STR1_LOOP);
 344       BIND(STR2_NEXT);
 345         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 346         br(LE, FIRST_LOOP);
 347         b(NOMATCH);
 348 
 349       BIND(STR1_LOOP);
 350         adds(cnt1tmp, cnt1_neg, str1_chr_size);
 351         add(cnt2tmp, cnt2_neg, str2_chr_size);
 352         br(GE, MATCH);
 353 
 354       BIND(STR1_NEXT);
 355         (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
 356         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
 357         cmp(ch1, ch2);
 358         br(NE, STR2_NEXT);
 359         adds(cnt1tmp, cnt1tmp, str1_chr_size);
 360         add(cnt2tmp, cnt2tmp, str2_chr_size);
 361         br(LT, STR1_NEXT);
 362         b(MATCH);
 363 
 364       BIND(DOSHORT);
 365       if (str1_isL == str2_isL) {
 366         cmp(cnt1, (u1)2);
 367         br(LT, DO1);
 368         br(GT, DO3);
 369       }
 370     }
 371 
 372     if (icnt1 == 4) {
 373       Label CH1_LOOP;
 374 
 375         (this->*load_4chr)(ch1, str1);
 376         sub(result_tmp, cnt2, 4);
 377         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 378         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 379 
 380       BIND(CH1_LOOP);
 381         (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
 382         cmp(ch1, ch2);
 383         br(EQ, MATCH);
 384         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 385         br(LE, CH1_LOOP);
 386         b(NOMATCH);
 387       }
 388 
 389     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
 390       Label CH1_LOOP;
 391 
 392       BIND(DO2);
 393         (this->*load_2chr)(ch1, str1);
 394         if (icnt1 == 2) {
 395           sub(result_tmp, cnt2, 2);
 396         }
 397         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 398         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 399       BIND(CH1_LOOP);
 400         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
 401         cmp(ch1, ch2);
 402         br(EQ, MATCH);
 403         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 404         br(LE, CH1_LOOP);
 405         b(NOMATCH);
 406     }
 407 
 408     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
 409       Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
 410 
 411       BIND(DO3);
 412         (this->*load_2chr)(first, str1);
 413         (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
 414         if (icnt1 == 3) {
 415           sub(result_tmp, cnt2, 3);
 416         }
 417         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 418         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 419       BIND(FIRST_LOOP);
 420         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
 421         cmpw(first, ch2);
 422         br(EQ, STR1_LOOP);
 423       BIND(STR2_NEXT);
 424         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 425         br(LE, FIRST_LOOP);
 426         b(NOMATCH);
 427 
 428       BIND(STR1_LOOP);
 429         add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
 430         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
 431         cmp(ch1, ch2);
 432         br(NE, STR2_NEXT);
 433         b(MATCH);
 434     }
 435 
 436     if (icnt1 == -1 || icnt1 == 1) {
 437       Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
 438 
 439       BIND(DO1);
 440         (this->*str1_load_1chr)(ch1, str1);
 441         cmp(cnt2, (u1)8);
 442         br(LT, DO1_SHORT);
 443 
 444         sub(result_tmp, cnt2, 8/str2_chr_size);
 445         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 446         mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
 447         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 448 
 449         if (str2_isL) {
 450           orr(ch1, ch1, ch1, LSL, 8);
 451         }
 452         orr(ch1, ch1, ch1, LSL, 16);
 453         orr(ch1, ch1, ch1, LSL, 32);
 454       BIND(CH1_LOOP);
 455         ldr(ch2, Address(str2, cnt2_neg));
 456         eor(ch2, ch1, ch2);
 457         sub(tmp1, ch2, tmp3);
 458         orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 459         bics(tmp1, tmp1, tmp2);
 460         br(NE, HAS_ZERO);
 461         adds(cnt2_neg, cnt2_neg, 8);
 462         br(LT, CH1_LOOP);
 463 
 464         cmp(cnt2_neg, (u1)8);
 465         mov(cnt2_neg, 0);
 466         br(LT, CH1_LOOP);
 467         b(NOMATCH);
 468 
 469       BIND(HAS_ZERO);
 470         rev(tmp1, tmp1);
 471         clz(tmp1, tmp1);
 472         add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
 473         b(MATCH);
 474 
 475       BIND(DO1_SHORT);
 476         mov(result_tmp, cnt2);
 477         lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
 478         sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
 479       BIND(DO1_LOOP);
 480         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
 481         cmpw(ch1, ch2);
 482         br(EQ, MATCH);
 483         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 484         br(LT, DO1_LOOP);
 485     }
 486   }
 487   BIND(NOMATCH);
 488     mov(result, -1);
 489     b(DONE);
 490   BIND(MATCH);
 491     add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
 492   BIND(DONE);
 493 }
 494 
 495 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
 496 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
 497 
 498 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1,
 499                                             Register ch, Register result,
 500                                             Register tmp1, Register tmp2, Register tmp3)
 501 {
 502   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
 503   Register cnt1_neg = cnt1;
 504   Register ch1 = rscratch1;
 505   Register result_tmp = rscratch2;
 506 
 507   cbz(cnt1, NOMATCH);
 508 
 509   cmp(cnt1, (u1)4);
 510   br(LT, DO1_SHORT);
 511 
 512   orr(ch, ch, ch, LSL, 16);
 513   orr(ch, ch, ch, LSL, 32);
 514 
 515   sub(cnt1, cnt1, 4);
 516   mov(result_tmp, cnt1);
 517   lea(str1, Address(str1, cnt1, Address::uxtw(1)));
 518   sub(cnt1_neg, zr, cnt1, LSL, 1);
 519 
 520   mov(tmp3, 0x0001000100010001);
 521 
 522   BIND(CH1_LOOP);
 523     ldr(ch1, Address(str1, cnt1_neg));
 524     eor(ch1, ch, ch1);
 525     sub(tmp1, ch1, tmp3);
 526     orr(tmp2, ch1, 0x7fff7fff7fff7fff);
 527     bics(tmp1, tmp1, tmp2);
 528     br(NE, HAS_ZERO);
 529     adds(cnt1_neg, cnt1_neg, 8);
 530     br(LT, CH1_LOOP);
 531 
 532     cmp(cnt1_neg, (u1)8);
 533     mov(cnt1_neg, 0);
 534     br(LT, CH1_LOOP);
 535     b(NOMATCH);
 536 
 537   BIND(HAS_ZERO);
 538     rev(tmp1, tmp1);
 539     clz(tmp1, tmp1);
 540     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
 541     b(MATCH);
 542 
 543   BIND(DO1_SHORT);
 544     mov(result_tmp, cnt1);
 545     lea(str1, Address(str1, cnt1, Address::uxtw(1)));
 546     sub(cnt1_neg, zr, cnt1, LSL, 1);
 547   BIND(DO1_LOOP);
 548     ldrh(ch1, Address(str1, cnt1_neg));
 549     cmpw(ch, ch1);
 550     br(EQ, MATCH);
 551     adds(cnt1_neg, cnt1_neg, 2);
 552     br(LT, DO1_LOOP);
 553   BIND(NOMATCH);
 554     mov(result, -1);
 555     b(DONE);
 556   BIND(MATCH);
 557     add(result, result_tmp, cnt1_neg, ASR, 1);
 558   BIND(DONE);
 559 }
 560 
 561 void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1,
 562                                                 Register ch, Register result,
 563                                                 FloatRegister ztmp1,
 564                                                 FloatRegister ztmp2,
 565                                                 PRegister tmp_pg,
 566                                                 PRegister tmp_pdn, bool isL)
 567 {
 568   // Note that `tmp_pdn` should *NOT* be used as governing predicate register.
 569   assert(tmp_pg->is_governing(),
 570          "this register has to be a governing predicate register");
 571 
 572   Label LOOP, MATCH, DONE, NOMATCH;
 573   Register vec_len = rscratch1;
 574   Register idx = rscratch2;
 575 
 576   SIMD_RegVariant T = (isL == true) ? B : H;
 577 
 578   cbz(cnt1, NOMATCH);
 579 
 580   // Assign the particular char throughout the vector.
 581   sve_dup(ztmp2, T, ch);
 582   if (isL) {
 583     sve_cntb(vec_len);
 584   } else {
 585     sve_cnth(vec_len);
 586   }
 587   mov(idx, 0);
 588 
 589   // Generate a predicate to control the reading of input string.
 590   sve_whilelt(tmp_pg, T, idx, cnt1);
 591 
 592   BIND(LOOP);
 593     // Read a vector of 8- or 16-bit data depending on the string type. Note
 594     // that inactive elements indicated by the predicate register won't cause
 595     // a data read from memory to the destination vector.
 596     if (isL) {
 597       sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx));
 598     } else {
 599       sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1)));
 600     }
 601     add(idx, idx, vec_len);
 602 
 603     // Perform the comparison. An element of the destination predicate is set
 604     // to active if the particular char is matched.
 605     sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2);
 606 
 607     // Branch if the particular char is found.
 608     br(NE, MATCH);
 609 
 610     sve_whilelt(tmp_pg, T, idx, cnt1);
 611 
 612     // Loop back if the particular char not found.
 613     br(MI, LOOP);
 614 
 615   BIND(NOMATCH);
 616     mov(result, -1);
 617     b(DONE);
 618 
 619   BIND(MATCH);
 620     // Undo the index increment.
 621     sub(idx, idx, vec_len);
 622 
 623     // Crop the vector to find its location.
 624     sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */);
 625     add(result, idx, -1);
 626     sve_incp(result, T, tmp_pdn);
 627   BIND(DONE);
 628 }
 629 
 630 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1,
 631                                             Register ch, Register result,
 632                                             Register tmp1, Register tmp2, Register tmp3)
 633 {
 634   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
 635   Register cnt1_neg = cnt1;
 636   Register ch1 = rscratch1;
 637   Register result_tmp = rscratch2;
 638 
 639   cbz(cnt1, NOMATCH);
 640 
 641   cmp(cnt1, (u1)8);
 642   br(LT, DO1_SHORT);
 643 
 644   orr(ch, ch, ch, LSL, 8);
 645   orr(ch, ch, ch, LSL, 16);
 646   orr(ch, ch, ch, LSL, 32);
 647 
 648   sub(cnt1, cnt1, 8);
 649   mov(result_tmp, cnt1);
 650   lea(str1, Address(str1, cnt1));
 651   sub(cnt1_neg, zr, cnt1);
 652 
 653   mov(tmp3, 0x0101010101010101);
 654 
 655   BIND(CH1_LOOP);
 656     ldr(ch1, Address(str1, cnt1_neg));
 657     eor(ch1, ch, ch1);
 658     sub(tmp1, ch1, tmp3);
 659     orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f);
 660     bics(tmp1, tmp1, tmp2);
 661     br(NE, HAS_ZERO);
 662     adds(cnt1_neg, cnt1_neg, 8);
 663     br(LT, CH1_LOOP);
 664 
 665     cmp(cnt1_neg, (u1)8);
 666     mov(cnt1_neg, 0);
 667     br(LT, CH1_LOOP);
 668     b(NOMATCH);
 669 
 670   BIND(HAS_ZERO);
 671     rev(tmp1, tmp1);
 672     clz(tmp1, tmp1);
 673     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
 674     b(MATCH);
 675 
 676   BIND(DO1_SHORT);
 677     mov(result_tmp, cnt1);
 678     lea(str1, Address(str1, cnt1));
 679     sub(cnt1_neg, zr, cnt1);
 680   BIND(DO1_LOOP);
 681     ldrb(ch1, Address(str1, cnt1_neg));
 682     cmp(ch, ch1);
 683     br(EQ, MATCH);
 684     adds(cnt1_neg, cnt1_neg, 1);
 685     br(LT, DO1_LOOP);
 686   BIND(NOMATCH);
 687     mov(result, -1);
 688     b(DONE);
 689   BIND(MATCH);
 690     add(result, result_tmp, cnt1_neg);
 691   BIND(DONE);
 692 }
 693 
 694 // Compare strings.
 695 void C2_MacroAssembler::string_compare(Register str1, Register str2,
 696     Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
 697     FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
 698     PRegister pgtmp1, PRegister pgtmp2, int ae) {
 699   Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
 700       DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
 701       SHORT_LOOP_START, TAIL_CHECK;
 702 
 703   bool isLL = ae == StrIntrinsicNode::LL;
 704   bool isLU = ae == StrIntrinsicNode::LU;
 705   bool isUL = ae == StrIntrinsicNode::UL;
 706 
 707   // The stub threshold for LL strings is: 72 (64 + 8) chars
 708   // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch)
 709   // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least)
 710   const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36);
 711 
 712   bool str1_isL = isLL || isLU;
 713   bool str2_isL = isLL || isUL;
 714 
 715   int str1_chr_shift = str1_isL ? 0 : 1;
 716   int str2_chr_shift = str2_isL ? 0 : 1;
 717   int str1_chr_size = str1_isL ? 1 : 2;
 718   int str2_chr_size = str2_isL ? 1 : 2;
 719   int minCharsInWord = isLL ? wordSize : wordSize/2;
 720 
 721   FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
 722   chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
 723                                       (chr_insn)&MacroAssembler::ldrh;
 724   chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
 725                                       (chr_insn)&MacroAssembler::ldrh;
 726   uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
 727                             (uxt_insn)&MacroAssembler::uxthw;
 728 
 729   BLOCK_COMMENT("string_compare {");
 730 
 731   // Bizzarely, the counts are passed in bytes, regardless of whether they
 732   // are L or U strings, however the result is always in characters.
 733   if (!str1_isL) asrw(cnt1, cnt1, 1);
 734   if (!str2_isL) asrw(cnt2, cnt2, 1);
 735 
 736   // Compute the minimum of the string lengths and save the difference.
 737   subsw(result, cnt1, cnt2);
 738   cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
 739 
 740   // A very short string
 741   cmpw(cnt2, minCharsInWord);
 742   br(Assembler::LE, SHORT_STRING);
 743 
 744   // Compare longwords
 745   // load first parts of strings and finish initialization while loading
 746   {
 747     if (str1_isL == str2_isL) { // LL or UU
 748       ldr(tmp1, Address(str1));
 749       cmp(str1, str2);
 750       br(Assembler::EQ, DONE);
 751       ldr(tmp2, Address(str2));
 752       cmp(cnt2, stub_threshold);
 753       br(GE, STUB);
 754       subsw(cnt2, cnt2, minCharsInWord);
 755       br(EQ, TAIL_CHECK);
 756       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
 757       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
 758       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
 759     } else if (isLU) {
 760       ldrs(vtmp, Address(str1));
 761       ldr(tmp2, Address(str2));
 762       cmp(cnt2, stub_threshold);
 763       br(GE, STUB);
 764       subw(cnt2, cnt2, 4);
 765       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
 766       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
 767       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
 768       zip1(vtmp, T8B, vtmp, vtmpZ);
 769       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
 770       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
 771       add(cnt1, cnt1, 4);
 772       fmovd(tmp1, vtmp);
 773     } else { // UL case
 774       ldr(tmp1, Address(str1));
 775       ldrs(vtmp, Address(str2));
 776       cmp(cnt2, stub_threshold);
 777       br(GE, STUB);
 778       subw(cnt2, cnt2, 4);
 779       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
 780       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
 781       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
 782       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
 783       zip1(vtmp, T8B, vtmp, vtmpZ);
 784       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
 785       add(cnt1, cnt1, 8);
 786       fmovd(tmp2, vtmp);
 787     }
 788     adds(cnt2, cnt2, isUL ? 4 : 8);
 789     br(GE, TAIL);
 790     eor(rscratch2, tmp1, tmp2);
 791     cbnz(rscratch2, DIFF);
 792     // main loop
 793     bind(NEXT_WORD);
 794     if (str1_isL == str2_isL) {
 795       ldr(tmp1, Address(str1, cnt2));
 796       ldr(tmp2, Address(str2, cnt2));
 797       adds(cnt2, cnt2, 8);
 798     } else if (isLU) {
 799       ldrs(vtmp, Address(str1, cnt1));
 800       ldr(tmp2, Address(str2, cnt2));
 801       add(cnt1, cnt1, 4);
 802       zip1(vtmp, T8B, vtmp, vtmpZ);
 803       fmovd(tmp1, vtmp);
 804       adds(cnt2, cnt2, 8);
 805     } else { // UL
 806       ldrs(vtmp, Address(str2, cnt2));
 807       ldr(tmp1, Address(str1, cnt1));
 808       zip1(vtmp, T8B, vtmp, vtmpZ);
 809       add(cnt1, cnt1, 8);
 810       fmovd(tmp2, vtmp);
 811       adds(cnt2, cnt2, 4);
 812     }
 813     br(GE, TAIL);
 814 
 815     eor(rscratch2, tmp1, tmp2);
 816     cbz(rscratch2, NEXT_WORD);
 817     b(DIFF);
 818     bind(TAIL);
 819     eor(rscratch2, tmp1, tmp2);
 820     cbnz(rscratch2, DIFF);
 821     // Last longword.  In the case where length == 4 we compare the
 822     // same longword twice, but that's still faster than another
 823     // conditional branch.
 824     if (str1_isL == str2_isL) {
 825       ldr(tmp1, Address(str1));
 826       ldr(tmp2, Address(str2));
 827     } else if (isLU) {
 828       ldrs(vtmp, Address(str1));
 829       ldr(tmp2, Address(str2));
 830       zip1(vtmp, T8B, vtmp, vtmpZ);
 831       fmovd(tmp1, vtmp);
 832     } else { // UL
 833       ldrs(vtmp, Address(str2));
 834       ldr(tmp1, Address(str1));
 835       zip1(vtmp, T8B, vtmp, vtmpZ);
 836       fmovd(tmp2, vtmp);
 837     }
 838     bind(TAIL_CHECK);
 839     eor(rscratch2, tmp1, tmp2);
 840     cbz(rscratch2, DONE);
 841 
 842     // Find the first different characters in the longwords and
 843     // compute their difference.
 844     bind(DIFF);
 845     rev(rscratch2, rscratch2);
 846     clz(rscratch2, rscratch2);
 847     andr(rscratch2, rscratch2, isLL ? -8 : -16);
 848     lsrv(tmp1, tmp1, rscratch2);
 849     (this->*ext_chr)(tmp1, tmp1);
 850     lsrv(tmp2, tmp2, rscratch2);
 851     (this->*ext_chr)(tmp2, tmp2);
 852     subw(result, tmp1, tmp2);
 853     b(DONE);
 854   }
 855 
 856   bind(STUB);
 857     RuntimeAddress stub = NULL;
 858     switch(ae) {
 859       case StrIntrinsicNode::LL:
 860         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
 861         break;
 862       case StrIntrinsicNode::UU:
 863         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
 864         break;
 865       case StrIntrinsicNode::LU:
 866         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
 867         break;
 868       case StrIntrinsicNode::UL:
 869         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
 870         break;
 871       default:
 872         ShouldNotReachHere();
 873      }
 874     assert(stub.target() != NULL, "compare_long_string stub has not been generated");
 875     trampoline_call(stub);
 876     b(DONE);
 877 
 878   bind(SHORT_STRING);
 879   // Is the minimum length zero?
 880   cbz(cnt2, DONE);
 881   // arrange code to do most branches while loading and loading next characters
 882   // while comparing previous
 883   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
 884   subs(cnt2, cnt2, 1);
 885   br(EQ, SHORT_LAST_INIT);
 886   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
 887   b(SHORT_LOOP_START);
 888   bind(SHORT_LOOP);
 889   subs(cnt2, cnt2, 1);
 890   br(EQ, SHORT_LAST);
 891   bind(SHORT_LOOP_START);
 892   (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
 893   (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
 894   cmp(tmp1, cnt1);
 895   br(NE, SHORT_LOOP_TAIL);
 896   subs(cnt2, cnt2, 1);
 897   br(EQ, SHORT_LAST2);
 898   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
 899   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
 900   cmp(tmp2, rscratch1);
 901   br(EQ, SHORT_LOOP);
 902   sub(result, tmp2, rscratch1);
 903   b(DONE);
 904   bind(SHORT_LOOP_TAIL);
 905   sub(result, tmp1, cnt1);
 906   b(DONE);
 907   bind(SHORT_LAST2);
 908   cmp(tmp2, rscratch1);
 909   br(EQ, DONE);
 910   sub(result, tmp2, rscratch1);
 911 
 912   b(DONE);
 913   bind(SHORT_LAST_INIT);
 914   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
 915   bind(SHORT_LAST);
 916   cmp(tmp1, cnt1);
 917   br(EQ, DONE);
 918   sub(result, tmp1, cnt1);
 919 
 920   bind(DONE);
 921 
 922   BLOCK_COMMENT("} string_compare");
 923 }
 924 
 925 void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1,
 926                                      FloatRegister src2, int cond, bool isQ) {
 927   SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
 928   if (bt == T_FLOAT || bt == T_DOUBLE) {
 929     switch (cond) {
 930       case BoolTest::eq: fcmeq(dst, size, src1, src2); break;
 931       case BoolTest::ne: {
 932         fcmeq(dst, size, src1, src2);
 933         notr(dst, T16B, dst);
 934         break;
 935       }
 936       case BoolTest::ge: fcmge(dst, size, src1, src2); break;
 937       case BoolTest::gt: fcmgt(dst, size, src1, src2); break;
 938       case BoolTest::le: fcmge(dst, size, src2, src1); break;
 939       case BoolTest::lt: fcmgt(dst, size, src2, src1); break;
 940       default:
 941         assert(false, "unsupported");
 942         ShouldNotReachHere();
 943     }
 944   } else {
 945     switch (cond) {
 946       case BoolTest::eq: cmeq(dst, size, src1, src2); break;
 947       case BoolTest::ne: {
 948         cmeq(dst, size, src1, src2);
 949         notr(dst, T16B, dst);
 950         break;
 951       }
 952       case BoolTest::ge: cmge(dst, size, src1, src2); break;
 953       case BoolTest::gt: cmgt(dst, size, src1, src2); break;
 954       case BoolTest::le: cmge(dst, size, src2, src1); break;
 955       case BoolTest::lt: cmgt(dst, size, src2, src1); break;
 956       case BoolTest::uge: cmhs(dst, size, src1, src2); break;
 957       case BoolTest::ugt: cmhi(dst, size, src1, src2); break;
 958       case BoolTest::ult: cmhi(dst, size, src2, src1); break;
 959       case BoolTest::ule: cmhs(dst, size, src2, src1); break;
 960       default:
 961         assert(false, "unsupported");
 962         ShouldNotReachHere();
 963     }
 964   }
 965 }
 966 
 967 // Compress the least significant bit of each byte to the rightmost and clear
 968 // the higher garbage bits.
 969 void C2_MacroAssembler::bytemask_compress(Register dst) {
 970   // Example input, dst = 0x01 00 00 00 01 01 00 01
 971   // The "??" bytes are garbage.
 972   orr(dst, dst, dst, Assembler::LSR, 7);  // dst = 0x?? 02 ?? 00 ?? 03 ?? 01
 973   orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D
 974   orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D
 975   andr(dst, dst, 0xff);                   // dst = 0x8D
 976 }
 977 
 978 // Pack the lowest-numbered bit of each mask element in src into a long value
 979 // in dst, at most the first 64 lane elements.
 980 // Clobbers: rscratch1, if UseSVE=1 or the hardware doesn't support FEAT_BITPERM.
 981 void C2_MacroAssembler::sve_vmask_tolong(Register dst, PRegister src, BasicType bt, int lane_cnt,
 982                                          FloatRegister vtmp1, FloatRegister vtmp2) {
 983   assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
 984   assert_different_registers(dst, rscratch1);
 985   assert_different_registers(vtmp1, vtmp2);
 986 
 987   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
 988   // Example:   src = 0b01100101 10001101, bt = T_BYTE, lane_cnt = 16
 989   // Expected:  dst = 0x658D
 990 
 991   // Convert the mask into vector with sequential bytes.
 992   // vtmp1 = 0x00010100 0x00010001 0x01000000 0x01010001
 993   sve_cpy(vtmp1, size, src, 1, false);
 994   if (bt != T_BYTE) {
 995     sve_vector_narrow(vtmp1, B, vtmp1, size, vtmp2);
 996   }
 997 
 998   if (UseSVE > 1 && VM_Version::supports_svebitperm()) {
 999     // Given a vector with the value 0x00 or 0x01 in each byte, the basic idea
1000     // is to compress each significant bit of the byte in a cross-lane way. Due
1001     // to the lack of a cross-lane bit-compress instruction, we use BEXT
1002     // (bit-compress in each lane) with the biggest lane size (T = D) then
1003     // concatenate the results.
1004 
1005     // The second source input of BEXT, initialized with 0x01 in each byte.
1006     // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1007     sve_dup(vtmp2, B, 1);
1008 
1009     // BEXT vtmp1.D, vtmp1.D, vtmp2.D
1010     // vtmp1 = 0x0001010000010001 | 0x0100000001010001
1011     // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1012     //         ---------------------------------------
1013     // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1014     sve_bext(vtmp1, D, vtmp1, vtmp2);
1015 
1016     // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the
1017     // result to dst.
1018     // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1019     // dst   = 0x658D
1020     if (lane_cnt <= 8) {
1021       // No need to concatenate.
1022       umov(dst, vtmp1, B, 0);
1023     } else if (lane_cnt <= 16) {
1024       ins(vtmp1, B, vtmp1, 1, 8);
1025       umov(dst, vtmp1, H, 0);
1026     } else {
1027       // As the lane count is 64 at most, the final expected value must be in
1028       // the lowest 64 bits after narrowing vtmp1 from D to B.
1029       sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2);
1030       umov(dst, vtmp1, D, 0);
1031     }
1032   } else if (UseSVE > 0) {
1033     // Compress the lowest 8 bytes.
1034     fmovd(dst, vtmp1);
1035     bytemask_compress(dst);
1036     if (lane_cnt <= 8) return;
1037 
1038     // Repeat on higher bytes and join the results.
1039     // Compress 8 bytes in each iteration.
1040     for (int idx = 1; idx < (lane_cnt / 8); idx++) {
1041       sve_extract_integral(rscratch1, T_LONG, vtmp1, idx, vtmp2);
1042       bytemask_compress(rscratch1);
1043       orr(dst, dst, rscratch1, Assembler::LSL, idx << 3);
1044     }
1045   } else {
1046     assert(false, "unsupported");
1047     ShouldNotReachHere();
1048   }
1049 }
1050 
1051 // Unpack the mask, a long value in src, into predicate register dst based on the
1052 // corresponding data type. Note that dst can support at most 64 lanes.
1053 // Below example gives the expected dst predicate register in different types, with
1054 // a valid src(0x658D) on a 1024-bit vector size machine.
1055 // BYTE:  dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 00 00 65 8D
1056 // SHORT: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 14 11 40 51
1057 // INT:   dst = 0x00 00 00 00 00 00 00 00 01 10 01 01 10 00 11 01
1058 // LONG:  dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01
1059 //
1060 // The number of significant bits of src must be equal to lane_cnt. E.g., 0xFF658D which
1061 // has 24 significant bits would be an invalid input if dst predicate register refers to
1062 // a LONG type 1024-bit vector, which has at most 16 lanes.
1063 void C2_MacroAssembler::sve_vmask_fromlong(PRegister dst, Register src, BasicType bt, int lane_cnt,
1064                                            FloatRegister vtmp1, FloatRegister vtmp2) {
1065   assert(UseSVE == 2 && VM_Version::supports_svebitperm() &&
1066          lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported");
1067   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
1068   // Example:   src = 0x658D, bt = T_BYTE, size = B, lane_cnt = 16
1069   // Expected:  dst = 0b01101001 10001101
1070 
1071   // Put long value from general purpose register into the first lane of vector.
1072   // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1073   sve_dup(vtmp1, B, 0);
1074   mov(vtmp1, D, 0, src);
1075 
1076   // As sve_cmp generates mask value with the minimum unit in byte, we should
1077   // transform the value in the first lane which is mask in bit now to the
1078   // mask in byte, which can be done by SVE2's BDEP instruction.
1079 
1080   // The first source input of BDEP instruction. Deposite each byte in every 8 bytes.
1081   // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1082   if (lane_cnt <= 8) {
1083     // Nothing. As only one byte exsits.
1084   } else if (lane_cnt <= 16) {
1085     ins(vtmp1, B, vtmp1, 8, 1);
1086     mov(vtmp1, B, 1, zr);
1087   } else {
1088     sve_vector_extend(vtmp1, D, vtmp1, B);
1089   }
1090 
1091   // The second source input of BDEP instruction, initialized with 0x01 for each byte.
1092   // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1093   sve_dup(vtmp2, B, 1);
1094 
1095   // BDEP vtmp1.D, vtmp1.D, vtmp2.D
1096   // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1097   // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1098   //         ---------------------------------------
1099   // vtmp1 = 0x0001010000010001 | 0x0100000001010001
1100   sve_bdep(vtmp1, D, vtmp1, vtmp2);
1101 
1102   if (bt != T_BYTE) {
1103     sve_vector_extend(vtmp1, size, vtmp1, B);
1104   }
1105   // Generate mask according to the given vector, in which the elements have been
1106   // extended to expected type.
1107   // dst = 0b01101001 10001101
1108   sve_cmp(Assembler::NE, dst, size, ptrue, vtmp1, 0);
1109 }
1110 
1111 // Clobbers: rflags
1112 void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg,
1113                                     FloatRegister zn, FloatRegister zm, int cond) {
1114   assert(pg->is_governing(), "This register has to be a governing predicate register");
1115   FloatRegister z1 = zn, z2 = zm;
1116   // Convert the original BoolTest condition to Assembler::condition.
1117   Condition condition;
1118   switch (cond) {
1119     case BoolTest::eq: condition = Assembler::EQ; break;
1120     case BoolTest::ne: condition = Assembler::NE; break;
1121     case BoolTest::le: z1 = zm; z2 = zn; condition = Assembler::GE; break;
1122     case BoolTest::ge: condition = Assembler::GE; break;
1123     case BoolTest::lt: z1 = zm; z2 = zn; condition = Assembler::GT; break;
1124     case BoolTest::gt: condition = Assembler::GT; break;
1125     default:
1126       assert(false, "unsupported compare condition");
1127       ShouldNotReachHere();
1128   }
1129 
1130   SIMD_RegVariant size = elemType_to_regVariant(bt);
1131   if (bt == T_FLOAT || bt == T_DOUBLE) {
1132     sve_fcm(condition, pd, size, pg, z1, z2);
1133   } else {
1134     assert(is_integral_type(bt), "unsupported element type");
1135     sve_cmp(condition, pd, size, pg, z1, z2);
1136   }
1137 }
1138 
1139 // Get index of the last mask lane that is set
1140 void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) {
1141   SIMD_RegVariant size = elemType_to_regVariant(bt);
1142   sve_rev(ptmp, size, src);
1143   sve_brkb(ptmp, ptrue, ptmp, false);
1144   sve_cntp(dst, size, ptrue, ptmp);
1145   movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1);
1146   subw(dst, rscratch1, dst);
1147 }
1148 
1149 // Extend integer vector src to dst with the same lane count
1150 // but larger element size, e.g. 4B -> 4I
1151 void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes,
1152                                            FloatRegister src, BasicType src_bt) {
1153   if (src_bt == T_BYTE) {
1154     if (dst_bt == T_SHORT) {
1155       // 4B/8B to 4S/8S
1156       assert(dst_vlen_in_bytes == 8 || dst_vlen_in_bytes == 16, "unsupported");
1157       sxtl(dst, T8H, src, T8B);
1158     } else {
1159       // 4B to 4I
1160       assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported");
1161       sxtl(dst, T8H, src, T8B);
1162       sxtl(dst, T4S, dst, T4H);
1163     }
1164   } else if (src_bt == T_SHORT) {
1165     // 4S to 4I
1166     assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported");
1167     sxtl(dst, T4S, src, T4H);
1168   } else if (src_bt == T_INT) {
1169     // 2I to 2L
1170     assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported");
1171     sxtl(dst, T2D, src, T2S);
1172   } else {
1173     ShouldNotReachHere();
1174   }
1175 }
1176 
1177 // Narrow integer vector src down to dst with the same lane count
1178 // but smaller element size, e.g. 4I -> 4B
1179 void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt,
1180                                            FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) {
1181   if (src_bt == T_SHORT) {
1182     // 4S/8S to 4B/8B
1183     assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
1184     assert(dst_bt == T_BYTE, "unsupported");
1185     xtn(dst, T8B, src, T8H);
1186   } else if (src_bt == T_INT) {
1187     // 4I to 4B/4S
1188     assert(src_vlen_in_bytes == 16, "unsupported");
1189     assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported");
1190     xtn(dst, T4H, src, T4S);
1191     if (dst_bt == T_BYTE) {
1192       xtn(dst, T8B, dst, T8H);
1193     }
1194   } else if (src_bt == T_LONG) {
1195     // 2L to 2I
1196     assert(src_vlen_in_bytes == 16, "unsupported");
1197     assert(dst_bt == T_INT, "unsupported");
1198     xtn(dst, T2S, src, T2D);
1199   } else {
1200     ShouldNotReachHere();
1201   }
1202 }
1203 
1204 void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size,
1205                                           FloatRegister src, SIMD_RegVariant src_size) {
1206   assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size");
1207   if (src_size == B) {
1208     switch (dst_size) {
1209     case H:
1210       sve_sunpklo(dst, H, src);
1211       break;
1212     case S:
1213       sve_sunpklo(dst, H, src);
1214       sve_sunpklo(dst, S, dst);
1215       break;
1216     case D:
1217       sve_sunpklo(dst, H, src);
1218       sve_sunpklo(dst, S, dst);
1219       sve_sunpklo(dst, D, dst);
1220       break;
1221     default:
1222       ShouldNotReachHere();
1223     }
1224   } else if (src_size == H) {
1225     if (dst_size == S) {
1226       sve_sunpklo(dst, S, src);
1227     } else { // D
1228       sve_sunpklo(dst, S, src);
1229       sve_sunpklo(dst, D, dst);
1230     }
1231   } else if (src_size == S) {
1232     sve_sunpklo(dst, D, src);
1233   }
1234 }
1235 
1236 // Vector narrow from src to dst with specified element sizes.
1237 // High part of dst vector will be filled with zero.
1238 void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size,
1239                                           FloatRegister src, SIMD_RegVariant src_size,
1240                                           FloatRegister tmp) {
1241   assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size");
1242   assert_different_registers(src, tmp);
1243   sve_dup(tmp, src_size, 0);
1244   if (src_size == D) {
1245     switch (dst_size) {
1246     case S:
1247       sve_uzp1(dst, S, src, tmp);
1248       break;
1249     case H:
1250       assert_different_registers(dst, tmp);
1251       sve_uzp1(dst, S, src, tmp);
1252       sve_uzp1(dst, H, dst, tmp);
1253       break;
1254     case B:
1255       assert_different_registers(dst, tmp);
1256       sve_uzp1(dst, S, src, tmp);
1257       sve_uzp1(dst, H, dst, tmp);
1258       sve_uzp1(dst, B, dst, tmp);
1259       break;
1260     default:
1261       ShouldNotReachHere();
1262     }
1263   } else if (src_size == S) {
1264     if (dst_size == H) {
1265       sve_uzp1(dst, H, src, tmp);
1266     } else { // B
1267       assert_different_registers(dst, tmp);
1268       sve_uzp1(dst, H, src, tmp);
1269       sve_uzp1(dst, B, dst, tmp);
1270     }
1271   } else if (src_size == H) {
1272     sve_uzp1(dst, B, src, tmp);
1273   }
1274 }
1275 
1276 // Extend src predicate to dst predicate with the same lane count but larger
1277 // element size, e.g. 64Byte -> 512Long
1278 void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src,
1279                                              uint dst_element_length_in_bytes,
1280                                              uint src_element_length_in_bytes) {
1281   if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) {
1282     sve_punpklo(dst, src);
1283   } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) {
1284     sve_punpklo(dst, src);
1285     sve_punpklo(dst, dst);
1286   } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) {
1287     sve_punpklo(dst, src);
1288     sve_punpklo(dst, dst);
1289     sve_punpklo(dst, dst);
1290   } else {
1291     assert(false, "unsupported");
1292     ShouldNotReachHere();
1293   }
1294 }
1295 
1296 // Narrow src predicate to dst predicate with the same lane count but
1297 // smaller element size, e.g. 512Long -> 64Byte
1298 void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src,
1299                                              uint dst_element_length_in_bytes, uint src_element_length_in_bytes) {
1300   // The insignificant bits in src predicate are expected to be zero.
1301   if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) {
1302     sve_uzp1(dst, B, src, src);
1303   } else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) {
1304     sve_uzp1(dst, H, src, src);
1305     sve_uzp1(dst, B, dst, dst);
1306   } else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) {
1307     sve_uzp1(dst, S, src, src);
1308     sve_uzp1(dst, H, dst, dst);
1309     sve_uzp1(dst, B, dst, dst);
1310   } else {
1311     assert(false, "unsupported");
1312     ShouldNotReachHere();
1313   }
1314 }
1315 
1316 // Vector reduction add for integral type with ASIMD instructions.
1317 void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt,
1318                                                  Register isrc, FloatRegister vsrc,
1319                                                  unsigned vector_length_in_bytes,
1320                                                  FloatRegister vtmp) {
1321   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1322   assert_different_registers(dst, isrc);
1323   bool isQ = vector_length_in_bytes == 16;
1324 
1325   BLOCK_COMMENT("neon_reduce_add_integral {");
1326     switch(bt) {
1327       case T_BYTE:
1328         addv(vtmp, isQ ? T16B : T8B, vsrc);
1329         smov(dst, vtmp, B, 0);
1330         addw(dst, dst, isrc, ext::sxtb);
1331         break;
1332       case T_SHORT:
1333         addv(vtmp, isQ ? T8H : T4H, vsrc);
1334         smov(dst, vtmp, H, 0);
1335         addw(dst, dst, isrc, ext::sxth);
1336         break;
1337       case T_INT:
1338         isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc);
1339         umov(dst, vtmp, S, 0);
1340         addw(dst, dst, isrc);
1341         break;
1342       case T_LONG:
1343         assert(isQ, "unsupported");
1344         addpd(vtmp, vsrc);
1345         umov(dst, vtmp, D, 0);
1346         add(dst, dst, isrc);
1347         break;
1348       default:
1349         assert(false, "unsupported");
1350         ShouldNotReachHere();
1351     }
1352   BLOCK_COMMENT("} neon_reduce_add_integral");
1353 }
1354 
1355 // Vector reduction multiply for integral type with ASIMD instructions.
1356 // Note: temporary registers vtmp1 and vtmp2 are not used in some cases.
1357 // Clobbers: rscratch1
1358 void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt,
1359                                                  Register isrc, FloatRegister vsrc,
1360                                                  unsigned vector_length_in_bytes,
1361                                                  FloatRegister vtmp1, FloatRegister vtmp2) {
1362   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1363   bool isQ = vector_length_in_bytes == 16;
1364 
1365   BLOCK_COMMENT("neon_reduce_mul_integral {");
1366     switch(bt) {
1367       case T_BYTE:
1368         if (isQ) {
1369           // Multiply the lower half and higher half of vector iteratively.
1370           // vtmp1 = vsrc[8:15]
1371           ins(vtmp1, D, vsrc, 0, 1);
1372           // vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7]
1373           mulv(vtmp1, T8B, vtmp1, vsrc);
1374           // vtmp2 = vtmp1[4:7]
1375           ins(vtmp2, S, vtmp1, 0, 1);
1376           // vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3]
1377           mulv(vtmp1, T8B, vtmp2, vtmp1);
1378         } else {
1379           ins(vtmp1, S, vsrc, 0, 1);
1380           mulv(vtmp1, T8B, vtmp1, vsrc);
1381         }
1382         // vtmp2 = vtmp1[2:3]
1383         ins(vtmp2, H, vtmp1, 0, 1);
1384         // vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1]
1385         mulv(vtmp2, T8B, vtmp2, vtmp1);
1386         // dst = vtmp2[0] * isrc * vtmp2[1]
1387         umov(rscratch1, vtmp2, B, 0);
1388         mulw(dst, rscratch1, isrc);
1389         sxtb(dst, dst);
1390         umov(rscratch1, vtmp2, B, 1);
1391         mulw(dst, rscratch1, dst);
1392         sxtb(dst, dst);
1393         break;
1394       case T_SHORT:
1395         if (isQ) {
1396           ins(vtmp2, D, vsrc, 0, 1);
1397           mulv(vtmp2, T4H, vtmp2, vsrc);
1398           ins(vtmp1, S, vtmp2, 0, 1);
1399           mulv(vtmp1, T4H, vtmp1, vtmp2);
1400         } else {
1401           ins(vtmp1, S, vsrc, 0, 1);
1402           mulv(vtmp1, T4H, vtmp1, vsrc);
1403         }
1404         umov(rscratch1, vtmp1, H, 0);
1405         mulw(dst, rscratch1, isrc);
1406         sxth(dst, dst);
1407         umov(rscratch1, vtmp1, H, 1);
1408         mulw(dst, rscratch1, dst);
1409         sxth(dst, dst);
1410         break;
1411       case T_INT:
1412         if (isQ) {
1413           ins(vtmp1, D, vsrc, 0, 1);
1414           mulv(vtmp1, T2S, vtmp1, vsrc);
1415         } else {
1416           vtmp1 = vsrc;
1417         }
1418         umov(rscratch1, vtmp1, S, 0);
1419         mul(dst, rscratch1, isrc);
1420         umov(rscratch1, vtmp1, S, 1);
1421         mul(dst, rscratch1, dst);
1422         break;
1423       case T_LONG:
1424         umov(rscratch1, vsrc, D, 0);
1425         mul(dst, isrc, rscratch1);
1426         umov(rscratch1, vsrc, D, 1);
1427         mul(dst, dst, rscratch1);
1428         break;
1429       default:
1430         assert(false, "unsupported");
1431         ShouldNotReachHere();
1432     }
1433   BLOCK_COMMENT("} neon_reduce_mul_integral");
1434 }
1435 
1436 // Vector reduction multiply for floating-point type with ASIMD instructions.
1437 void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt,
1438                                            FloatRegister fsrc, FloatRegister vsrc,
1439                                            unsigned vector_length_in_bytes,
1440                                            FloatRegister vtmp) {
1441   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1442   bool isQ = vector_length_in_bytes == 16;
1443 
1444   BLOCK_COMMENT("neon_reduce_mul_fp {");
1445     switch(bt) {
1446       case T_FLOAT:
1447         fmuls(dst, fsrc, vsrc);
1448         ins(vtmp, S, vsrc, 0, 1);
1449         fmuls(dst, dst, vtmp);
1450         if (isQ) {
1451           ins(vtmp, S, vsrc, 0, 2);
1452           fmuls(dst, dst, vtmp);
1453           ins(vtmp, S, vsrc, 0, 3);
1454           fmuls(dst, dst, vtmp);
1455          }
1456         break;
1457       case T_DOUBLE:
1458         assert(isQ, "unsupported");
1459         fmuld(dst, fsrc, vsrc);
1460         ins(vtmp, D, vsrc, 0, 1);
1461         fmuld(dst, dst, vtmp);
1462         break;
1463       default:
1464         assert(false, "unsupported");
1465         ShouldNotReachHere();
1466     }
1467   BLOCK_COMMENT("} neon_reduce_mul_fp");
1468 }
1469 
1470 // Helper to select logical instruction
1471 void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd,
1472                                                    Register Rn, Register Rm,
1473                                                    enum shift_kind kind, unsigned shift) {
1474   switch(opc) {
1475     case Op_AndReductionV:
1476       is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift);
1477       break;
1478     case Op_OrReductionV:
1479       is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift);
1480       break;
1481     case Op_XorReductionV:
1482       is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift);
1483       break;
1484     default:
1485       assert(false, "unsupported");
1486       ShouldNotReachHere();
1487   }
1488 }
1489 
1490 // Vector reduction logical operations And, Or, Xor
1491 // Clobbers: rscratch1
1492 void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt,
1493                                             Register isrc, FloatRegister vsrc,
1494                                             unsigned vector_length_in_bytes) {
1495   assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV,
1496          "unsupported");
1497   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1498   assert_different_registers(dst, isrc);
1499   bool isQ = vector_length_in_bytes == 16;
1500 
1501   BLOCK_COMMENT("neon_reduce_logical {");
1502     umov(rscratch1, vsrc, isQ ? D : S, 0);
1503     umov(dst, vsrc, isQ ? D : S, 1);
1504     neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1);
1505     switch(bt) {
1506       case T_BYTE:
1507         if (isQ) {
1508           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1509         }
1510         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
1511         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8);
1512         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1513         sxtb(dst, dst);
1514         break;
1515       case T_SHORT:
1516         if (isQ) {
1517           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1518         }
1519         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
1520         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1521         sxth(dst, dst);
1522         break;
1523       case T_INT:
1524         if (isQ) {
1525           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1526         }
1527         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1528         break;
1529       case T_LONG:
1530         assert(isQ, "unsupported");
1531         neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst);
1532         break;
1533       default:
1534         assert(false, "unsupported");
1535         ShouldNotReachHere();
1536     }
1537   BLOCK_COMMENT("} neon_reduce_logical");
1538 }
1539 
1540 // Vector reduction min/max for integral type with ASIMD instructions.
1541 // Note: vtmp is not used and expected to be fnoreg for T_LONG case.
1542 // Clobbers: rscratch1, rflags
1543 void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt,
1544                                                     Register isrc, FloatRegister vsrc,
1545                                                     unsigned vector_length_in_bytes,
1546                                                     FloatRegister vtmp) {
1547   assert(opc == Op_MinReductionV || opc == Op_MaxReductionV, "unsupported");
1548   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1549   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported");
1550   assert_different_registers(dst, isrc);
1551   bool isQ = vector_length_in_bytes == 16;
1552   bool is_min = opc == Op_MinReductionV;
1553 
1554   BLOCK_COMMENT("neon_reduce_minmax_integral {");
1555     if (bt == T_LONG) {
1556       assert(vtmp == fnoreg, "should be");
1557       assert(isQ, "should be");
1558       umov(rscratch1, vsrc, D, 0);
1559       cmp(isrc, rscratch1);
1560       csel(dst, isrc, rscratch1, is_min ? LT : GT);
1561       umov(rscratch1, vsrc, D, 1);
1562       cmp(dst, rscratch1);
1563       csel(dst, dst, rscratch1, is_min ? LT : GT);
1564     } else {
1565       SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1566       if (size == T2S) {
1567         is_min ? sminp(vtmp, size, vsrc, vsrc) : smaxp(vtmp, size, vsrc, vsrc);
1568       } else {
1569         is_min ? sminv(vtmp, size, vsrc) : smaxv(vtmp, size, vsrc);
1570       }
1571       if (bt == T_INT) {
1572         umov(dst, vtmp, S, 0);
1573       } else {
1574         smov(dst, vtmp, elemType_to_regVariant(bt), 0);
1575       }
1576       cmpw(dst, isrc);
1577       cselw(dst, dst, isrc, is_min ? LT : GT);
1578     }
1579   BLOCK_COMMENT("} neon_reduce_minmax_integral");
1580 }
1581 
1582 // Vector reduction for integral type with SVE instruction.
1583 // Supported operations are Add, And, Or, Xor, Max, Min.
1584 // rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV.
1585 void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1,
1586                                             FloatRegister src2, PRegister pg, FloatRegister tmp) {
1587   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
1588   assert(pg->is_governing(), "This register has to be a governing predicate register");
1589   assert_different_registers(src1, dst);
1590   // Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved.
1591   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
1592   switch (opc) {
1593     case Op_AddReductionVI: {
1594       sve_uaddv(tmp, size, pg, src2);
1595       if (bt == T_BYTE) {
1596         smov(dst, tmp, size, 0);
1597         addw(dst, src1, dst, ext::sxtb);
1598       } else if (bt == T_SHORT) {
1599         smov(dst, tmp, size, 0);
1600         addw(dst, src1, dst, ext::sxth);
1601       } else {
1602         umov(dst, tmp, size, 0);
1603         addw(dst, dst, src1);
1604       }
1605       break;
1606     }
1607     case Op_AddReductionVL: {
1608       sve_uaddv(tmp, size, pg, src2);
1609       umov(dst, tmp, size, 0);
1610       add(dst, dst, src1);
1611       break;
1612     }
1613     case Op_AndReductionV: {
1614       sve_andv(tmp, size, pg, src2);
1615       if (bt == T_INT || bt == T_LONG) {
1616         umov(dst, tmp, size, 0);
1617       } else {
1618         smov(dst, tmp, size, 0);
1619       }
1620       if (bt == T_LONG) {
1621         andr(dst, dst, src1);
1622       } else {
1623         andw(dst, dst, src1);
1624       }
1625       break;
1626     }
1627     case Op_OrReductionV: {
1628       sve_orv(tmp, size, pg, src2);
1629       if (bt == T_INT || bt == T_LONG) {
1630         umov(dst, tmp, size, 0);
1631       } else {
1632         smov(dst, tmp, size, 0);
1633       }
1634       if (bt == T_LONG) {
1635         orr(dst, dst, src1);
1636       } else {
1637         orrw(dst, dst, src1);
1638       }
1639       break;
1640     }
1641     case Op_XorReductionV: {
1642       sve_eorv(tmp, size, pg, src2);
1643       if (bt == T_INT || bt == T_LONG) {
1644         umov(dst, tmp, size, 0);
1645       } else {
1646         smov(dst, tmp, size, 0);
1647       }
1648       if (bt == T_LONG) {
1649         eor(dst, dst, src1);
1650       } else {
1651         eorw(dst, dst, src1);
1652       }
1653       break;
1654     }
1655     case Op_MaxReductionV: {
1656       sve_smaxv(tmp, size, pg, src2);
1657       if (bt == T_INT || bt == T_LONG) {
1658         umov(dst, tmp, size, 0);
1659       } else {
1660         smov(dst, tmp, size, 0);
1661       }
1662       if (bt == T_LONG) {
1663         cmp(dst, src1);
1664         csel(dst, dst, src1, Assembler::GT);
1665       } else {
1666         cmpw(dst, src1);
1667         cselw(dst, dst, src1, Assembler::GT);
1668       }
1669       break;
1670     }
1671     case Op_MinReductionV: {
1672       sve_sminv(tmp, size, pg, src2);
1673       if (bt == T_INT || bt == T_LONG) {
1674         umov(dst, tmp, size, 0);
1675       } else {
1676         smov(dst, tmp, size, 0);
1677       }
1678       if (bt == T_LONG) {
1679         cmp(dst, src1);
1680         csel(dst, dst, src1, Assembler::LT);
1681       } else {
1682         cmpw(dst, src1);
1683         cselw(dst, dst, src1, Assembler::LT);
1684       }
1685       break;
1686     }
1687     default:
1688       assert(false, "unsupported");
1689       ShouldNotReachHere();
1690   }
1691 
1692   if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) {
1693     if (bt == T_BYTE) {
1694       sxtb(dst, dst);
1695     } else if (bt == T_SHORT) {
1696       sxth(dst, dst);
1697     }
1698   }
1699 }
1700 
1701 // Set elements of the dst predicate to true for lanes in the range of [0, lane_cnt), or
1702 // to false otherwise. The input "lane_cnt" should be smaller than or equal to the supported
1703 // max vector length of the basic type. Clobbers: rscratch1 and the rFlagsReg.
1704 void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt) {
1705   uint32_t max_vector_length = Matcher::max_vector_size(bt);
1706   assert(lane_cnt <= max_vector_length, "unsupported input lane_cnt");
1707 
1708   // Set all elements to false if the input "lane_cnt" is zero.
1709   if (lane_cnt == 0) {
1710     sve_pfalse(dst);
1711     return;
1712   }
1713 
1714   SIMD_RegVariant size = elemType_to_regVariant(bt);
1715   assert(size != Q, "invalid size");
1716 
1717   // Set all true if "lane_cnt" equals to the max lane count.
1718   if (lane_cnt == max_vector_length) {
1719     sve_ptrue(dst, size, /* ALL */ 0b11111);
1720     return;
1721   }
1722 
1723   // Fixed numbers for "ptrue".
1724   switch(lane_cnt) {
1725   case 1: /* VL1 */
1726   case 2: /* VL2 */
1727   case 3: /* VL3 */
1728   case 4: /* VL4 */
1729   case 5: /* VL5 */
1730   case 6: /* VL6 */
1731   case 7: /* VL7 */
1732   case 8: /* VL8 */
1733     sve_ptrue(dst, size, lane_cnt);
1734     return;
1735   case 16:
1736     sve_ptrue(dst, size, /* VL16 */ 0b01001);
1737     return;
1738   case 32:
1739     sve_ptrue(dst, size, /* VL32 */ 0b01010);
1740     return;
1741   case 64:
1742     sve_ptrue(dst, size, /* VL64 */ 0b01011);
1743     return;
1744   case 128:
1745     sve_ptrue(dst, size, /* VL128 */ 0b01100);
1746     return;
1747   case 256:
1748     sve_ptrue(dst, size, /* VL256 */ 0b01101);
1749     return;
1750   default:
1751     break;
1752   }
1753 
1754   // Special patterns for "ptrue".
1755   if (lane_cnt == round_down_power_of_2(max_vector_length)) {
1756     sve_ptrue(dst, size, /* POW2 */ 0b00000);
1757   } else if (lane_cnt == max_vector_length - (max_vector_length % 4)) {
1758     sve_ptrue(dst, size, /* MUL4 */ 0b11101);
1759   } else if (lane_cnt == max_vector_length - (max_vector_length % 3)) {
1760     sve_ptrue(dst, size, /* MUL3 */ 0b11110);
1761   } else {
1762     // Encode to "whilelow" for the remaining cases.
1763     mov(rscratch1, lane_cnt);
1764     sve_whilelow(dst, size, zr, rscratch1);
1765   }
1766 }
1767 
1768 // Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst.
1769 // Any remaining elements of dst will be filled with zero.
1770 // Clobbers: rscratch1
1771 // Preserves: src, mask
1772 void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask,
1773                                            FloatRegister vtmp1, FloatRegister vtmp2,
1774                                            PRegister pgtmp) {
1775   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
1776   assert_different_registers(dst, src, vtmp1, vtmp2);
1777   assert_different_registers(mask, pgtmp);
1778 
1779   // Example input:   src   = 8888 7777 6666 5555 4444 3333 2222 1111
1780   //                  mask  = 0001 0000 0000 0001 0001 0000 0001 0001
1781   // Expected result: dst   = 0000 0000 0000 8888 5555 4444 2222 1111
1782   sve_dup(vtmp2, H, 0);
1783 
1784   // Extend lowest half to type INT.
1785   // dst = 00004444 00003333 00002222 00001111
1786   sve_uunpklo(dst, S, src);
1787   // pgtmp = 00000001 00000000 00000001 00000001
1788   sve_punpklo(pgtmp, mask);
1789   // Pack the active elements in size of type INT to the right,
1790   // and fill the remainings with zero.
1791   // dst = 00000000 00004444 00002222 00001111
1792   sve_compact(dst, S, dst, pgtmp);
1793   // Narrow the result back to type SHORT.
1794   // dst = 0000 0000 0000 0000 0000 4444 2222 1111
1795   sve_uzp1(dst, H, dst, vtmp2);
1796   // Count the active elements of lowest half.
1797   // rscratch1 = 3
1798   sve_cntp(rscratch1, S, ptrue, pgtmp);
1799 
1800   // Repeat to the highest half.
1801   // pgtmp = 00000001 00000000 00000000 00000001
1802   sve_punpkhi(pgtmp, mask);
1803   // vtmp1 = 00008888 00007777 00006666 00005555
1804   sve_uunpkhi(vtmp1, S, src);
1805   // vtmp1 = 00000000 00000000 00008888 00005555
1806   sve_compact(vtmp1, S, vtmp1, pgtmp);
1807   // vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555
1808   sve_uzp1(vtmp1, H, vtmp1, vtmp2);
1809 
1810   // Compressed low:   dst   = 0000 0000 0000 0000 0000 4444 2222 1111
1811   // Compressed high:  vtmp1 = 0000 0000 0000 0000 0000 0000 8888  5555
1812   // Left shift(cross lane) compressed high with TRUE_CNT lanes,
1813   // TRUE_CNT is the number of active elements in the compressed low.
1814   neg(rscratch1, rscratch1);
1815   // vtmp2 = {4 3 2 1 0 -1 -2 -3}
1816   sve_index(vtmp2, H, rscratch1, 1);
1817   // vtmp1 = 0000 0000 0000 8888 5555 0000 0000 0000
1818   sve_tbl(vtmp1, H, vtmp1, vtmp2);
1819 
1820   // Combine the compressed high(after shifted) with the compressed low.
1821   // dst = 0000 0000 0000 8888 5555 4444 2222 1111
1822   sve_orr(dst, dst, vtmp1);
1823 }
1824 
1825 // Clobbers: rscratch1, rscratch2
1826 // Preserves: src, mask
1827 void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask,
1828                                           FloatRegister vtmp1, FloatRegister vtmp2,
1829                                           FloatRegister vtmp3, FloatRegister vtmp4,
1830                                           PRegister ptmp, PRegister pgtmp) {
1831   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
1832   assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3, vtmp4);
1833   assert_different_registers(mask, ptmp, pgtmp);
1834   // Example input:   src   = 88 77 66 55 44 33 22 11
1835   //                  mask  = 01 00 00 01 01 00 01 01
1836   // Expected result: dst   = 00 00 00 88 55 44 22 11
1837 
1838   sve_dup(vtmp4, B, 0);
1839   // Extend lowest half to type SHORT.
1840   // vtmp1 = 0044 0033 0022 0011
1841   sve_uunpklo(vtmp1, H, src);
1842   // ptmp = 0001 0000 0001 0001
1843   sve_punpklo(ptmp, mask);
1844   // Count the active elements of lowest half.
1845   // rscratch2 = 3
1846   sve_cntp(rscratch2, H, ptrue, ptmp);
1847   // Pack the active elements in size of type SHORT to the right,
1848   // and fill the remainings with zero.
1849   // dst = 0000 0044 0022 0011
1850   sve_compress_short(dst, vtmp1, ptmp, vtmp2, vtmp3, pgtmp);
1851   // Narrow the result back to type BYTE.
1852   // dst = 00 00 00 00 00 44 22 11
1853   sve_uzp1(dst, B, dst, vtmp4);
1854 
1855   // Repeat to the highest half.
1856   // ptmp = 0001 0000 0000 0001
1857   sve_punpkhi(ptmp, mask);
1858   // vtmp1 = 0088 0077 0066 0055
1859   sve_uunpkhi(vtmp2, H, src);
1860   // vtmp1 = 0000 0000 0088 0055
1861   sve_compress_short(vtmp1, vtmp2, ptmp, vtmp3, vtmp4, pgtmp);
1862 
1863   sve_dup(vtmp4, B, 0);
1864   // vtmp1 = 00 00 00 00 00 00 88 55
1865   sve_uzp1(vtmp1, B, vtmp1, vtmp4);
1866 
1867   // Compressed low:   dst   = 00 00 00 00 00 44 22 11
1868   // Compressed high:  vtmp1 = 00 00 00 00 00 00 88 55
1869   // Left shift(cross lane) compressed high with TRUE_CNT lanes,
1870   // TRUE_CNT is the number of active elements in the compressed low.
1871   neg(rscratch2, rscratch2);
1872   // vtmp2 = {4 3 2 1 0 -1 -2 -3}
1873   sve_index(vtmp2, B, rscratch2, 1);
1874   // vtmp1 = 00 00 00 88 55 00 00 00
1875   sve_tbl(vtmp1, B, vtmp1, vtmp2);
1876   // Combine the compressed high(after shifted) with the compressed low.
1877   // dst = 00 00 00 88 55 44 22 11
1878   sve_orr(dst, dst, vtmp1);
1879 }
1880 
1881 void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
1882   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
1883   SIMD_Arrangement size = isQ ? T16B : T8B;
1884   if (bt == T_BYTE) {
1885     rbit(dst, size, src);
1886   } else {
1887     neon_reverse_bytes(dst, src, bt, isQ);
1888     rbit(dst, size, dst);
1889   }
1890 }
1891 
1892 void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
1893   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
1894   SIMD_Arrangement size = isQ ? T16B : T8B;
1895   switch (bt) {
1896     case T_BYTE:
1897       if (dst != src) {
1898         orr(dst, size, src, src);
1899       }
1900       break;
1901     case T_SHORT:
1902       rev16(dst, size, src);
1903       break;
1904     case T_INT:
1905       rev32(dst, size, src);
1906       break;
1907     case T_LONG:
1908       rev64(dst, size, src);
1909       break;
1910     default:
1911       assert(false, "unsupported");
1912       ShouldNotReachHere();
1913   }
1914 }
1915 
1916 // Extract a scalar element from an sve vector at position 'idx'.
1917 // The input elements in src are expected to be of integral type.
1918 void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src,
1919                                              int idx, FloatRegister vtmp) {
1920   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
1921   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
1922   if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction
1923     if (bt == T_INT || bt == T_LONG) {
1924       umov(dst, src, size, idx);
1925     } else {
1926       smov(dst, src, size, idx);
1927     }
1928   } else {
1929     sve_orr(vtmp, src, src);
1930     sve_ext(vtmp, vtmp, idx << size);
1931     if (bt == T_INT || bt == T_LONG) {
1932       umov(dst, vtmp, size, 0);
1933     } else {
1934       smov(dst, vtmp, size, 0);
1935     }
1936   }
1937 }
1938 
1939 // java.lang.Math::round intrinsics
1940 
1941 // Clobbers: rscratch1, rflags
1942 void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
1943                                           FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) {
1944   assert_different_registers(tmp1, tmp2, tmp3, src, dst);
1945   switch (T) {
1946     case T2S:
1947     case T4S:
1948       fmovs(tmp1, T, 0.5f);
1949       mov(rscratch1, jint_cast(0x1.0p23f));
1950       break;
1951     case T2D:
1952       fmovd(tmp1, T, 0.5);
1953       mov(rscratch1, julong_cast(0x1.0p52));
1954       break;
1955     default:
1956       assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
1957   }
1958   fadd(tmp1, T, tmp1, src);
1959   fcvtms(tmp1, T, tmp1);
1960   // tmp1 = floor(src + 0.5, ties to even)
1961 
1962   fcvtas(dst, T, src);
1963   // dst = round(src), ties to away
1964 
1965   fneg(tmp3, T, src);
1966   dup(tmp2, T, rscratch1);
1967   cmhs(tmp3, T, tmp3, tmp2);
1968   // tmp3 is now a set of flags
1969 
1970   bif(dst, T16B, tmp1, tmp3);
1971   // result in dst
1972 }
1973 
1974 // Clobbers: rscratch1, rflags
1975 void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
1976                                          FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) {
1977   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
1978   assert_different_registers(tmp1, tmp2, src, dst);
1979 
1980   switch (T) {
1981     case S:
1982       mov(rscratch1, jint_cast(0x1.0p23f));
1983       break;
1984     case D:
1985       mov(rscratch1, julong_cast(0x1.0p52));
1986       break;
1987     default:
1988       assert(T == S || T == D, "invalid register variant");
1989   }
1990 
1991   sve_frinta(dst, T, ptrue, src);
1992   // dst = round(src), ties to away
1993 
1994   Label none;
1995 
1996   sve_fneg(tmp1, T, ptrue, src);
1997   sve_dup(tmp2, T, rscratch1);
1998   sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1);
1999   br(EQ, none);
2000   {
2001     sve_cpy(tmp1, T, pgtmp, 0.5);
2002     sve_fadd(tmp1, T, pgtmp, src);
2003     sve_frintm(dst, T, pgtmp, tmp1);
2004     // dst = floor(src + 0.5, ties to even)
2005   }
2006   bind(none);
2007 
2008   sve_fcvtzs(dst, T, ptrue, dst, T);
2009   // result in dst
2010 }
2011 
2012 void C2_MacroAssembler::vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero,
2013                                            FloatRegister one, SIMD_Arrangement T) {
2014   assert_different_registers(dst, src, zero, one);
2015   assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2016 
2017   facgt(dst, T, src, zero);
2018   ushr(dst, T, dst, 1); // dst=0 for +-0.0 and NaN. 0x7FF..F otherwise
2019   bsl(dst, T == T2S ? T8B : T16B, one, src); // Result in dst
2020 }
2021 
2022 void C2_MacroAssembler::vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero,
2023                                           FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T) {
2024     assert_different_registers(dst, src, zero, one, vtmp);
2025     assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2026 
2027     sve_orr(vtmp, src, src);
2028     sve_fac(Assembler::GT, pgtmp, T, ptrue, src, zero); // pmtp=0 for +-0.0 and NaN. 0x1 otherwise
2029     switch (T) {
2030     case S:
2031       sve_and(vtmp, T, min_jint); // Extract the sign bit of float value in every lane of src
2032       sve_orr(vtmp, T, jint_cast(1.0)); // OR it with +1 to make the final result +1 or -1 depending
2033                                         // on the sign of the float value
2034       break;
2035     case D:
2036       sve_and(vtmp, T, min_jlong);
2037       sve_orr(vtmp, T, jlong_cast(1.0));
2038       break;
2039     default:
2040       assert(false, "unsupported");
2041       ShouldNotReachHere();
2042     }
2043     sve_sel(dst, T, pgtmp, vtmp, src); // Select either from src or vtmp based on the predicate register pgtmp
2044                                        // Result in dst
2045 }
2046 
2047 bool C2_MacroAssembler::in_scratch_emit_size() {
2048   if (ciEnv::current()->task() != NULL) {
2049     PhaseOutput* phase_output = Compile::current()->output();
2050     if (phase_output != NULL && phase_output->in_scratch_emit_size()) {
2051       return true;
2052     }
2053   }
2054   return MacroAssembler::in_scratch_emit_size();
2055 }