1 /*
   2  * Copyright (c) 2020, 2022, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "opto/c2_MacroAssembler.hpp"
  29 #include "opto/compile.hpp"
  30 #include "opto/intrinsicnode.hpp"
  31 #include "opto/matcher.hpp"
  32 #include "opto/output.hpp"
  33 #include "opto/subnode.hpp"
  34 #include "runtime/stubRoutines.hpp"
  35 
  36 #ifdef PRODUCT
  37 #define BLOCK_COMMENT(str) /* nothing */
  38 #define STOP(error) stop(error)
  39 #else
  40 #define BLOCK_COMMENT(str) block_comment(str)
  41 #define STOP(error) block_comment(error); stop(error)
  42 #endif
  43 
  44 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  45 
  46 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
  47 
  48 void C2_MacroAssembler::entry_barrier() {
  49   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
  50   if (BarrierSet::barrier_set()->barrier_set_nmethod() != NULL) {
  51     // Dummy labels for just measuring the code size
  52     Label dummy_slow_path;
  53     Label dummy_continuation;
  54     Label dummy_guard;
  55     Label* slow_path = &dummy_slow_path;
  56     Label* continuation = &dummy_continuation;
  57     Label* guard = &dummy_guard;
  58     if (!Compile::current()->output()->in_scratch_emit_size()) {
  59       // Use real labels from actual stub when not emitting code for the purpose of measuring its size
  60       C2EntryBarrierStub* stub = Compile::current()->output()->entry_barrier_table()->add_entry_barrier();
  61       slow_path = &stub->slow_path();
  62       continuation = &stub->continuation();
  63       guard = &stub->guard();
  64     }
  65     // In the C2 code, we move the non-hot part of nmethod entry barriers out-of-line to a stub.
  66     bs->nmethod_entry_barrier(this, slow_path, continuation, guard);
  67   }
  68 }
  69 
  70 void C2_MacroAssembler::emit_entry_barrier_stub(C2EntryBarrierStub* stub) {
  71   bind(stub->slow_path());
  72   movptr(rscratch1, (uintptr_t) StubRoutines::aarch64::method_entry_barrier());
  73   blr(rscratch1);
  74   b(stub->continuation());
  75 
  76   bind(stub->guard());
  77   relocate(entry_guard_Relocation::spec());
  78   emit_int32(0);   // nmethod guard value
  79 }
  80 
  81 int C2_MacroAssembler::entry_barrier_stub_size() {
  82   return 4 * 6;
  83 }
  84 
  85 // Search for str1 in str2 and return index or -1
  86 void C2_MacroAssembler::string_indexof(Register str2, Register str1,
  87                                        Register cnt2, Register cnt1,
  88                                        Register tmp1, Register tmp2,
  89                                        Register tmp3, Register tmp4,
  90                                        Register tmp5, Register tmp6,
  91                                        int icnt1, Register result, int ae) {
  92   // NOTE: tmp5, tmp6 can be zr depending on specific method version
  93   Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
  94 
  95   Register ch1 = rscratch1;
  96   Register ch2 = rscratch2;
  97   Register cnt1tmp = tmp1;
  98   Register cnt2tmp = tmp2;
  99   Register cnt1_neg = cnt1;
 100   Register cnt2_neg = cnt2;
 101   Register result_tmp = tmp4;
 102 
 103   bool isL = ae == StrIntrinsicNode::LL;
 104 
 105   bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
 106   bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
 107   int str1_chr_shift = str1_isL ? 0:1;
 108   int str2_chr_shift = str2_isL ? 0:1;
 109   int str1_chr_size = str1_isL ? 1:2;
 110   int str2_chr_size = str2_isL ? 1:2;
 111   chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
 112                                       (chr_insn)&MacroAssembler::ldrh;
 113   chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
 114                                       (chr_insn)&MacroAssembler::ldrh;
 115   chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
 116   chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
 117 
 118   // Note, inline_string_indexOf() generates checks:
 119   // if (substr.count > string.count) return -1;
 120   // if (substr.count == 0) return 0;
 121 
 122   // We have two strings, a source string in str2, cnt2 and a pattern string
 123   // in str1, cnt1. Find the 1st occurrence of pattern in source or return -1.
 124 
 125   // For larger pattern and source we use a simplified Boyer Moore algorithm.
 126   // With a small pattern and source we use linear scan.
 127 
 128   if (icnt1 == -1) {
 129     sub(result_tmp, cnt2, cnt1);
 130     cmp(cnt1, (u1)8);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
 131     br(LT, LINEARSEARCH);
 132     dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
 133     subs(zr, cnt1, 256);
 134     lsr(tmp1, cnt2, 2);
 135     ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
 136     br(GE, LINEARSTUB);
 137   }
 138 
 139 // The Boyer Moore alogorithm is based on the description here:-
 140 //
 141 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
 142 //
 143 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
 144 // and the 'Good Suffix' rule.
 145 //
 146 // These rules are essentially heuristics for how far we can shift the
 147 // pattern along the search string.
 148 //
 149 // The implementation here uses the 'Bad Character' rule only because of the
 150 // complexity of initialisation for the 'Good Suffix' rule.
 151 //
 152 // This is also known as the Boyer-Moore-Horspool algorithm:-
 153 //
 154 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
 155 //
 156 // This particular implementation has few java-specific optimizations.
 157 //
 158 // #define ASIZE 256
 159 //
 160 //    int bm(unsigned char *x, int m, unsigned char *y, int n) {
 161 //       int i, j;
 162 //       unsigned c;
 163 //       unsigned char bc[ASIZE];
 164 //
 165 //       /* Preprocessing */
 166 //       for (i = 0; i < ASIZE; ++i)
 167 //          bc[i] = m;
 168 //       for (i = 0; i < m - 1; ) {
 169 //          c = x[i];
 170 //          ++i;
 171 //          // c < 256 for Latin1 string, so, no need for branch
 172 //          #ifdef PATTERN_STRING_IS_LATIN1
 173 //          bc[c] = m - i;
 174 //          #else
 175 //          if (c < ASIZE) bc[c] = m - i;
 176 //          #endif
 177 //       }
 178 //
 179 //       /* Searching */
 180 //       j = 0;
 181 //       while (j <= n - m) {
 182 //          c = y[i+j];
 183 //          if (x[m-1] == c)
 184 //            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
 185 //          if (i < 0) return j;
 186 //          // c < 256 for Latin1 string, so, no need for branch
 187 //          #ifdef SOURCE_STRING_IS_LATIN1
 188 //          // LL case: (c< 256) always true. Remove branch
 189 //          j += bc[y[j+m-1]];
 190 //          #endif
 191 //          #ifndef PATTERN_STRING_IS_UTF
 192 //          // UU case: need if (c<ASIZE) check. Skip 1 character if not.
 193 //          if (c < ASIZE)
 194 //            j += bc[y[j+m-1]];
 195 //          else
 196 //            j += 1
 197 //          #endif
 198 //          #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
 199 //          // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
 200 //          if (c < ASIZE)
 201 //            j += bc[y[j+m-1]];
 202 //          else
 203 //            j += m
 204 //          #endif
 205 //       }
 206 //    }
 207 
 208   if (icnt1 == -1) {
 209     Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
 210         BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
 211     Register cnt1end = tmp2;
 212     Register str2end = cnt2;
 213     Register skipch = tmp2;
 214 
 215     // str1 length is >=8, so, we can read at least 1 register for cases when
 216     // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
 217     // UL case. We'll re-read last character in inner pre-loop code to have
 218     // single outer pre-loop load
 219     const int firstStep = isL ? 7 : 3;
 220 
 221     const int ASIZE = 256;
 222     const int STORED_BYTES = 32; // amount of bytes stored per instruction
 223     sub(sp, sp, ASIZE);
 224     mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
 225     mov(ch1, sp);
 226     BIND(BM_INIT_LOOP);
 227       stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
 228       subs(tmp5, tmp5, 1);
 229       br(GT, BM_INIT_LOOP);
 230 
 231       sub(cnt1tmp, cnt1, 1);
 232       mov(tmp5, str2);
 233       add(str2end, str2, result_tmp, LSL, str2_chr_shift);
 234       sub(ch2, cnt1, 1);
 235       mov(tmp3, str1);
 236     BIND(BCLOOP);
 237       (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
 238       if (!str1_isL) {
 239         subs(zr, ch1, ASIZE);
 240         br(HS, BCSKIP);
 241       }
 242       strb(ch2, Address(sp, ch1));
 243     BIND(BCSKIP);
 244       subs(ch2, ch2, 1);
 245       br(GT, BCLOOP);
 246 
 247       add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
 248       if (str1_isL == str2_isL) {
 249         // load last 8 bytes (8LL/4UU symbols)
 250         ldr(tmp6, Address(tmp6, -wordSize));
 251       } else {
 252         ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
 253         // convert Latin1 to UTF. We'll have to wait until load completed, but
 254         // it's still faster than per-character loads+checks
 255         lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
 256         ubfx(ch1, tmp6, 8, 8); // str1[N-2]
 257         ubfx(ch2, tmp6, 16, 8); // str1[N-3]
 258         andr(tmp6, tmp6, 0xFF); // str1[N-4]
 259         orr(ch2, ch1, ch2, LSL, 16);
 260         orr(tmp6, tmp6, tmp3, LSL, 48);
 261         orr(tmp6, tmp6, ch2, LSL, 16);
 262       }
 263     BIND(BMLOOPSTR2);
 264       (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 265       sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
 266       if (str1_isL == str2_isL) {
 267         // re-init tmp3. It's for free because it's executed in parallel with
 268         // load above. Alternative is to initialize it before loop, but it'll
 269         // affect performance on in-order systems with 2 or more ld/st pipelines
 270         lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
 271       }
 272       if (!isL) { // UU/UL case
 273         lsl(ch2, cnt1tmp, 1); // offset in bytes
 274       }
 275       cmp(tmp3, skipch);
 276       br(NE, BMSKIP);
 277       ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
 278       mov(ch1, tmp6);
 279       if (isL) {
 280         b(BMLOOPSTR1_AFTER_LOAD);
 281       } else {
 282         sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
 283         b(BMLOOPSTR1_CMP);
 284       }
 285     BIND(BMLOOPSTR1);
 286       (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
 287       (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 288     BIND(BMLOOPSTR1_AFTER_LOAD);
 289       subs(cnt1tmp, cnt1tmp, 1);
 290       br(LT, BMLOOPSTR1_LASTCMP);
 291     BIND(BMLOOPSTR1_CMP);
 292       cmp(ch1, ch2);
 293       br(EQ, BMLOOPSTR1);
 294     BIND(BMSKIP);
 295       if (!isL) {
 296         // if we've met UTF symbol while searching Latin1 pattern, then we can
 297         // skip cnt1 symbols
 298         if (str1_isL != str2_isL) {
 299           mov(result_tmp, cnt1);
 300         } else {
 301           mov(result_tmp, 1);
 302         }
 303         subs(zr, skipch, ASIZE);
 304         br(HS, BMADV);
 305       }
 306       ldrb(result_tmp, Address(sp, skipch)); // load skip distance
 307     BIND(BMADV);
 308       sub(cnt1tmp, cnt1, 1);
 309       add(str2, str2, result_tmp, LSL, str2_chr_shift);
 310       cmp(str2, str2end);
 311       br(LE, BMLOOPSTR2);
 312       add(sp, sp, ASIZE);
 313       b(NOMATCH);
 314     BIND(BMLOOPSTR1_LASTCMP);
 315       cmp(ch1, ch2);
 316       br(NE, BMSKIP);
 317     BIND(BMMATCH);
 318       sub(result, str2, tmp5);
 319       if (!str2_isL) lsr(result, result, 1);
 320       add(sp, sp, ASIZE);
 321       b(DONE);
 322 
 323     BIND(LINEARSTUB);
 324     cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm
 325     br(LT, LINEAR_MEDIUM);
 326     mov(result, zr);
 327     RuntimeAddress stub = NULL;
 328     if (isL) {
 329       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
 330       assert(stub.target() != NULL, "string_indexof_linear_ll stub has not been generated");
 331     } else if (str1_isL) {
 332       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
 333        assert(stub.target() != NULL, "string_indexof_linear_ul stub has not been generated");
 334     } else {
 335       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
 336       assert(stub.target() != NULL, "string_indexof_linear_uu stub has not been generated");
 337     }
 338     trampoline_call(stub);
 339     b(DONE);
 340   }
 341 
 342   BIND(LINEARSEARCH);
 343   {
 344     Label DO1, DO2, DO3;
 345 
 346     Register str2tmp = tmp2;
 347     Register first = tmp3;
 348 
 349     if (icnt1 == -1)
 350     {
 351         Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
 352 
 353         cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2));
 354         br(LT, DOSHORT);
 355       BIND(LINEAR_MEDIUM);
 356         (this->*str1_load_1chr)(first, Address(str1));
 357         lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
 358         sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
 359         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 360         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 361 
 362       BIND(FIRST_LOOP);
 363         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
 364         cmp(first, ch2);
 365         br(EQ, STR1_LOOP);
 366       BIND(STR2_NEXT);
 367         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 368         br(LE, FIRST_LOOP);
 369         b(NOMATCH);
 370 
 371       BIND(STR1_LOOP);
 372         adds(cnt1tmp, cnt1_neg, str1_chr_size);
 373         add(cnt2tmp, cnt2_neg, str2_chr_size);
 374         br(GE, MATCH);
 375 
 376       BIND(STR1_NEXT);
 377         (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
 378         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
 379         cmp(ch1, ch2);
 380         br(NE, STR2_NEXT);
 381         adds(cnt1tmp, cnt1tmp, str1_chr_size);
 382         add(cnt2tmp, cnt2tmp, str2_chr_size);
 383         br(LT, STR1_NEXT);
 384         b(MATCH);
 385 
 386       BIND(DOSHORT);
 387       if (str1_isL == str2_isL) {
 388         cmp(cnt1, (u1)2);
 389         br(LT, DO1);
 390         br(GT, DO3);
 391       }
 392     }
 393 
 394     if (icnt1 == 4) {
 395       Label CH1_LOOP;
 396 
 397         (this->*load_4chr)(ch1, str1);
 398         sub(result_tmp, cnt2, 4);
 399         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 400         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 401 
 402       BIND(CH1_LOOP);
 403         (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
 404         cmp(ch1, ch2);
 405         br(EQ, MATCH);
 406         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 407         br(LE, CH1_LOOP);
 408         b(NOMATCH);
 409       }
 410 
 411     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
 412       Label CH1_LOOP;
 413 
 414       BIND(DO2);
 415         (this->*load_2chr)(ch1, str1);
 416         if (icnt1 == 2) {
 417           sub(result_tmp, cnt2, 2);
 418         }
 419         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 420         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 421       BIND(CH1_LOOP);
 422         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
 423         cmp(ch1, ch2);
 424         br(EQ, MATCH);
 425         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 426         br(LE, CH1_LOOP);
 427         b(NOMATCH);
 428     }
 429 
 430     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
 431       Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
 432 
 433       BIND(DO3);
 434         (this->*load_2chr)(first, str1);
 435         (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
 436         if (icnt1 == 3) {
 437           sub(result_tmp, cnt2, 3);
 438         }
 439         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 440         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 441       BIND(FIRST_LOOP);
 442         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
 443         cmpw(first, ch2);
 444         br(EQ, STR1_LOOP);
 445       BIND(STR2_NEXT);
 446         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 447         br(LE, FIRST_LOOP);
 448         b(NOMATCH);
 449 
 450       BIND(STR1_LOOP);
 451         add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
 452         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
 453         cmp(ch1, ch2);
 454         br(NE, STR2_NEXT);
 455         b(MATCH);
 456     }
 457 
 458     if (icnt1 == -1 || icnt1 == 1) {
 459       Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
 460 
 461       BIND(DO1);
 462         (this->*str1_load_1chr)(ch1, str1);
 463         cmp(cnt2, (u1)8);
 464         br(LT, DO1_SHORT);
 465 
 466         sub(result_tmp, cnt2, 8/str2_chr_size);
 467         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 468         mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
 469         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 470 
 471         if (str2_isL) {
 472           orr(ch1, ch1, ch1, LSL, 8);
 473         }
 474         orr(ch1, ch1, ch1, LSL, 16);
 475         orr(ch1, ch1, ch1, LSL, 32);
 476       BIND(CH1_LOOP);
 477         ldr(ch2, Address(str2, cnt2_neg));
 478         eor(ch2, ch1, ch2);
 479         sub(tmp1, ch2, tmp3);
 480         orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 481         bics(tmp1, tmp1, tmp2);
 482         br(NE, HAS_ZERO);
 483         adds(cnt2_neg, cnt2_neg, 8);
 484         br(LT, CH1_LOOP);
 485 
 486         cmp(cnt2_neg, (u1)8);
 487         mov(cnt2_neg, 0);
 488         br(LT, CH1_LOOP);
 489         b(NOMATCH);
 490 
 491       BIND(HAS_ZERO);
 492         rev(tmp1, tmp1);
 493         clz(tmp1, tmp1);
 494         add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
 495         b(MATCH);
 496 
 497       BIND(DO1_SHORT);
 498         mov(result_tmp, cnt2);
 499         lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
 500         sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
 501       BIND(DO1_LOOP);
 502         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
 503         cmpw(ch1, ch2);
 504         br(EQ, MATCH);
 505         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 506         br(LT, DO1_LOOP);
 507     }
 508   }
 509   BIND(NOMATCH);
 510     mov(result, -1);
 511     b(DONE);
 512   BIND(MATCH);
 513     add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
 514   BIND(DONE);
 515 }
 516 
 517 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
 518 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
 519 
 520 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1,
 521                                             Register ch, Register result,
 522                                             Register tmp1, Register tmp2, Register tmp3)
 523 {
 524   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
 525   Register cnt1_neg = cnt1;
 526   Register ch1 = rscratch1;
 527   Register result_tmp = rscratch2;
 528 
 529   cbz(cnt1, NOMATCH);
 530 
 531   cmp(cnt1, (u1)4);
 532   br(LT, DO1_SHORT);
 533 
 534   orr(ch, ch, ch, LSL, 16);
 535   orr(ch, ch, ch, LSL, 32);
 536 
 537   sub(cnt1, cnt1, 4);
 538   mov(result_tmp, cnt1);
 539   lea(str1, Address(str1, cnt1, Address::uxtw(1)));
 540   sub(cnt1_neg, zr, cnt1, LSL, 1);
 541 
 542   mov(tmp3, 0x0001000100010001);
 543 
 544   BIND(CH1_LOOP);
 545     ldr(ch1, Address(str1, cnt1_neg));
 546     eor(ch1, ch, ch1);
 547     sub(tmp1, ch1, tmp3);
 548     orr(tmp2, ch1, 0x7fff7fff7fff7fff);
 549     bics(tmp1, tmp1, tmp2);
 550     br(NE, HAS_ZERO);
 551     adds(cnt1_neg, cnt1_neg, 8);
 552     br(LT, CH1_LOOP);
 553 
 554     cmp(cnt1_neg, (u1)8);
 555     mov(cnt1_neg, 0);
 556     br(LT, CH1_LOOP);
 557     b(NOMATCH);
 558 
 559   BIND(HAS_ZERO);
 560     rev(tmp1, tmp1);
 561     clz(tmp1, tmp1);
 562     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
 563     b(MATCH);
 564 
 565   BIND(DO1_SHORT);
 566     mov(result_tmp, cnt1);
 567     lea(str1, Address(str1, cnt1, Address::uxtw(1)));
 568     sub(cnt1_neg, zr, cnt1, LSL, 1);
 569   BIND(DO1_LOOP);
 570     ldrh(ch1, Address(str1, cnt1_neg));
 571     cmpw(ch, ch1);
 572     br(EQ, MATCH);
 573     adds(cnt1_neg, cnt1_neg, 2);
 574     br(LT, DO1_LOOP);
 575   BIND(NOMATCH);
 576     mov(result, -1);
 577     b(DONE);
 578   BIND(MATCH);
 579     add(result, result_tmp, cnt1_neg, ASR, 1);
 580   BIND(DONE);
 581 }
 582 
 583 void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1,
 584                                                 Register ch, Register result,
 585                                                 FloatRegister ztmp1,
 586                                                 FloatRegister ztmp2,
 587                                                 PRegister tmp_pg,
 588                                                 PRegister tmp_pdn, bool isL)
 589 {
 590   // Note that `tmp_pdn` should *NOT* be used as governing predicate register.
 591   assert(tmp_pg->is_governing(),
 592          "this register has to be a governing predicate register");
 593 
 594   Label LOOP, MATCH, DONE, NOMATCH;
 595   Register vec_len = rscratch1;
 596   Register idx = rscratch2;
 597 
 598   SIMD_RegVariant T = (isL == true) ? B : H;
 599 
 600   cbz(cnt1, NOMATCH);
 601 
 602   // Assign the particular char throughout the vector.
 603   sve_dup(ztmp2, T, ch);
 604   if (isL) {
 605     sve_cntb(vec_len);
 606   } else {
 607     sve_cnth(vec_len);
 608   }
 609   mov(idx, 0);
 610 
 611   // Generate a predicate to control the reading of input string.
 612   sve_whilelt(tmp_pg, T, idx, cnt1);
 613 
 614   BIND(LOOP);
 615     // Read a vector of 8- or 16-bit data depending on the string type. Note
 616     // that inactive elements indicated by the predicate register won't cause
 617     // a data read from memory to the destination vector.
 618     if (isL) {
 619       sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx));
 620     } else {
 621       sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1)));
 622     }
 623     add(idx, idx, vec_len);
 624 
 625     // Perform the comparison. An element of the destination predicate is set
 626     // to active if the particular char is matched.
 627     sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2);
 628 
 629     // Branch if the particular char is found.
 630     br(NE, MATCH);
 631 
 632     sve_whilelt(tmp_pg, T, idx, cnt1);
 633 
 634     // Loop back if the particular char not found.
 635     br(MI, LOOP);
 636 
 637   BIND(NOMATCH);
 638     mov(result, -1);
 639     b(DONE);
 640 
 641   BIND(MATCH);
 642     // Undo the index increment.
 643     sub(idx, idx, vec_len);
 644 
 645     // Crop the vector to find its location.
 646     sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */);
 647     add(result, idx, -1);
 648     sve_incp(result, T, tmp_pdn);
 649   BIND(DONE);
 650 }
 651 
 652 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1,
 653                                             Register ch, Register result,
 654                                             Register tmp1, Register tmp2, Register tmp3)
 655 {
 656   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
 657   Register cnt1_neg = cnt1;
 658   Register ch1 = rscratch1;
 659   Register result_tmp = rscratch2;
 660 
 661   cbz(cnt1, NOMATCH);
 662 
 663   cmp(cnt1, (u1)8);
 664   br(LT, DO1_SHORT);
 665 
 666   orr(ch, ch, ch, LSL, 8);
 667   orr(ch, ch, ch, LSL, 16);
 668   orr(ch, ch, ch, LSL, 32);
 669 
 670   sub(cnt1, cnt1, 8);
 671   mov(result_tmp, cnt1);
 672   lea(str1, Address(str1, cnt1));
 673   sub(cnt1_neg, zr, cnt1);
 674 
 675   mov(tmp3, 0x0101010101010101);
 676 
 677   BIND(CH1_LOOP);
 678     ldr(ch1, Address(str1, cnt1_neg));
 679     eor(ch1, ch, ch1);
 680     sub(tmp1, ch1, tmp3);
 681     orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f);
 682     bics(tmp1, tmp1, tmp2);
 683     br(NE, HAS_ZERO);
 684     adds(cnt1_neg, cnt1_neg, 8);
 685     br(LT, CH1_LOOP);
 686 
 687     cmp(cnt1_neg, (u1)8);
 688     mov(cnt1_neg, 0);
 689     br(LT, CH1_LOOP);
 690     b(NOMATCH);
 691 
 692   BIND(HAS_ZERO);
 693     rev(tmp1, tmp1);
 694     clz(tmp1, tmp1);
 695     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
 696     b(MATCH);
 697 
 698   BIND(DO1_SHORT);
 699     mov(result_tmp, cnt1);
 700     lea(str1, Address(str1, cnt1));
 701     sub(cnt1_neg, zr, cnt1);
 702   BIND(DO1_LOOP);
 703     ldrb(ch1, Address(str1, cnt1_neg));
 704     cmp(ch, ch1);
 705     br(EQ, MATCH);
 706     adds(cnt1_neg, cnt1_neg, 1);
 707     br(LT, DO1_LOOP);
 708   BIND(NOMATCH);
 709     mov(result, -1);
 710     b(DONE);
 711   BIND(MATCH);
 712     add(result, result_tmp, cnt1_neg);
 713   BIND(DONE);
 714 }
 715 
 716 // Compare strings.
 717 void C2_MacroAssembler::string_compare(Register str1, Register str2,
 718     Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
 719     FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
 720     PRegister pgtmp1, PRegister pgtmp2, int ae) {
 721   Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
 722       DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
 723       SHORT_LOOP_START, TAIL_CHECK;
 724 
 725   bool isLL = ae == StrIntrinsicNode::LL;
 726   bool isLU = ae == StrIntrinsicNode::LU;
 727   bool isUL = ae == StrIntrinsicNode::UL;
 728 
 729   // The stub threshold for LL strings is: 72 (64 + 8) chars
 730   // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch)
 731   // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least)
 732   const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36);
 733 
 734   bool str1_isL = isLL || isLU;
 735   bool str2_isL = isLL || isUL;
 736 
 737   int str1_chr_shift = str1_isL ? 0 : 1;
 738   int str2_chr_shift = str2_isL ? 0 : 1;
 739   int str1_chr_size = str1_isL ? 1 : 2;
 740   int str2_chr_size = str2_isL ? 1 : 2;
 741   int minCharsInWord = isLL ? wordSize : wordSize/2;
 742 
 743   FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
 744   chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
 745                                       (chr_insn)&MacroAssembler::ldrh;
 746   chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
 747                                       (chr_insn)&MacroAssembler::ldrh;
 748   uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
 749                             (uxt_insn)&MacroAssembler::uxthw;
 750 
 751   BLOCK_COMMENT("string_compare {");
 752 
 753   // Bizzarely, the counts are passed in bytes, regardless of whether they
 754   // are L or U strings, however the result is always in characters.
 755   if (!str1_isL) asrw(cnt1, cnt1, 1);
 756   if (!str2_isL) asrw(cnt2, cnt2, 1);
 757 
 758   // Compute the minimum of the string lengths and save the difference.
 759   subsw(result, cnt1, cnt2);
 760   cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
 761 
 762   // A very short string
 763   cmpw(cnt2, minCharsInWord);
 764   br(Assembler::LE, SHORT_STRING);
 765 
 766   // Compare longwords
 767   // load first parts of strings and finish initialization while loading
 768   {
 769     if (str1_isL == str2_isL) { // LL or UU
 770       ldr(tmp1, Address(str1));
 771       cmp(str1, str2);
 772       br(Assembler::EQ, DONE);
 773       ldr(tmp2, Address(str2));
 774       cmp(cnt2, stub_threshold);
 775       br(GE, STUB);
 776       subsw(cnt2, cnt2, minCharsInWord);
 777       br(EQ, TAIL_CHECK);
 778       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
 779       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
 780       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
 781     } else if (isLU) {
 782       ldrs(vtmp, Address(str1));
 783       ldr(tmp2, Address(str2));
 784       cmp(cnt2, stub_threshold);
 785       br(GE, STUB);
 786       subw(cnt2, cnt2, 4);
 787       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
 788       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
 789       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
 790       zip1(vtmp, T8B, vtmp, vtmpZ);
 791       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
 792       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
 793       add(cnt1, cnt1, 4);
 794       fmovd(tmp1, vtmp);
 795     } else { // UL case
 796       ldr(tmp1, Address(str1));
 797       ldrs(vtmp, Address(str2));
 798       cmp(cnt2, stub_threshold);
 799       br(GE, STUB);
 800       subw(cnt2, cnt2, 4);
 801       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
 802       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
 803       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
 804       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
 805       zip1(vtmp, T8B, vtmp, vtmpZ);
 806       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
 807       add(cnt1, cnt1, 8);
 808       fmovd(tmp2, vtmp);
 809     }
 810     adds(cnt2, cnt2, isUL ? 4 : 8);
 811     br(GE, TAIL);
 812     eor(rscratch2, tmp1, tmp2);
 813     cbnz(rscratch2, DIFF);
 814     // main loop
 815     bind(NEXT_WORD);
 816     if (str1_isL == str2_isL) {
 817       ldr(tmp1, Address(str1, cnt2));
 818       ldr(tmp2, Address(str2, cnt2));
 819       adds(cnt2, cnt2, 8);
 820     } else if (isLU) {
 821       ldrs(vtmp, Address(str1, cnt1));
 822       ldr(tmp2, Address(str2, cnt2));
 823       add(cnt1, cnt1, 4);
 824       zip1(vtmp, T8B, vtmp, vtmpZ);
 825       fmovd(tmp1, vtmp);
 826       adds(cnt2, cnt2, 8);
 827     } else { // UL
 828       ldrs(vtmp, Address(str2, cnt2));
 829       ldr(tmp1, Address(str1, cnt1));
 830       zip1(vtmp, T8B, vtmp, vtmpZ);
 831       add(cnt1, cnt1, 8);
 832       fmovd(tmp2, vtmp);
 833       adds(cnt2, cnt2, 4);
 834     }
 835     br(GE, TAIL);
 836 
 837     eor(rscratch2, tmp1, tmp2);
 838     cbz(rscratch2, NEXT_WORD);
 839     b(DIFF);
 840     bind(TAIL);
 841     eor(rscratch2, tmp1, tmp2);
 842     cbnz(rscratch2, DIFF);
 843     // Last longword.  In the case where length == 4 we compare the
 844     // same longword twice, but that's still faster than another
 845     // conditional branch.
 846     if (str1_isL == str2_isL) {
 847       ldr(tmp1, Address(str1));
 848       ldr(tmp2, Address(str2));
 849     } else if (isLU) {
 850       ldrs(vtmp, Address(str1));
 851       ldr(tmp2, Address(str2));
 852       zip1(vtmp, T8B, vtmp, vtmpZ);
 853       fmovd(tmp1, vtmp);
 854     } else { // UL
 855       ldrs(vtmp, Address(str2));
 856       ldr(tmp1, Address(str1));
 857       zip1(vtmp, T8B, vtmp, vtmpZ);
 858       fmovd(tmp2, vtmp);
 859     }
 860     bind(TAIL_CHECK);
 861     eor(rscratch2, tmp1, tmp2);
 862     cbz(rscratch2, DONE);
 863 
 864     // Find the first different characters in the longwords and
 865     // compute their difference.
 866     bind(DIFF);
 867     rev(rscratch2, rscratch2);
 868     clz(rscratch2, rscratch2);
 869     andr(rscratch2, rscratch2, isLL ? -8 : -16);
 870     lsrv(tmp1, tmp1, rscratch2);
 871     (this->*ext_chr)(tmp1, tmp1);
 872     lsrv(tmp2, tmp2, rscratch2);
 873     (this->*ext_chr)(tmp2, tmp2);
 874     subw(result, tmp1, tmp2);
 875     b(DONE);
 876   }
 877 
 878   bind(STUB);
 879     RuntimeAddress stub = NULL;
 880     switch(ae) {
 881       case StrIntrinsicNode::LL:
 882         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
 883         break;
 884       case StrIntrinsicNode::UU:
 885         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
 886         break;
 887       case StrIntrinsicNode::LU:
 888         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
 889         break;
 890       case StrIntrinsicNode::UL:
 891         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
 892         break;
 893       default:
 894         ShouldNotReachHere();
 895      }
 896     assert(stub.target() != NULL, "compare_long_string stub has not been generated");
 897     trampoline_call(stub);
 898     b(DONE);
 899 
 900   bind(SHORT_STRING);
 901   // Is the minimum length zero?
 902   cbz(cnt2, DONE);
 903   // arrange code to do most branches while loading and loading next characters
 904   // while comparing previous
 905   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
 906   subs(cnt2, cnt2, 1);
 907   br(EQ, SHORT_LAST_INIT);
 908   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
 909   b(SHORT_LOOP_START);
 910   bind(SHORT_LOOP);
 911   subs(cnt2, cnt2, 1);
 912   br(EQ, SHORT_LAST);
 913   bind(SHORT_LOOP_START);
 914   (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
 915   (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
 916   cmp(tmp1, cnt1);
 917   br(NE, SHORT_LOOP_TAIL);
 918   subs(cnt2, cnt2, 1);
 919   br(EQ, SHORT_LAST2);
 920   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
 921   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
 922   cmp(tmp2, rscratch1);
 923   br(EQ, SHORT_LOOP);
 924   sub(result, tmp2, rscratch1);
 925   b(DONE);
 926   bind(SHORT_LOOP_TAIL);
 927   sub(result, tmp1, cnt1);
 928   b(DONE);
 929   bind(SHORT_LAST2);
 930   cmp(tmp2, rscratch1);
 931   br(EQ, DONE);
 932   sub(result, tmp2, rscratch1);
 933 
 934   b(DONE);
 935   bind(SHORT_LAST_INIT);
 936   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
 937   bind(SHORT_LAST);
 938   cmp(tmp1, cnt1);
 939   br(EQ, DONE);
 940   sub(result, tmp1, cnt1);
 941 
 942   bind(DONE);
 943 
 944   BLOCK_COMMENT("} string_compare");
 945 }
 946 
 947 void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1,
 948                                      FloatRegister src2, int cond, bool isQ) {
 949   SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
 950   if (bt == T_FLOAT || bt == T_DOUBLE) {
 951     switch (cond) {
 952       case BoolTest::eq: fcmeq(dst, size, src1, src2); break;
 953       case BoolTest::ne: {
 954         fcmeq(dst, size, src1, src2);
 955         notr(dst, T16B, dst);
 956         break;
 957       }
 958       case BoolTest::ge: fcmge(dst, size, src1, src2); break;
 959       case BoolTest::gt: fcmgt(dst, size, src1, src2); break;
 960       case BoolTest::le: fcmge(dst, size, src2, src1); break;
 961       case BoolTest::lt: fcmgt(dst, size, src2, src1); break;
 962       default:
 963         assert(false, "unsupported");
 964         ShouldNotReachHere();
 965     }
 966   } else {
 967     switch (cond) {
 968       case BoolTest::eq: cmeq(dst, size, src1, src2); break;
 969       case BoolTest::ne: {
 970         cmeq(dst, size, src1, src2);
 971         notr(dst, T16B, dst);
 972         break;
 973       }
 974       case BoolTest::ge: cmge(dst, size, src1, src2); break;
 975       case BoolTest::gt: cmgt(dst, size, src1, src2); break;
 976       case BoolTest::le: cmge(dst, size, src2, src1); break;
 977       case BoolTest::lt: cmgt(dst, size, src2, src1); break;
 978       case BoolTest::uge: cmhs(dst, size, src1, src2); break;
 979       case BoolTest::ugt: cmhi(dst, size, src1, src2); break;
 980       case BoolTest::ult: cmhi(dst, size, src2, src1); break;
 981       case BoolTest::ule: cmhs(dst, size, src2, src1); break;
 982       default:
 983         assert(false, "unsupported");
 984         ShouldNotReachHere();
 985     }
 986   }
 987 }
 988 
 989 // Compress the least significant bit of each byte to the rightmost and clear
 990 // the higher garbage bits.
 991 void C2_MacroAssembler::bytemask_compress(Register dst) {
 992   // Example input, dst = 0x01 00 00 00 01 01 00 01
 993   // The "??" bytes are garbage.
 994   orr(dst, dst, dst, Assembler::LSR, 7);  // dst = 0x?? 02 ?? 00 ?? 03 ?? 01
 995   orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D
 996   orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D
 997   andr(dst, dst, 0xff);                   // dst = 0x8D
 998 }
 999 
1000 // Pack the lowest-numbered bit of each mask element in src into a long value
1001 // in dst, at most the first 64 lane elements.
1002 // Clobbers: rscratch1, if UseSVE=1 or the hardware doesn't support FEAT_BITPERM.
1003 void C2_MacroAssembler::sve_vmask_tolong(Register dst, PRegister src, BasicType bt, int lane_cnt,
1004                                          FloatRegister vtmp1, FloatRegister vtmp2) {
1005   assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
1006   assert_different_registers(dst, rscratch1);
1007   assert_different_registers(vtmp1, vtmp2);
1008 
1009   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
1010   // Example:   src = 0b01100101 10001101, bt = T_BYTE, lane_cnt = 16
1011   // Expected:  dst = 0x658D
1012 
1013   // Convert the mask into vector with sequential bytes.
1014   // vtmp1 = 0x00010100 0x00010001 0x01000000 0x01010001
1015   sve_cpy(vtmp1, size, src, 1, false);
1016   if (bt != T_BYTE) {
1017     sve_vector_narrow(vtmp1, B, vtmp1, size, vtmp2);
1018   }
1019 
1020   if (UseSVE > 1 && VM_Version::supports_svebitperm()) {
1021     // Given a vector with the value 0x00 or 0x01 in each byte, the basic idea
1022     // is to compress each significant bit of the byte in a cross-lane way. Due
1023     // to the lack of a cross-lane bit-compress instruction, we use BEXT
1024     // (bit-compress in each lane) with the biggest lane size (T = D) then
1025     // concatenate the results.
1026 
1027     // The second source input of BEXT, initialized with 0x01 in each byte.
1028     // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1029     sve_dup(vtmp2, B, 1);
1030 
1031     // BEXT vtmp1.D, vtmp1.D, vtmp2.D
1032     // vtmp1 = 0x0001010000010001 | 0x0100000001010001
1033     // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1034     //         ---------------------------------------
1035     // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1036     sve_bext(vtmp1, D, vtmp1, vtmp2);
1037 
1038     // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the
1039     // result to dst.
1040     // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1041     // dst   = 0x658D
1042     if (lane_cnt <= 8) {
1043       // No need to concatenate.
1044       umov(dst, vtmp1, B, 0);
1045     } else if (lane_cnt <= 16) {
1046       ins(vtmp1, B, vtmp1, 1, 8);
1047       umov(dst, vtmp1, H, 0);
1048     } else {
1049       // As the lane count is 64 at most, the final expected value must be in
1050       // the lowest 64 bits after narrowing vtmp1 from D to B.
1051       sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2);
1052       umov(dst, vtmp1, D, 0);
1053     }
1054   } else if (UseSVE > 0) {
1055     // Compress the lowest 8 bytes.
1056     fmovd(dst, vtmp1);
1057     bytemask_compress(dst);
1058     if (lane_cnt <= 8) return;
1059 
1060     // Repeat on higher bytes and join the results.
1061     // Compress 8 bytes in each iteration.
1062     for (int idx = 1; idx < (lane_cnt / 8); idx++) {
1063       sve_extract_integral(rscratch1, T_LONG, vtmp1, idx, vtmp2);
1064       bytemask_compress(rscratch1);
1065       orr(dst, dst, rscratch1, Assembler::LSL, idx << 3);
1066     }
1067   } else {
1068     assert(false, "unsupported");
1069     ShouldNotReachHere();
1070   }
1071 }
1072 
1073 // Unpack the mask, a long value in src, into predicate register dst based on the
1074 // corresponding data type. Note that dst can support at most 64 lanes.
1075 // Below example gives the expected dst predicate register in different types, with
1076 // a valid src(0x658D) on a 1024-bit vector size machine.
1077 // BYTE:  dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 00 00 65 8D
1078 // SHORT: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 14 11 40 51
1079 // INT:   dst = 0x00 00 00 00 00 00 00 00 01 10 01 01 10 00 11 01
1080 // LONG:  dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01
1081 //
1082 // The number of significant bits of src must be equal to lane_cnt. E.g., 0xFF658D which
1083 // has 24 significant bits would be an invalid input if dst predicate register refers to
1084 // a LONG type 1024-bit vector, which has at most 16 lanes.
1085 void C2_MacroAssembler::sve_vmask_fromlong(PRegister dst, Register src, BasicType bt, int lane_cnt,
1086                                            FloatRegister vtmp1, FloatRegister vtmp2) {
1087   assert(UseSVE == 2 && VM_Version::supports_svebitperm() &&
1088          lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported");
1089   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
1090   // Example:   src = 0x658D, bt = T_BYTE, size = B, lane_cnt = 16
1091   // Expected:  dst = 0b01101001 10001101
1092 
1093   // Put long value from general purpose register into the first lane of vector.
1094   // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1095   sve_dup(vtmp1, B, 0);
1096   mov(vtmp1, D, 0, src);
1097 
1098   // As sve_cmp generates mask value with the minimum unit in byte, we should
1099   // transform the value in the first lane which is mask in bit now to the
1100   // mask in byte, which can be done by SVE2's BDEP instruction.
1101 
1102   // The first source input of BDEP instruction. Deposite each byte in every 8 bytes.
1103   // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1104   if (lane_cnt <= 8) {
1105     // Nothing. As only one byte exsits.
1106   } else if (lane_cnt <= 16) {
1107     ins(vtmp1, B, vtmp1, 8, 1);
1108     mov(vtmp1, B, 1, zr);
1109   } else {
1110     sve_vector_extend(vtmp1, D, vtmp1, B);
1111   }
1112 
1113   // The second source input of BDEP instruction, initialized with 0x01 for each byte.
1114   // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1115   sve_dup(vtmp2, B, 1);
1116 
1117   // BDEP vtmp1.D, vtmp1.D, vtmp2.D
1118   // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1119   // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1120   //         ---------------------------------------
1121   // vtmp1 = 0x0001010000010001 | 0x0100000001010001
1122   sve_bdep(vtmp1, D, vtmp1, vtmp2);
1123 
1124   if (bt != T_BYTE) {
1125     sve_vector_extend(vtmp1, size, vtmp1, B);
1126   }
1127   // Generate mask according to the given vector, in which the elements have been
1128   // extended to expected type.
1129   // dst = 0b01101001 10001101
1130   sve_cmp(Assembler::NE, dst, size, ptrue, vtmp1, 0);
1131 }
1132 
1133 // Clobbers: rflags
1134 void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg,
1135                                     FloatRegister zn, FloatRegister zm, int cond) {
1136   assert(pg->is_governing(), "This register has to be a governing predicate register");
1137   FloatRegister z1 = zn, z2 = zm;
1138   // Convert the original BoolTest condition to Assembler::condition.
1139   Condition condition;
1140   switch (cond) {
1141     case BoolTest::eq: condition = Assembler::EQ; break;
1142     case BoolTest::ne: condition = Assembler::NE; break;
1143     case BoolTest::le: z1 = zm; z2 = zn; condition = Assembler::GE; break;
1144     case BoolTest::ge: condition = Assembler::GE; break;
1145     case BoolTest::lt: z1 = zm; z2 = zn; condition = Assembler::GT; break;
1146     case BoolTest::gt: condition = Assembler::GT; break;
1147     default:
1148       assert(false, "unsupported compare condition");
1149       ShouldNotReachHere();
1150   }
1151 
1152   SIMD_RegVariant size = elemType_to_regVariant(bt);
1153   if (bt == T_FLOAT || bt == T_DOUBLE) {
1154     sve_fcm(condition, pd, size, pg, z1, z2);
1155   } else {
1156     assert(is_integral_type(bt), "unsupported element type");
1157     sve_cmp(condition, pd, size, pg, z1, z2);
1158   }
1159 }
1160 
1161 // Get index of the last mask lane that is set
1162 void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) {
1163   SIMD_RegVariant size = elemType_to_regVariant(bt);
1164   sve_rev(ptmp, size, src);
1165   sve_brkb(ptmp, ptrue, ptmp, false);
1166   sve_cntp(dst, size, ptrue, ptmp);
1167   movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1);
1168   subw(dst, rscratch1, dst);
1169 }
1170 
1171 // Extend integer vector src to dst with the same lane count
1172 // but larger element size, e.g. 4B -> 4I
1173 void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes,
1174                                            FloatRegister src, BasicType src_bt) {
1175   if (src_bt == T_BYTE) {
1176     if (dst_bt == T_SHORT) {
1177       // 4B/8B to 4S/8S
1178       assert(dst_vlen_in_bytes == 8 || dst_vlen_in_bytes == 16, "unsupported");
1179       sxtl(dst, T8H, src, T8B);
1180     } else {
1181       // 4B to 4I
1182       assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported");
1183       sxtl(dst, T8H, src, T8B);
1184       sxtl(dst, T4S, dst, T4H);
1185     }
1186   } else if (src_bt == T_SHORT) {
1187     // 4S to 4I
1188     assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported");
1189     sxtl(dst, T4S, src, T4H);
1190   } else if (src_bt == T_INT) {
1191     // 2I to 2L
1192     assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported");
1193     sxtl(dst, T2D, src, T2S);
1194   } else {
1195     ShouldNotReachHere();
1196   }
1197 }
1198 
1199 // Narrow integer vector src down to dst with the same lane count
1200 // but smaller element size, e.g. 4I -> 4B
1201 void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt,
1202                                            FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) {
1203   if (src_bt == T_SHORT) {
1204     // 4S/8S to 4B/8B
1205     assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
1206     assert(dst_bt == T_BYTE, "unsupported");
1207     xtn(dst, T8B, src, T8H);
1208   } else if (src_bt == T_INT) {
1209     // 4I to 4B/4S
1210     assert(src_vlen_in_bytes == 16, "unsupported");
1211     assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported");
1212     xtn(dst, T4H, src, T4S);
1213     if (dst_bt == T_BYTE) {
1214       xtn(dst, T8B, dst, T8H);
1215     }
1216   } else if (src_bt == T_LONG) {
1217     // 2L to 2I
1218     assert(src_vlen_in_bytes == 16, "unsupported");
1219     assert(dst_bt == T_INT, "unsupported");
1220     xtn(dst, T2S, src, T2D);
1221   } else {
1222     ShouldNotReachHere();
1223   }
1224 }
1225 
1226 void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size,
1227                                           FloatRegister src, SIMD_RegVariant src_size) {
1228   assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size");
1229   if (src_size == B) {
1230     switch (dst_size) {
1231     case H:
1232       sve_sunpklo(dst, H, src);
1233       break;
1234     case S:
1235       sve_sunpklo(dst, H, src);
1236       sve_sunpklo(dst, S, dst);
1237       break;
1238     case D:
1239       sve_sunpklo(dst, H, src);
1240       sve_sunpklo(dst, S, dst);
1241       sve_sunpklo(dst, D, dst);
1242       break;
1243     default:
1244       ShouldNotReachHere();
1245     }
1246   } else if (src_size == H) {
1247     if (dst_size == S) {
1248       sve_sunpklo(dst, S, src);
1249     } else { // D
1250       sve_sunpklo(dst, S, src);
1251       sve_sunpklo(dst, D, dst);
1252     }
1253   } else if (src_size == S) {
1254     sve_sunpklo(dst, D, src);
1255   }
1256 }
1257 
1258 // Vector narrow from src to dst with specified element sizes.
1259 // High part of dst vector will be filled with zero.
1260 void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size,
1261                                           FloatRegister src, SIMD_RegVariant src_size,
1262                                           FloatRegister tmp) {
1263   assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size");
1264   assert_different_registers(src, tmp);
1265   sve_dup(tmp, src_size, 0);
1266   if (src_size == D) {
1267     switch (dst_size) {
1268     case S:
1269       sve_uzp1(dst, S, src, tmp);
1270       break;
1271     case H:
1272       assert_different_registers(dst, tmp);
1273       sve_uzp1(dst, S, src, tmp);
1274       sve_uzp1(dst, H, dst, tmp);
1275       break;
1276     case B:
1277       assert_different_registers(dst, tmp);
1278       sve_uzp1(dst, S, src, tmp);
1279       sve_uzp1(dst, H, dst, tmp);
1280       sve_uzp1(dst, B, dst, tmp);
1281       break;
1282     default:
1283       ShouldNotReachHere();
1284     }
1285   } else if (src_size == S) {
1286     if (dst_size == H) {
1287       sve_uzp1(dst, H, src, tmp);
1288     } else { // B
1289       assert_different_registers(dst, tmp);
1290       sve_uzp1(dst, H, src, tmp);
1291       sve_uzp1(dst, B, dst, tmp);
1292     }
1293   } else if (src_size == H) {
1294     sve_uzp1(dst, B, src, tmp);
1295   }
1296 }
1297 
1298 // Extend src predicate to dst predicate with the same lane count but larger
1299 // element size, e.g. 64Byte -> 512Long
1300 void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src,
1301                                              uint dst_element_length_in_bytes,
1302                                              uint src_element_length_in_bytes) {
1303   if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) {
1304     sve_punpklo(dst, src);
1305   } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) {
1306     sve_punpklo(dst, src);
1307     sve_punpklo(dst, dst);
1308   } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) {
1309     sve_punpklo(dst, src);
1310     sve_punpklo(dst, dst);
1311     sve_punpklo(dst, dst);
1312   } else {
1313     assert(false, "unsupported");
1314     ShouldNotReachHere();
1315   }
1316 }
1317 
1318 // Narrow src predicate to dst predicate with the same lane count but
1319 // smaller element size, e.g. 512Long -> 64Byte
1320 void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src,
1321                                              uint dst_element_length_in_bytes, uint src_element_length_in_bytes) {
1322   // The insignificant bits in src predicate are expected to be zero.
1323   if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) {
1324     sve_uzp1(dst, B, src, src);
1325   } else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) {
1326     sve_uzp1(dst, H, src, src);
1327     sve_uzp1(dst, B, dst, dst);
1328   } else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) {
1329     sve_uzp1(dst, S, src, src);
1330     sve_uzp1(dst, H, dst, dst);
1331     sve_uzp1(dst, B, dst, dst);
1332   } else {
1333     assert(false, "unsupported");
1334     ShouldNotReachHere();
1335   }
1336 }
1337 
1338 // Vector reduction add for integral type with ASIMD instructions.
1339 void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt,
1340                                                  Register isrc, FloatRegister vsrc,
1341                                                  unsigned vector_length_in_bytes,
1342                                                  FloatRegister vtmp) {
1343   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1344   assert_different_registers(dst, isrc);
1345   bool isQ = vector_length_in_bytes == 16;
1346 
1347   BLOCK_COMMENT("neon_reduce_add_integral {");
1348     switch(bt) {
1349       case T_BYTE:
1350         addv(vtmp, isQ ? T16B : T8B, vsrc);
1351         smov(dst, vtmp, B, 0);
1352         addw(dst, dst, isrc, ext::sxtb);
1353         break;
1354       case T_SHORT:
1355         addv(vtmp, isQ ? T8H : T4H, vsrc);
1356         smov(dst, vtmp, H, 0);
1357         addw(dst, dst, isrc, ext::sxth);
1358         break;
1359       case T_INT:
1360         isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc);
1361         umov(dst, vtmp, S, 0);
1362         addw(dst, dst, isrc);
1363         break;
1364       case T_LONG:
1365         assert(isQ, "unsupported");
1366         addpd(vtmp, vsrc);
1367         umov(dst, vtmp, D, 0);
1368         add(dst, dst, isrc);
1369         break;
1370       default:
1371         assert(false, "unsupported");
1372         ShouldNotReachHere();
1373     }
1374   BLOCK_COMMENT("} neon_reduce_add_integral");
1375 }
1376 
1377 // Vector reduction multiply for integral type with ASIMD instructions.
1378 // Note: temporary registers vtmp1 and vtmp2 are not used in some cases.
1379 // Clobbers: rscratch1
1380 void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt,
1381                                                  Register isrc, FloatRegister vsrc,
1382                                                  unsigned vector_length_in_bytes,
1383                                                  FloatRegister vtmp1, FloatRegister vtmp2) {
1384   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1385   bool isQ = vector_length_in_bytes == 16;
1386 
1387   BLOCK_COMMENT("neon_reduce_mul_integral {");
1388     switch(bt) {
1389       case T_BYTE:
1390         if (isQ) {
1391           // Multiply the lower half and higher half of vector iteratively.
1392           // vtmp1 = vsrc[8:15]
1393           ins(vtmp1, D, vsrc, 0, 1);
1394           // vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7]
1395           mulv(vtmp1, T8B, vtmp1, vsrc);
1396           // vtmp2 = vtmp1[4:7]
1397           ins(vtmp2, S, vtmp1, 0, 1);
1398           // vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3]
1399           mulv(vtmp1, T8B, vtmp2, vtmp1);
1400         } else {
1401           ins(vtmp1, S, vsrc, 0, 1);
1402           mulv(vtmp1, T8B, vtmp1, vsrc);
1403         }
1404         // vtmp2 = vtmp1[2:3]
1405         ins(vtmp2, H, vtmp1, 0, 1);
1406         // vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1]
1407         mulv(vtmp2, T8B, vtmp2, vtmp1);
1408         // dst = vtmp2[0] * isrc * vtmp2[1]
1409         umov(rscratch1, vtmp2, B, 0);
1410         mulw(dst, rscratch1, isrc);
1411         sxtb(dst, dst);
1412         umov(rscratch1, vtmp2, B, 1);
1413         mulw(dst, rscratch1, dst);
1414         sxtb(dst, dst);
1415         break;
1416       case T_SHORT:
1417         if (isQ) {
1418           ins(vtmp2, D, vsrc, 0, 1);
1419           mulv(vtmp2, T4H, vtmp2, vsrc);
1420           ins(vtmp1, S, vtmp2, 0, 1);
1421           mulv(vtmp1, T4H, vtmp1, vtmp2);
1422         } else {
1423           ins(vtmp1, S, vsrc, 0, 1);
1424           mulv(vtmp1, T4H, vtmp1, vsrc);
1425         }
1426         umov(rscratch1, vtmp1, H, 0);
1427         mulw(dst, rscratch1, isrc);
1428         sxth(dst, dst);
1429         umov(rscratch1, vtmp1, H, 1);
1430         mulw(dst, rscratch1, dst);
1431         sxth(dst, dst);
1432         break;
1433       case T_INT:
1434         if (isQ) {
1435           ins(vtmp1, D, vsrc, 0, 1);
1436           mulv(vtmp1, T2S, vtmp1, vsrc);
1437         } else {
1438           vtmp1 = vsrc;
1439         }
1440         umov(rscratch1, vtmp1, S, 0);
1441         mul(dst, rscratch1, isrc);
1442         umov(rscratch1, vtmp1, S, 1);
1443         mul(dst, rscratch1, dst);
1444         break;
1445       case T_LONG:
1446         umov(rscratch1, vsrc, D, 0);
1447         mul(dst, isrc, rscratch1);
1448         umov(rscratch1, vsrc, D, 1);
1449         mul(dst, dst, rscratch1);
1450         break;
1451       default:
1452         assert(false, "unsupported");
1453         ShouldNotReachHere();
1454     }
1455   BLOCK_COMMENT("} neon_reduce_mul_integral");
1456 }
1457 
1458 // Vector reduction multiply for floating-point type with ASIMD instructions.
1459 void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt,
1460                                            FloatRegister fsrc, FloatRegister vsrc,
1461                                            unsigned vector_length_in_bytes,
1462                                            FloatRegister vtmp) {
1463   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1464   bool isQ = vector_length_in_bytes == 16;
1465 
1466   BLOCK_COMMENT("neon_reduce_mul_fp {");
1467     switch(bt) {
1468       case T_FLOAT:
1469         fmuls(dst, fsrc, vsrc);
1470         ins(vtmp, S, vsrc, 0, 1);
1471         fmuls(dst, dst, vtmp);
1472         if (isQ) {
1473           ins(vtmp, S, vsrc, 0, 2);
1474           fmuls(dst, dst, vtmp);
1475           ins(vtmp, S, vsrc, 0, 3);
1476           fmuls(dst, dst, vtmp);
1477          }
1478         break;
1479       case T_DOUBLE:
1480         assert(isQ, "unsupported");
1481         fmuld(dst, fsrc, vsrc);
1482         ins(vtmp, D, vsrc, 0, 1);
1483         fmuld(dst, dst, vtmp);
1484         break;
1485       default:
1486         assert(false, "unsupported");
1487         ShouldNotReachHere();
1488     }
1489   BLOCK_COMMENT("} neon_reduce_mul_fp");
1490 }
1491 
1492 // Helper to select logical instruction
1493 void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd,
1494                                                    Register Rn, Register Rm,
1495                                                    enum shift_kind kind, unsigned shift) {
1496   switch(opc) {
1497     case Op_AndReductionV:
1498       is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift);
1499       break;
1500     case Op_OrReductionV:
1501       is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift);
1502       break;
1503     case Op_XorReductionV:
1504       is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift);
1505       break;
1506     default:
1507       assert(false, "unsupported");
1508       ShouldNotReachHere();
1509   }
1510 }
1511 
1512 // Vector reduction logical operations And, Or, Xor
1513 // Clobbers: rscratch1
1514 void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt,
1515                                             Register isrc, FloatRegister vsrc,
1516                                             unsigned vector_length_in_bytes) {
1517   assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV,
1518          "unsupported");
1519   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1520   assert_different_registers(dst, isrc);
1521   bool isQ = vector_length_in_bytes == 16;
1522 
1523   BLOCK_COMMENT("neon_reduce_logical {");
1524     umov(rscratch1, vsrc, isQ ? D : S, 0);
1525     umov(dst, vsrc, isQ ? D : S, 1);
1526     neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1);
1527     switch(bt) {
1528       case T_BYTE:
1529         if (isQ) {
1530           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1531         }
1532         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
1533         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8);
1534         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1535         sxtb(dst, dst);
1536         break;
1537       case T_SHORT:
1538         if (isQ) {
1539           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1540         }
1541         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
1542         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1543         sxth(dst, dst);
1544         break;
1545       case T_INT:
1546         if (isQ) {
1547           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1548         }
1549         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1550         break;
1551       case T_LONG:
1552         assert(isQ, "unsupported");
1553         neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst);
1554         break;
1555       default:
1556         assert(false, "unsupported");
1557         ShouldNotReachHere();
1558     }
1559   BLOCK_COMMENT("} neon_reduce_logical");
1560 }
1561 
1562 // Vector reduction min/max for integral type with ASIMD instructions.
1563 // Note: vtmp is not used and expected to be fnoreg for T_LONG case.
1564 // Clobbers: rscratch1, rflags
1565 void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt,
1566                                                     Register isrc, FloatRegister vsrc,
1567                                                     unsigned vector_length_in_bytes,
1568                                                     FloatRegister vtmp) {
1569   assert(opc == Op_MinReductionV || opc == Op_MaxReductionV, "unsupported");
1570   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1571   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported");
1572   assert_different_registers(dst, isrc);
1573   bool isQ = vector_length_in_bytes == 16;
1574   bool is_min = opc == Op_MinReductionV;
1575 
1576   BLOCK_COMMENT("neon_reduce_minmax_integral {");
1577     if (bt == T_LONG) {
1578       assert(vtmp == fnoreg, "should be");
1579       assert(isQ, "should be");
1580       umov(rscratch1, vsrc, D, 0);
1581       cmp(isrc, rscratch1);
1582       csel(dst, isrc, rscratch1, is_min ? LT : GT);
1583       umov(rscratch1, vsrc, D, 1);
1584       cmp(dst, rscratch1);
1585       csel(dst, dst, rscratch1, is_min ? LT : GT);
1586     } else {
1587       SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1588       if (size == T2S) {
1589         is_min ? sminp(vtmp, size, vsrc, vsrc) : smaxp(vtmp, size, vsrc, vsrc);
1590       } else {
1591         is_min ? sminv(vtmp, size, vsrc) : smaxv(vtmp, size, vsrc);
1592       }
1593       if (bt == T_INT) {
1594         umov(dst, vtmp, S, 0);
1595       } else {
1596         smov(dst, vtmp, elemType_to_regVariant(bt), 0);
1597       }
1598       cmpw(dst, isrc);
1599       cselw(dst, dst, isrc, is_min ? LT : GT);
1600     }
1601   BLOCK_COMMENT("} neon_reduce_minmax_integral");
1602 }
1603 
1604 // Vector reduction for integral type with SVE instruction.
1605 // Supported operations are Add, And, Or, Xor, Max, Min.
1606 // rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV.
1607 void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1,
1608                                             FloatRegister src2, PRegister pg, FloatRegister tmp) {
1609   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
1610   assert(pg->is_governing(), "This register has to be a governing predicate register");
1611   assert_different_registers(src1, dst);
1612   // Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved.
1613   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
1614   switch (opc) {
1615     case Op_AddReductionVI: {
1616       sve_uaddv(tmp, size, pg, src2);
1617       if (bt == T_BYTE) {
1618         smov(dst, tmp, size, 0);
1619         addw(dst, src1, dst, ext::sxtb);
1620       } else if (bt == T_SHORT) {
1621         smov(dst, tmp, size, 0);
1622         addw(dst, src1, dst, ext::sxth);
1623       } else {
1624         umov(dst, tmp, size, 0);
1625         addw(dst, dst, src1);
1626       }
1627       break;
1628     }
1629     case Op_AddReductionVL: {
1630       sve_uaddv(tmp, size, pg, src2);
1631       umov(dst, tmp, size, 0);
1632       add(dst, dst, src1);
1633       break;
1634     }
1635     case Op_AndReductionV: {
1636       sve_andv(tmp, size, pg, src2);
1637       if (bt == T_INT || bt == T_LONG) {
1638         umov(dst, tmp, size, 0);
1639       } else {
1640         smov(dst, tmp, size, 0);
1641       }
1642       if (bt == T_LONG) {
1643         andr(dst, dst, src1);
1644       } else {
1645         andw(dst, dst, src1);
1646       }
1647       break;
1648     }
1649     case Op_OrReductionV: {
1650       sve_orv(tmp, size, pg, src2);
1651       if (bt == T_INT || bt == T_LONG) {
1652         umov(dst, tmp, size, 0);
1653       } else {
1654         smov(dst, tmp, size, 0);
1655       }
1656       if (bt == T_LONG) {
1657         orr(dst, dst, src1);
1658       } else {
1659         orrw(dst, dst, src1);
1660       }
1661       break;
1662     }
1663     case Op_XorReductionV: {
1664       sve_eorv(tmp, size, pg, src2);
1665       if (bt == T_INT || bt == T_LONG) {
1666         umov(dst, tmp, size, 0);
1667       } else {
1668         smov(dst, tmp, size, 0);
1669       }
1670       if (bt == T_LONG) {
1671         eor(dst, dst, src1);
1672       } else {
1673         eorw(dst, dst, src1);
1674       }
1675       break;
1676     }
1677     case Op_MaxReductionV: {
1678       sve_smaxv(tmp, size, pg, src2);
1679       if (bt == T_INT || bt == T_LONG) {
1680         umov(dst, tmp, size, 0);
1681       } else {
1682         smov(dst, tmp, size, 0);
1683       }
1684       if (bt == T_LONG) {
1685         cmp(dst, src1);
1686         csel(dst, dst, src1, Assembler::GT);
1687       } else {
1688         cmpw(dst, src1);
1689         cselw(dst, dst, src1, Assembler::GT);
1690       }
1691       break;
1692     }
1693     case Op_MinReductionV: {
1694       sve_sminv(tmp, size, pg, src2);
1695       if (bt == T_INT || bt == T_LONG) {
1696         umov(dst, tmp, size, 0);
1697       } else {
1698         smov(dst, tmp, size, 0);
1699       }
1700       if (bt == T_LONG) {
1701         cmp(dst, src1);
1702         csel(dst, dst, src1, Assembler::LT);
1703       } else {
1704         cmpw(dst, src1);
1705         cselw(dst, dst, src1, Assembler::LT);
1706       }
1707       break;
1708     }
1709     default:
1710       assert(false, "unsupported");
1711       ShouldNotReachHere();
1712   }
1713 
1714   if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) {
1715     if (bt == T_BYTE) {
1716       sxtb(dst, dst);
1717     } else if (bt == T_SHORT) {
1718       sxth(dst, dst);
1719     }
1720   }
1721 }
1722 
1723 // Set elements of the dst predicate to true for lanes in the range of [0, lane_cnt), or
1724 // to false otherwise. The input "lane_cnt" should be smaller than or equal to the supported
1725 // max vector length of the basic type. Clobbers: rscratch1 and the rFlagsReg.
1726 void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt) {
1727   uint32_t max_vector_length = Matcher::max_vector_size(bt);
1728   assert(lane_cnt <= max_vector_length, "unsupported input lane_cnt");
1729 
1730   // Set all elements to false if the input "lane_cnt" is zero.
1731   if (lane_cnt == 0) {
1732     sve_pfalse(dst);
1733     return;
1734   }
1735 
1736   SIMD_RegVariant size = elemType_to_regVariant(bt);
1737   assert(size != Q, "invalid size");
1738 
1739   // Set all true if "lane_cnt" equals to the max lane count.
1740   if (lane_cnt == max_vector_length) {
1741     sve_ptrue(dst, size, /* ALL */ 0b11111);
1742     return;
1743   }
1744 
1745   // Fixed numbers for "ptrue".
1746   switch(lane_cnt) {
1747   case 1: /* VL1 */
1748   case 2: /* VL2 */
1749   case 3: /* VL3 */
1750   case 4: /* VL4 */
1751   case 5: /* VL5 */
1752   case 6: /* VL6 */
1753   case 7: /* VL7 */
1754   case 8: /* VL8 */
1755     sve_ptrue(dst, size, lane_cnt);
1756     return;
1757   case 16:
1758     sve_ptrue(dst, size, /* VL16 */ 0b01001);
1759     return;
1760   case 32:
1761     sve_ptrue(dst, size, /* VL32 */ 0b01010);
1762     return;
1763   case 64:
1764     sve_ptrue(dst, size, /* VL64 */ 0b01011);
1765     return;
1766   case 128:
1767     sve_ptrue(dst, size, /* VL128 */ 0b01100);
1768     return;
1769   case 256:
1770     sve_ptrue(dst, size, /* VL256 */ 0b01101);
1771     return;
1772   default:
1773     break;
1774   }
1775 
1776   // Special patterns for "ptrue".
1777   if (lane_cnt == round_down_power_of_2(max_vector_length)) {
1778     sve_ptrue(dst, size, /* POW2 */ 0b00000);
1779   } else if (lane_cnt == max_vector_length - (max_vector_length % 4)) {
1780     sve_ptrue(dst, size, /* MUL4 */ 0b11101);
1781   } else if (lane_cnt == max_vector_length - (max_vector_length % 3)) {
1782     sve_ptrue(dst, size, /* MUL3 */ 0b11110);
1783   } else {
1784     // Encode to "whilelow" for the remaining cases.
1785     mov(rscratch1, lane_cnt);
1786     sve_whilelow(dst, size, zr, rscratch1);
1787   }
1788 }
1789 
1790 // Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst.
1791 // Any remaining elements of dst will be filled with zero.
1792 // Clobbers: rscratch1
1793 // Preserves: src, mask
1794 void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask,
1795                                            FloatRegister vtmp1, FloatRegister vtmp2,
1796                                            PRegister pgtmp) {
1797   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
1798   assert_different_registers(dst, src, vtmp1, vtmp2);
1799   assert_different_registers(mask, pgtmp);
1800 
1801   // Example input:   src   = 8888 7777 6666 5555 4444 3333 2222 1111
1802   //                  mask  = 0001 0000 0000 0001 0001 0000 0001 0001
1803   // Expected result: dst   = 0000 0000 0000 8888 5555 4444 2222 1111
1804   sve_dup(vtmp2, H, 0);
1805 
1806   // Extend lowest half to type INT.
1807   // dst = 00004444 00003333 00002222 00001111
1808   sve_uunpklo(dst, S, src);
1809   // pgtmp = 00000001 00000000 00000001 00000001
1810   sve_punpklo(pgtmp, mask);
1811   // Pack the active elements in size of type INT to the right,
1812   // and fill the remainings with zero.
1813   // dst = 00000000 00004444 00002222 00001111
1814   sve_compact(dst, S, dst, pgtmp);
1815   // Narrow the result back to type SHORT.
1816   // dst = 0000 0000 0000 0000 0000 4444 2222 1111
1817   sve_uzp1(dst, H, dst, vtmp2);
1818   // Count the active elements of lowest half.
1819   // rscratch1 = 3
1820   sve_cntp(rscratch1, S, ptrue, pgtmp);
1821 
1822   // Repeat to the highest half.
1823   // pgtmp = 00000001 00000000 00000000 00000001
1824   sve_punpkhi(pgtmp, mask);
1825   // vtmp1 = 00008888 00007777 00006666 00005555
1826   sve_uunpkhi(vtmp1, S, src);
1827   // vtmp1 = 00000000 00000000 00008888 00005555
1828   sve_compact(vtmp1, S, vtmp1, pgtmp);
1829   // vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555
1830   sve_uzp1(vtmp1, H, vtmp1, vtmp2);
1831 
1832   // Compressed low:   dst   = 0000 0000 0000 0000 0000 4444 2222 1111
1833   // Compressed high:  vtmp1 = 0000 0000 0000 0000 0000 0000 8888  5555
1834   // Left shift(cross lane) compressed high with TRUE_CNT lanes,
1835   // TRUE_CNT is the number of active elements in the compressed low.
1836   neg(rscratch1, rscratch1);
1837   // vtmp2 = {4 3 2 1 0 -1 -2 -3}
1838   sve_index(vtmp2, H, rscratch1, 1);
1839   // vtmp1 = 0000 0000 0000 8888 5555 0000 0000 0000
1840   sve_tbl(vtmp1, H, vtmp1, vtmp2);
1841 
1842   // Combine the compressed high(after shifted) with the compressed low.
1843   // dst = 0000 0000 0000 8888 5555 4444 2222 1111
1844   sve_orr(dst, dst, vtmp1);
1845 }
1846 
1847 // Clobbers: rscratch1, rscratch2
1848 // Preserves: src, mask
1849 void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask,
1850                                           FloatRegister vtmp1, FloatRegister vtmp2,
1851                                           FloatRegister vtmp3, FloatRegister vtmp4,
1852                                           PRegister ptmp, PRegister pgtmp) {
1853   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
1854   assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3, vtmp4);
1855   assert_different_registers(mask, ptmp, pgtmp);
1856   // Example input:   src   = 88 77 66 55 44 33 22 11
1857   //                  mask  = 01 00 00 01 01 00 01 01
1858   // Expected result: dst   = 00 00 00 88 55 44 22 11
1859 
1860   sve_dup(vtmp4, B, 0);
1861   // Extend lowest half to type SHORT.
1862   // vtmp1 = 0044 0033 0022 0011
1863   sve_uunpklo(vtmp1, H, src);
1864   // ptmp = 0001 0000 0001 0001
1865   sve_punpklo(ptmp, mask);
1866   // Count the active elements of lowest half.
1867   // rscratch2 = 3
1868   sve_cntp(rscratch2, H, ptrue, ptmp);
1869   // Pack the active elements in size of type SHORT to the right,
1870   // and fill the remainings with zero.
1871   // dst = 0000 0044 0022 0011
1872   sve_compress_short(dst, vtmp1, ptmp, vtmp2, vtmp3, pgtmp);
1873   // Narrow the result back to type BYTE.
1874   // dst = 00 00 00 00 00 44 22 11
1875   sve_uzp1(dst, B, dst, vtmp4);
1876 
1877   // Repeat to the highest half.
1878   // ptmp = 0001 0000 0000 0001
1879   sve_punpkhi(ptmp, mask);
1880   // vtmp1 = 0088 0077 0066 0055
1881   sve_uunpkhi(vtmp2, H, src);
1882   // vtmp1 = 0000 0000 0088 0055
1883   sve_compress_short(vtmp1, vtmp2, ptmp, vtmp3, vtmp4, pgtmp);
1884 
1885   sve_dup(vtmp4, B, 0);
1886   // vtmp1 = 00 00 00 00 00 00 88 55
1887   sve_uzp1(vtmp1, B, vtmp1, vtmp4);
1888 
1889   // Compressed low:   dst   = 00 00 00 00 00 44 22 11
1890   // Compressed high:  vtmp1 = 00 00 00 00 00 00 88 55
1891   // Left shift(cross lane) compressed high with TRUE_CNT lanes,
1892   // TRUE_CNT is the number of active elements in the compressed low.
1893   neg(rscratch2, rscratch2);
1894   // vtmp2 = {4 3 2 1 0 -1 -2 -3}
1895   sve_index(vtmp2, B, rscratch2, 1);
1896   // vtmp1 = 00 00 00 88 55 00 00 00
1897   sve_tbl(vtmp1, B, vtmp1, vtmp2);
1898   // Combine the compressed high(after shifted) with the compressed low.
1899   // dst = 00 00 00 88 55 44 22 11
1900   sve_orr(dst, dst, vtmp1);
1901 }
1902 
1903 void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
1904   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
1905   SIMD_Arrangement size = isQ ? T16B : T8B;
1906   if (bt == T_BYTE) {
1907     rbit(dst, size, src);
1908   } else {
1909     neon_reverse_bytes(dst, src, bt, isQ);
1910     rbit(dst, size, dst);
1911   }
1912 }
1913 
1914 void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
1915   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
1916   SIMD_Arrangement size = isQ ? T16B : T8B;
1917   switch (bt) {
1918     case T_BYTE:
1919       if (dst != src) {
1920         orr(dst, size, src, src);
1921       }
1922       break;
1923     case T_SHORT:
1924       rev16(dst, size, src);
1925       break;
1926     case T_INT:
1927       rev32(dst, size, src);
1928       break;
1929     case T_LONG:
1930       rev64(dst, size, src);
1931       break;
1932     default:
1933       assert(false, "unsupported");
1934       ShouldNotReachHere();
1935   }
1936 }
1937 
1938 // Extract a scalar element from an sve vector at position 'idx'.
1939 // The input elements in src are expected to be of integral type.
1940 void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src,
1941                                              int idx, FloatRegister vtmp) {
1942   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
1943   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
1944   if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction
1945     if (bt == T_INT || bt == T_LONG) {
1946       umov(dst, src, size, idx);
1947     } else {
1948       smov(dst, src, size, idx);
1949     }
1950   } else {
1951     sve_orr(vtmp, src, src);
1952     sve_ext(vtmp, vtmp, idx << size);
1953     if (bt == T_INT || bt == T_LONG) {
1954       umov(dst, vtmp, size, 0);
1955     } else {
1956       smov(dst, vtmp, size, 0);
1957     }
1958   }
1959 }
1960 
1961 // java.lang.Math::round intrinsics
1962 
1963 // Clobbers: rscratch1, rflags
1964 void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
1965                                           FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) {
1966   assert_different_registers(tmp1, tmp2, tmp3, src, dst);
1967   switch (T) {
1968     case T2S:
1969     case T4S:
1970       fmovs(tmp1, T, 0.5f);
1971       mov(rscratch1, jint_cast(0x1.0p23f));
1972       break;
1973     case T2D:
1974       fmovd(tmp1, T, 0.5);
1975       mov(rscratch1, julong_cast(0x1.0p52));
1976       break;
1977     default:
1978       assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
1979   }
1980   fadd(tmp1, T, tmp1, src);
1981   fcvtms(tmp1, T, tmp1);
1982   // tmp1 = floor(src + 0.5, ties to even)
1983 
1984   fcvtas(dst, T, src);
1985   // dst = round(src), ties to away
1986 
1987   fneg(tmp3, T, src);
1988   dup(tmp2, T, rscratch1);
1989   cmhs(tmp3, T, tmp3, tmp2);
1990   // tmp3 is now a set of flags
1991 
1992   bif(dst, T16B, tmp1, tmp3);
1993   // result in dst
1994 }
1995 
1996 // Clobbers: rscratch1, rflags
1997 void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
1998                                          FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) {
1999   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2000   assert_different_registers(tmp1, tmp2, src, dst);
2001 
2002   switch (T) {
2003     case S:
2004       mov(rscratch1, jint_cast(0x1.0p23f));
2005       break;
2006     case D:
2007       mov(rscratch1, julong_cast(0x1.0p52));
2008       break;
2009     default:
2010       assert(T == S || T == D, "invalid register variant");
2011   }
2012 
2013   sve_frinta(dst, T, ptrue, src);
2014   // dst = round(src), ties to away
2015 
2016   Label none;
2017 
2018   sve_fneg(tmp1, T, ptrue, src);
2019   sve_dup(tmp2, T, rscratch1);
2020   sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1);
2021   br(EQ, none);
2022   {
2023     sve_cpy(tmp1, T, pgtmp, 0.5);
2024     sve_fadd(tmp1, T, pgtmp, src);
2025     sve_frintm(dst, T, pgtmp, tmp1);
2026     // dst = floor(src + 0.5, ties to even)
2027   }
2028   bind(none);
2029 
2030   sve_fcvtzs(dst, T, ptrue, dst, T);
2031   // result in dst
2032 }
2033 
2034 void C2_MacroAssembler::vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero,
2035                                            FloatRegister one, SIMD_Arrangement T) {
2036   assert_different_registers(dst, src, zero, one);
2037   assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2038 
2039   facgt(dst, T, src, zero);
2040   ushr(dst, T, dst, 1); // dst=0 for +-0.0 and NaN. 0x7FF..F otherwise
2041   bsl(dst, T == T2S ? T8B : T16B, one, src); // Result in dst
2042 }
2043 
2044 void C2_MacroAssembler::vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero,
2045                                           FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T) {
2046     assert_different_registers(dst, src, zero, one, vtmp);
2047     assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2048 
2049     sve_orr(vtmp, src, src);
2050     sve_fac(Assembler::GT, pgtmp, T, ptrue, src, zero); // pmtp=0 for +-0.0 and NaN. 0x1 otherwise
2051     switch (T) {
2052     case S:
2053       sve_and(vtmp, T, min_jint); // Extract the sign bit of float value in every lane of src
2054       sve_orr(vtmp, T, jint_cast(1.0)); // OR it with +1 to make the final result +1 or -1 depending
2055                                         // on the sign of the float value
2056       break;
2057     case D:
2058       sve_and(vtmp, T, min_jlong);
2059       sve_orr(vtmp, T, jlong_cast(1.0));
2060       break;
2061     default:
2062       assert(false, "unsupported");
2063       ShouldNotReachHere();
2064     }
2065     sve_sel(dst, T, pgtmp, vtmp, src); // Select either from src or vtmp based on the predicate register pgtmp
2066                                        // Result in dst
2067 }
2068 
2069 bool C2_MacroAssembler::in_scratch_emit_size() {
2070   if (ciEnv::current()->task() != NULL) {
2071     PhaseOutput* phase_output = Compile::current()->output();
2072     if (phase_output != NULL && phase_output->in_scratch_emit_size()) {
2073       return true;
2074     }
2075   }
2076   return MacroAssembler::in_scratch_emit_size();
2077 }