1 /*
   2  * Copyright (c) 2020, 2023, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "opto/c2_MacroAssembler.hpp"
  29 #include "opto/compile.hpp"
  30 #include "opto/intrinsicnode.hpp"
  31 #include "opto/matcher.hpp"
  32 #include "opto/output.hpp"
  33 #include "opto/subnode.hpp"
  34 #include "runtime/stubRoutines.hpp"
  35 
  36 #ifdef PRODUCT
  37 #define BLOCK_COMMENT(str) /* nothing */
  38 #define STOP(error) stop(error)
  39 #else
  40 #define BLOCK_COMMENT(str) block_comment(str)
  41 #define STOP(error) block_comment(error); stop(error)
  42 #endif
  43 
  44 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  45 
  46 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
  47 
  48 void C2_MacroAssembler::entry_barrier() {
  49   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
  50   if (BarrierSet::barrier_set()->barrier_set_nmethod() != nullptr) {
  51     // Dummy labels for just measuring the code size
  52     Label dummy_slow_path;
  53     Label dummy_continuation;
  54     Label dummy_guard;
  55     Label* slow_path = &dummy_slow_path;
  56     Label* continuation = &dummy_continuation;
  57     Label* guard = &dummy_guard;
  58     if (!Compile::current()->output()->in_scratch_emit_size()) {
  59       // Use real labels from actual stub when not emitting code for the purpose of measuring its size
  60       C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
  61       Compile::current()->output()->add_stub(stub);
  62       slow_path = &stub->entry();
  63       continuation = &stub->continuation();
  64       guard = &stub->guard();
  65     }
  66     // In the C2 code, we move the non-hot part of nmethod entry barriers out-of-line to a stub.
  67     bs->nmethod_entry_barrier(this, slow_path, continuation, guard);
  68   }
  69 }
  70 
  71 // Search for str1 in str2 and return index or -1
  72 // Clobbers: rscratch1, rscratch2, rflags. May also clobber v0-v1, when icnt1==-1.
  73 void C2_MacroAssembler::string_indexof(Register str2, Register str1,
  74                                        Register cnt2, Register cnt1,
  75                                        Register tmp1, Register tmp2,
  76                                        Register tmp3, Register tmp4,
  77                                        Register tmp5, Register tmp6,
  78                                        int icnt1, Register result, int ae) {
  79   // NOTE: tmp5, tmp6 can be zr depending on specific method version
  80   Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
  81 
  82   Register ch1 = rscratch1;
  83   Register ch2 = rscratch2;
  84   Register cnt1tmp = tmp1;
  85   Register cnt2tmp = tmp2;
  86   Register cnt1_neg = cnt1;
  87   Register cnt2_neg = cnt2;
  88   Register result_tmp = tmp4;
  89 
  90   bool isL = ae == StrIntrinsicNode::LL;
  91 
  92   bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
  93   bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
  94   int str1_chr_shift = str1_isL ? 0:1;
  95   int str2_chr_shift = str2_isL ? 0:1;
  96   int str1_chr_size = str1_isL ? 1:2;
  97   int str2_chr_size = str2_isL ? 1:2;
  98   chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
  99                                       (chr_insn)&MacroAssembler::ldrh;
 100   chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
 101                                       (chr_insn)&MacroAssembler::ldrh;
 102   chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
 103   chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
 104 
 105   // Note, inline_string_indexOf() generates checks:
 106   // if (substr.count > string.count) return -1;
 107   // if (substr.count == 0) return 0;
 108 
 109   // We have two strings, a source string in str2, cnt2 and a pattern string
 110   // in str1, cnt1. Find the 1st occurrence of pattern in source or return -1.
 111 
 112   // For larger pattern and source we use a simplified Boyer Moore algorithm.
 113   // With a small pattern and source we use linear scan.
 114 
 115   if (icnt1 == -1) {
 116     sub(result_tmp, cnt2, cnt1);
 117     cmp(cnt1, (u1)8);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
 118     br(LT, LINEARSEARCH);
 119     dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
 120     subs(zr, cnt1, 256);
 121     lsr(tmp1, cnt2, 2);
 122     ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
 123     br(GE, LINEARSTUB);
 124   }
 125 
 126 // The Boyer Moore alogorithm is based on the description here:-
 127 //
 128 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
 129 //
 130 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
 131 // and the 'Good Suffix' rule.
 132 //
 133 // These rules are essentially heuristics for how far we can shift the
 134 // pattern along the search string.
 135 //
 136 // The implementation here uses the 'Bad Character' rule only because of the
 137 // complexity of initialisation for the 'Good Suffix' rule.
 138 //
 139 // This is also known as the Boyer-Moore-Horspool algorithm:-
 140 //
 141 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
 142 //
 143 // This particular implementation has few java-specific optimizations.
 144 //
 145 // #define ASIZE 256
 146 //
 147 //    int bm(unsigned char *x, int m, unsigned char *y, int n) {
 148 //       int i, j;
 149 //       unsigned c;
 150 //       unsigned char bc[ASIZE];
 151 //
 152 //       /* Preprocessing */
 153 //       for (i = 0; i < ASIZE; ++i)
 154 //          bc[i] = m;
 155 //       for (i = 0; i < m - 1; ) {
 156 //          c = x[i];
 157 //          ++i;
 158 //          // c < 256 for Latin1 string, so, no need for branch
 159 //          #ifdef PATTERN_STRING_IS_LATIN1
 160 //          bc[c] = m - i;
 161 //          #else
 162 //          if (c < ASIZE) bc[c] = m - i;
 163 //          #endif
 164 //       }
 165 //
 166 //       /* Searching */
 167 //       j = 0;
 168 //       while (j <= n - m) {
 169 //          c = y[i+j];
 170 //          if (x[m-1] == c)
 171 //            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
 172 //          if (i < 0) return j;
 173 //          // c < 256 for Latin1 string, so, no need for branch
 174 //          #ifdef SOURCE_STRING_IS_LATIN1
 175 //          // LL case: (c< 256) always true. Remove branch
 176 //          j += bc[y[j+m-1]];
 177 //          #endif
 178 //          #ifndef PATTERN_STRING_IS_UTF
 179 //          // UU case: need if (c<ASIZE) check. Skip 1 character if not.
 180 //          if (c < ASIZE)
 181 //            j += bc[y[j+m-1]];
 182 //          else
 183 //            j += 1
 184 //          #endif
 185 //          #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
 186 //          // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
 187 //          if (c < ASIZE)
 188 //            j += bc[y[j+m-1]];
 189 //          else
 190 //            j += m
 191 //          #endif
 192 //       }
 193 //    }
 194 
 195   if (icnt1 == -1) {
 196     Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
 197         BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
 198     Register cnt1end = tmp2;
 199     Register str2end = cnt2;
 200     Register skipch = tmp2;
 201 
 202     // str1 length is >=8, so, we can read at least 1 register for cases when
 203     // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
 204     // UL case. We'll re-read last character in inner pre-loop code to have
 205     // single outer pre-loop load
 206     const int firstStep = isL ? 7 : 3;
 207 
 208     const int ASIZE = 256;
 209     const int STORED_BYTES = 32; // amount of bytes stored per instruction
 210     sub(sp, sp, ASIZE);
 211     mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
 212     mov(ch1, sp);
 213     BIND(BM_INIT_LOOP);
 214       stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
 215       subs(tmp5, tmp5, 1);
 216       br(GT, BM_INIT_LOOP);
 217 
 218       sub(cnt1tmp, cnt1, 1);
 219       mov(tmp5, str2);
 220       add(str2end, str2, result_tmp, LSL, str2_chr_shift);
 221       sub(ch2, cnt1, 1);
 222       mov(tmp3, str1);
 223     BIND(BCLOOP);
 224       (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
 225       if (!str1_isL) {
 226         subs(zr, ch1, ASIZE);
 227         br(HS, BCSKIP);
 228       }
 229       strb(ch2, Address(sp, ch1));
 230     BIND(BCSKIP);
 231       subs(ch2, ch2, 1);
 232       br(GT, BCLOOP);
 233 
 234       add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
 235       if (str1_isL == str2_isL) {
 236         // load last 8 bytes (8LL/4UU symbols)
 237         ldr(tmp6, Address(tmp6, -wordSize));
 238       } else {
 239         ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
 240         // convert Latin1 to UTF. We'll have to wait until load completed, but
 241         // it's still faster than per-character loads+checks
 242         lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
 243         ubfx(ch1, tmp6, 8, 8); // str1[N-2]
 244         ubfx(ch2, tmp6, 16, 8); // str1[N-3]
 245         andr(tmp6, tmp6, 0xFF); // str1[N-4]
 246         orr(ch2, ch1, ch2, LSL, 16);
 247         orr(tmp6, tmp6, tmp3, LSL, 48);
 248         orr(tmp6, tmp6, ch2, LSL, 16);
 249       }
 250     BIND(BMLOOPSTR2);
 251       (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 252       sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
 253       if (str1_isL == str2_isL) {
 254         // re-init tmp3. It's for free because it's executed in parallel with
 255         // load above. Alternative is to initialize it before loop, but it'll
 256         // affect performance on in-order systems with 2 or more ld/st pipelines
 257         lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
 258       }
 259       if (!isL) { // UU/UL case
 260         lsl(ch2, cnt1tmp, 1); // offset in bytes
 261       }
 262       cmp(tmp3, skipch);
 263       br(NE, BMSKIP);
 264       ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
 265       mov(ch1, tmp6);
 266       if (isL) {
 267         b(BMLOOPSTR1_AFTER_LOAD);
 268       } else {
 269         sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
 270         b(BMLOOPSTR1_CMP);
 271       }
 272     BIND(BMLOOPSTR1);
 273       (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
 274       (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 275     BIND(BMLOOPSTR1_AFTER_LOAD);
 276       subs(cnt1tmp, cnt1tmp, 1);
 277       br(LT, BMLOOPSTR1_LASTCMP);
 278     BIND(BMLOOPSTR1_CMP);
 279       cmp(ch1, ch2);
 280       br(EQ, BMLOOPSTR1);
 281     BIND(BMSKIP);
 282       if (!isL) {
 283         // if we've met UTF symbol while searching Latin1 pattern, then we can
 284         // skip cnt1 symbols
 285         if (str1_isL != str2_isL) {
 286           mov(result_tmp, cnt1);
 287         } else {
 288           mov(result_tmp, 1);
 289         }
 290         subs(zr, skipch, ASIZE);
 291         br(HS, BMADV);
 292       }
 293       ldrb(result_tmp, Address(sp, skipch)); // load skip distance
 294     BIND(BMADV);
 295       sub(cnt1tmp, cnt1, 1);
 296       add(str2, str2, result_tmp, LSL, str2_chr_shift);
 297       cmp(str2, str2end);
 298       br(LE, BMLOOPSTR2);
 299       add(sp, sp, ASIZE);
 300       b(NOMATCH);
 301     BIND(BMLOOPSTR1_LASTCMP);
 302       cmp(ch1, ch2);
 303       br(NE, BMSKIP);
 304     BIND(BMMATCH);
 305       sub(result, str2, tmp5);
 306       if (!str2_isL) lsr(result, result, 1);
 307       add(sp, sp, ASIZE);
 308       b(DONE);
 309 
 310     BIND(LINEARSTUB);
 311     cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm
 312     br(LT, LINEAR_MEDIUM);
 313     mov(result, zr);
 314     RuntimeAddress stub = nullptr;
 315     if (isL) {
 316       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
 317       assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated");
 318     } else if (str1_isL) {
 319       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
 320        assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated");
 321     } else {
 322       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
 323       assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated");
 324     }
 325     address call = trampoline_call(stub);
 326     if (call == nullptr) {
 327       DEBUG_ONLY(reset_labels(LINEARSEARCH, LINEAR_MEDIUM, DONE, NOMATCH, MATCH));
 328       ciEnv::current()->record_failure("CodeCache is full");
 329       return;
 330     }
 331     b(DONE);
 332   }
 333 
 334   BIND(LINEARSEARCH);
 335   {
 336     Label DO1, DO2, DO3;
 337 
 338     Register str2tmp = tmp2;
 339     Register first = tmp3;
 340 
 341     if (icnt1 == -1)
 342     {
 343         Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
 344 
 345         cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2));
 346         br(LT, DOSHORT);
 347       BIND(LINEAR_MEDIUM);
 348         (this->*str1_load_1chr)(first, Address(str1));
 349         lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
 350         sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
 351         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 352         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 353 
 354       BIND(FIRST_LOOP);
 355         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
 356         cmp(first, ch2);
 357         br(EQ, STR1_LOOP);
 358       BIND(STR2_NEXT);
 359         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 360         br(LE, FIRST_LOOP);
 361         b(NOMATCH);
 362 
 363       BIND(STR1_LOOP);
 364         adds(cnt1tmp, cnt1_neg, str1_chr_size);
 365         add(cnt2tmp, cnt2_neg, str2_chr_size);
 366         br(GE, MATCH);
 367 
 368       BIND(STR1_NEXT);
 369         (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
 370         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
 371         cmp(ch1, ch2);
 372         br(NE, STR2_NEXT);
 373         adds(cnt1tmp, cnt1tmp, str1_chr_size);
 374         add(cnt2tmp, cnt2tmp, str2_chr_size);
 375         br(LT, STR1_NEXT);
 376         b(MATCH);
 377 
 378       BIND(DOSHORT);
 379       if (str1_isL == str2_isL) {
 380         cmp(cnt1, (u1)2);
 381         br(LT, DO1);
 382         br(GT, DO3);
 383       }
 384     }
 385 
 386     if (icnt1 == 4) {
 387       Label CH1_LOOP;
 388 
 389         (this->*load_4chr)(ch1, str1);
 390         sub(result_tmp, cnt2, 4);
 391         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 392         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 393 
 394       BIND(CH1_LOOP);
 395         (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
 396         cmp(ch1, ch2);
 397         br(EQ, MATCH);
 398         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 399         br(LE, CH1_LOOP);
 400         b(NOMATCH);
 401       }
 402 
 403     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
 404       Label CH1_LOOP;
 405 
 406       BIND(DO2);
 407         (this->*load_2chr)(ch1, str1);
 408         if (icnt1 == 2) {
 409           sub(result_tmp, cnt2, 2);
 410         }
 411         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 412         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 413       BIND(CH1_LOOP);
 414         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
 415         cmp(ch1, ch2);
 416         br(EQ, MATCH);
 417         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 418         br(LE, CH1_LOOP);
 419         b(NOMATCH);
 420     }
 421 
 422     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
 423       Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
 424 
 425       BIND(DO3);
 426         (this->*load_2chr)(first, str1);
 427         (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
 428         if (icnt1 == 3) {
 429           sub(result_tmp, cnt2, 3);
 430         }
 431         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 432         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 433       BIND(FIRST_LOOP);
 434         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
 435         cmpw(first, ch2);
 436         br(EQ, STR1_LOOP);
 437       BIND(STR2_NEXT);
 438         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 439         br(LE, FIRST_LOOP);
 440         b(NOMATCH);
 441 
 442       BIND(STR1_LOOP);
 443         add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
 444         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
 445         cmp(ch1, ch2);
 446         br(NE, STR2_NEXT);
 447         b(MATCH);
 448     }
 449 
 450     if (icnt1 == -1 || icnt1 == 1) {
 451       Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
 452 
 453       BIND(DO1);
 454         (this->*str1_load_1chr)(ch1, str1);
 455         cmp(cnt2, (u1)8);
 456         br(LT, DO1_SHORT);
 457 
 458         sub(result_tmp, cnt2, 8/str2_chr_size);
 459         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 460         mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
 461         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 462 
 463         if (str2_isL) {
 464           orr(ch1, ch1, ch1, LSL, 8);
 465         }
 466         orr(ch1, ch1, ch1, LSL, 16);
 467         orr(ch1, ch1, ch1, LSL, 32);
 468       BIND(CH1_LOOP);
 469         ldr(ch2, Address(str2, cnt2_neg));
 470         eor(ch2, ch1, ch2);
 471         sub(tmp1, ch2, tmp3);
 472         orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 473         bics(tmp1, tmp1, tmp2);
 474         br(NE, HAS_ZERO);
 475         adds(cnt2_neg, cnt2_neg, 8);
 476         br(LT, CH1_LOOP);
 477 
 478         cmp(cnt2_neg, (u1)8);
 479         mov(cnt2_neg, 0);
 480         br(LT, CH1_LOOP);
 481         b(NOMATCH);
 482 
 483       BIND(HAS_ZERO);
 484         rev(tmp1, tmp1);
 485         clz(tmp1, tmp1);
 486         add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
 487         b(MATCH);
 488 
 489       BIND(DO1_SHORT);
 490         mov(result_tmp, cnt2);
 491         lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
 492         sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
 493       BIND(DO1_LOOP);
 494         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
 495         cmpw(ch1, ch2);
 496         br(EQ, MATCH);
 497         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 498         br(LT, DO1_LOOP);
 499     }
 500   }
 501   BIND(NOMATCH);
 502     mov(result, -1);
 503     b(DONE);
 504   BIND(MATCH);
 505     add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
 506   BIND(DONE);
 507 }
 508 
 509 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
 510 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
 511 
 512 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1,
 513                                             Register ch, Register result,
 514                                             Register tmp1, Register tmp2, Register tmp3)
 515 {
 516   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
 517   Register cnt1_neg = cnt1;
 518   Register ch1 = rscratch1;
 519   Register result_tmp = rscratch2;
 520 
 521   cbz(cnt1, NOMATCH);
 522 
 523   cmp(cnt1, (u1)4);
 524   br(LT, DO1_SHORT);
 525 
 526   orr(ch, ch, ch, LSL, 16);
 527   orr(ch, ch, ch, LSL, 32);
 528 
 529   sub(cnt1, cnt1, 4);
 530   mov(result_tmp, cnt1);
 531   lea(str1, Address(str1, cnt1, Address::uxtw(1)));
 532   sub(cnt1_neg, zr, cnt1, LSL, 1);
 533 
 534   mov(tmp3, 0x0001000100010001);
 535 
 536   BIND(CH1_LOOP);
 537     ldr(ch1, Address(str1, cnt1_neg));
 538     eor(ch1, ch, ch1);
 539     sub(tmp1, ch1, tmp3);
 540     orr(tmp2, ch1, 0x7fff7fff7fff7fff);
 541     bics(tmp1, tmp1, tmp2);
 542     br(NE, HAS_ZERO);
 543     adds(cnt1_neg, cnt1_neg, 8);
 544     br(LT, CH1_LOOP);
 545 
 546     cmp(cnt1_neg, (u1)8);
 547     mov(cnt1_neg, 0);
 548     br(LT, CH1_LOOP);
 549     b(NOMATCH);
 550 
 551   BIND(HAS_ZERO);
 552     rev(tmp1, tmp1);
 553     clz(tmp1, tmp1);
 554     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
 555     b(MATCH);
 556 
 557   BIND(DO1_SHORT);
 558     mov(result_tmp, cnt1);
 559     lea(str1, Address(str1, cnt1, Address::uxtw(1)));
 560     sub(cnt1_neg, zr, cnt1, LSL, 1);
 561   BIND(DO1_LOOP);
 562     ldrh(ch1, Address(str1, cnt1_neg));
 563     cmpw(ch, ch1);
 564     br(EQ, MATCH);
 565     adds(cnt1_neg, cnt1_neg, 2);
 566     br(LT, DO1_LOOP);
 567   BIND(NOMATCH);
 568     mov(result, -1);
 569     b(DONE);
 570   BIND(MATCH);
 571     add(result, result_tmp, cnt1_neg, ASR, 1);
 572   BIND(DONE);
 573 }
 574 
 575 void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1,
 576                                                 Register ch, Register result,
 577                                                 FloatRegister ztmp1,
 578                                                 FloatRegister ztmp2,
 579                                                 PRegister tmp_pg,
 580                                                 PRegister tmp_pdn, bool isL)
 581 {
 582   // Note that `tmp_pdn` should *NOT* be used as governing predicate register.
 583   assert(tmp_pg->is_governing(),
 584          "this register has to be a governing predicate register");
 585 
 586   Label LOOP, MATCH, DONE, NOMATCH;
 587   Register vec_len = rscratch1;
 588   Register idx = rscratch2;
 589 
 590   SIMD_RegVariant T = (isL == true) ? B : H;
 591 
 592   cbz(cnt1, NOMATCH);
 593 
 594   // Assign the particular char throughout the vector.
 595   sve_dup(ztmp2, T, ch);
 596   if (isL) {
 597     sve_cntb(vec_len);
 598   } else {
 599     sve_cnth(vec_len);
 600   }
 601   mov(idx, 0);
 602 
 603   // Generate a predicate to control the reading of input string.
 604   sve_whilelt(tmp_pg, T, idx, cnt1);
 605 
 606   BIND(LOOP);
 607     // Read a vector of 8- or 16-bit data depending on the string type. Note
 608     // that inactive elements indicated by the predicate register won't cause
 609     // a data read from memory to the destination vector.
 610     if (isL) {
 611       sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx));
 612     } else {
 613       sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1)));
 614     }
 615     add(idx, idx, vec_len);
 616 
 617     // Perform the comparison. An element of the destination predicate is set
 618     // to active if the particular char is matched.
 619     sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2);
 620 
 621     // Branch if the particular char is found.
 622     br(NE, MATCH);
 623 
 624     sve_whilelt(tmp_pg, T, idx, cnt1);
 625 
 626     // Loop back if the particular char not found.
 627     br(MI, LOOP);
 628 
 629   BIND(NOMATCH);
 630     mov(result, -1);
 631     b(DONE);
 632 
 633   BIND(MATCH);
 634     // Undo the index increment.
 635     sub(idx, idx, vec_len);
 636 
 637     // Crop the vector to find its location.
 638     sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */);
 639     add(result, idx, -1);
 640     sve_incp(result, T, tmp_pdn);
 641   BIND(DONE);
 642 }
 643 
 644 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1,
 645                                             Register ch, Register result,
 646                                             Register tmp1, Register tmp2, Register tmp3)
 647 {
 648   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
 649   Register cnt1_neg = cnt1;
 650   Register ch1 = rscratch1;
 651   Register result_tmp = rscratch2;
 652 
 653   cbz(cnt1, NOMATCH);
 654 
 655   cmp(cnt1, (u1)8);
 656   br(LT, DO1_SHORT);
 657 
 658   orr(ch, ch, ch, LSL, 8);
 659   orr(ch, ch, ch, LSL, 16);
 660   orr(ch, ch, ch, LSL, 32);
 661 
 662   sub(cnt1, cnt1, 8);
 663   mov(result_tmp, cnt1);
 664   lea(str1, Address(str1, cnt1));
 665   sub(cnt1_neg, zr, cnt1);
 666 
 667   mov(tmp3, 0x0101010101010101);
 668 
 669   BIND(CH1_LOOP);
 670     ldr(ch1, Address(str1, cnt1_neg));
 671     eor(ch1, ch, ch1);
 672     sub(tmp1, ch1, tmp3);
 673     orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f);
 674     bics(tmp1, tmp1, tmp2);
 675     br(NE, HAS_ZERO);
 676     adds(cnt1_neg, cnt1_neg, 8);
 677     br(LT, CH1_LOOP);
 678 
 679     cmp(cnt1_neg, (u1)8);
 680     mov(cnt1_neg, 0);
 681     br(LT, CH1_LOOP);
 682     b(NOMATCH);
 683 
 684   BIND(HAS_ZERO);
 685     rev(tmp1, tmp1);
 686     clz(tmp1, tmp1);
 687     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
 688     b(MATCH);
 689 
 690   BIND(DO1_SHORT);
 691     mov(result_tmp, cnt1);
 692     lea(str1, Address(str1, cnt1));
 693     sub(cnt1_neg, zr, cnt1);
 694   BIND(DO1_LOOP);
 695     ldrb(ch1, Address(str1, cnt1_neg));
 696     cmp(ch, ch1);
 697     br(EQ, MATCH);
 698     adds(cnt1_neg, cnt1_neg, 1);
 699     br(LT, DO1_LOOP);
 700   BIND(NOMATCH);
 701     mov(result, -1);
 702     b(DONE);
 703   BIND(MATCH);
 704     add(result, result_tmp, cnt1_neg);
 705   BIND(DONE);
 706 }
 707 
 708 // Compare strings.
 709 void C2_MacroAssembler::string_compare(Register str1, Register str2,
 710     Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
 711     FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
 712     PRegister pgtmp1, PRegister pgtmp2, int ae) {
 713   Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
 714       DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
 715       SHORT_LOOP_START, TAIL_CHECK;
 716 
 717   bool isLL = ae == StrIntrinsicNode::LL;
 718   bool isLU = ae == StrIntrinsicNode::LU;
 719   bool isUL = ae == StrIntrinsicNode::UL;
 720 
 721   // The stub threshold for LL strings is: 72 (64 + 8) chars
 722   // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch)
 723   // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least)
 724   const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36);
 725 
 726   bool str1_isL = isLL || isLU;
 727   bool str2_isL = isLL || isUL;
 728 
 729   int str1_chr_shift = str1_isL ? 0 : 1;
 730   int str2_chr_shift = str2_isL ? 0 : 1;
 731   int str1_chr_size = str1_isL ? 1 : 2;
 732   int str2_chr_size = str2_isL ? 1 : 2;
 733   int minCharsInWord = isLL ? wordSize : wordSize/2;
 734 
 735   FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
 736   chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
 737                                       (chr_insn)&MacroAssembler::ldrh;
 738   chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
 739                                       (chr_insn)&MacroAssembler::ldrh;
 740   uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
 741                             (uxt_insn)&MacroAssembler::uxthw;
 742 
 743   BLOCK_COMMENT("string_compare {");
 744 
 745   // Bizzarely, the counts are passed in bytes, regardless of whether they
 746   // are L or U strings, however the result is always in characters.
 747   if (!str1_isL) asrw(cnt1, cnt1, 1);
 748   if (!str2_isL) asrw(cnt2, cnt2, 1);
 749 
 750   // Compute the minimum of the string lengths and save the difference.
 751   subsw(result, cnt1, cnt2);
 752   cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
 753 
 754   // A very short string
 755   cmpw(cnt2, minCharsInWord);
 756   br(Assembler::LE, SHORT_STRING);
 757 
 758   // Compare longwords
 759   // load first parts of strings and finish initialization while loading
 760   {
 761     if (str1_isL == str2_isL) { // LL or UU
 762       ldr(tmp1, Address(str1));
 763       cmp(str1, str2);
 764       br(Assembler::EQ, DONE);
 765       ldr(tmp2, Address(str2));
 766       cmp(cnt2, stub_threshold);
 767       br(GE, STUB);
 768       subsw(cnt2, cnt2, minCharsInWord);
 769       br(EQ, TAIL_CHECK);
 770       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
 771       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
 772       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
 773     } else if (isLU) {
 774       ldrs(vtmp, Address(str1));
 775       ldr(tmp2, Address(str2));
 776       cmp(cnt2, stub_threshold);
 777       br(GE, STUB);
 778       subw(cnt2, cnt2, 4);
 779       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
 780       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
 781       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
 782       zip1(vtmp, T8B, vtmp, vtmpZ);
 783       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
 784       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
 785       add(cnt1, cnt1, 4);
 786       fmovd(tmp1, vtmp);
 787     } else { // UL case
 788       ldr(tmp1, Address(str1));
 789       ldrs(vtmp, Address(str2));
 790       cmp(cnt2, stub_threshold);
 791       br(GE, STUB);
 792       subw(cnt2, cnt2, 4);
 793       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
 794       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
 795       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
 796       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
 797       zip1(vtmp, T8B, vtmp, vtmpZ);
 798       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
 799       add(cnt1, cnt1, 8);
 800       fmovd(tmp2, vtmp);
 801     }
 802     adds(cnt2, cnt2, isUL ? 4 : 8);
 803     br(GE, TAIL);
 804     eor(rscratch2, tmp1, tmp2);
 805     cbnz(rscratch2, DIFF);
 806     // main loop
 807     bind(NEXT_WORD);
 808     if (str1_isL == str2_isL) {
 809       ldr(tmp1, Address(str1, cnt2));
 810       ldr(tmp2, Address(str2, cnt2));
 811       adds(cnt2, cnt2, 8);
 812     } else if (isLU) {
 813       ldrs(vtmp, Address(str1, cnt1));
 814       ldr(tmp2, Address(str2, cnt2));
 815       add(cnt1, cnt1, 4);
 816       zip1(vtmp, T8B, vtmp, vtmpZ);
 817       fmovd(tmp1, vtmp);
 818       adds(cnt2, cnt2, 8);
 819     } else { // UL
 820       ldrs(vtmp, Address(str2, cnt2));
 821       ldr(tmp1, Address(str1, cnt1));
 822       zip1(vtmp, T8B, vtmp, vtmpZ);
 823       add(cnt1, cnt1, 8);
 824       fmovd(tmp2, vtmp);
 825       adds(cnt2, cnt2, 4);
 826     }
 827     br(GE, TAIL);
 828 
 829     eor(rscratch2, tmp1, tmp2);
 830     cbz(rscratch2, NEXT_WORD);
 831     b(DIFF);
 832     bind(TAIL);
 833     eor(rscratch2, tmp1, tmp2);
 834     cbnz(rscratch2, DIFF);
 835     // Last longword.  In the case where length == 4 we compare the
 836     // same longword twice, but that's still faster than another
 837     // conditional branch.
 838     if (str1_isL == str2_isL) {
 839       ldr(tmp1, Address(str1));
 840       ldr(tmp2, Address(str2));
 841     } else if (isLU) {
 842       ldrs(vtmp, Address(str1));
 843       ldr(tmp2, Address(str2));
 844       zip1(vtmp, T8B, vtmp, vtmpZ);
 845       fmovd(tmp1, vtmp);
 846     } else { // UL
 847       ldrs(vtmp, Address(str2));
 848       ldr(tmp1, Address(str1));
 849       zip1(vtmp, T8B, vtmp, vtmpZ);
 850       fmovd(tmp2, vtmp);
 851     }
 852     bind(TAIL_CHECK);
 853     eor(rscratch2, tmp1, tmp2);
 854     cbz(rscratch2, DONE);
 855 
 856     // Find the first different characters in the longwords and
 857     // compute their difference.
 858     bind(DIFF);
 859     rev(rscratch2, rscratch2);
 860     clz(rscratch2, rscratch2);
 861     andr(rscratch2, rscratch2, isLL ? -8 : -16);
 862     lsrv(tmp1, tmp1, rscratch2);
 863     (this->*ext_chr)(tmp1, tmp1);
 864     lsrv(tmp2, tmp2, rscratch2);
 865     (this->*ext_chr)(tmp2, tmp2);
 866     subw(result, tmp1, tmp2);
 867     b(DONE);
 868   }
 869 
 870   bind(STUB);
 871     RuntimeAddress stub = nullptr;
 872     switch(ae) {
 873       case StrIntrinsicNode::LL:
 874         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
 875         break;
 876       case StrIntrinsicNode::UU:
 877         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
 878         break;
 879       case StrIntrinsicNode::LU:
 880         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
 881         break;
 882       case StrIntrinsicNode::UL:
 883         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
 884         break;
 885       default:
 886         ShouldNotReachHere();
 887      }
 888     assert(stub.target() != nullptr, "compare_long_string stub has not been generated");
 889     address call = trampoline_call(stub);
 890     if (call == nullptr) {
 891       DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START));
 892       ciEnv::current()->record_failure("CodeCache is full");
 893       return;
 894     }
 895     b(DONE);
 896 
 897   bind(SHORT_STRING);
 898   // Is the minimum length zero?
 899   cbz(cnt2, DONE);
 900   // arrange code to do most branches while loading and loading next characters
 901   // while comparing previous
 902   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
 903   subs(cnt2, cnt2, 1);
 904   br(EQ, SHORT_LAST_INIT);
 905   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
 906   b(SHORT_LOOP_START);
 907   bind(SHORT_LOOP);
 908   subs(cnt2, cnt2, 1);
 909   br(EQ, SHORT_LAST);
 910   bind(SHORT_LOOP_START);
 911   (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
 912   (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
 913   cmp(tmp1, cnt1);
 914   br(NE, SHORT_LOOP_TAIL);
 915   subs(cnt2, cnt2, 1);
 916   br(EQ, SHORT_LAST2);
 917   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
 918   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
 919   cmp(tmp2, rscratch1);
 920   br(EQ, SHORT_LOOP);
 921   sub(result, tmp2, rscratch1);
 922   b(DONE);
 923   bind(SHORT_LOOP_TAIL);
 924   sub(result, tmp1, cnt1);
 925   b(DONE);
 926   bind(SHORT_LAST2);
 927   cmp(tmp2, rscratch1);
 928   br(EQ, DONE);
 929   sub(result, tmp2, rscratch1);
 930 
 931   b(DONE);
 932   bind(SHORT_LAST_INIT);
 933   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
 934   bind(SHORT_LAST);
 935   cmp(tmp1, cnt1);
 936   br(EQ, DONE);
 937   sub(result, tmp1, cnt1);
 938 
 939   bind(DONE);
 940 
 941   BLOCK_COMMENT("} string_compare");
 942 }
 943 
 944 void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1,
 945                                      FloatRegister src2, Condition cond, bool isQ) {
 946   SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
 947   FloatRegister zn = src1, zm = src2;
 948   bool needs_negation = false;
 949   switch (cond) {
 950     case LT: cond = GT; zn = src2; zm = src1; break;
 951     case LE: cond = GE; zn = src2; zm = src1; break;
 952     case LO: cond = HI; zn = src2; zm = src1; break;
 953     case LS: cond = HS; zn = src2; zm = src1; break;
 954     case NE: cond = EQ; needs_negation = true; break;
 955     default:
 956       break;
 957   }
 958 
 959   if (is_floating_point_type(bt)) {
 960     fcm(cond, dst, size, zn, zm);
 961   } else {
 962     cm(cond, dst, size, zn, zm);
 963   }
 964 
 965   if (needs_negation) {
 966     notr(dst, isQ ? T16B : T8B, dst);
 967   }
 968 }
 969 
 970 void C2_MacroAssembler::neon_compare_zero(FloatRegister dst, BasicType bt, FloatRegister src,
 971                                           Condition cond, bool isQ) {
 972   SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
 973   if (bt == T_FLOAT || bt == T_DOUBLE) {
 974     if (cond == Assembler::NE) {
 975       fcm(Assembler::EQ, dst, size, src);
 976       notr(dst, isQ ? T16B : T8B, dst);
 977     } else {
 978       fcm(cond, dst, size, src);
 979     }
 980   } else {
 981     if (cond == Assembler::NE) {
 982       cm(Assembler::EQ, dst, size, src);
 983       notr(dst, isQ ? T16B : T8B, dst);
 984     } else {
 985       cm(cond, dst, size, src);
 986     }
 987   }
 988 }
 989 
 990 // Compress the least significant bit of each byte to the rightmost and clear
 991 // the higher garbage bits.
 992 void C2_MacroAssembler::bytemask_compress(Register dst) {
 993   // Example input, dst = 0x01 00 00 00 01 01 00 01
 994   // The "??" bytes are garbage.
 995   orr(dst, dst, dst, Assembler::LSR, 7);  // dst = 0x?? 02 ?? 00 ?? 03 ?? 01
 996   orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D
 997   orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D
 998   andr(dst, dst, 0xff);                   // dst = 0x8D
 999 }
1000 
1001 // Pack the lowest-numbered bit of each mask element in src into a long value
1002 // in dst, at most the first 64 lane elements.
1003 // Clobbers: rscratch1, if UseSVE=1 or the hardware doesn't support FEAT_BITPERM.
1004 void C2_MacroAssembler::sve_vmask_tolong(Register dst, PRegister src, BasicType bt, int lane_cnt,
1005                                          FloatRegister vtmp1, FloatRegister vtmp2) {
1006   assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
1007   assert_different_registers(dst, rscratch1);
1008   assert_different_registers(vtmp1, vtmp2);
1009 
1010   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
1011   // Example:   src = 0b01100101 10001101, bt = T_BYTE, lane_cnt = 16
1012   // Expected:  dst = 0x658D
1013 
1014   // Convert the mask into vector with sequential bytes.
1015   // vtmp1 = 0x00010100 0x00010001 0x01000000 0x01010001
1016   sve_cpy(vtmp1, size, src, 1, false);
1017   if (bt != T_BYTE) {
1018     sve_vector_narrow(vtmp1, B, vtmp1, size, vtmp2);
1019   }
1020 
1021   if (UseSVE > 1 && VM_Version::supports_svebitperm()) {
1022     // Given a vector with the value 0x00 or 0x01 in each byte, the basic idea
1023     // is to compress each significant bit of the byte in a cross-lane way. Due
1024     // to the lack of a cross-lane bit-compress instruction, we use BEXT
1025     // (bit-compress in each lane) with the biggest lane size (T = D) then
1026     // concatenate the results.
1027 
1028     // The second source input of BEXT, initialized with 0x01 in each byte.
1029     // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1030     sve_dup(vtmp2, B, 1);
1031 
1032     // BEXT vtmp1.D, vtmp1.D, vtmp2.D
1033     // vtmp1 = 0x0001010000010001 | 0x0100000001010001
1034     // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1035     //         ---------------------------------------
1036     // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1037     sve_bext(vtmp1, D, vtmp1, vtmp2);
1038 
1039     // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the
1040     // result to dst.
1041     // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1042     // dst   = 0x658D
1043     if (lane_cnt <= 8) {
1044       // No need to concatenate.
1045       umov(dst, vtmp1, B, 0);
1046     } else if (lane_cnt <= 16) {
1047       ins(vtmp1, B, vtmp1, 1, 8);
1048       umov(dst, vtmp1, H, 0);
1049     } else {
1050       // As the lane count is 64 at most, the final expected value must be in
1051       // the lowest 64 bits after narrowing vtmp1 from D to B.
1052       sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2);
1053       umov(dst, vtmp1, D, 0);
1054     }
1055   } else if (UseSVE > 0) {
1056     // Compress the lowest 8 bytes.
1057     fmovd(dst, vtmp1);
1058     bytemask_compress(dst);
1059     if (lane_cnt <= 8) return;
1060 
1061     // Repeat on higher bytes and join the results.
1062     // Compress 8 bytes in each iteration.
1063     for (int idx = 1; idx < (lane_cnt / 8); idx++) {
1064       sve_extract_integral(rscratch1, T_LONG, vtmp1, idx, vtmp2);
1065       bytemask_compress(rscratch1);
1066       orr(dst, dst, rscratch1, Assembler::LSL, idx << 3);
1067     }
1068   } else {
1069     assert(false, "unsupported");
1070     ShouldNotReachHere();
1071   }
1072 }
1073 
1074 // Unpack the mask, a long value in src, into predicate register dst based on the
1075 // corresponding data type. Note that dst can support at most 64 lanes.
1076 // Below example gives the expected dst predicate register in different types, with
1077 // a valid src(0x658D) on a 1024-bit vector size machine.
1078 // BYTE:  dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 00 00 65 8D
1079 // SHORT: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 14 11 40 51
1080 // INT:   dst = 0x00 00 00 00 00 00 00 00 01 10 01 01 10 00 11 01
1081 // LONG:  dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01
1082 //
1083 // The number of significant bits of src must be equal to lane_cnt. E.g., 0xFF658D which
1084 // has 24 significant bits would be an invalid input if dst predicate register refers to
1085 // a LONG type 1024-bit vector, which has at most 16 lanes.
1086 void C2_MacroAssembler::sve_vmask_fromlong(PRegister dst, Register src, BasicType bt, int lane_cnt,
1087                                            FloatRegister vtmp1, FloatRegister vtmp2) {
1088   assert(UseSVE == 2 && VM_Version::supports_svebitperm() &&
1089          lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported");
1090   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
1091   // Example:   src = 0x658D, bt = T_BYTE, size = B, lane_cnt = 16
1092   // Expected:  dst = 0b01101001 10001101
1093 
1094   // Put long value from general purpose register into the first lane of vector.
1095   // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1096   sve_dup(vtmp1, B, 0);
1097   mov(vtmp1, D, 0, src);
1098 
1099   // As sve_cmp generates mask value with the minimum unit in byte, we should
1100   // transform the value in the first lane which is mask in bit now to the
1101   // mask in byte, which can be done by SVE2's BDEP instruction.
1102 
1103   // The first source input of BDEP instruction. Deposite each byte in every 8 bytes.
1104   // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1105   if (lane_cnt <= 8) {
1106     // Nothing. As only one byte exsits.
1107   } else if (lane_cnt <= 16) {
1108     ins(vtmp1, B, vtmp1, 8, 1);
1109     mov(vtmp1, B, 1, zr);
1110   } else {
1111     sve_vector_extend(vtmp1, D, vtmp1, B);
1112   }
1113 
1114   // The second source input of BDEP instruction, initialized with 0x01 for each byte.
1115   // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1116   sve_dup(vtmp2, B, 1);
1117 
1118   // BDEP vtmp1.D, vtmp1.D, vtmp2.D
1119   // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1120   // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1121   //         ---------------------------------------
1122   // vtmp1 = 0x0001010000010001 | 0x0100000001010001
1123   sve_bdep(vtmp1, D, vtmp1, vtmp2);
1124 
1125   if (bt != T_BYTE) {
1126     sve_vector_extend(vtmp1, size, vtmp1, B);
1127   }
1128   // Generate mask according to the given vector, in which the elements have been
1129   // extended to expected type.
1130   // dst = 0b01101001 10001101
1131   sve_cmp(Assembler::NE, dst, size, ptrue, vtmp1, 0);
1132 }
1133 
1134 // Clobbers: rflags
1135 void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg,
1136                                     FloatRegister zn, FloatRegister zm, Condition cond) {
1137   assert(pg->is_governing(), "This register has to be a governing predicate register");
1138   FloatRegister z1 = zn, z2 = zm;
1139   switch (cond) {
1140     case LE: z1 = zm; z2 = zn; cond = GE; break;
1141     case LT: z1 = zm; z2 = zn; cond = GT; break;
1142     case LO: z1 = zm; z2 = zn; cond = HI; break;
1143     case LS: z1 = zm; z2 = zn; cond = HS; break;
1144     default:
1145       break;
1146   }
1147 
1148   SIMD_RegVariant size = elemType_to_regVariant(bt);
1149   if (is_floating_point_type(bt)) {
1150     sve_fcm(cond, pd, size, pg, z1, z2);
1151   } else {
1152     assert(is_integral_type(bt), "unsupported element type");
1153     sve_cmp(cond, pd, size, pg, z1, z2);
1154   }
1155 }
1156 
1157 // Get index of the last mask lane that is set
1158 void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) {
1159   SIMD_RegVariant size = elemType_to_regVariant(bt);
1160   sve_rev(ptmp, size, src);
1161   sve_brkb(ptmp, ptrue, ptmp, false);
1162   sve_cntp(dst, size, ptrue, ptmp);
1163   movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1);
1164   subw(dst, rscratch1, dst);
1165 }
1166 
1167 // Extend integer vector src to dst with the same lane count
1168 // but larger element size, e.g. 4B -> 4I
1169 void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes,
1170                                            FloatRegister src, BasicType src_bt) {
1171   if (src_bt == T_BYTE) {
1172     if (dst_bt == T_SHORT) {
1173       // 4B/8B to 4S/8S
1174       assert(dst_vlen_in_bytes == 8 || dst_vlen_in_bytes == 16, "unsupported");
1175       sxtl(dst, T8H, src, T8B);
1176     } else {
1177       // 4B to 4I
1178       assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported");
1179       sxtl(dst, T8H, src, T8B);
1180       sxtl(dst, T4S, dst, T4H);
1181     }
1182   } else if (src_bt == T_SHORT) {
1183     // 4S to 4I
1184     assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported");
1185     sxtl(dst, T4S, src, T4H);
1186   } else if (src_bt == T_INT) {
1187     // 2I to 2L
1188     assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported");
1189     sxtl(dst, T2D, src, T2S);
1190   } else {
1191     ShouldNotReachHere();
1192   }
1193 }
1194 
1195 // Narrow integer vector src down to dst with the same lane count
1196 // but smaller element size, e.g. 4I -> 4B
1197 void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt,
1198                                            FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) {
1199   if (src_bt == T_SHORT) {
1200     // 4S/8S to 4B/8B
1201     assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
1202     assert(dst_bt == T_BYTE, "unsupported");
1203     xtn(dst, T8B, src, T8H);
1204   } else if (src_bt == T_INT) {
1205     // 4I to 4B/4S
1206     assert(src_vlen_in_bytes == 16, "unsupported");
1207     assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported");
1208     xtn(dst, T4H, src, T4S);
1209     if (dst_bt == T_BYTE) {
1210       xtn(dst, T8B, dst, T8H);
1211     }
1212   } else if (src_bt == T_LONG) {
1213     // 2L to 2I
1214     assert(src_vlen_in_bytes == 16, "unsupported");
1215     assert(dst_bt == T_INT, "unsupported");
1216     xtn(dst, T2S, src, T2D);
1217   } else {
1218     ShouldNotReachHere();
1219   }
1220 }
1221 
1222 void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size,
1223                                           FloatRegister src, SIMD_RegVariant src_size) {
1224   assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size");
1225   if (src_size == B) {
1226     switch (dst_size) {
1227     case H:
1228       sve_sunpklo(dst, H, src);
1229       break;
1230     case S:
1231       sve_sunpklo(dst, H, src);
1232       sve_sunpklo(dst, S, dst);
1233       break;
1234     case D:
1235       sve_sunpklo(dst, H, src);
1236       sve_sunpklo(dst, S, dst);
1237       sve_sunpklo(dst, D, dst);
1238       break;
1239     default:
1240       ShouldNotReachHere();
1241     }
1242   } else if (src_size == H) {
1243     if (dst_size == S) {
1244       sve_sunpklo(dst, S, src);
1245     } else { // D
1246       sve_sunpklo(dst, S, src);
1247       sve_sunpklo(dst, D, dst);
1248     }
1249   } else if (src_size == S) {
1250     sve_sunpklo(dst, D, src);
1251   }
1252 }
1253 
1254 // Vector narrow from src to dst with specified element sizes.
1255 // High part of dst vector will be filled with zero.
1256 void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size,
1257                                           FloatRegister src, SIMD_RegVariant src_size,
1258                                           FloatRegister tmp) {
1259   assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size");
1260   assert_different_registers(src, tmp);
1261   sve_dup(tmp, src_size, 0);
1262   if (src_size == D) {
1263     switch (dst_size) {
1264     case S:
1265       sve_uzp1(dst, S, src, tmp);
1266       break;
1267     case H:
1268       assert_different_registers(dst, tmp);
1269       sve_uzp1(dst, S, src, tmp);
1270       sve_uzp1(dst, H, dst, tmp);
1271       break;
1272     case B:
1273       assert_different_registers(dst, tmp);
1274       sve_uzp1(dst, S, src, tmp);
1275       sve_uzp1(dst, H, dst, tmp);
1276       sve_uzp1(dst, B, dst, tmp);
1277       break;
1278     default:
1279       ShouldNotReachHere();
1280     }
1281   } else if (src_size == S) {
1282     if (dst_size == H) {
1283       sve_uzp1(dst, H, src, tmp);
1284     } else { // B
1285       assert_different_registers(dst, tmp);
1286       sve_uzp1(dst, H, src, tmp);
1287       sve_uzp1(dst, B, dst, tmp);
1288     }
1289   } else if (src_size == H) {
1290     sve_uzp1(dst, B, src, tmp);
1291   }
1292 }
1293 
1294 // Extend src predicate to dst predicate with the same lane count but larger
1295 // element size, e.g. 64Byte -> 512Long
1296 void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src,
1297                                              uint dst_element_length_in_bytes,
1298                                              uint src_element_length_in_bytes) {
1299   if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) {
1300     sve_punpklo(dst, src);
1301   } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) {
1302     sve_punpklo(dst, src);
1303     sve_punpklo(dst, dst);
1304   } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) {
1305     sve_punpklo(dst, src);
1306     sve_punpklo(dst, dst);
1307     sve_punpklo(dst, dst);
1308   } else {
1309     assert(false, "unsupported");
1310     ShouldNotReachHere();
1311   }
1312 }
1313 
1314 // Narrow src predicate to dst predicate with the same lane count but
1315 // smaller element size, e.g. 512Long -> 64Byte
1316 void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src, PRegister ptmp,
1317                                              uint dst_element_length_in_bytes, uint src_element_length_in_bytes) {
1318   // The insignificant bits in src predicate are expected to be zero.
1319   // To ensure the higher order bits of the resultant narrowed vector are 0, an all-zero predicate is
1320   // passed as the second argument. An example narrowing operation with a given mask would be -
1321   // 128Long -> 64Int on a 128-bit machine i.e 2L -> 2I
1322   // Mask (for 2 Longs) : TF
1323   // Predicate register for the above mask (16 bits) : 00000001 00000000
1324   // After narrowing (uzp1 dst.b, src.b, ptmp.b) : 0000 0000 0001 0000
1325   // Which translates to mask for 2 integers as : TF (lower half is considered while upper half is 0)
1326   assert_different_registers(src, ptmp);
1327   assert_different_registers(dst, ptmp);
1328   sve_pfalse(ptmp);
1329   if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) {
1330     sve_uzp1(dst, B, src, ptmp);
1331   } else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) {
1332     sve_uzp1(dst, H, src, ptmp);
1333     sve_uzp1(dst, B, dst, ptmp);
1334   } else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) {
1335     sve_uzp1(dst, S, src, ptmp);
1336     sve_uzp1(dst, H, dst, ptmp);
1337     sve_uzp1(dst, B, dst, ptmp);
1338   } else {
1339     assert(false, "unsupported");
1340     ShouldNotReachHere();
1341   }
1342 }
1343 
1344 // Vector reduction add for integral type with ASIMD instructions.
1345 void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt,
1346                                                  Register isrc, FloatRegister vsrc,
1347                                                  unsigned vector_length_in_bytes,
1348                                                  FloatRegister vtmp) {
1349   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1350   assert_different_registers(dst, isrc);
1351   bool isQ = vector_length_in_bytes == 16;
1352 
1353   BLOCK_COMMENT("neon_reduce_add_integral {");
1354     switch(bt) {
1355       case T_BYTE:
1356         addv(vtmp, isQ ? T16B : T8B, vsrc);
1357         smov(dst, vtmp, B, 0);
1358         addw(dst, dst, isrc, ext::sxtb);
1359         break;
1360       case T_SHORT:
1361         addv(vtmp, isQ ? T8H : T4H, vsrc);
1362         smov(dst, vtmp, H, 0);
1363         addw(dst, dst, isrc, ext::sxth);
1364         break;
1365       case T_INT:
1366         isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc);
1367         umov(dst, vtmp, S, 0);
1368         addw(dst, dst, isrc);
1369         break;
1370       case T_LONG:
1371         assert(isQ, "unsupported");
1372         addpd(vtmp, vsrc);
1373         umov(dst, vtmp, D, 0);
1374         add(dst, dst, isrc);
1375         break;
1376       default:
1377         assert(false, "unsupported");
1378         ShouldNotReachHere();
1379     }
1380   BLOCK_COMMENT("} neon_reduce_add_integral");
1381 }
1382 
1383 // Vector reduction multiply for integral type with ASIMD instructions.
1384 // Note: temporary registers vtmp1 and vtmp2 are not used in some cases.
1385 // Clobbers: rscratch1
1386 void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt,
1387                                                  Register isrc, FloatRegister vsrc,
1388                                                  unsigned vector_length_in_bytes,
1389                                                  FloatRegister vtmp1, FloatRegister vtmp2) {
1390   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1391   bool isQ = vector_length_in_bytes == 16;
1392 
1393   BLOCK_COMMENT("neon_reduce_mul_integral {");
1394     switch(bt) {
1395       case T_BYTE:
1396         if (isQ) {
1397           // Multiply the lower half and higher half of vector iteratively.
1398           // vtmp1 = vsrc[8:15]
1399           ins(vtmp1, D, vsrc, 0, 1);
1400           // vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7]
1401           mulv(vtmp1, T8B, vtmp1, vsrc);
1402           // vtmp2 = vtmp1[4:7]
1403           ins(vtmp2, S, vtmp1, 0, 1);
1404           // vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3]
1405           mulv(vtmp1, T8B, vtmp2, vtmp1);
1406         } else {
1407           ins(vtmp1, S, vsrc, 0, 1);
1408           mulv(vtmp1, T8B, vtmp1, vsrc);
1409         }
1410         // vtmp2 = vtmp1[2:3]
1411         ins(vtmp2, H, vtmp1, 0, 1);
1412         // vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1]
1413         mulv(vtmp2, T8B, vtmp2, vtmp1);
1414         // dst = vtmp2[0] * isrc * vtmp2[1]
1415         umov(rscratch1, vtmp2, B, 0);
1416         mulw(dst, rscratch1, isrc);
1417         sxtb(dst, dst);
1418         umov(rscratch1, vtmp2, B, 1);
1419         mulw(dst, rscratch1, dst);
1420         sxtb(dst, dst);
1421         break;
1422       case T_SHORT:
1423         if (isQ) {
1424           ins(vtmp2, D, vsrc, 0, 1);
1425           mulv(vtmp2, T4H, vtmp2, vsrc);
1426           ins(vtmp1, S, vtmp2, 0, 1);
1427           mulv(vtmp1, T4H, vtmp1, vtmp2);
1428         } else {
1429           ins(vtmp1, S, vsrc, 0, 1);
1430           mulv(vtmp1, T4H, vtmp1, vsrc);
1431         }
1432         umov(rscratch1, vtmp1, H, 0);
1433         mulw(dst, rscratch1, isrc);
1434         sxth(dst, dst);
1435         umov(rscratch1, vtmp1, H, 1);
1436         mulw(dst, rscratch1, dst);
1437         sxth(dst, dst);
1438         break;
1439       case T_INT:
1440         if (isQ) {
1441           ins(vtmp1, D, vsrc, 0, 1);
1442           mulv(vtmp1, T2S, vtmp1, vsrc);
1443         } else {
1444           vtmp1 = vsrc;
1445         }
1446         umov(rscratch1, vtmp1, S, 0);
1447         mul(dst, rscratch1, isrc);
1448         umov(rscratch1, vtmp1, S, 1);
1449         mul(dst, rscratch1, dst);
1450         break;
1451       case T_LONG:
1452         umov(rscratch1, vsrc, D, 0);
1453         mul(dst, isrc, rscratch1);
1454         umov(rscratch1, vsrc, D, 1);
1455         mul(dst, dst, rscratch1);
1456         break;
1457       default:
1458         assert(false, "unsupported");
1459         ShouldNotReachHere();
1460     }
1461   BLOCK_COMMENT("} neon_reduce_mul_integral");
1462 }
1463 
1464 // Vector reduction multiply for floating-point type with ASIMD instructions.
1465 void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt,
1466                                            FloatRegister fsrc, FloatRegister vsrc,
1467                                            unsigned vector_length_in_bytes,
1468                                            FloatRegister vtmp) {
1469   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1470   bool isQ = vector_length_in_bytes == 16;
1471 
1472   BLOCK_COMMENT("neon_reduce_mul_fp {");
1473     switch(bt) {
1474       case T_FLOAT:
1475         fmuls(dst, fsrc, vsrc);
1476         ins(vtmp, S, vsrc, 0, 1);
1477         fmuls(dst, dst, vtmp);
1478         if (isQ) {
1479           ins(vtmp, S, vsrc, 0, 2);
1480           fmuls(dst, dst, vtmp);
1481           ins(vtmp, S, vsrc, 0, 3);
1482           fmuls(dst, dst, vtmp);
1483          }
1484         break;
1485       case T_DOUBLE:
1486         assert(isQ, "unsupported");
1487         fmuld(dst, fsrc, vsrc);
1488         ins(vtmp, D, vsrc, 0, 1);
1489         fmuld(dst, dst, vtmp);
1490         break;
1491       default:
1492         assert(false, "unsupported");
1493         ShouldNotReachHere();
1494     }
1495   BLOCK_COMMENT("} neon_reduce_mul_fp");
1496 }
1497 
1498 // Helper to select logical instruction
1499 void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd,
1500                                                    Register Rn, Register Rm,
1501                                                    enum shift_kind kind, unsigned shift) {
1502   switch(opc) {
1503     case Op_AndReductionV:
1504       is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift);
1505       break;
1506     case Op_OrReductionV:
1507       is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift);
1508       break;
1509     case Op_XorReductionV:
1510       is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift);
1511       break;
1512     default:
1513       assert(false, "unsupported");
1514       ShouldNotReachHere();
1515   }
1516 }
1517 
1518 // Vector reduction logical operations And, Or, Xor
1519 // Clobbers: rscratch1
1520 void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt,
1521                                             Register isrc, FloatRegister vsrc,
1522                                             unsigned vector_length_in_bytes) {
1523   assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV,
1524          "unsupported");
1525   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1526   assert_different_registers(dst, isrc);
1527   bool isQ = vector_length_in_bytes == 16;
1528 
1529   BLOCK_COMMENT("neon_reduce_logical {");
1530     umov(rscratch1, vsrc, isQ ? D : S, 0);
1531     umov(dst, vsrc, isQ ? D : S, 1);
1532     neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1);
1533     switch(bt) {
1534       case T_BYTE:
1535         if (isQ) {
1536           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1537         }
1538         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
1539         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8);
1540         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1541         sxtb(dst, dst);
1542         break;
1543       case T_SHORT:
1544         if (isQ) {
1545           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1546         }
1547         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
1548         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1549         sxth(dst, dst);
1550         break;
1551       case T_INT:
1552         if (isQ) {
1553           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1554         }
1555         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1556         break;
1557       case T_LONG:
1558         assert(isQ, "unsupported");
1559         neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst);
1560         break;
1561       default:
1562         assert(false, "unsupported");
1563         ShouldNotReachHere();
1564     }
1565   BLOCK_COMMENT("} neon_reduce_logical");
1566 }
1567 
1568 // Vector reduction min/max for integral type with ASIMD instructions.
1569 // Note: vtmp is not used and expected to be fnoreg for T_LONG case.
1570 // Clobbers: rscratch1, rflags
1571 void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt,
1572                                                     Register isrc, FloatRegister vsrc,
1573                                                     unsigned vector_length_in_bytes,
1574                                                     FloatRegister vtmp) {
1575   assert(opc == Op_MinReductionV || opc == Op_MaxReductionV, "unsupported");
1576   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1577   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported");
1578   assert_different_registers(dst, isrc);
1579   bool isQ = vector_length_in_bytes == 16;
1580   bool is_min = opc == Op_MinReductionV;
1581 
1582   BLOCK_COMMENT("neon_reduce_minmax_integral {");
1583     if (bt == T_LONG) {
1584       assert(vtmp == fnoreg, "should be");
1585       assert(isQ, "should be");
1586       umov(rscratch1, vsrc, D, 0);
1587       cmp(isrc, rscratch1);
1588       csel(dst, isrc, rscratch1, is_min ? LT : GT);
1589       umov(rscratch1, vsrc, D, 1);
1590       cmp(dst, rscratch1);
1591       csel(dst, dst, rscratch1, is_min ? LT : GT);
1592     } else {
1593       SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1594       if (size == T2S) {
1595         is_min ? sminp(vtmp, size, vsrc, vsrc) : smaxp(vtmp, size, vsrc, vsrc);
1596       } else {
1597         is_min ? sminv(vtmp, size, vsrc) : smaxv(vtmp, size, vsrc);
1598       }
1599       if (bt == T_INT) {
1600         umov(dst, vtmp, S, 0);
1601       } else {
1602         smov(dst, vtmp, elemType_to_regVariant(bt), 0);
1603       }
1604       cmpw(dst, isrc);
1605       cselw(dst, dst, isrc, is_min ? LT : GT);
1606     }
1607   BLOCK_COMMENT("} neon_reduce_minmax_integral");
1608 }
1609 
1610 // Vector reduction for integral type with SVE instruction.
1611 // Supported operations are Add, And, Or, Xor, Max, Min.
1612 // rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV.
1613 void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1,
1614                                             FloatRegister src2, PRegister pg, FloatRegister tmp) {
1615   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
1616   assert(pg->is_governing(), "This register has to be a governing predicate register");
1617   assert_different_registers(src1, dst);
1618   // Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved.
1619   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
1620   switch (opc) {
1621     case Op_AddReductionVI: {
1622       sve_uaddv(tmp, size, pg, src2);
1623       if (bt == T_BYTE) {
1624         smov(dst, tmp, size, 0);
1625         addw(dst, src1, dst, ext::sxtb);
1626       } else if (bt == T_SHORT) {
1627         smov(dst, tmp, size, 0);
1628         addw(dst, src1, dst, ext::sxth);
1629       } else {
1630         umov(dst, tmp, size, 0);
1631         addw(dst, dst, src1);
1632       }
1633       break;
1634     }
1635     case Op_AddReductionVL: {
1636       sve_uaddv(tmp, size, pg, src2);
1637       umov(dst, tmp, size, 0);
1638       add(dst, dst, src1);
1639       break;
1640     }
1641     case Op_AndReductionV: {
1642       sve_andv(tmp, size, pg, src2);
1643       if (bt == T_INT || bt == T_LONG) {
1644         umov(dst, tmp, size, 0);
1645       } else {
1646         smov(dst, tmp, size, 0);
1647       }
1648       if (bt == T_LONG) {
1649         andr(dst, dst, src1);
1650       } else {
1651         andw(dst, dst, src1);
1652       }
1653       break;
1654     }
1655     case Op_OrReductionV: {
1656       sve_orv(tmp, size, pg, src2);
1657       if (bt == T_INT || bt == T_LONG) {
1658         umov(dst, tmp, size, 0);
1659       } else {
1660         smov(dst, tmp, size, 0);
1661       }
1662       if (bt == T_LONG) {
1663         orr(dst, dst, src1);
1664       } else {
1665         orrw(dst, dst, src1);
1666       }
1667       break;
1668     }
1669     case Op_XorReductionV: {
1670       sve_eorv(tmp, size, pg, src2);
1671       if (bt == T_INT || bt == T_LONG) {
1672         umov(dst, tmp, size, 0);
1673       } else {
1674         smov(dst, tmp, size, 0);
1675       }
1676       if (bt == T_LONG) {
1677         eor(dst, dst, src1);
1678       } else {
1679         eorw(dst, dst, src1);
1680       }
1681       break;
1682     }
1683     case Op_MaxReductionV: {
1684       sve_smaxv(tmp, size, pg, src2);
1685       if (bt == T_INT || bt == T_LONG) {
1686         umov(dst, tmp, size, 0);
1687       } else {
1688         smov(dst, tmp, size, 0);
1689       }
1690       if (bt == T_LONG) {
1691         cmp(dst, src1);
1692         csel(dst, dst, src1, Assembler::GT);
1693       } else {
1694         cmpw(dst, src1);
1695         cselw(dst, dst, src1, Assembler::GT);
1696       }
1697       break;
1698     }
1699     case Op_MinReductionV: {
1700       sve_sminv(tmp, size, pg, src2);
1701       if (bt == T_INT || bt == T_LONG) {
1702         umov(dst, tmp, size, 0);
1703       } else {
1704         smov(dst, tmp, size, 0);
1705       }
1706       if (bt == T_LONG) {
1707         cmp(dst, src1);
1708         csel(dst, dst, src1, Assembler::LT);
1709       } else {
1710         cmpw(dst, src1);
1711         cselw(dst, dst, src1, Assembler::LT);
1712       }
1713       break;
1714     }
1715     default:
1716       assert(false, "unsupported");
1717       ShouldNotReachHere();
1718   }
1719 
1720   if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) {
1721     if (bt == T_BYTE) {
1722       sxtb(dst, dst);
1723     } else if (bt == T_SHORT) {
1724       sxth(dst, dst);
1725     }
1726   }
1727 }
1728 
1729 // Set elements of the dst predicate to true for lanes in the range of [0, lane_cnt), or
1730 // to false otherwise. The input "lane_cnt" should be smaller than or equal to the supported
1731 // max vector length of the basic type. Clobbers: rscratch1 and the rFlagsReg.
1732 void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt) {
1733   uint32_t max_vector_length = Matcher::max_vector_size(bt);
1734   assert(lane_cnt <= max_vector_length, "unsupported input lane_cnt");
1735 
1736   // Set all elements to false if the input "lane_cnt" is zero.
1737   if (lane_cnt == 0) {
1738     sve_pfalse(dst);
1739     return;
1740   }
1741 
1742   SIMD_RegVariant size = elemType_to_regVariant(bt);
1743   assert(size != Q, "invalid size");
1744 
1745   // Set all true if "lane_cnt" equals to the max lane count.
1746   if (lane_cnt == max_vector_length) {
1747     sve_ptrue(dst, size, /* ALL */ 0b11111);
1748     return;
1749   }
1750 
1751   // Fixed numbers for "ptrue".
1752   switch(lane_cnt) {
1753   case 1: /* VL1 */
1754   case 2: /* VL2 */
1755   case 3: /* VL3 */
1756   case 4: /* VL4 */
1757   case 5: /* VL5 */
1758   case 6: /* VL6 */
1759   case 7: /* VL7 */
1760   case 8: /* VL8 */
1761     sve_ptrue(dst, size, lane_cnt);
1762     return;
1763   case 16:
1764     sve_ptrue(dst, size, /* VL16 */ 0b01001);
1765     return;
1766   case 32:
1767     sve_ptrue(dst, size, /* VL32 */ 0b01010);
1768     return;
1769   case 64:
1770     sve_ptrue(dst, size, /* VL64 */ 0b01011);
1771     return;
1772   case 128:
1773     sve_ptrue(dst, size, /* VL128 */ 0b01100);
1774     return;
1775   case 256:
1776     sve_ptrue(dst, size, /* VL256 */ 0b01101);
1777     return;
1778   default:
1779     break;
1780   }
1781 
1782   // Special patterns for "ptrue".
1783   if (lane_cnt == round_down_power_of_2(max_vector_length)) {
1784     sve_ptrue(dst, size, /* POW2 */ 0b00000);
1785   } else if (lane_cnt == max_vector_length - (max_vector_length % 4)) {
1786     sve_ptrue(dst, size, /* MUL4 */ 0b11101);
1787   } else if (lane_cnt == max_vector_length - (max_vector_length % 3)) {
1788     sve_ptrue(dst, size, /* MUL3 */ 0b11110);
1789   } else {
1790     // Encode to "whileltw" for the remaining cases.
1791     mov(rscratch1, lane_cnt);
1792     sve_whileltw(dst, size, zr, rscratch1);
1793   }
1794 }
1795 
1796 // Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst.
1797 // Any remaining elements of dst will be filled with zero.
1798 // Clobbers: rscratch1
1799 // Preserves: src, mask
1800 void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask,
1801                                            FloatRegister vtmp1, FloatRegister vtmp2,
1802                                            PRegister pgtmp) {
1803   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
1804   assert_different_registers(dst, src, vtmp1, vtmp2);
1805   assert_different_registers(mask, pgtmp);
1806 
1807   // Example input:   src   = 8888 7777 6666 5555 4444 3333 2222 1111
1808   //                  mask  = 0001 0000 0000 0001 0001 0000 0001 0001
1809   // Expected result: dst   = 0000 0000 0000 8888 5555 4444 2222 1111
1810   sve_dup(vtmp2, H, 0);
1811 
1812   // Extend lowest half to type INT.
1813   // dst = 00004444 00003333 00002222 00001111
1814   sve_uunpklo(dst, S, src);
1815   // pgtmp = 00000001 00000000 00000001 00000001
1816   sve_punpklo(pgtmp, mask);
1817   // Pack the active elements in size of type INT to the right,
1818   // and fill the remainings with zero.
1819   // dst = 00000000 00004444 00002222 00001111
1820   sve_compact(dst, S, dst, pgtmp);
1821   // Narrow the result back to type SHORT.
1822   // dst = 0000 0000 0000 0000 0000 4444 2222 1111
1823   sve_uzp1(dst, H, dst, vtmp2);
1824   // Count the active elements of lowest half.
1825   // rscratch1 = 3
1826   sve_cntp(rscratch1, S, ptrue, pgtmp);
1827 
1828   // Repeat to the highest half.
1829   // pgtmp = 00000001 00000000 00000000 00000001
1830   sve_punpkhi(pgtmp, mask);
1831   // vtmp1 = 00008888 00007777 00006666 00005555
1832   sve_uunpkhi(vtmp1, S, src);
1833   // vtmp1 = 00000000 00000000 00008888 00005555
1834   sve_compact(vtmp1, S, vtmp1, pgtmp);
1835   // vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555
1836   sve_uzp1(vtmp1, H, vtmp1, vtmp2);
1837 
1838   // Compressed low:   dst   = 0000 0000 0000 0000 0000 4444 2222 1111
1839   // Compressed high:  vtmp1 = 0000 0000 0000 0000 0000 0000 8888  5555
1840   // Left shift(cross lane) compressed high with TRUE_CNT lanes,
1841   // TRUE_CNT is the number of active elements in the compressed low.
1842   neg(rscratch1, rscratch1);
1843   // vtmp2 = {4 3 2 1 0 -1 -2 -3}
1844   sve_index(vtmp2, H, rscratch1, 1);
1845   // vtmp1 = 0000 0000 0000 8888 5555 0000 0000 0000
1846   sve_tbl(vtmp1, H, vtmp1, vtmp2);
1847 
1848   // Combine the compressed high(after shifted) with the compressed low.
1849   // dst = 0000 0000 0000 8888 5555 4444 2222 1111
1850   sve_orr(dst, dst, vtmp1);
1851 }
1852 
1853 // Clobbers: rscratch1, rscratch2
1854 // Preserves: src, mask
1855 void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask,
1856                                           FloatRegister vtmp1, FloatRegister vtmp2,
1857                                           FloatRegister vtmp3, FloatRegister vtmp4,
1858                                           PRegister ptmp, PRegister pgtmp) {
1859   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
1860   assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3, vtmp4);
1861   assert_different_registers(mask, ptmp, pgtmp);
1862   // Example input:   src   = 88 77 66 55 44 33 22 11
1863   //                  mask  = 01 00 00 01 01 00 01 01
1864   // Expected result: dst   = 00 00 00 88 55 44 22 11
1865 
1866   sve_dup(vtmp4, B, 0);
1867   // Extend lowest half to type SHORT.
1868   // vtmp1 = 0044 0033 0022 0011
1869   sve_uunpklo(vtmp1, H, src);
1870   // ptmp = 0001 0000 0001 0001
1871   sve_punpklo(ptmp, mask);
1872   // Count the active elements of lowest half.
1873   // rscratch2 = 3
1874   sve_cntp(rscratch2, H, ptrue, ptmp);
1875   // Pack the active elements in size of type SHORT to the right,
1876   // and fill the remainings with zero.
1877   // dst = 0000 0044 0022 0011
1878   sve_compress_short(dst, vtmp1, ptmp, vtmp2, vtmp3, pgtmp);
1879   // Narrow the result back to type BYTE.
1880   // dst = 00 00 00 00 00 44 22 11
1881   sve_uzp1(dst, B, dst, vtmp4);
1882 
1883   // Repeat to the highest half.
1884   // ptmp = 0001 0000 0000 0001
1885   sve_punpkhi(ptmp, mask);
1886   // vtmp1 = 0088 0077 0066 0055
1887   sve_uunpkhi(vtmp2, H, src);
1888   // vtmp1 = 0000 0000 0088 0055
1889   sve_compress_short(vtmp1, vtmp2, ptmp, vtmp3, vtmp4, pgtmp);
1890 
1891   sve_dup(vtmp4, B, 0);
1892   // vtmp1 = 00 00 00 00 00 00 88 55
1893   sve_uzp1(vtmp1, B, vtmp1, vtmp4);
1894 
1895   // Compressed low:   dst   = 00 00 00 00 00 44 22 11
1896   // Compressed high:  vtmp1 = 00 00 00 00 00 00 88 55
1897   // Left shift(cross lane) compressed high with TRUE_CNT lanes,
1898   // TRUE_CNT is the number of active elements in the compressed low.
1899   neg(rscratch2, rscratch2);
1900   // vtmp2 = {4 3 2 1 0 -1 -2 -3}
1901   sve_index(vtmp2, B, rscratch2, 1);
1902   // vtmp1 = 00 00 00 88 55 00 00 00
1903   sve_tbl(vtmp1, B, vtmp1, vtmp2);
1904   // Combine the compressed high(after shifted) with the compressed low.
1905   // dst = 00 00 00 88 55 44 22 11
1906   sve_orr(dst, dst, vtmp1);
1907 }
1908 
1909 void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
1910   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
1911   SIMD_Arrangement size = isQ ? T16B : T8B;
1912   if (bt == T_BYTE) {
1913     rbit(dst, size, src);
1914   } else {
1915     neon_reverse_bytes(dst, src, bt, isQ);
1916     rbit(dst, size, dst);
1917   }
1918 }
1919 
1920 void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
1921   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
1922   SIMD_Arrangement size = isQ ? T16B : T8B;
1923   switch (bt) {
1924     case T_BYTE:
1925       if (dst != src) {
1926         orr(dst, size, src, src);
1927       }
1928       break;
1929     case T_SHORT:
1930       rev16(dst, size, src);
1931       break;
1932     case T_INT:
1933       rev32(dst, size, src);
1934       break;
1935     case T_LONG:
1936       rev64(dst, size, src);
1937       break;
1938     default:
1939       assert(false, "unsupported");
1940       ShouldNotReachHere();
1941   }
1942 }
1943 
1944 // Extract a scalar element from an sve vector at position 'idx'.
1945 // The input elements in src are expected to be of integral type.
1946 void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src,
1947                                              int idx, FloatRegister vtmp) {
1948   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
1949   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
1950   if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction
1951     if (bt == T_INT || bt == T_LONG) {
1952       umov(dst, src, size, idx);
1953     } else {
1954       smov(dst, src, size, idx);
1955     }
1956   } else {
1957     sve_orr(vtmp, src, src);
1958     sve_ext(vtmp, vtmp, idx << size);
1959     if (bt == T_INT || bt == T_LONG) {
1960       umov(dst, vtmp, size, 0);
1961     } else {
1962       smov(dst, vtmp, size, 0);
1963     }
1964   }
1965 }
1966 
1967 // java.lang.Math::round intrinsics
1968 
1969 // Clobbers: rscratch1, rflags
1970 void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
1971                                           FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) {
1972   assert_different_registers(tmp1, tmp2, tmp3, src, dst);
1973   switch (T) {
1974     case T2S:
1975     case T4S:
1976       fmovs(tmp1, T, 0.5f);
1977       mov(rscratch1, jint_cast(0x1.0p23f));
1978       break;
1979     case T2D:
1980       fmovd(tmp1, T, 0.5);
1981       mov(rscratch1, julong_cast(0x1.0p52));
1982       break;
1983     default:
1984       assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
1985   }
1986   fadd(tmp1, T, tmp1, src);
1987   fcvtms(tmp1, T, tmp1);
1988   // tmp1 = floor(src + 0.5, ties to even)
1989 
1990   fcvtas(dst, T, src);
1991   // dst = round(src), ties to away
1992 
1993   fneg(tmp3, T, src);
1994   dup(tmp2, T, rscratch1);
1995   cm(HS, tmp3, T, tmp3, tmp2);
1996   // tmp3 is now a set of flags
1997 
1998   bif(dst, T16B, tmp1, tmp3);
1999   // result in dst
2000 }
2001 
2002 // Clobbers: rscratch1, rflags
2003 void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2004                                          FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) {
2005   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2006   assert_different_registers(tmp1, tmp2, src, dst);
2007 
2008   switch (T) {
2009     case S:
2010       mov(rscratch1, jint_cast(0x1.0p23f));
2011       break;
2012     case D:
2013       mov(rscratch1, julong_cast(0x1.0p52));
2014       break;
2015     default:
2016       assert(T == S || T == D, "invalid register variant");
2017   }
2018 
2019   sve_frinta(dst, T, ptrue, src);
2020   // dst = round(src), ties to away
2021 
2022   Label none;
2023 
2024   sve_fneg(tmp1, T, ptrue, src);
2025   sve_dup(tmp2, T, rscratch1);
2026   sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1);
2027   br(EQ, none);
2028   {
2029     sve_cpy(tmp1, T, pgtmp, 0.5);
2030     sve_fadd(tmp1, T, pgtmp, src);
2031     sve_frintm(dst, T, pgtmp, tmp1);
2032     // dst = floor(src + 0.5, ties to even)
2033   }
2034   bind(none);
2035 
2036   sve_fcvtzs(dst, T, ptrue, dst, T);
2037   // result in dst
2038 }
2039 
2040 void C2_MacroAssembler::vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero,
2041                                            FloatRegister one, SIMD_Arrangement T) {
2042   assert_different_registers(dst, src, zero, one);
2043   assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2044 
2045   facgt(dst, T, src, zero);
2046   ushr(dst, T, dst, 1); // dst=0 for +-0.0 and NaN. 0x7FF..F otherwise
2047   bsl(dst, T == T2S ? T8B : T16B, one, src); // Result in dst
2048 }
2049 
2050 void C2_MacroAssembler::vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero,
2051                                           FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T) {
2052     assert_different_registers(dst, src, zero, one, vtmp);
2053     assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2054 
2055     sve_orr(vtmp, src, src);
2056     sve_fac(Assembler::GT, pgtmp, T, ptrue, src, zero); // pmtp=0 for +-0.0 and NaN. 0x1 otherwise
2057     switch (T) {
2058     case S:
2059       sve_and(vtmp, T, min_jint); // Extract the sign bit of float value in every lane of src
2060       sve_orr(vtmp, T, jint_cast(1.0)); // OR it with +1 to make the final result +1 or -1 depending
2061                                         // on the sign of the float value
2062       break;
2063     case D:
2064       sve_and(vtmp, T, min_jlong);
2065       sve_orr(vtmp, T, jlong_cast(1.0));
2066       break;
2067     default:
2068       assert(false, "unsupported");
2069       ShouldNotReachHere();
2070     }
2071     sve_sel(dst, T, pgtmp, vtmp, src); // Select either from src or vtmp based on the predicate register pgtmp
2072                                        // Result in dst
2073 }
2074 
2075 bool C2_MacroAssembler::in_scratch_emit_size() {
2076   if (ciEnv::current()->task() != nullptr) {
2077     PhaseOutput* phase_output = Compile::current()->output();
2078     if (phase_output != nullptr && phase_output->in_scratch_emit_size()) {
2079       return true;
2080     }
2081   }
2082   return MacroAssembler::in_scratch_emit_size();
2083 }