1 /*
   2  * Copyright (c) 2020, 2023, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "opto/c2_MacroAssembler.hpp"
  29 #include "opto/compile.hpp"
  30 #include "opto/intrinsicnode.hpp"
  31 #include "opto/matcher.hpp"
  32 #include "opto/output.hpp"
  33 #include "opto/subnode.hpp"
  34 #include "runtime/stubRoutines.hpp"
  35 
  36 #ifdef PRODUCT
  37 #define BLOCK_COMMENT(str) /* nothing */
  38 #define STOP(error) stop(error)
  39 #else
  40 #define BLOCK_COMMENT(str) block_comment(str)
  41 #define STOP(error) block_comment(error); stop(error)
  42 #endif
  43 
  44 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  45 
  46 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
  47 
  48 void C2_MacroAssembler::entry_barrier() {
  49   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
  50   if (BarrierSet::barrier_set()->barrier_set_nmethod() != NULL) {
  51     // Dummy labels for just measuring the code size
  52     Label dummy_slow_path;
  53     Label dummy_continuation;
  54     Label dummy_guard;
  55     Label* slow_path = &dummy_slow_path;
  56     Label* continuation = &dummy_continuation;
  57     Label* guard = &dummy_guard;
  58     if (!Compile::current()->output()->in_scratch_emit_size()) {
  59       // Use real labels from actual stub when not emitting code for the purpose of measuring its size
  60       C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
  61       Compile::current()->output()->add_stub(stub);
  62       slow_path = &stub->entry();
  63       continuation = &stub->continuation();
  64       guard = &stub->guard();
  65     }
  66     // In the C2 code, we move the non-hot part of nmethod entry barriers out-of-line to a stub.
  67     bs->nmethod_entry_barrier(this, slow_path, continuation, guard);
  68   }
  69 }
  70 
  71 int C2_MacroAssembler::entry_barrier_stub_size() {
  72   return 4 * 6;
  73 }
  74 
  75 // Search for str1 in str2 and return index or -1
  76 void C2_MacroAssembler::string_indexof(Register str2, Register str1,
  77                                        Register cnt2, Register cnt1,
  78                                        Register tmp1, Register tmp2,
  79                                        Register tmp3, Register tmp4,
  80                                        Register tmp5, Register tmp6,
  81                                        int icnt1, Register result, int ae) {
  82   // NOTE: tmp5, tmp6 can be zr depending on specific method version
  83   Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
  84 
  85   Register ch1 = rscratch1;
  86   Register ch2 = rscratch2;
  87   Register cnt1tmp = tmp1;
  88   Register cnt2tmp = tmp2;
  89   Register cnt1_neg = cnt1;
  90   Register cnt2_neg = cnt2;
  91   Register result_tmp = tmp4;
  92 
  93   bool isL = ae == StrIntrinsicNode::LL;
  94 
  95   bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
  96   bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
  97   int str1_chr_shift = str1_isL ? 0:1;
  98   int str2_chr_shift = str2_isL ? 0:1;
  99   int str1_chr_size = str1_isL ? 1:2;
 100   int str2_chr_size = str2_isL ? 1:2;
 101   chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
 102                                       (chr_insn)&MacroAssembler::ldrh;
 103   chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
 104                                       (chr_insn)&MacroAssembler::ldrh;
 105   chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
 106   chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
 107 
 108   // Note, inline_string_indexOf() generates checks:
 109   // if (substr.count > string.count) return -1;
 110   // if (substr.count == 0) return 0;
 111 
 112   // We have two strings, a source string in str2, cnt2 and a pattern string
 113   // in str1, cnt1. Find the 1st occurrence of pattern in source or return -1.
 114 
 115   // For larger pattern and source we use a simplified Boyer Moore algorithm.
 116   // With a small pattern and source we use linear scan.
 117 
 118   if (icnt1 == -1) {
 119     sub(result_tmp, cnt2, cnt1);
 120     cmp(cnt1, (u1)8);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
 121     br(LT, LINEARSEARCH);
 122     dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
 123     subs(zr, cnt1, 256);
 124     lsr(tmp1, cnt2, 2);
 125     ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
 126     br(GE, LINEARSTUB);
 127   }
 128 
 129 // The Boyer Moore alogorithm is based on the description here:-
 130 //
 131 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
 132 //
 133 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
 134 // and the 'Good Suffix' rule.
 135 //
 136 // These rules are essentially heuristics for how far we can shift the
 137 // pattern along the search string.
 138 //
 139 // The implementation here uses the 'Bad Character' rule only because of the
 140 // complexity of initialisation for the 'Good Suffix' rule.
 141 //
 142 // This is also known as the Boyer-Moore-Horspool algorithm:-
 143 //
 144 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
 145 //
 146 // This particular implementation has few java-specific optimizations.
 147 //
 148 // #define ASIZE 256
 149 //
 150 //    int bm(unsigned char *x, int m, unsigned char *y, int n) {
 151 //       int i, j;
 152 //       unsigned c;
 153 //       unsigned char bc[ASIZE];
 154 //
 155 //       /* Preprocessing */
 156 //       for (i = 0; i < ASIZE; ++i)
 157 //          bc[i] = m;
 158 //       for (i = 0; i < m - 1; ) {
 159 //          c = x[i];
 160 //          ++i;
 161 //          // c < 256 for Latin1 string, so, no need for branch
 162 //          #ifdef PATTERN_STRING_IS_LATIN1
 163 //          bc[c] = m - i;
 164 //          #else
 165 //          if (c < ASIZE) bc[c] = m - i;
 166 //          #endif
 167 //       }
 168 //
 169 //       /* Searching */
 170 //       j = 0;
 171 //       while (j <= n - m) {
 172 //          c = y[i+j];
 173 //          if (x[m-1] == c)
 174 //            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
 175 //          if (i < 0) return j;
 176 //          // c < 256 for Latin1 string, so, no need for branch
 177 //          #ifdef SOURCE_STRING_IS_LATIN1
 178 //          // LL case: (c< 256) always true. Remove branch
 179 //          j += bc[y[j+m-1]];
 180 //          #endif
 181 //          #ifndef PATTERN_STRING_IS_UTF
 182 //          // UU case: need if (c<ASIZE) check. Skip 1 character if not.
 183 //          if (c < ASIZE)
 184 //            j += bc[y[j+m-1]];
 185 //          else
 186 //            j += 1
 187 //          #endif
 188 //          #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
 189 //          // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
 190 //          if (c < ASIZE)
 191 //            j += bc[y[j+m-1]];
 192 //          else
 193 //            j += m
 194 //          #endif
 195 //       }
 196 //    }
 197 
 198   if (icnt1 == -1) {
 199     Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
 200         BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
 201     Register cnt1end = tmp2;
 202     Register str2end = cnt2;
 203     Register skipch = tmp2;
 204 
 205     // str1 length is >=8, so, we can read at least 1 register for cases when
 206     // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
 207     // UL case. We'll re-read last character in inner pre-loop code to have
 208     // single outer pre-loop load
 209     const int firstStep = isL ? 7 : 3;
 210 
 211     const int ASIZE = 256;
 212     const int STORED_BYTES = 32; // amount of bytes stored per instruction
 213     sub(sp, sp, ASIZE);
 214     mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
 215     mov(ch1, sp);
 216     BIND(BM_INIT_LOOP);
 217       stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
 218       subs(tmp5, tmp5, 1);
 219       br(GT, BM_INIT_LOOP);
 220 
 221       sub(cnt1tmp, cnt1, 1);
 222       mov(tmp5, str2);
 223       add(str2end, str2, result_tmp, LSL, str2_chr_shift);
 224       sub(ch2, cnt1, 1);
 225       mov(tmp3, str1);
 226     BIND(BCLOOP);
 227       (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
 228       if (!str1_isL) {
 229         subs(zr, ch1, ASIZE);
 230         br(HS, BCSKIP);
 231       }
 232       strb(ch2, Address(sp, ch1));
 233     BIND(BCSKIP);
 234       subs(ch2, ch2, 1);
 235       br(GT, BCLOOP);
 236 
 237       add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
 238       if (str1_isL == str2_isL) {
 239         // load last 8 bytes (8LL/4UU symbols)
 240         ldr(tmp6, Address(tmp6, -wordSize));
 241       } else {
 242         ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
 243         // convert Latin1 to UTF. We'll have to wait until load completed, but
 244         // it's still faster than per-character loads+checks
 245         lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
 246         ubfx(ch1, tmp6, 8, 8); // str1[N-2]
 247         ubfx(ch2, tmp6, 16, 8); // str1[N-3]
 248         andr(tmp6, tmp6, 0xFF); // str1[N-4]
 249         orr(ch2, ch1, ch2, LSL, 16);
 250         orr(tmp6, tmp6, tmp3, LSL, 48);
 251         orr(tmp6, tmp6, ch2, LSL, 16);
 252       }
 253     BIND(BMLOOPSTR2);
 254       (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 255       sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
 256       if (str1_isL == str2_isL) {
 257         // re-init tmp3. It's for free because it's executed in parallel with
 258         // load above. Alternative is to initialize it before loop, but it'll
 259         // affect performance on in-order systems with 2 or more ld/st pipelines
 260         lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
 261       }
 262       if (!isL) { // UU/UL case
 263         lsl(ch2, cnt1tmp, 1); // offset in bytes
 264       }
 265       cmp(tmp3, skipch);
 266       br(NE, BMSKIP);
 267       ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
 268       mov(ch1, tmp6);
 269       if (isL) {
 270         b(BMLOOPSTR1_AFTER_LOAD);
 271       } else {
 272         sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
 273         b(BMLOOPSTR1_CMP);
 274       }
 275     BIND(BMLOOPSTR1);
 276       (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
 277       (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 278     BIND(BMLOOPSTR1_AFTER_LOAD);
 279       subs(cnt1tmp, cnt1tmp, 1);
 280       br(LT, BMLOOPSTR1_LASTCMP);
 281     BIND(BMLOOPSTR1_CMP);
 282       cmp(ch1, ch2);
 283       br(EQ, BMLOOPSTR1);
 284     BIND(BMSKIP);
 285       if (!isL) {
 286         // if we've met UTF symbol while searching Latin1 pattern, then we can
 287         // skip cnt1 symbols
 288         if (str1_isL != str2_isL) {
 289           mov(result_tmp, cnt1);
 290         } else {
 291           mov(result_tmp, 1);
 292         }
 293         subs(zr, skipch, ASIZE);
 294         br(HS, BMADV);
 295       }
 296       ldrb(result_tmp, Address(sp, skipch)); // load skip distance
 297     BIND(BMADV);
 298       sub(cnt1tmp, cnt1, 1);
 299       add(str2, str2, result_tmp, LSL, str2_chr_shift);
 300       cmp(str2, str2end);
 301       br(LE, BMLOOPSTR2);
 302       add(sp, sp, ASIZE);
 303       b(NOMATCH);
 304     BIND(BMLOOPSTR1_LASTCMP);
 305       cmp(ch1, ch2);
 306       br(NE, BMSKIP);
 307     BIND(BMMATCH);
 308       sub(result, str2, tmp5);
 309       if (!str2_isL) lsr(result, result, 1);
 310       add(sp, sp, ASIZE);
 311       b(DONE);
 312 
 313     BIND(LINEARSTUB);
 314     cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm
 315     br(LT, LINEAR_MEDIUM);
 316     mov(result, zr);
 317     RuntimeAddress stub = NULL;
 318     if (isL) {
 319       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
 320       assert(stub.target() != NULL, "string_indexof_linear_ll stub has not been generated");
 321     } else if (str1_isL) {
 322       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
 323        assert(stub.target() != NULL, "string_indexof_linear_ul stub has not been generated");
 324     } else {
 325       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
 326       assert(stub.target() != NULL, "string_indexof_linear_uu stub has not been generated");
 327     }
 328     address call = trampoline_call(stub);
 329     if (call == nullptr) {
 330       DEBUG_ONLY(reset_labels(LINEARSEARCH, LINEAR_MEDIUM, DONE, NOMATCH, MATCH));
 331       ciEnv::current()->record_failure("CodeCache is full");
 332       return;
 333     }
 334     b(DONE);
 335   }
 336 
 337   BIND(LINEARSEARCH);
 338   {
 339     Label DO1, DO2, DO3;
 340 
 341     Register str2tmp = tmp2;
 342     Register first = tmp3;
 343 
 344     if (icnt1 == -1)
 345     {
 346         Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
 347 
 348         cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2));
 349         br(LT, DOSHORT);
 350       BIND(LINEAR_MEDIUM);
 351         (this->*str1_load_1chr)(first, Address(str1));
 352         lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
 353         sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
 354         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 355         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 356 
 357       BIND(FIRST_LOOP);
 358         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
 359         cmp(first, ch2);
 360         br(EQ, STR1_LOOP);
 361       BIND(STR2_NEXT);
 362         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 363         br(LE, FIRST_LOOP);
 364         b(NOMATCH);
 365 
 366       BIND(STR1_LOOP);
 367         adds(cnt1tmp, cnt1_neg, str1_chr_size);
 368         add(cnt2tmp, cnt2_neg, str2_chr_size);
 369         br(GE, MATCH);
 370 
 371       BIND(STR1_NEXT);
 372         (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
 373         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
 374         cmp(ch1, ch2);
 375         br(NE, STR2_NEXT);
 376         adds(cnt1tmp, cnt1tmp, str1_chr_size);
 377         add(cnt2tmp, cnt2tmp, str2_chr_size);
 378         br(LT, STR1_NEXT);
 379         b(MATCH);
 380 
 381       BIND(DOSHORT);
 382       if (str1_isL == str2_isL) {
 383         cmp(cnt1, (u1)2);
 384         br(LT, DO1);
 385         br(GT, DO3);
 386       }
 387     }
 388 
 389     if (icnt1 == 4) {
 390       Label CH1_LOOP;
 391 
 392         (this->*load_4chr)(ch1, str1);
 393         sub(result_tmp, cnt2, 4);
 394         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 395         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 396 
 397       BIND(CH1_LOOP);
 398         (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
 399         cmp(ch1, ch2);
 400         br(EQ, MATCH);
 401         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 402         br(LE, CH1_LOOP);
 403         b(NOMATCH);
 404       }
 405 
 406     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
 407       Label CH1_LOOP;
 408 
 409       BIND(DO2);
 410         (this->*load_2chr)(ch1, str1);
 411         if (icnt1 == 2) {
 412           sub(result_tmp, cnt2, 2);
 413         }
 414         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 415         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 416       BIND(CH1_LOOP);
 417         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
 418         cmp(ch1, ch2);
 419         br(EQ, MATCH);
 420         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 421         br(LE, CH1_LOOP);
 422         b(NOMATCH);
 423     }
 424 
 425     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
 426       Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
 427 
 428       BIND(DO3);
 429         (this->*load_2chr)(first, str1);
 430         (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
 431         if (icnt1 == 3) {
 432           sub(result_tmp, cnt2, 3);
 433         }
 434         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 435         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 436       BIND(FIRST_LOOP);
 437         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
 438         cmpw(first, ch2);
 439         br(EQ, STR1_LOOP);
 440       BIND(STR2_NEXT);
 441         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 442         br(LE, FIRST_LOOP);
 443         b(NOMATCH);
 444 
 445       BIND(STR1_LOOP);
 446         add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
 447         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
 448         cmp(ch1, ch2);
 449         br(NE, STR2_NEXT);
 450         b(MATCH);
 451     }
 452 
 453     if (icnt1 == -1 || icnt1 == 1) {
 454       Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
 455 
 456       BIND(DO1);
 457         (this->*str1_load_1chr)(ch1, str1);
 458         cmp(cnt2, (u1)8);
 459         br(LT, DO1_SHORT);
 460 
 461         sub(result_tmp, cnt2, 8/str2_chr_size);
 462         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 463         mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
 464         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 465 
 466         if (str2_isL) {
 467           orr(ch1, ch1, ch1, LSL, 8);
 468         }
 469         orr(ch1, ch1, ch1, LSL, 16);
 470         orr(ch1, ch1, ch1, LSL, 32);
 471       BIND(CH1_LOOP);
 472         ldr(ch2, Address(str2, cnt2_neg));
 473         eor(ch2, ch1, ch2);
 474         sub(tmp1, ch2, tmp3);
 475         orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 476         bics(tmp1, tmp1, tmp2);
 477         br(NE, HAS_ZERO);
 478         adds(cnt2_neg, cnt2_neg, 8);
 479         br(LT, CH1_LOOP);
 480 
 481         cmp(cnt2_neg, (u1)8);
 482         mov(cnt2_neg, 0);
 483         br(LT, CH1_LOOP);
 484         b(NOMATCH);
 485 
 486       BIND(HAS_ZERO);
 487         rev(tmp1, tmp1);
 488         clz(tmp1, tmp1);
 489         add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
 490         b(MATCH);
 491 
 492       BIND(DO1_SHORT);
 493         mov(result_tmp, cnt2);
 494         lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
 495         sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
 496       BIND(DO1_LOOP);
 497         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
 498         cmpw(ch1, ch2);
 499         br(EQ, MATCH);
 500         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 501         br(LT, DO1_LOOP);
 502     }
 503   }
 504   BIND(NOMATCH);
 505     mov(result, -1);
 506     b(DONE);
 507   BIND(MATCH);
 508     add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
 509   BIND(DONE);
 510 }
 511 
 512 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
 513 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
 514 
 515 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1,
 516                                             Register ch, Register result,
 517                                             Register tmp1, Register tmp2, Register tmp3)
 518 {
 519   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
 520   Register cnt1_neg = cnt1;
 521   Register ch1 = rscratch1;
 522   Register result_tmp = rscratch2;
 523 
 524   cbz(cnt1, NOMATCH);
 525 
 526   cmp(cnt1, (u1)4);
 527   br(LT, DO1_SHORT);
 528 
 529   orr(ch, ch, ch, LSL, 16);
 530   orr(ch, ch, ch, LSL, 32);
 531 
 532   sub(cnt1, cnt1, 4);
 533   mov(result_tmp, cnt1);
 534   lea(str1, Address(str1, cnt1, Address::uxtw(1)));
 535   sub(cnt1_neg, zr, cnt1, LSL, 1);
 536 
 537   mov(tmp3, 0x0001000100010001);
 538 
 539   BIND(CH1_LOOP);
 540     ldr(ch1, Address(str1, cnt1_neg));
 541     eor(ch1, ch, ch1);
 542     sub(tmp1, ch1, tmp3);
 543     orr(tmp2, ch1, 0x7fff7fff7fff7fff);
 544     bics(tmp1, tmp1, tmp2);
 545     br(NE, HAS_ZERO);
 546     adds(cnt1_neg, cnt1_neg, 8);
 547     br(LT, CH1_LOOP);
 548 
 549     cmp(cnt1_neg, (u1)8);
 550     mov(cnt1_neg, 0);
 551     br(LT, CH1_LOOP);
 552     b(NOMATCH);
 553 
 554   BIND(HAS_ZERO);
 555     rev(tmp1, tmp1);
 556     clz(tmp1, tmp1);
 557     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
 558     b(MATCH);
 559 
 560   BIND(DO1_SHORT);
 561     mov(result_tmp, cnt1);
 562     lea(str1, Address(str1, cnt1, Address::uxtw(1)));
 563     sub(cnt1_neg, zr, cnt1, LSL, 1);
 564   BIND(DO1_LOOP);
 565     ldrh(ch1, Address(str1, cnt1_neg));
 566     cmpw(ch, ch1);
 567     br(EQ, MATCH);
 568     adds(cnt1_neg, cnt1_neg, 2);
 569     br(LT, DO1_LOOP);
 570   BIND(NOMATCH);
 571     mov(result, -1);
 572     b(DONE);
 573   BIND(MATCH);
 574     add(result, result_tmp, cnt1_neg, ASR, 1);
 575   BIND(DONE);
 576 }
 577 
 578 void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1,
 579                                                 Register ch, Register result,
 580                                                 FloatRegister ztmp1,
 581                                                 FloatRegister ztmp2,
 582                                                 PRegister tmp_pg,
 583                                                 PRegister tmp_pdn, bool isL)
 584 {
 585   // Note that `tmp_pdn` should *NOT* be used as governing predicate register.
 586   assert(tmp_pg->is_governing(),
 587          "this register has to be a governing predicate register");
 588 
 589   Label LOOP, MATCH, DONE, NOMATCH;
 590   Register vec_len = rscratch1;
 591   Register idx = rscratch2;
 592 
 593   SIMD_RegVariant T = (isL == true) ? B : H;
 594 
 595   cbz(cnt1, NOMATCH);
 596 
 597   // Assign the particular char throughout the vector.
 598   sve_dup(ztmp2, T, ch);
 599   if (isL) {
 600     sve_cntb(vec_len);
 601   } else {
 602     sve_cnth(vec_len);
 603   }
 604   mov(idx, 0);
 605 
 606   // Generate a predicate to control the reading of input string.
 607   sve_whilelt(tmp_pg, T, idx, cnt1);
 608 
 609   BIND(LOOP);
 610     // Read a vector of 8- or 16-bit data depending on the string type. Note
 611     // that inactive elements indicated by the predicate register won't cause
 612     // a data read from memory to the destination vector.
 613     if (isL) {
 614       sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx));
 615     } else {
 616       sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1)));
 617     }
 618     add(idx, idx, vec_len);
 619 
 620     // Perform the comparison. An element of the destination predicate is set
 621     // to active if the particular char is matched.
 622     sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2);
 623 
 624     // Branch if the particular char is found.
 625     br(NE, MATCH);
 626 
 627     sve_whilelt(tmp_pg, T, idx, cnt1);
 628 
 629     // Loop back if the particular char not found.
 630     br(MI, LOOP);
 631 
 632   BIND(NOMATCH);
 633     mov(result, -1);
 634     b(DONE);
 635 
 636   BIND(MATCH);
 637     // Undo the index increment.
 638     sub(idx, idx, vec_len);
 639 
 640     // Crop the vector to find its location.
 641     sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */);
 642     add(result, idx, -1);
 643     sve_incp(result, T, tmp_pdn);
 644   BIND(DONE);
 645 }
 646 
 647 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1,
 648                                             Register ch, Register result,
 649                                             Register tmp1, Register tmp2, Register tmp3)
 650 {
 651   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
 652   Register cnt1_neg = cnt1;
 653   Register ch1 = rscratch1;
 654   Register result_tmp = rscratch2;
 655 
 656   cbz(cnt1, NOMATCH);
 657 
 658   cmp(cnt1, (u1)8);
 659   br(LT, DO1_SHORT);
 660 
 661   orr(ch, ch, ch, LSL, 8);
 662   orr(ch, ch, ch, LSL, 16);
 663   orr(ch, ch, ch, LSL, 32);
 664 
 665   sub(cnt1, cnt1, 8);
 666   mov(result_tmp, cnt1);
 667   lea(str1, Address(str1, cnt1));
 668   sub(cnt1_neg, zr, cnt1);
 669 
 670   mov(tmp3, 0x0101010101010101);
 671 
 672   BIND(CH1_LOOP);
 673     ldr(ch1, Address(str1, cnt1_neg));
 674     eor(ch1, ch, ch1);
 675     sub(tmp1, ch1, tmp3);
 676     orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f);
 677     bics(tmp1, tmp1, tmp2);
 678     br(NE, HAS_ZERO);
 679     adds(cnt1_neg, cnt1_neg, 8);
 680     br(LT, CH1_LOOP);
 681 
 682     cmp(cnt1_neg, (u1)8);
 683     mov(cnt1_neg, 0);
 684     br(LT, CH1_LOOP);
 685     b(NOMATCH);
 686 
 687   BIND(HAS_ZERO);
 688     rev(tmp1, tmp1);
 689     clz(tmp1, tmp1);
 690     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
 691     b(MATCH);
 692 
 693   BIND(DO1_SHORT);
 694     mov(result_tmp, cnt1);
 695     lea(str1, Address(str1, cnt1));
 696     sub(cnt1_neg, zr, cnt1);
 697   BIND(DO1_LOOP);
 698     ldrb(ch1, Address(str1, cnt1_neg));
 699     cmp(ch, ch1);
 700     br(EQ, MATCH);
 701     adds(cnt1_neg, cnt1_neg, 1);
 702     br(LT, DO1_LOOP);
 703   BIND(NOMATCH);
 704     mov(result, -1);
 705     b(DONE);
 706   BIND(MATCH);
 707     add(result, result_tmp, cnt1_neg);
 708   BIND(DONE);
 709 }
 710 
 711 // Compare strings.
 712 void C2_MacroAssembler::string_compare(Register str1, Register str2,
 713     Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
 714     FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
 715     PRegister pgtmp1, PRegister pgtmp2, int ae) {
 716   Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
 717       DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
 718       SHORT_LOOP_START, TAIL_CHECK;
 719 
 720   bool isLL = ae == StrIntrinsicNode::LL;
 721   bool isLU = ae == StrIntrinsicNode::LU;
 722   bool isUL = ae == StrIntrinsicNode::UL;
 723 
 724   // The stub threshold for LL strings is: 72 (64 + 8) chars
 725   // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch)
 726   // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least)
 727   const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36);
 728 
 729   bool str1_isL = isLL || isLU;
 730   bool str2_isL = isLL || isUL;
 731 
 732   int str1_chr_shift = str1_isL ? 0 : 1;
 733   int str2_chr_shift = str2_isL ? 0 : 1;
 734   int str1_chr_size = str1_isL ? 1 : 2;
 735   int str2_chr_size = str2_isL ? 1 : 2;
 736   int minCharsInWord = isLL ? wordSize : wordSize/2;
 737 
 738   FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
 739   chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
 740                                       (chr_insn)&MacroAssembler::ldrh;
 741   chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
 742                                       (chr_insn)&MacroAssembler::ldrh;
 743   uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
 744                             (uxt_insn)&MacroAssembler::uxthw;
 745 
 746   BLOCK_COMMENT("string_compare {");
 747 
 748   // Bizzarely, the counts are passed in bytes, regardless of whether they
 749   // are L or U strings, however the result is always in characters.
 750   if (!str1_isL) asrw(cnt1, cnt1, 1);
 751   if (!str2_isL) asrw(cnt2, cnt2, 1);
 752 
 753   // Compute the minimum of the string lengths and save the difference.
 754   subsw(result, cnt1, cnt2);
 755   cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
 756 
 757   // A very short string
 758   cmpw(cnt2, minCharsInWord);
 759   br(Assembler::LE, SHORT_STRING);
 760 
 761   // Compare longwords
 762   // load first parts of strings and finish initialization while loading
 763   {
 764     if (str1_isL == str2_isL) { // LL or UU
 765       ldr(tmp1, Address(str1));
 766       cmp(str1, str2);
 767       br(Assembler::EQ, DONE);
 768       ldr(tmp2, Address(str2));
 769       cmp(cnt2, stub_threshold);
 770       br(GE, STUB);
 771       subsw(cnt2, cnt2, minCharsInWord);
 772       br(EQ, TAIL_CHECK);
 773       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
 774       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
 775       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
 776     } else if (isLU) {
 777       ldrs(vtmp, Address(str1));
 778       ldr(tmp2, Address(str2));
 779       cmp(cnt2, stub_threshold);
 780       br(GE, STUB);
 781       subw(cnt2, cnt2, 4);
 782       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
 783       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
 784       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
 785       zip1(vtmp, T8B, vtmp, vtmpZ);
 786       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
 787       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
 788       add(cnt1, cnt1, 4);
 789       fmovd(tmp1, vtmp);
 790     } else { // UL case
 791       ldr(tmp1, Address(str1));
 792       ldrs(vtmp, Address(str2));
 793       cmp(cnt2, stub_threshold);
 794       br(GE, STUB);
 795       subw(cnt2, cnt2, 4);
 796       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
 797       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
 798       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
 799       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
 800       zip1(vtmp, T8B, vtmp, vtmpZ);
 801       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
 802       add(cnt1, cnt1, 8);
 803       fmovd(tmp2, vtmp);
 804     }
 805     adds(cnt2, cnt2, isUL ? 4 : 8);
 806     br(GE, TAIL);
 807     eor(rscratch2, tmp1, tmp2);
 808     cbnz(rscratch2, DIFF);
 809     // main loop
 810     bind(NEXT_WORD);
 811     if (str1_isL == str2_isL) {
 812       ldr(tmp1, Address(str1, cnt2));
 813       ldr(tmp2, Address(str2, cnt2));
 814       adds(cnt2, cnt2, 8);
 815     } else if (isLU) {
 816       ldrs(vtmp, Address(str1, cnt1));
 817       ldr(tmp2, Address(str2, cnt2));
 818       add(cnt1, cnt1, 4);
 819       zip1(vtmp, T8B, vtmp, vtmpZ);
 820       fmovd(tmp1, vtmp);
 821       adds(cnt2, cnt2, 8);
 822     } else { // UL
 823       ldrs(vtmp, Address(str2, cnt2));
 824       ldr(tmp1, Address(str1, cnt1));
 825       zip1(vtmp, T8B, vtmp, vtmpZ);
 826       add(cnt1, cnt1, 8);
 827       fmovd(tmp2, vtmp);
 828       adds(cnt2, cnt2, 4);
 829     }
 830     br(GE, TAIL);
 831 
 832     eor(rscratch2, tmp1, tmp2);
 833     cbz(rscratch2, NEXT_WORD);
 834     b(DIFF);
 835     bind(TAIL);
 836     eor(rscratch2, tmp1, tmp2);
 837     cbnz(rscratch2, DIFF);
 838     // Last longword.  In the case where length == 4 we compare the
 839     // same longword twice, but that's still faster than another
 840     // conditional branch.
 841     if (str1_isL == str2_isL) {
 842       ldr(tmp1, Address(str1));
 843       ldr(tmp2, Address(str2));
 844     } else if (isLU) {
 845       ldrs(vtmp, Address(str1));
 846       ldr(tmp2, Address(str2));
 847       zip1(vtmp, T8B, vtmp, vtmpZ);
 848       fmovd(tmp1, vtmp);
 849     } else { // UL
 850       ldrs(vtmp, Address(str2));
 851       ldr(tmp1, Address(str1));
 852       zip1(vtmp, T8B, vtmp, vtmpZ);
 853       fmovd(tmp2, vtmp);
 854     }
 855     bind(TAIL_CHECK);
 856     eor(rscratch2, tmp1, tmp2);
 857     cbz(rscratch2, DONE);
 858 
 859     // Find the first different characters in the longwords and
 860     // compute their difference.
 861     bind(DIFF);
 862     rev(rscratch2, rscratch2);
 863     clz(rscratch2, rscratch2);
 864     andr(rscratch2, rscratch2, isLL ? -8 : -16);
 865     lsrv(tmp1, tmp1, rscratch2);
 866     (this->*ext_chr)(tmp1, tmp1);
 867     lsrv(tmp2, tmp2, rscratch2);
 868     (this->*ext_chr)(tmp2, tmp2);
 869     subw(result, tmp1, tmp2);
 870     b(DONE);
 871   }
 872 
 873   bind(STUB);
 874     RuntimeAddress stub = NULL;
 875     switch(ae) {
 876       case StrIntrinsicNode::LL:
 877         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
 878         break;
 879       case StrIntrinsicNode::UU:
 880         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
 881         break;
 882       case StrIntrinsicNode::LU:
 883         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
 884         break;
 885       case StrIntrinsicNode::UL:
 886         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
 887         break;
 888       default:
 889         ShouldNotReachHere();
 890      }
 891     assert(stub.target() != NULL, "compare_long_string stub has not been generated");
 892     address call = trampoline_call(stub);
 893     if (call == nullptr) {
 894       DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START));
 895       ciEnv::current()->record_failure("CodeCache is full");
 896       return;
 897     }
 898     b(DONE);
 899 
 900   bind(SHORT_STRING);
 901   // Is the minimum length zero?
 902   cbz(cnt2, DONE);
 903   // arrange code to do most branches while loading and loading next characters
 904   // while comparing previous
 905   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
 906   subs(cnt2, cnt2, 1);
 907   br(EQ, SHORT_LAST_INIT);
 908   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
 909   b(SHORT_LOOP_START);
 910   bind(SHORT_LOOP);
 911   subs(cnt2, cnt2, 1);
 912   br(EQ, SHORT_LAST);
 913   bind(SHORT_LOOP_START);
 914   (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
 915   (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
 916   cmp(tmp1, cnt1);
 917   br(NE, SHORT_LOOP_TAIL);
 918   subs(cnt2, cnt2, 1);
 919   br(EQ, SHORT_LAST2);
 920   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
 921   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
 922   cmp(tmp2, rscratch1);
 923   br(EQ, SHORT_LOOP);
 924   sub(result, tmp2, rscratch1);
 925   b(DONE);
 926   bind(SHORT_LOOP_TAIL);
 927   sub(result, tmp1, cnt1);
 928   b(DONE);
 929   bind(SHORT_LAST2);
 930   cmp(tmp2, rscratch1);
 931   br(EQ, DONE);
 932   sub(result, tmp2, rscratch1);
 933 
 934   b(DONE);
 935   bind(SHORT_LAST_INIT);
 936   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
 937   bind(SHORT_LAST);
 938   cmp(tmp1, cnt1);
 939   br(EQ, DONE);
 940   sub(result, tmp1, cnt1);
 941 
 942   bind(DONE);
 943 
 944   BLOCK_COMMENT("} string_compare");
 945 }
 946 
 947 void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1,
 948                                      FloatRegister src2, int cond, bool isQ) {
 949   SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
 950   if (bt == T_FLOAT || bt == T_DOUBLE) {
 951     switch (cond) {
 952       case BoolTest::eq: fcmeq(dst, size, src1, src2); break;
 953       case BoolTest::ne: {
 954         fcmeq(dst, size, src1, src2);
 955         notr(dst, T16B, dst);
 956         break;
 957       }
 958       case BoolTest::ge: fcmge(dst, size, src1, src2); break;
 959       case BoolTest::gt: fcmgt(dst, size, src1, src2); break;
 960       case BoolTest::le: fcmge(dst, size, src2, src1); break;
 961       case BoolTest::lt: fcmgt(dst, size, src2, src1); break;
 962       default:
 963         assert(false, "unsupported");
 964         ShouldNotReachHere();
 965     }
 966   } else {
 967     switch (cond) {
 968       case BoolTest::eq: cmeq(dst, size, src1, src2); break;
 969       case BoolTest::ne: {
 970         cmeq(dst, size, src1, src2);
 971         notr(dst, T16B, dst);
 972         break;
 973       }
 974       case BoolTest::ge: cmge(dst, size, src1, src2); break;
 975       case BoolTest::gt: cmgt(dst, size, src1, src2); break;
 976       case BoolTest::le: cmge(dst, size, src2, src1); break;
 977       case BoolTest::lt: cmgt(dst, size, src2, src1); break;
 978       case BoolTest::uge: cmhs(dst, size, src1, src2); break;
 979       case BoolTest::ugt: cmhi(dst, size, src1, src2); break;
 980       case BoolTest::ult: cmhi(dst, size, src2, src1); break;
 981       case BoolTest::ule: cmhs(dst, size, src2, src1); break;
 982       default:
 983         assert(false, "unsupported");
 984         ShouldNotReachHere();
 985     }
 986   }
 987 }
 988 
 989 // Compress the least significant bit of each byte to the rightmost and clear
 990 // the higher garbage bits.
 991 void C2_MacroAssembler::bytemask_compress(Register dst) {
 992   // Example input, dst = 0x01 00 00 00 01 01 00 01
 993   // The "??" bytes are garbage.
 994   orr(dst, dst, dst, Assembler::LSR, 7);  // dst = 0x?? 02 ?? 00 ?? 03 ?? 01
 995   orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D
 996   orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D
 997   andr(dst, dst, 0xff);                   // dst = 0x8D
 998 }
 999 
1000 // Pack the lowest-numbered bit of each mask element in src into a long value
1001 // in dst, at most the first 64 lane elements.
1002 // Clobbers: rscratch1, if UseSVE=1 or the hardware doesn't support FEAT_BITPERM.
1003 void C2_MacroAssembler::sve_vmask_tolong(Register dst, PRegister src, BasicType bt, int lane_cnt,
1004                                          FloatRegister vtmp1, FloatRegister vtmp2) {
1005   assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
1006   assert_different_registers(dst, rscratch1);
1007   assert_different_registers(vtmp1, vtmp2);
1008 
1009   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
1010   // Example:   src = 0b01100101 10001101, bt = T_BYTE, lane_cnt = 16
1011   // Expected:  dst = 0x658D
1012 
1013   // Convert the mask into vector with sequential bytes.
1014   // vtmp1 = 0x00010100 0x00010001 0x01000000 0x01010001
1015   sve_cpy(vtmp1, size, src, 1, false);
1016   if (bt != T_BYTE) {
1017     sve_vector_narrow(vtmp1, B, vtmp1, size, vtmp2);
1018   }
1019 
1020   if (UseSVE > 1 && VM_Version::supports_svebitperm()) {
1021     // Given a vector with the value 0x00 or 0x01 in each byte, the basic idea
1022     // is to compress each significant bit of the byte in a cross-lane way. Due
1023     // to the lack of a cross-lane bit-compress instruction, we use BEXT
1024     // (bit-compress in each lane) with the biggest lane size (T = D) then
1025     // concatenate the results.
1026 
1027     // The second source input of BEXT, initialized with 0x01 in each byte.
1028     // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1029     sve_dup(vtmp2, B, 1);
1030 
1031     // BEXT vtmp1.D, vtmp1.D, vtmp2.D
1032     // vtmp1 = 0x0001010000010001 | 0x0100000001010001
1033     // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1034     //         ---------------------------------------
1035     // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1036     sve_bext(vtmp1, D, vtmp1, vtmp2);
1037 
1038     // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the
1039     // result to dst.
1040     // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1041     // dst   = 0x658D
1042     if (lane_cnt <= 8) {
1043       // No need to concatenate.
1044       umov(dst, vtmp1, B, 0);
1045     } else if (lane_cnt <= 16) {
1046       ins(vtmp1, B, vtmp1, 1, 8);
1047       umov(dst, vtmp1, H, 0);
1048     } else {
1049       // As the lane count is 64 at most, the final expected value must be in
1050       // the lowest 64 bits after narrowing vtmp1 from D to B.
1051       sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2);
1052       umov(dst, vtmp1, D, 0);
1053     }
1054   } else if (UseSVE > 0) {
1055     // Compress the lowest 8 bytes.
1056     fmovd(dst, vtmp1);
1057     bytemask_compress(dst);
1058     if (lane_cnt <= 8) return;
1059 
1060     // Repeat on higher bytes and join the results.
1061     // Compress 8 bytes in each iteration.
1062     for (int idx = 1; idx < (lane_cnt / 8); idx++) {
1063       sve_extract_integral(rscratch1, T_LONG, vtmp1, idx, vtmp2);
1064       bytemask_compress(rscratch1);
1065       orr(dst, dst, rscratch1, Assembler::LSL, idx << 3);
1066     }
1067   } else {
1068     assert(false, "unsupported");
1069     ShouldNotReachHere();
1070   }
1071 }
1072 
1073 // Unpack the mask, a long value in src, into predicate register dst based on the
1074 // corresponding data type. Note that dst can support at most 64 lanes.
1075 // Below example gives the expected dst predicate register in different types, with
1076 // a valid src(0x658D) on a 1024-bit vector size machine.
1077 // BYTE:  dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 00 00 65 8D
1078 // SHORT: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 14 11 40 51
1079 // INT:   dst = 0x00 00 00 00 00 00 00 00 01 10 01 01 10 00 11 01
1080 // LONG:  dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01
1081 //
1082 // The number of significant bits of src must be equal to lane_cnt. E.g., 0xFF658D which
1083 // has 24 significant bits would be an invalid input if dst predicate register refers to
1084 // a LONG type 1024-bit vector, which has at most 16 lanes.
1085 void C2_MacroAssembler::sve_vmask_fromlong(PRegister dst, Register src, BasicType bt, int lane_cnt,
1086                                            FloatRegister vtmp1, FloatRegister vtmp2) {
1087   assert(UseSVE == 2 && VM_Version::supports_svebitperm() &&
1088          lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported");
1089   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
1090   // Example:   src = 0x658D, bt = T_BYTE, size = B, lane_cnt = 16
1091   // Expected:  dst = 0b01101001 10001101
1092 
1093   // Put long value from general purpose register into the first lane of vector.
1094   // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1095   sve_dup(vtmp1, B, 0);
1096   mov(vtmp1, D, 0, src);
1097 
1098   // As sve_cmp generates mask value with the minimum unit in byte, we should
1099   // transform the value in the first lane which is mask in bit now to the
1100   // mask in byte, which can be done by SVE2's BDEP instruction.
1101 
1102   // The first source input of BDEP instruction. Deposite each byte in every 8 bytes.
1103   // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1104   if (lane_cnt <= 8) {
1105     // Nothing. As only one byte exsits.
1106   } else if (lane_cnt <= 16) {
1107     ins(vtmp1, B, vtmp1, 8, 1);
1108     mov(vtmp1, B, 1, zr);
1109   } else {
1110     sve_vector_extend(vtmp1, D, vtmp1, B);
1111   }
1112 
1113   // The second source input of BDEP instruction, initialized with 0x01 for each byte.
1114   // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1115   sve_dup(vtmp2, B, 1);
1116 
1117   // BDEP vtmp1.D, vtmp1.D, vtmp2.D
1118   // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1119   // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1120   //         ---------------------------------------
1121   // vtmp1 = 0x0001010000010001 | 0x0100000001010001
1122   sve_bdep(vtmp1, D, vtmp1, vtmp2);
1123 
1124   if (bt != T_BYTE) {
1125     sve_vector_extend(vtmp1, size, vtmp1, B);
1126   }
1127   // Generate mask according to the given vector, in which the elements have been
1128   // extended to expected type.
1129   // dst = 0b01101001 10001101
1130   sve_cmp(Assembler::NE, dst, size, ptrue, vtmp1, 0);
1131 }
1132 
1133 // Clobbers: rflags
1134 void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg,
1135                                     FloatRegister zn, FloatRegister zm, int cond) {
1136   assert(pg->is_governing(), "This register has to be a governing predicate register");
1137   FloatRegister z1 = zn, z2 = zm;
1138   // Convert the original BoolTest condition to Assembler::condition.
1139   Condition condition;
1140   switch (cond) {
1141     case BoolTest::eq: condition = Assembler::EQ; break;
1142     case BoolTest::ne: condition = Assembler::NE; break;
1143     case BoolTest::le: z1 = zm; z2 = zn; condition = Assembler::GE; break;
1144     case BoolTest::ge: condition = Assembler::GE; break;
1145     case BoolTest::lt: z1 = zm; z2 = zn; condition = Assembler::GT; break;
1146     case BoolTest::gt: condition = Assembler::GT; break;
1147     default:
1148       assert(false, "unsupported compare condition");
1149       ShouldNotReachHere();
1150   }
1151 
1152   SIMD_RegVariant size = elemType_to_regVariant(bt);
1153   if (bt == T_FLOAT || bt == T_DOUBLE) {
1154     sve_fcm(condition, pd, size, pg, z1, z2);
1155   } else {
1156     assert(is_integral_type(bt), "unsupported element type");
1157     sve_cmp(condition, pd, size, pg, z1, z2);
1158   }
1159 }
1160 
1161 // Get index of the last mask lane that is set
1162 void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) {
1163   SIMD_RegVariant size = elemType_to_regVariant(bt);
1164   sve_rev(ptmp, size, src);
1165   sve_brkb(ptmp, ptrue, ptmp, false);
1166   sve_cntp(dst, size, ptrue, ptmp);
1167   movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1);
1168   subw(dst, rscratch1, dst);
1169 }
1170 
1171 // Extend integer vector src to dst with the same lane count
1172 // but larger element size, e.g. 4B -> 4I
1173 void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes,
1174                                            FloatRegister src, BasicType src_bt) {
1175   if (src_bt == T_BYTE) {
1176     if (dst_bt == T_SHORT) {
1177       // 4B/8B to 4S/8S
1178       assert(dst_vlen_in_bytes == 8 || dst_vlen_in_bytes == 16, "unsupported");
1179       sxtl(dst, T8H, src, T8B);
1180     } else {
1181       // 4B to 4I
1182       assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported");
1183       sxtl(dst, T8H, src, T8B);
1184       sxtl(dst, T4S, dst, T4H);
1185     }
1186   } else if (src_bt == T_SHORT) {
1187     // 4S to 4I
1188     assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported");
1189     sxtl(dst, T4S, src, T4H);
1190   } else if (src_bt == T_INT) {
1191     // 2I to 2L
1192     assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported");
1193     sxtl(dst, T2D, src, T2S);
1194   } else {
1195     ShouldNotReachHere();
1196   }
1197 }
1198 
1199 // Narrow integer vector src down to dst with the same lane count
1200 // but smaller element size, e.g. 4I -> 4B
1201 void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt,
1202                                            FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) {
1203   if (src_bt == T_SHORT) {
1204     // 4S/8S to 4B/8B
1205     assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
1206     assert(dst_bt == T_BYTE, "unsupported");
1207     xtn(dst, T8B, src, T8H);
1208   } else if (src_bt == T_INT) {
1209     // 4I to 4B/4S
1210     assert(src_vlen_in_bytes == 16, "unsupported");
1211     assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported");
1212     xtn(dst, T4H, src, T4S);
1213     if (dst_bt == T_BYTE) {
1214       xtn(dst, T8B, dst, T8H);
1215     }
1216   } else if (src_bt == T_LONG) {
1217     // 2L to 2I
1218     assert(src_vlen_in_bytes == 16, "unsupported");
1219     assert(dst_bt == T_INT, "unsupported");
1220     xtn(dst, T2S, src, T2D);
1221   } else {
1222     ShouldNotReachHere();
1223   }
1224 }
1225 
1226 void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size,
1227                                           FloatRegister src, SIMD_RegVariant src_size) {
1228   assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size");
1229   if (src_size == B) {
1230     switch (dst_size) {
1231     case H:
1232       sve_sunpklo(dst, H, src);
1233       break;
1234     case S:
1235       sve_sunpklo(dst, H, src);
1236       sve_sunpklo(dst, S, dst);
1237       break;
1238     case D:
1239       sve_sunpklo(dst, H, src);
1240       sve_sunpklo(dst, S, dst);
1241       sve_sunpklo(dst, D, dst);
1242       break;
1243     default:
1244       ShouldNotReachHere();
1245     }
1246   } else if (src_size == H) {
1247     if (dst_size == S) {
1248       sve_sunpklo(dst, S, src);
1249     } else { // D
1250       sve_sunpklo(dst, S, src);
1251       sve_sunpklo(dst, D, dst);
1252     }
1253   } else if (src_size == S) {
1254     sve_sunpklo(dst, D, src);
1255   }
1256 }
1257 
1258 // Vector narrow from src to dst with specified element sizes.
1259 // High part of dst vector will be filled with zero.
1260 void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size,
1261                                           FloatRegister src, SIMD_RegVariant src_size,
1262                                           FloatRegister tmp) {
1263   assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size");
1264   assert_different_registers(src, tmp);
1265   sve_dup(tmp, src_size, 0);
1266   if (src_size == D) {
1267     switch (dst_size) {
1268     case S:
1269       sve_uzp1(dst, S, src, tmp);
1270       break;
1271     case H:
1272       assert_different_registers(dst, tmp);
1273       sve_uzp1(dst, S, src, tmp);
1274       sve_uzp1(dst, H, dst, tmp);
1275       break;
1276     case B:
1277       assert_different_registers(dst, tmp);
1278       sve_uzp1(dst, S, src, tmp);
1279       sve_uzp1(dst, H, dst, tmp);
1280       sve_uzp1(dst, B, dst, tmp);
1281       break;
1282     default:
1283       ShouldNotReachHere();
1284     }
1285   } else if (src_size == S) {
1286     if (dst_size == H) {
1287       sve_uzp1(dst, H, src, tmp);
1288     } else { // B
1289       assert_different_registers(dst, tmp);
1290       sve_uzp1(dst, H, src, tmp);
1291       sve_uzp1(dst, B, dst, tmp);
1292     }
1293   } else if (src_size == H) {
1294     sve_uzp1(dst, B, src, tmp);
1295   }
1296 }
1297 
1298 // Extend src predicate to dst predicate with the same lane count but larger
1299 // element size, e.g. 64Byte -> 512Long
1300 void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src,
1301                                              uint dst_element_length_in_bytes,
1302                                              uint src_element_length_in_bytes) {
1303   if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) {
1304     sve_punpklo(dst, src);
1305   } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) {
1306     sve_punpklo(dst, src);
1307     sve_punpklo(dst, dst);
1308   } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) {
1309     sve_punpklo(dst, src);
1310     sve_punpklo(dst, dst);
1311     sve_punpklo(dst, dst);
1312   } else {
1313     assert(false, "unsupported");
1314     ShouldNotReachHere();
1315   }
1316 }
1317 
1318 // Narrow src predicate to dst predicate with the same lane count but
1319 // smaller element size, e.g. 512Long -> 64Byte
1320 void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src,
1321                                              uint dst_element_length_in_bytes, uint src_element_length_in_bytes) {
1322   // The insignificant bits in src predicate are expected to be zero.
1323   if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) {
1324     sve_uzp1(dst, B, src, src);
1325   } else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) {
1326     sve_uzp1(dst, H, src, src);
1327     sve_uzp1(dst, B, dst, dst);
1328   } else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) {
1329     sve_uzp1(dst, S, src, src);
1330     sve_uzp1(dst, H, dst, dst);
1331     sve_uzp1(dst, B, dst, dst);
1332   } else {
1333     assert(false, "unsupported");
1334     ShouldNotReachHere();
1335   }
1336 }
1337 
1338 // Vector reduction add for integral type with ASIMD instructions.
1339 void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt,
1340                                                  Register isrc, FloatRegister vsrc,
1341                                                  unsigned vector_length_in_bytes,
1342                                                  FloatRegister vtmp) {
1343   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1344   assert_different_registers(dst, isrc);
1345   bool isQ = vector_length_in_bytes == 16;
1346 
1347   BLOCK_COMMENT("neon_reduce_add_integral {");
1348     switch(bt) {
1349       case T_BYTE:
1350         addv(vtmp, isQ ? T16B : T8B, vsrc);
1351         smov(dst, vtmp, B, 0);
1352         addw(dst, dst, isrc, ext::sxtb);
1353         break;
1354       case T_SHORT:
1355         addv(vtmp, isQ ? T8H : T4H, vsrc);
1356         smov(dst, vtmp, H, 0);
1357         addw(dst, dst, isrc, ext::sxth);
1358         break;
1359       case T_INT:
1360         isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc);
1361         umov(dst, vtmp, S, 0);
1362         addw(dst, dst, isrc);
1363         break;
1364       case T_LONG:
1365         assert(isQ, "unsupported");
1366         addpd(vtmp, vsrc);
1367         umov(dst, vtmp, D, 0);
1368         add(dst, dst, isrc);
1369         break;
1370       default:
1371         assert(false, "unsupported");
1372         ShouldNotReachHere();
1373     }
1374   BLOCK_COMMENT("} neon_reduce_add_integral");
1375 }
1376 
1377 // Vector reduction multiply for integral type with ASIMD instructions.
1378 // Note: temporary registers vtmp1 and vtmp2 are not used in some cases.
1379 // Clobbers: rscratch1
1380 void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt,
1381                                                  Register isrc, FloatRegister vsrc,
1382                                                  unsigned vector_length_in_bytes,
1383                                                  FloatRegister vtmp1, FloatRegister vtmp2) {
1384   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1385   bool isQ = vector_length_in_bytes == 16;
1386 
1387   BLOCK_COMMENT("neon_reduce_mul_integral {");
1388     switch(bt) {
1389       case T_BYTE:
1390         if (isQ) {
1391           // Multiply the lower half and higher half of vector iteratively.
1392           // vtmp1 = vsrc[8:15]
1393           ins(vtmp1, D, vsrc, 0, 1);
1394           // vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7]
1395           mulv(vtmp1, T8B, vtmp1, vsrc);
1396           // vtmp2 = vtmp1[4:7]
1397           ins(vtmp2, S, vtmp1, 0, 1);
1398           // vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3]
1399           mulv(vtmp1, T8B, vtmp2, vtmp1);
1400         } else {
1401           ins(vtmp1, S, vsrc, 0, 1);
1402           mulv(vtmp1, T8B, vtmp1, vsrc);
1403         }
1404         // vtmp2 = vtmp1[2:3]
1405         ins(vtmp2, H, vtmp1, 0, 1);
1406         // vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1]
1407         mulv(vtmp2, T8B, vtmp2, vtmp1);
1408         // dst = vtmp2[0] * isrc * vtmp2[1]
1409         umov(rscratch1, vtmp2, B, 0);
1410         mulw(dst, rscratch1, isrc);
1411         sxtb(dst, dst);
1412         umov(rscratch1, vtmp2, B, 1);
1413         mulw(dst, rscratch1, dst);
1414         sxtb(dst, dst);
1415         break;
1416       case T_SHORT:
1417         if (isQ) {
1418           ins(vtmp2, D, vsrc, 0, 1);
1419           mulv(vtmp2, T4H, vtmp2, vsrc);
1420           ins(vtmp1, S, vtmp2, 0, 1);
1421           mulv(vtmp1, T4H, vtmp1, vtmp2);
1422         } else {
1423           ins(vtmp1, S, vsrc, 0, 1);
1424           mulv(vtmp1, T4H, vtmp1, vsrc);
1425         }
1426         umov(rscratch1, vtmp1, H, 0);
1427         mulw(dst, rscratch1, isrc);
1428         sxth(dst, dst);
1429         umov(rscratch1, vtmp1, H, 1);
1430         mulw(dst, rscratch1, dst);
1431         sxth(dst, dst);
1432         break;
1433       case T_INT:
1434         if (isQ) {
1435           ins(vtmp1, D, vsrc, 0, 1);
1436           mulv(vtmp1, T2S, vtmp1, vsrc);
1437         } else {
1438           vtmp1 = vsrc;
1439         }
1440         umov(rscratch1, vtmp1, S, 0);
1441         mul(dst, rscratch1, isrc);
1442         umov(rscratch1, vtmp1, S, 1);
1443         mul(dst, rscratch1, dst);
1444         break;
1445       case T_LONG:
1446         umov(rscratch1, vsrc, D, 0);
1447         mul(dst, isrc, rscratch1);
1448         umov(rscratch1, vsrc, D, 1);
1449         mul(dst, dst, rscratch1);
1450         break;
1451       default:
1452         assert(false, "unsupported");
1453         ShouldNotReachHere();
1454     }
1455   BLOCK_COMMENT("} neon_reduce_mul_integral");
1456 }
1457 
1458 // Vector reduction multiply for floating-point type with ASIMD instructions.
1459 void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt,
1460                                            FloatRegister fsrc, FloatRegister vsrc,
1461                                            unsigned vector_length_in_bytes,
1462                                            FloatRegister vtmp) {
1463   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1464   bool isQ = vector_length_in_bytes == 16;
1465 
1466   BLOCK_COMMENT("neon_reduce_mul_fp {");
1467     switch(bt) {
1468       case T_FLOAT:
1469         fmuls(dst, fsrc, vsrc);
1470         ins(vtmp, S, vsrc, 0, 1);
1471         fmuls(dst, dst, vtmp);
1472         if (isQ) {
1473           ins(vtmp, S, vsrc, 0, 2);
1474           fmuls(dst, dst, vtmp);
1475           ins(vtmp, S, vsrc, 0, 3);
1476           fmuls(dst, dst, vtmp);
1477          }
1478         break;
1479       case T_DOUBLE:
1480         assert(isQ, "unsupported");
1481         fmuld(dst, fsrc, vsrc);
1482         ins(vtmp, D, vsrc, 0, 1);
1483         fmuld(dst, dst, vtmp);
1484         break;
1485       default:
1486         assert(false, "unsupported");
1487         ShouldNotReachHere();
1488     }
1489   BLOCK_COMMENT("} neon_reduce_mul_fp");
1490 }
1491 
1492 // Helper to select logical instruction
1493 void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd,
1494                                                    Register Rn, Register Rm,
1495                                                    enum shift_kind kind, unsigned shift) {
1496   switch(opc) {
1497     case Op_AndReductionV:
1498       is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift);
1499       break;
1500     case Op_OrReductionV:
1501       is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift);
1502       break;
1503     case Op_XorReductionV:
1504       is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift);
1505       break;
1506     default:
1507       assert(false, "unsupported");
1508       ShouldNotReachHere();
1509   }
1510 }
1511 
1512 // Vector reduction logical operations And, Or, Xor
1513 // Clobbers: rscratch1
1514 void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt,
1515                                             Register isrc, FloatRegister vsrc,
1516                                             unsigned vector_length_in_bytes) {
1517   assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV,
1518          "unsupported");
1519   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1520   assert_different_registers(dst, isrc);
1521   bool isQ = vector_length_in_bytes == 16;
1522 
1523   BLOCK_COMMENT("neon_reduce_logical {");
1524     umov(rscratch1, vsrc, isQ ? D : S, 0);
1525     umov(dst, vsrc, isQ ? D : S, 1);
1526     neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1);
1527     switch(bt) {
1528       case T_BYTE:
1529         if (isQ) {
1530           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1531         }
1532         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
1533         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8);
1534         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1535         sxtb(dst, dst);
1536         break;
1537       case T_SHORT:
1538         if (isQ) {
1539           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1540         }
1541         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
1542         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1543         sxth(dst, dst);
1544         break;
1545       case T_INT:
1546         if (isQ) {
1547           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1548         }
1549         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1550         break;
1551       case T_LONG:
1552         assert(isQ, "unsupported");
1553         neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst);
1554         break;
1555       default:
1556         assert(false, "unsupported");
1557         ShouldNotReachHere();
1558     }
1559   BLOCK_COMMENT("} neon_reduce_logical");
1560 }
1561 
1562 // Vector reduction min/max for integral type with ASIMD instructions.
1563 // Note: vtmp is not used and expected to be fnoreg for T_LONG case.
1564 // Clobbers: rscratch1, rflags
1565 void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt,
1566                                                     Register isrc, FloatRegister vsrc,
1567                                                     unsigned vector_length_in_bytes,
1568                                                     FloatRegister vtmp) {
1569   assert(opc == Op_MinReductionV || opc == Op_MaxReductionV, "unsupported");
1570   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1571   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported");
1572   assert_different_registers(dst, isrc);
1573   bool isQ = vector_length_in_bytes == 16;
1574   bool is_min = opc == Op_MinReductionV;
1575 
1576   BLOCK_COMMENT("neon_reduce_minmax_integral {");
1577     if (bt == T_LONG) {
1578       assert(vtmp == fnoreg, "should be");
1579       assert(isQ, "should be");
1580       umov(rscratch1, vsrc, D, 0);
1581       cmp(isrc, rscratch1);
1582       csel(dst, isrc, rscratch1, is_min ? LT : GT);
1583       umov(rscratch1, vsrc, D, 1);
1584       cmp(dst, rscratch1);
1585       csel(dst, dst, rscratch1, is_min ? LT : GT);
1586     } else {
1587       SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1588       if (size == T2S) {
1589         is_min ? sminp(vtmp, size, vsrc, vsrc) : smaxp(vtmp, size, vsrc, vsrc);
1590       } else {
1591         is_min ? sminv(vtmp, size, vsrc) : smaxv(vtmp, size, vsrc);
1592       }
1593       if (bt == T_INT) {
1594         umov(dst, vtmp, S, 0);
1595       } else {
1596         smov(dst, vtmp, elemType_to_regVariant(bt), 0);
1597       }
1598       cmpw(dst, isrc);
1599       cselw(dst, dst, isrc, is_min ? LT : GT);
1600     }
1601   BLOCK_COMMENT("} neon_reduce_minmax_integral");
1602 }
1603 
1604 // Vector reduction for integral type with SVE instruction.
1605 // Supported operations are Add, And, Or, Xor, Max, Min.
1606 // rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV.
1607 void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1,
1608                                             FloatRegister src2, PRegister pg, FloatRegister tmp) {
1609   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
1610   assert(pg->is_governing(), "This register has to be a governing predicate register");
1611   assert_different_registers(src1, dst);
1612   // Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved.
1613   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
1614   switch (opc) {
1615     case Op_AddReductionVI: {
1616       sve_uaddv(tmp, size, pg, src2);
1617       if (bt == T_BYTE) {
1618         smov(dst, tmp, size, 0);
1619         addw(dst, src1, dst, ext::sxtb);
1620       } else if (bt == T_SHORT) {
1621         smov(dst, tmp, size, 0);
1622         addw(dst, src1, dst, ext::sxth);
1623       } else {
1624         umov(dst, tmp, size, 0);
1625         addw(dst, dst, src1);
1626       }
1627       break;
1628     }
1629     case Op_AddReductionVL: {
1630       sve_uaddv(tmp, size, pg, src2);
1631       umov(dst, tmp, size, 0);
1632       add(dst, dst, src1);
1633       break;
1634     }
1635     case Op_AndReductionV: {
1636       sve_andv(tmp, size, pg, src2);
1637       if (bt == T_INT || bt == T_LONG) {
1638         umov(dst, tmp, size, 0);
1639       } else {
1640         smov(dst, tmp, size, 0);
1641       }
1642       if (bt == T_LONG) {
1643         andr(dst, dst, src1);
1644       } else {
1645         andw(dst, dst, src1);
1646       }
1647       break;
1648     }
1649     case Op_OrReductionV: {
1650       sve_orv(tmp, size, pg, src2);
1651       if (bt == T_INT || bt == T_LONG) {
1652         umov(dst, tmp, size, 0);
1653       } else {
1654         smov(dst, tmp, size, 0);
1655       }
1656       if (bt == T_LONG) {
1657         orr(dst, dst, src1);
1658       } else {
1659         orrw(dst, dst, src1);
1660       }
1661       break;
1662     }
1663     case Op_XorReductionV: {
1664       sve_eorv(tmp, size, pg, src2);
1665       if (bt == T_INT || bt == T_LONG) {
1666         umov(dst, tmp, size, 0);
1667       } else {
1668         smov(dst, tmp, size, 0);
1669       }
1670       if (bt == T_LONG) {
1671         eor(dst, dst, src1);
1672       } else {
1673         eorw(dst, dst, src1);
1674       }
1675       break;
1676     }
1677     case Op_MaxReductionV: {
1678       sve_smaxv(tmp, size, pg, src2);
1679       if (bt == T_INT || bt == T_LONG) {
1680         umov(dst, tmp, size, 0);
1681       } else {
1682         smov(dst, tmp, size, 0);
1683       }
1684       if (bt == T_LONG) {
1685         cmp(dst, src1);
1686         csel(dst, dst, src1, Assembler::GT);
1687       } else {
1688         cmpw(dst, src1);
1689         cselw(dst, dst, src1, Assembler::GT);
1690       }
1691       break;
1692     }
1693     case Op_MinReductionV: {
1694       sve_sminv(tmp, size, pg, src2);
1695       if (bt == T_INT || bt == T_LONG) {
1696         umov(dst, tmp, size, 0);
1697       } else {
1698         smov(dst, tmp, size, 0);
1699       }
1700       if (bt == T_LONG) {
1701         cmp(dst, src1);
1702         csel(dst, dst, src1, Assembler::LT);
1703       } else {
1704         cmpw(dst, src1);
1705         cselw(dst, dst, src1, Assembler::LT);
1706       }
1707       break;
1708     }
1709     default:
1710       assert(false, "unsupported");
1711       ShouldNotReachHere();
1712   }
1713 
1714   if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) {
1715     if (bt == T_BYTE) {
1716       sxtb(dst, dst);
1717     } else if (bt == T_SHORT) {
1718       sxth(dst, dst);
1719     }
1720   }
1721 }
1722 
1723 // Set elements of the dst predicate to true for lanes in the range of [0, lane_cnt), or
1724 // to false otherwise. The input "lane_cnt" should be smaller than or equal to the supported
1725 // max vector length of the basic type. Clobbers: rscratch1 and the rFlagsReg.
1726 void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt) {
1727   uint32_t max_vector_length = Matcher::max_vector_size(bt);
1728   assert(lane_cnt <= max_vector_length, "unsupported input lane_cnt");
1729 
1730   // Set all elements to false if the input "lane_cnt" is zero.
1731   if (lane_cnt == 0) {
1732     sve_pfalse(dst);
1733     return;
1734   }
1735 
1736   SIMD_RegVariant size = elemType_to_regVariant(bt);
1737   assert(size != Q, "invalid size");
1738 
1739   // Set all true if "lane_cnt" equals to the max lane count.
1740   if (lane_cnt == max_vector_length) {
1741     sve_ptrue(dst, size, /* ALL */ 0b11111);
1742     return;
1743   }
1744 
1745   // Fixed numbers for "ptrue".
1746   switch(lane_cnt) {
1747   case 1: /* VL1 */
1748   case 2: /* VL2 */
1749   case 3: /* VL3 */
1750   case 4: /* VL4 */
1751   case 5: /* VL5 */
1752   case 6: /* VL6 */
1753   case 7: /* VL7 */
1754   case 8: /* VL8 */
1755     sve_ptrue(dst, size, lane_cnt);
1756     return;
1757   case 16:
1758     sve_ptrue(dst, size, /* VL16 */ 0b01001);
1759     return;
1760   case 32:
1761     sve_ptrue(dst, size, /* VL32 */ 0b01010);
1762     return;
1763   case 64:
1764     sve_ptrue(dst, size, /* VL64 */ 0b01011);
1765     return;
1766   case 128:
1767     sve_ptrue(dst, size, /* VL128 */ 0b01100);
1768     return;
1769   case 256:
1770     sve_ptrue(dst, size, /* VL256 */ 0b01101);
1771     return;
1772   default:
1773     break;
1774   }
1775 
1776   // Special patterns for "ptrue".
1777   if (lane_cnt == round_down_power_of_2(max_vector_length)) {
1778     sve_ptrue(dst, size, /* POW2 */ 0b00000);
1779   } else if (lane_cnt == max_vector_length - (max_vector_length % 4)) {
1780     sve_ptrue(dst, size, /* MUL4 */ 0b11101);
1781   } else if (lane_cnt == max_vector_length - (max_vector_length % 3)) {
1782     sve_ptrue(dst, size, /* MUL3 */ 0b11110);
1783   } else {
1784     // Encode to "whilelow" for the remaining cases.
1785     mov(rscratch1, lane_cnt);
1786     sve_whilelow(dst, size, zr, rscratch1);
1787   }
1788 }
1789 
1790 // Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst.
1791 // Any remaining elements of dst will be filled with zero.
1792 // Clobbers: rscratch1
1793 // Preserves: src, mask
1794 void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask,
1795                                            FloatRegister vtmp1, FloatRegister vtmp2,
1796                                            PRegister pgtmp) {
1797   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
1798   assert_different_registers(dst, src, vtmp1, vtmp2);
1799   assert_different_registers(mask, pgtmp);
1800 
1801   // Example input:   src   = 8888 7777 6666 5555 4444 3333 2222 1111
1802   //                  mask  = 0001 0000 0000 0001 0001 0000 0001 0001
1803   // Expected result: dst   = 0000 0000 0000 8888 5555 4444 2222 1111
1804   sve_dup(vtmp2, H, 0);
1805 
1806   // Extend lowest half to type INT.
1807   // dst = 00004444 00003333 00002222 00001111
1808   sve_uunpklo(dst, S, src);
1809   // pgtmp = 00000001 00000000 00000001 00000001
1810   sve_punpklo(pgtmp, mask);
1811   // Pack the active elements in size of type INT to the right,
1812   // and fill the remainings with zero.
1813   // dst = 00000000 00004444 00002222 00001111
1814   sve_compact(dst, S, dst, pgtmp);
1815   // Narrow the result back to type SHORT.
1816   // dst = 0000 0000 0000 0000 0000 4444 2222 1111
1817   sve_uzp1(dst, H, dst, vtmp2);
1818   // Count the active elements of lowest half.
1819   // rscratch1 = 3
1820   sve_cntp(rscratch1, S, ptrue, pgtmp);
1821 
1822   // Repeat to the highest half.
1823   // pgtmp = 00000001 00000000 00000000 00000001
1824   sve_punpkhi(pgtmp, mask);
1825   // vtmp1 = 00008888 00007777 00006666 00005555
1826   sve_uunpkhi(vtmp1, S, src);
1827   // vtmp1 = 00000000 00000000 00008888 00005555
1828   sve_compact(vtmp1, S, vtmp1, pgtmp);
1829   // vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555
1830   sve_uzp1(vtmp1, H, vtmp1, vtmp2);
1831 
1832   // Compressed low:   dst   = 0000 0000 0000 0000 0000 4444 2222 1111
1833   // Compressed high:  vtmp1 = 0000 0000 0000 0000 0000 0000 8888  5555
1834   // Left shift(cross lane) compressed high with TRUE_CNT lanes,
1835   // TRUE_CNT is the number of active elements in the compressed low.
1836   neg(rscratch1, rscratch1);
1837   // vtmp2 = {4 3 2 1 0 -1 -2 -3}
1838   sve_index(vtmp2, H, rscratch1, 1);
1839   // vtmp1 = 0000 0000 0000 8888 5555 0000 0000 0000
1840   sve_tbl(vtmp1, H, vtmp1, vtmp2);
1841 
1842   // Combine the compressed high(after shifted) with the compressed low.
1843   // dst = 0000 0000 0000 8888 5555 4444 2222 1111
1844   sve_orr(dst, dst, vtmp1);
1845 }
1846 
1847 // Clobbers: rscratch1, rscratch2
1848 // Preserves: src, mask
1849 void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask,
1850                                           FloatRegister vtmp1, FloatRegister vtmp2,
1851                                           FloatRegister vtmp3, FloatRegister vtmp4,
1852                                           PRegister ptmp, PRegister pgtmp) {
1853   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
1854   assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3, vtmp4);
1855   assert_different_registers(mask, ptmp, pgtmp);
1856   // Example input:   src   = 88 77 66 55 44 33 22 11
1857   //                  mask  = 01 00 00 01 01 00 01 01
1858   // Expected result: dst   = 00 00 00 88 55 44 22 11
1859 
1860   sve_dup(vtmp4, B, 0);
1861   // Extend lowest half to type SHORT.
1862   // vtmp1 = 0044 0033 0022 0011
1863   sve_uunpklo(vtmp1, H, src);
1864   // ptmp = 0001 0000 0001 0001
1865   sve_punpklo(ptmp, mask);
1866   // Count the active elements of lowest half.
1867   // rscratch2 = 3
1868   sve_cntp(rscratch2, H, ptrue, ptmp);
1869   // Pack the active elements in size of type SHORT to the right,
1870   // and fill the remainings with zero.
1871   // dst = 0000 0044 0022 0011
1872   sve_compress_short(dst, vtmp1, ptmp, vtmp2, vtmp3, pgtmp);
1873   // Narrow the result back to type BYTE.
1874   // dst = 00 00 00 00 00 44 22 11
1875   sve_uzp1(dst, B, dst, vtmp4);
1876 
1877   // Repeat to the highest half.
1878   // ptmp = 0001 0000 0000 0001
1879   sve_punpkhi(ptmp, mask);
1880   // vtmp1 = 0088 0077 0066 0055
1881   sve_uunpkhi(vtmp2, H, src);
1882   // vtmp1 = 0000 0000 0088 0055
1883   sve_compress_short(vtmp1, vtmp2, ptmp, vtmp3, vtmp4, pgtmp);
1884 
1885   sve_dup(vtmp4, B, 0);
1886   // vtmp1 = 00 00 00 00 00 00 88 55
1887   sve_uzp1(vtmp1, B, vtmp1, vtmp4);
1888 
1889   // Compressed low:   dst   = 00 00 00 00 00 44 22 11
1890   // Compressed high:  vtmp1 = 00 00 00 00 00 00 88 55
1891   // Left shift(cross lane) compressed high with TRUE_CNT lanes,
1892   // TRUE_CNT is the number of active elements in the compressed low.
1893   neg(rscratch2, rscratch2);
1894   // vtmp2 = {4 3 2 1 0 -1 -2 -3}
1895   sve_index(vtmp2, B, rscratch2, 1);
1896   // vtmp1 = 00 00 00 88 55 00 00 00
1897   sve_tbl(vtmp1, B, vtmp1, vtmp2);
1898   // Combine the compressed high(after shifted) with the compressed low.
1899   // dst = 00 00 00 88 55 44 22 11
1900   sve_orr(dst, dst, vtmp1);
1901 }
1902 
1903 void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
1904   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
1905   SIMD_Arrangement size = isQ ? T16B : T8B;
1906   if (bt == T_BYTE) {
1907     rbit(dst, size, src);
1908   } else {
1909     neon_reverse_bytes(dst, src, bt, isQ);
1910     rbit(dst, size, dst);
1911   }
1912 }
1913 
1914 void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
1915   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
1916   SIMD_Arrangement size = isQ ? T16B : T8B;
1917   switch (bt) {
1918     case T_BYTE:
1919       if (dst != src) {
1920         orr(dst, size, src, src);
1921       }
1922       break;
1923     case T_SHORT:
1924       rev16(dst, size, src);
1925       break;
1926     case T_INT:
1927       rev32(dst, size, src);
1928       break;
1929     case T_LONG:
1930       rev64(dst, size, src);
1931       break;
1932     default:
1933       assert(false, "unsupported");
1934       ShouldNotReachHere();
1935   }
1936 }
1937 
1938 // Extract a scalar element from an sve vector at position 'idx'.
1939 // The input elements in src are expected to be of integral type.
1940 void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src,
1941                                              int idx, FloatRegister vtmp) {
1942   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
1943   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
1944   if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction
1945     if (bt == T_INT || bt == T_LONG) {
1946       umov(dst, src, size, idx);
1947     } else {
1948       smov(dst, src, size, idx);
1949     }
1950   } else {
1951     sve_orr(vtmp, src, src);
1952     sve_ext(vtmp, vtmp, idx << size);
1953     if (bt == T_INT || bt == T_LONG) {
1954       umov(dst, vtmp, size, 0);
1955     } else {
1956       smov(dst, vtmp, size, 0);
1957     }
1958   }
1959 }
1960 
1961 // java.lang.Math::round intrinsics
1962 
1963 // Clobbers: rscratch1, rflags
1964 void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
1965                                           FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) {
1966   assert_different_registers(tmp1, tmp2, tmp3, src, dst);
1967   switch (T) {
1968     case T2S:
1969     case T4S:
1970       fmovs(tmp1, T, 0.5f);
1971       mov(rscratch1, jint_cast(0x1.0p23f));
1972       break;
1973     case T2D:
1974       fmovd(tmp1, T, 0.5);
1975       mov(rscratch1, julong_cast(0x1.0p52));
1976       break;
1977     default:
1978       assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
1979   }
1980   fadd(tmp1, T, tmp1, src);
1981   fcvtms(tmp1, T, tmp1);
1982   // tmp1 = floor(src + 0.5, ties to even)
1983 
1984   fcvtas(dst, T, src);
1985   // dst = round(src), ties to away
1986 
1987   fneg(tmp3, T, src);
1988   dup(tmp2, T, rscratch1);
1989   cmhs(tmp3, T, tmp3, tmp2);
1990   // tmp3 is now a set of flags
1991 
1992   bif(dst, T16B, tmp1, tmp3);
1993   // result in dst
1994 }
1995 
1996 // Clobbers: rscratch1, rflags
1997 void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
1998                                          FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) {
1999   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2000   assert_different_registers(tmp1, tmp2, src, dst);
2001 
2002   switch (T) {
2003     case S:
2004       mov(rscratch1, jint_cast(0x1.0p23f));
2005       break;
2006     case D:
2007       mov(rscratch1, julong_cast(0x1.0p52));
2008       break;
2009     default:
2010       assert(T == S || T == D, "invalid register variant");
2011   }
2012 
2013   sve_frinta(dst, T, ptrue, src);
2014   // dst = round(src), ties to away
2015 
2016   Label none;
2017 
2018   sve_fneg(tmp1, T, ptrue, src);
2019   sve_dup(tmp2, T, rscratch1);
2020   sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1);
2021   br(EQ, none);
2022   {
2023     sve_cpy(tmp1, T, pgtmp, 0.5);
2024     sve_fadd(tmp1, T, pgtmp, src);
2025     sve_frintm(dst, T, pgtmp, tmp1);
2026     // dst = floor(src + 0.5, ties to even)
2027   }
2028   bind(none);
2029 
2030   sve_fcvtzs(dst, T, ptrue, dst, T);
2031   // result in dst
2032 }
2033 
2034 void C2_MacroAssembler::vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero,
2035                                            FloatRegister one, SIMD_Arrangement T) {
2036   assert_different_registers(dst, src, zero, one);
2037   assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2038 
2039   facgt(dst, T, src, zero);
2040   ushr(dst, T, dst, 1); // dst=0 for +-0.0 and NaN. 0x7FF..F otherwise
2041   bsl(dst, T == T2S ? T8B : T16B, one, src); // Result in dst
2042 }
2043 
2044 void C2_MacroAssembler::vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero,
2045                                           FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T) {
2046     assert_different_registers(dst, src, zero, one, vtmp);
2047     assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2048 
2049     sve_orr(vtmp, src, src);
2050     sve_fac(Assembler::GT, pgtmp, T, ptrue, src, zero); // pmtp=0 for +-0.0 and NaN. 0x1 otherwise
2051     switch (T) {
2052     case S:
2053       sve_and(vtmp, T, min_jint); // Extract the sign bit of float value in every lane of src
2054       sve_orr(vtmp, T, jint_cast(1.0)); // OR it with +1 to make the final result +1 or -1 depending
2055                                         // on the sign of the float value
2056       break;
2057     case D:
2058       sve_and(vtmp, T, min_jlong);
2059       sve_orr(vtmp, T, jlong_cast(1.0));
2060       break;
2061     default:
2062       assert(false, "unsupported");
2063       ShouldNotReachHere();
2064     }
2065     sve_sel(dst, T, pgtmp, vtmp, src); // Select either from src or vtmp based on the predicate register pgtmp
2066                                        // Result in dst
2067 }
2068 
2069 bool C2_MacroAssembler::in_scratch_emit_size() {
2070   if (ciEnv::current()->task() != NULL) {
2071     PhaseOutput* phase_output = Compile::current()->output();
2072     if (phase_output != NULL && phase_output->in_scratch_emit_size()) {
2073       return true;
2074     }
2075   }
2076   return MacroAssembler::in_scratch_emit_size();
2077 }