1 /*
   2  * Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2020, 2022, Huawei Technologies Co., Ltd. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/assembler.hpp"
  28 #include "asm/assembler.inline.hpp"
  29 #include "opto/c2_MacroAssembler.hpp"
  30 #include "opto/intrinsicnode.hpp"
  31 #include "opto/subnode.hpp"
  32 #include "runtime/stubRoutines.hpp"
  33 
  34 #ifdef PRODUCT
  35 #define BLOCK_COMMENT(str) /* nothing */
  36 #define STOP(error) stop(error)
  37 #else
  38 #define BLOCK_COMMENT(str) block_comment(str)
  39 #define STOP(error) block_comment(error); stop(error)
  40 #endif
  41 
  42 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  43 
  44 // short string
  45 // StringUTF16.indexOfChar
  46 // StringLatin1.indexOfChar
  47 void C2_MacroAssembler::string_indexof_char_short(Register str1, Register cnt1,
  48                                                   Register ch, Register result,
  49                                                   bool isL)
  50 {
  51   Register ch1 = t0;
  52   Register index = t1;
  53 
  54   BLOCK_COMMENT("string_indexof_char_short {");
  55 
  56   Label LOOP, LOOP1, LOOP4, LOOP8;
  57   Label MATCH,  MATCH1, MATCH2, MATCH3,
  58         MATCH4, MATCH5, MATCH6, MATCH7, NOMATCH;
  59 
  60   mv(result, -1);
  61   mv(index, zr);
  62 
  63   bind(LOOP);
  64   addi(t0, index, 8);
  65   ble(t0, cnt1, LOOP8);
  66   addi(t0, index, 4);
  67   ble(t0, cnt1, LOOP4);
  68   j(LOOP1);
  69 
  70   bind(LOOP8);
  71   isL ? lbu(ch1, Address(str1, 0)) : lhu(ch1, Address(str1, 0));
  72   beq(ch, ch1, MATCH);
  73   isL ? lbu(ch1, Address(str1, 1)) : lhu(ch1, Address(str1, 2));
  74   beq(ch, ch1, MATCH1);
  75   isL ? lbu(ch1, Address(str1, 2)) : lhu(ch1, Address(str1, 4));
  76   beq(ch, ch1, MATCH2);
  77   isL ? lbu(ch1, Address(str1, 3)) : lhu(ch1, Address(str1, 6));
  78   beq(ch, ch1, MATCH3);
  79   isL ? lbu(ch1, Address(str1, 4)) : lhu(ch1, Address(str1, 8));
  80   beq(ch, ch1, MATCH4);
  81   isL ? lbu(ch1, Address(str1, 5)) : lhu(ch1, Address(str1, 10));
  82   beq(ch, ch1, MATCH5);
  83   isL ? lbu(ch1, Address(str1, 6)) : lhu(ch1, Address(str1, 12));
  84   beq(ch, ch1, MATCH6);
  85   isL ? lbu(ch1, Address(str1, 7)) : lhu(ch1, Address(str1, 14));
  86   beq(ch, ch1, MATCH7);
  87   addi(index, index, 8);
  88   addi(str1, str1, isL ? 8 : 16);
  89   blt(index, cnt1, LOOP);
  90   j(NOMATCH);
  91 
  92   bind(LOOP4);
  93   isL ? lbu(ch1, Address(str1, 0)) : lhu(ch1, Address(str1, 0));
  94   beq(ch, ch1, MATCH);
  95   isL ? lbu(ch1, Address(str1, 1)) : lhu(ch1, Address(str1, 2));
  96   beq(ch, ch1, MATCH1);
  97   isL ? lbu(ch1, Address(str1, 2)) : lhu(ch1, Address(str1, 4));
  98   beq(ch, ch1, MATCH2);
  99   isL ? lbu(ch1, Address(str1, 3)) : lhu(ch1, Address(str1, 6));
 100   beq(ch, ch1, MATCH3);
 101   addi(index, index, 4);
 102   addi(str1, str1, isL ? 4 : 8);
 103   bge(index, cnt1, NOMATCH);
 104 
 105   bind(LOOP1);
 106   isL ? lbu(ch1, Address(str1)) : lhu(ch1, Address(str1));
 107   beq(ch, ch1, MATCH);
 108   addi(index, index, 1);
 109   addi(str1, str1, isL ? 1 : 2);
 110   blt(index, cnt1, LOOP1);
 111   j(NOMATCH);
 112 
 113   bind(MATCH1);
 114   addi(index, index, 1);
 115   j(MATCH);
 116 
 117   bind(MATCH2);
 118   addi(index, index, 2);
 119   j(MATCH);
 120 
 121   bind(MATCH3);
 122   addi(index, index, 3);
 123   j(MATCH);
 124 
 125   bind(MATCH4);
 126   addi(index, index, 4);
 127   j(MATCH);
 128 
 129   bind(MATCH5);
 130   addi(index, index, 5);
 131   j(MATCH);
 132 
 133   bind(MATCH6);
 134   addi(index, index, 6);
 135   j(MATCH);
 136 
 137   bind(MATCH7);
 138   addi(index, index, 7);
 139 
 140   bind(MATCH);
 141   mv(result, index);
 142   bind(NOMATCH);
 143   BLOCK_COMMENT("} string_indexof_char_short");
 144 }
 145 
 146 // StringUTF16.indexOfChar
 147 // StringLatin1.indexOfChar
 148 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1,
 149                                             Register ch, Register result,
 150                                             Register tmp1, Register tmp2,
 151                                             Register tmp3, Register tmp4,
 152                                             bool isL)
 153 {
 154   Label CH1_LOOP, HIT, NOMATCH, DONE, DO_LONG;
 155   Register ch1 = t0;
 156   Register orig_cnt = t1;
 157   Register mask1 = tmp3;
 158   Register mask2 = tmp2;
 159   Register match_mask = tmp1;
 160   Register trailing_char = tmp4;
 161   Register unaligned_elems = tmp4;
 162 
 163   BLOCK_COMMENT("string_indexof_char {");
 164   beqz(cnt1, NOMATCH);
 165 
 166   addi(t0, cnt1, isL ? -32 : -16);
 167   bgtz(t0, DO_LONG);
 168   string_indexof_char_short(str1, cnt1, ch, result, isL);
 169   j(DONE);
 170 
 171   bind(DO_LONG);
 172   mv(orig_cnt, cnt1);
 173   if (AvoidUnalignedAccesses) {
 174     Label ALIGNED;
 175     andi(unaligned_elems, str1, 0x7);
 176     beqz(unaligned_elems, ALIGNED);
 177     sub(unaligned_elems, unaligned_elems, 8);
 178     neg(unaligned_elems, unaligned_elems);
 179     if (!isL) {
 180       srli(unaligned_elems, unaligned_elems, 1);
 181     }
 182     // do unaligned part per element
 183     string_indexof_char_short(str1, unaligned_elems, ch, result, isL);
 184     bgez(result, DONE);
 185     mv(orig_cnt, cnt1);
 186     sub(cnt1, cnt1, unaligned_elems);
 187     bind(ALIGNED);
 188   }
 189 
 190   // duplicate ch
 191   if (isL) {
 192     slli(ch1, ch, 8);
 193     orr(ch, ch1, ch);
 194   }
 195   slli(ch1, ch, 16);
 196   orr(ch, ch1, ch);
 197   slli(ch1, ch, 32);
 198   orr(ch, ch1, ch);
 199 
 200   if (!isL) {
 201     slli(cnt1, cnt1, 1);
 202   }
 203 
 204   uint64_t mask0101 = UCONST64(0x0101010101010101);
 205   uint64_t mask0001 = UCONST64(0x0001000100010001);
 206   mv(mask1, isL ? mask0101 : mask0001);
 207   uint64_t mask7f7f = UCONST64(0x7f7f7f7f7f7f7f7f);
 208   uint64_t mask7fff = UCONST64(0x7fff7fff7fff7fff);
 209   mv(mask2, isL ? mask7f7f : mask7fff);
 210 
 211   bind(CH1_LOOP);
 212   ld(ch1, Address(str1));
 213   addi(str1, str1, 8);
 214   addi(cnt1, cnt1, -8);
 215   compute_match_mask(ch1, ch, match_mask, mask1, mask2);
 216   bnez(match_mask, HIT);
 217   bgtz(cnt1, CH1_LOOP);
 218   j(NOMATCH);
 219 
 220   bind(HIT);
 221   ctzc_bit(trailing_char, match_mask, isL, ch1, result);
 222   srli(trailing_char, trailing_char, 3);
 223   addi(cnt1, cnt1, 8);
 224   ble(cnt1, trailing_char, NOMATCH);
 225   // match case
 226   if (!isL) {
 227     srli(cnt1, cnt1, 1);
 228     srli(trailing_char, trailing_char, 1);
 229   }
 230 
 231   sub(result, orig_cnt, cnt1);
 232   add(result, result, trailing_char);
 233   j(DONE);
 234 
 235   bind(NOMATCH);
 236   mv(result, -1);
 237 
 238   bind(DONE);
 239   BLOCK_COMMENT("} string_indexof_char");
 240 }
 241 
 242 typedef void (MacroAssembler::* load_chr_insn)(Register rd, const Address &adr, Register temp);
 243 
 244 // Search for needle in haystack and return index or -1
 245 // x10: result
 246 // x11: haystack
 247 // x12: haystack_len
 248 // x13: needle
 249 // x14: needle_len
 250 void C2_MacroAssembler::string_indexof(Register haystack, Register needle,
 251                                        Register haystack_len, Register needle_len,
 252                                        Register tmp1, Register tmp2,
 253                                        Register tmp3, Register tmp4,
 254                                        Register tmp5, Register tmp6,
 255                                        Register result, int ae)
 256 {
 257   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
 258 
 259   Label LINEARSEARCH, LINEARSTUB, DONE, NOMATCH;
 260 
 261   Register ch1 = t0;
 262   Register ch2 = t1;
 263   Register nlen_tmp = tmp1; // needle len tmp
 264   Register hlen_tmp = tmp2; // haystack len tmp
 265   Register result_tmp = tmp4;
 266 
 267   bool isLL = ae == StrIntrinsicNode::LL;
 268 
 269   bool needle_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
 270   bool haystack_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
 271   int needle_chr_shift = needle_isL ? 0 : 1;
 272   int haystack_chr_shift = haystack_isL ? 0 : 1;
 273   int needle_chr_size = needle_isL ? 1 : 2;
 274   int haystack_chr_size = haystack_isL ? 1 : 2;
 275   load_chr_insn needle_load_1chr = needle_isL ? (load_chr_insn)&MacroAssembler::lbu :
 276                               (load_chr_insn)&MacroAssembler::lhu;
 277   load_chr_insn haystack_load_1chr = haystack_isL ? (load_chr_insn)&MacroAssembler::lbu :
 278                                 (load_chr_insn)&MacroAssembler::lhu;
 279 
 280   BLOCK_COMMENT("string_indexof {");
 281 
 282   // Note, inline_string_indexOf() generates checks:
 283   // if (pattern.count > src.count) return -1;
 284   // if (pattern.count == 0) return 0;
 285 
 286   // We have two strings, a source string in haystack, haystack_len and a pattern string
 287   // in needle, needle_len. Find the first occurence of pattern in source or return -1.
 288 
 289   // For larger pattern and source we use a simplified Boyer Moore algorithm.
 290   // With a small pattern and source we use linear scan.
 291 
 292   // needle_len >=8 && needle_len < 256 && needle_len < haystack_len/4, use bmh algorithm.
 293   sub(result_tmp, haystack_len, needle_len);
 294   // needle_len < 8, use linear scan
 295   sub(t0, needle_len, 8);
 296   bltz(t0, LINEARSEARCH);
 297   // needle_len >= 256, use linear scan
 298   sub(t0, needle_len, 256);
 299   bgez(t0, LINEARSTUB);
 300   // needle_len >= haystack_len/4, use linear scan
 301   srli(t0, haystack_len, 2);
 302   bge(needle_len, t0, LINEARSTUB);
 303 
 304   // Boyer-Moore-Horspool introduction:
 305   // The Boyer Moore alogorithm is based on the description here:-
 306   //
 307   // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
 308   //
 309   // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
 310   // and the 'Good Suffix' rule.
 311   //
 312   // These rules are essentially heuristics for how far we can shift the
 313   // pattern along the search string.
 314   //
 315   // The implementation here uses the 'Bad Character' rule only because of the
 316   // complexity of initialisation for the 'Good Suffix' rule.
 317   //
 318   // This is also known as the Boyer-Moore-Horspool algorithm:
 319   //
 320   // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
 321   //
 322   // #define ASIZE 256
 323   //
 324   //    int bm(unsigned char *pattern, int m, unsigned char *src, int n) {
 325   //      int i, j;
 326   //      unsigned c;
 327   //      unsigned char bc[ASIZE];
 328   //
 329   //      /* Preprocessing */
 330   //      for (i = 0; i < ASIZE; ++i)
 331   //        bc[i] = m;
 332   //      for (i = 0; i < m - 1; ) {
 333   //        c = pattern[i];
 334   //        ++i;
 335   //        // c < 256 for Latin1 string, so, no need for branch
 336   //        #ifdef PATTERN_STRING_IS_LATIN1
 337   //        bc[c] = m - i;
 338   //        #else
 339   //        if (c < ASIZE) bc[c] = m - i;
 340   //        #endif
 341   //      }
 342   //
 343   //      /* Searching */
 344   //      j = 0;
 345   //      while (j <= n - m) {
 346   //        c = src[i+j];
 347   //        if (pattern[m-1] == c)
 348   //          int k;
 349   //          for (k = m - 2; k >= 0 && pattern[k] == src[k + j]; --k);
 350   //          if (k < 0) return j;
 351   //          // c < 256 for Latin1 string, so, no need for branch
 352   //          #ifdef SOURCE_STRING_IS_LATIN1_AND_PATTERN_STRING_IS_LATIN1
 353   //          // LL case: (c< 256) always true. Remove branch
 354   //          j += bc[pattern[j+m-1]];
 355   //          #endif
 356   //          #ifdef SOURCE_STRING_IS_UTF_AND_PATTERN_STRING_IS_UTF
 357   //          // UU case: need if (c<ASIZE) check. Skip 1 character if not.
 358   //          if (c < ASIZE)
 359   //            j += bc[pattern[j+m-1]];
 360   //          else
 361   //            j += 1
 362   //          #endif
 363   //          #ifdef SOURCE_IS_UTF_AND_PATTERN_IS_LATIN1
 364   //          // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
 365   //          if (c < ASIZE)
 366   //            j += bc[pattern[j+m-1]];
 367   //          else
 368   //            j += m
 369   //          #endif
 370   //      }
 371   //      return -1;
 372   //    }
 373 
 374   // temp register:t0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, result
 375   Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
 376         BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
 377 
 378   Register haystack_end = haystack_len;
 379   Register skipch = tmp2;
 380 
 381   // pattern length is >=8, so, we can read at least 1 register for cases when
 382   // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
 383   // UL case. We'll re-read last character in inner pre-loop code to have
 384   // single outer pre-loop load
 385   const int firstStep = isLL ? 7 : 3;
 386 
 387   const int ASIZE = 256;
 388   const int STORE_BYTES = 8; // 8 bytes stored per instruction(sd)
 389 
 390   sub(sp, sp, ASIZE);
 391 
 392   // init BC offset table with default value: needle_len
 393   slli(t0, needle_len, 8);
 394   orr(t0, t0, needle_len); // [63...16][needle_len][needle_len]
 395   slli(tmp1, t0, 16);
 396   orr(t0, tmp1, t0); // [63...32][needle_len][needle_len][needle_len][needle_len]
 397   slli(tmp1, t0, 32);
 398   orr(tmp5, tmp1, t0); // tmp5: 8 elements [needle_len]
 399 
 400   mv(ch1, sp);  // ch1 is t0
 401   mv(tmp6, ASIZE / STORE_BYTES); // loop iterations
 402 
 403   bind(BM_INIT_LOOP);
 404   // for (i = 0; i < ASIZE; ++i)
 405   //   bc[i] = m;
 406   for (int i = 0; i < 4; i++) {
 407     sd(tmp5, Address(ch1, i * wordSize));
 408   }
 409   add(ch1, ch1, 32);
 410   sub(tmp6, tmp6, 4);
 411   bgtz(tmp6, BM_INIT_LOOP);
 412 
 413   sub(nlen_tmp, needle_len, 1); // m - 1, index of the last element in pattern
 414   Register orig_haystack = tmp5;
 415   mv(orig_haystack, haystack);
 416   // result_tmp = tmp4
 417   shadd(haystack_end, result_tmp, haystack, haystack_end, haystack_chr_shift);
 418   sub(ch2, needle_len, 1); // bc offset init value, ch2 is t1
 419   mv(tmp3, needle);
 420 
 421   //  for (i = 0; i < m - 1; ) {
 422   //    c = pattern[i];
 423   //    ++i;
 424   //    // c < 256 for Latin1 string, so, no need for branch
 425   //    #ifdef PATTERN_STRING_IS_LATIN1
 426   //    bc[c] = m - i;
 427   //    #else
 428   //    if (c < ASIZE) bc[c] = m - i;
 429   //    #endif
 430   //  }
 431   bind(BCLOOP);
 432   (this->*needle_load_1chr)(ch1, Address(tmp3), noreg);
 433   add(tmp3, tmp3, needle_chr_size);
 434   if (!needle_isL) {
 435     // ae == StrIntrinsicNode::UU
 436     mv(tmp6, ASIZE);
 437     bgeu(ch1, tmp6, BCSKIP);
 438   }
 439   add(tmp4, sp, ch1);
 440   sb(ch2, Address(tmp4)); // store skip offset to BC offset table
 441 
 442   bind(BCSKIP);
 443   sub(ch2, ch2, 1); // for next pattern element, skip distance -1
 444   bgtz(ch2, BCLOOP);
 445 
 446   // tmp6: pattern end, address after needle
 447   shadd(tmp6, needle_len, needle, tmp6, needle_chr_shift);
 448   if (needle_isL == haystack_isL) {
 449     // load last 8 bytes (8LL/4UU symbols)
 450     ld(tmp6, Address(tmp6, -wordSize));
 451   } else {
 452     // UL: from UTF-16(source) search Latin1(pattern)
 453     lwu(tmp6, Address(tmp6, -wordSize / 2)); // load last 4 bytes(4 symbols)
 454     // convert Latin1 to UTF. eg: 0x0000abcd -> 0x0a0b0c0d
 455     // We'll have to wait until load completed, but it's still faster than per-character loads+checks
 456     srli(tmp3, tmp6, BitsPerByte * (wordSize / 2 - needle_chr_size)); // pattern[m-1], eg:0x0000000a
 457     slli(ch2, tmp6, XLEN - 24);
 458     srli(ch2, ch2, XLEN - 8); // pattern[m-2], 0x0000000b
 459     slli(ch1, tmp6, XLEN - 16);
 460     srli(ch1, ch1, XLEN - 8); // pattern[m-3], 0x0000000c
 461     andi(tmp6, tmp6, 0xff); // pattern[m-4], 0x0000000d
 462     slli(ch2, ch2, 16);
 463     orr(ch2, ch2, ch1); // 0x00000b0c
 464     slli(result, tmp3, 48); // use result as temp register
 465     orr(tmp6, tmp6, result); // 0x0a00000d
 466     slli(result, ch2, 16);
 467     orr(tmp6, tmp6, result); // UTF-16:0x0a0b0c0d
 468   }
 469 
 470   // i = m - 1;
 471   // skipch = j + i;
 472   // if (skipch == pattern[m - 1]
 473   //   for (k = m - 2; k >= 0 && pattern[k] == src[k + j]; --k);
 474   // else
 475   //   move j with bad char offset table
 476   bind(BMLOOPSTR2);
 477   // compare pattern to source string backward
 478   shadd(result, nlen_tmp, haystack, result, haystack_chr_shift);
 479   (this->*haystack_load_1chr)(skipch, Address(result), noreg);
 480   sub(nlen_tmp, nlen_tmp, firstStep); // nlen_tmp is positive here, because needle_len >= 8
 481   if (needle_isL == haystack_isL) {
 482     // re-init tmp3. It's for free because it's executed in parallel with
 483     // load above. Alternative is to initialize it before loop, but it'll
 484     // affect performance on in-order systems with 2 or more ld/st pipelines
 485     srli(tmp3, tmp6, BitsPerByte * (wordSize - needle_chr_size)); // UU/LL: pattern[m-1]
 486   }
 487   if (!isLL) { // UU/UL case
 488     slli(ch2, nlen_tmp, 1); // offsets in bytes
 489   }
 490   bne(tmp3, skipch, BMSKIP); // if not equal, skipch is bad char
 491   add(result, haystack, isLL ? nlen_tmp : ch2);
 492   ld(ch2, Address(result)); // load 8 bytes from source string
 493   mv(ch1, tmp6);
 494   if (isLL) {
 495     j(BMLOOPSTR1_AFTER_LOAD);
 496   } else {
 497     sub(nlen_tmp, nlen_tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
 498     j(BMLOOPSTR1_CMP);
 499   }
 500 
 501   bind(BMLOOPSTR1);
 502   shadd(ch1, nlen_tmp, needle, ch1, needle_chr_shift);
 503   (this->*needle_load_1chr)(ch1, Address(ch1), noreg);
 504   shadd(ch2, nlen_tmp, haystack, ch2, haystack_chr_shift);
 505   (this->*haystack_load_1chr)(ch2, Address(ch2), noreg);
 506 
 507   bind(BMLOOPSTR1_AFTER_LOAD);
 508   sub(nlen_tmp, nlen_tmp, 1);
 509   bltz(nlen_tmp, BMLOOPSTR1_LASTCMP);
 510 
 511   bind(BMLOOPSTR1_CMP);
 512   beq(ch1, ch2, BMLOOPSTR1);
 513 
 514   bind(BMSKIP);
 515   if (!isLL) {
 516     // if we've met UTF symbol while searching Latin1 pattern, then we can
 517     // skip needle_len symbols
 518     if (needle_isL != haystack_isL) {
 519       mv(result_tmp, needle_len);
 520     } else {
 521       mv(result_tmp, 1);
 522     }
 523     mv(t0, ASIZE);
 524     bgeu(skipch, t0, BMADV);
 525   }
 526   add(result_tmp, sp, skipch);
 527   lbu(result_tmp, Address(result_tmp)); // load skip offset
 528 
 529   bind(BMADV);
 530   sub(nlen_tmp, needle_len, 1);
 531   // move haystack after bad char skip offset
 532   shadd(haystack, result_tmp, haystack, result, haystack_chr_shift);
 533   ble(haystack, haystack_end, BMLOOPSTR2);
 534   add(sp, sp, ASIZE);
 535   j(NOMATCH);
 536 
 537   bind(BMLOOPSTR1_LASTCMP);
 538   bne(ch1, ch2, BMSKIP);
 539 
 540   bind(BMMATCH);
 541   sub(result, haystack, orig_haystack);
 542   if (!haystack_isL) {
 543     srli(result, result, 1);
 544   }
 545   add(sp, sp, ASIZE);
 546   j(DONE);
 547 
 548   bind(LINEARSTUB);
 549   sub(t0, needle_len, 16); // small patterns still should be handled by simple algorithm
 550   bltz(t0, LINEARSEARCH);
 551   mv(result, zr);
 552   RuntimeAddress stub = NULL;
 553   if (isLL) {
 554     stub = RuntimeAddress(StubRoutines::riscv::string_indexof_linear_ll());
 555     assert(stub.target() != NULL, "string_indexof_linear_ll stub has not been generated");
 556   } else if (needle_isL) {
 557     stub = RuntimeAddress(StubRoutines::riscv::string_indexof_linear_ul());
 558     assert(stub.target() != NULL, "string_indexof_linear_ul stub has not been generated");
 559   } else {
 560     stub = RuntimeAddress(StubRoutines::riscv::string_indexof_linear_uu());
 561     assert(stub.target() != NULL, "string_indexof_linear_uu stub has not been generated");
 562   }
 563   trampoline_call(stub);
 564   j(DONE);
 565 
 566   bind(NOMATCH);
 567   mv(result, -1);
 568   j(DONE);
 569 
 570   bind(LINEARSEARCH);
 571   string_indexof_linearscan(haystack, needle, haystack_len, needle_len, tmp1, tmp2, tmp3, tmp4, -1, result, ae);
 572 
 573   bind(DONE);
 574   BLOCK_COMMENT("} string_indexof");
 575 }
 576 
 577 // string_indexof
 578 // result: x10
 579 // src: x11
 580 // src_count: x12
 581 // pattern: x13
 582 // pattern_count: x14 or 1/2/3/4
 583 void C2_MacroAssembler::string_indexof_linearscan(Register haystack, Register needle,
 584                                                Register haystack_len, Register needle_len,
 585                                                Register tmp1, Register tmp2,
 586                                                Register tmp3, Register tmp4,
 587                                                int needle_con_cnt, Register result, int ae)
 588 {
 589   // Note:
 590   // needle_con_cnt > 0 means needle_len register is invalid, needle length is constant
 591   // for UU/LL: needle_con_cnt[1, 4], UL: needle_con_cnt = 1
 592   assert(needle_con_cnt <= 4, "Invalid needle constant count");
 593   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
 594 
 595   Register ch1 = t0;
 596   Register ch2 = t1;
 597   Register hlen_neg = haystack_len, nlen_neg = needle_len;
 598   Register nlen_tmp = tmp1, hlen_tmp = tmp2, result_tmp = tmp4;
 599 
 600   bool isLL = ae == StrIntrinsicNode::LL;
 601 
 602   bool needle_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
 603   bool haystack_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
 604   int needle_chr_shift = needle_isL ? 0 : 1;
 605   int haystack_chr_shift = haystack_isL ? 0 : 1;
 606   int needle_chr_size = needle_isL ? 1 : 2;
 607   int haystack_chr_size = haystack_isL ? 1 : 2;
 608 
 609   load_chr_insn needle_load_1chr = needle_isL ? (load_chr_insn)&MacroAssembler::lbu :
 610                               (load_chr_insn)&MacroAssembler::lhu;
 611   load_chr_insn haystack_load_1chr = haystack_isL ? (load_chr_insn)&MacroAssembler::lbu :
 612                                 (load_chr_insn)&MacroAssembler::lhu;
 613   load_chr_insn load_2chr = isLL ? (load_chr_insn)&MacroAssembler::lhu : (load_chr_insn)&MacroAssembler::lwu;
 614   load_chr_insn load_4chr = isLL ? (load_chr_insn)&MacroAssembler::lwu : (load_chr_insn)&MacroAssembler::ld;
 615 
 616   Label DO1, DO2, DO3, MATCH, NOMATCH, DONE;
 617 
 618   Register first = tmp3;
 619 
 620   if (needle_con_cnt == -1) {
 621     Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
 622 
 623     sub(t0, needle_len, needle_isL == haystack_isL ? 4 : 2);
 624     bltz(t0, DOSHORT);
 625 
 626     (this->*needle_load_1chr)(first, Address(needle), noreg);
 627     slli(t0, needle_len, needle_chr_shift);
 628     add(needle, needle, t0);
 629     neg(nlen_neg, t0);
 630     slli(t0, result_tmp, haystack_chr_shift);
 631     add(haystack, haystack, t0);
 632     neg(hlen_neg, t0);
 633 
 634     bind(FIRST_LOOP);
 635     add(t0, haystack, hlen_neg);
 636     (this->*haystack_load_1chr)(ch2, Address(t0), noreg);
 637     beq(first, ch2, STR1_LOOP);
 638 
 639     bind(STR2_NEXT);
 640     add(hlen_neg, hlen_neg, haystack_chr_size);
 641     blez(hlen_neg, FIRST_LOOP);
 642     j(NOMATCH);
 643 
 644     bind(STR1_LOOP);
 645     add(nlen_tmp, nlen_neg, needle_chr_size);
 646     add(hlen_tmp, hlen_neg, haystack_chr_size);
 647     bgez(nlen_tmp, MATCH);
 648 
 649     bind(STR1_NEXT);
 650     add(ch1, needle, nlen_tmp);
 651     (this->*needle_load_1chr)(ch1, Address(ch1), noreg);
 652     add(ch2, haystack, hlen_tmp);
 653     (this->*haystack_load_1chr)(ch2, Address(ch2), noreg);
 654     bne(ch1, ch2, STR2_NEXT);
 655     add(nlen_tmp, nlen_tmp, needle_chr_size);
 656     add(hlen_tmp, hlen_tmp, haystack_chr_size);
 657     bltz(nlen_tmp, STR1_NEXT);
 658     j(MATCH);
 659 
 660     bind(DOSHORT);
 661     if (needle_isL == haystack_isL) {
 662       sub(t0, needle_len, 2);
 663       bltz(t0, DO1);
 664       bgtz(t0, DO3);
 665     }
 666   }
 667 
 668   if (needle_con_cnt == 4) {
 669     Label CH1_LOOP;
 670     (this->*load_4chr)(ch1, Address(needle), noreg);
 671     sub(result_tmp, haystack_len, 4);
 672     slli(tmp3, result_tmp, haystack_chr_shift); // result as tmp
 673     add(haystack, haystack, tmp3);
 674     neg(hlen_neg, tmp3);
 675 
 676     bind(CH1_LOOP);
 677     add(ch2, haystack, hlen_neg);
 678     (this->*load_4chr)(ch2, Address(ch2), noreg);
 679     beq(ch1, ch2, MATCH);
 680     add(hlen_neg, hlen_neg, haystack_chr_size);
 681     blez(hlen_neg, CH1_LOOP);
 682     j(NOMATCH);
 683   }
 684 
 685   if ((needle_con_cnt == -1 && needle_isL == haystack_isL) || needle_con_cnt == 2) {
 686     Label CH1_LOOP;
 687     BLOCK_COMMENT("string_indexof DO2 {");
 688     bind(DO2);
 689     (this->*load_2chr)(ch1, Address(needle), noreg);
 690     if (needle_con_cnt == 2) {
 691       sub(result_tmp, haystack_len, 2);
 692     }
 693     slli(tmp3, result_tmp, haystack_chr_shift);
 694     add(haystack, haystack, tmp3);
 695     neg(hlen_neg, tmp3);
 696 
 697     bind(CH1_LOOP);
 698     add(tmp3, haystack, hlen_neg);
 699     (this->*load_2chr)(ch2, Address(tmp3), noreg);
 700     beq(ch1, ch2, MATCH);
 701     add(hlen_neg, hlen_neg, haystack_chr_size);
 702     blez(hlen_neg, CH1_LOOP);
 703     j(NOMATCH);
 704     BLOCK_COMMENT("} string_indexof DO2");
 705   }
 706 
 707   if ((needle_con_cnt == -1 && needle_isL == haystack_isL) || needle_con_cnt == 3) {
 708     Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
 709     BLOCK_COMMENT("string_indexof DO3 {");
 710 
 711     bind(DO3);
 712     (this->*load_2chr)(first, Address(needle), noreg);
 713     (this->*needle_load_1chr)(ch1, Address(needle, 2 * needle_chr_size), noreg);
 714     if (needle_con_cnt == 3) {
 715       sub(result_tmp, haystack_len, 3);
 716     }
 717     slli(hlen_tmp, result_tmp, haystack_chr_shift);
 718     add(haystack, haystack, hlen_tmp);
 719     neg(hlen_neg, hlen_tmp);
 720 
 721     bind(FIRST_LOOP);
 722     add(ch2, haystack, hlen_neg);
 723     (this->*load_2chr)(ch2, Address(ch2), noreg);
 724     beq(first, ch2, STR1_LOOP);
 725 
 726     bind(STR2_NEXT);
 727     add(hlen_neg, hlen_neg, haystack_chr_size);
 728     blez(hlen_neg, FIRST_LOOP);
 729     j(NOMATCH);
 730 
 731     bind(STR1_LOOP);
 732     add(hlen_tmp, hlen_neg, 2 * haystack_chr_size);
 733     add(ch2, haystack, hlen_tmp);
 734     (this->*haystack_load_1chr)(ch2, Address(ch2), noreg);
 735     bne(ch1, ch2, STR2_NEXT);
 736     j(MATCH);
 737     BLOCK_COMMENT("} string_indexof DO3");
 738   }
 739 
 740   if (needle_con_cnt == -1 || needle_con_cnt == 1) {
 741     Label DO1_LOOP;
 742 
 743     BLOCK_COMMENT("string_indexof DO1 {");
 744     bind(DO1);
 745     (this->*needle_load_1chr)(ch1, Address(needle), noreg);
 746     sub(result_tmp, haystack_len, 1);
 747     mv(tmp3, result_tmp);
 748     if (haystack_chr_shift) {
 749       slli(tmp3, result_tmp, haystack_chr_shift);
 750     }
 751     add(haystack, haystack, tmp3);
 752     neg(hlen_neg, tmp3);
 753 
 754     bind(DO1_LOOP);
 755     add(tmp3, haystack, hlen_neg);
 756     (this->*haystack_load_1chr)(ch2, Address(tmp3), noreg);
 757     beq(ch1, ch2, MATCH);
 758     add(hlen_neg, hlen_neg, haystack_chr_size);
 759     blez(hlen_neg, DO1_LOOP);
 760     BLOCK_COMMENT("} string_indexof DO1");
 761   }
 762 
 763   bind(NOMATCH);
 764   mv(result, -1);
 765   j(DONE);
 766 
 767   bind(MATCH);
 768   srai(t0, hlen_neg, haystack_chr_shift);
 769   add(result, result_tmp, t0);
 770 
 771   bind(DONE);
 772 }
 773 
 774 // Compare strings.
 775 void C2_MacroAssembler::string_compare(Register str1, Register str2,
 776                                     Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
 777                                     Register tmp3, int ae)
 778 {
 779   Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
 780       DIFFERENCE, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
 781       SHORT_LOOP_START, TAIL_CHECK, L;
 782 
 783   const int STUB_THRESHOLD = 64 + 8;
 784   bool isLL = ae == StrIntrinsicNode::LL;
 785   bool isLU = ae == StrIntrinsicNode::LU;
 786   bool isUL = ae == StrIntrinsicNode::UL;
 787 
 788   bool str1_isL = isLL || isLU;
 789   bool str2_isL = isLL || isUL;
 790 
 791   // for L strings, 1 byte for 1 character
 792   // for U strings, 2 bytes for 1 character
 793   int str1_chr_size = str1_isL ? 1 : 2;
 794   int str2_chr_size = str2_isL ? 1 : 2;
 795   int minCharsInWord = isLL ? wordSize : wordSize / 2;
 796 
 797   load_chr_insn str1_load_chr = str1_isL ? (load_chr_insn)&MacroAssembler::lbu : (load_chr_insn)&MacroAssembler::lhu;
 798   load_chr_insn str2_load_chr = str2_isL ? (load_chr_insn)&MacroAssembler::lbu : (load_chr_insn)&MacroAssembler::lhu;
 799 
 800   BLOCK_COMMENT("string_compare {");
 801 
 802   // Bizzarely, the counts are passed in bytes, regardless of whether they
 803   // are L or U strings, however the result is always in characters.
 804   if (!str1_isL) {
 805     sraiw(cnt1, cnt1, 1);
 806   }
 807   if (!str2_isL) {
 808     sraiw(cnt2, cnt2, 1);
 809   }
 810 
 811   // Compute the minimum of the string lengths and save the difference in result.
 812   sub(result, cnt1, cnt2);
 813   bgt(cnt1, cnt2, L);
 814   mv(cnt2, cnt1);
 815   bind(L);
 816 
 817   // A very short string
 818   li(t0, minCharsInWord);
 819   ble(cnt2, t0, SHORT_STRING);
 820 
 821   // Compare longwords
 822   // load first parts of strings and finish initialization while loading
 823   {
 824     if (str1_isL == str2_isL) { // LL or UU
 825       // load 8 bytes once to compare
 826       ld(tmp1, Address(str1));
 827       beq(str1, str2, DONE);
 828       ld(tmp2, Address(str2));
 829       li(t0, STUB_THRESHOLD);
 830       bge(cnt2, t0, STUB);
 831       sub(cnt2, cnt2, minCharsInWord);
 832       beqz(cnt2, TAIL_CHECK);
 833       // convert cnt2 from characters to bytes
 834       if (!str1_isL) {
 835         slli(cnt2, cnt2, 1);
 836       }
 837       add(str2, str2, cnt2);
 838       add(str1, str1, cnt2);
 839       sub(cnt2, zr, cnt2);
 840     } else if (isLU) { // LU case
 841       lwu(tmp1, Address(str1));
 842       ld(tmp2, Address(str2));
 843       li(t0, STUB_THRESHOLD);
 844       bge(cnt2, t0, STUB);
 845       addi(cnt2, cnt2, -4);
 846       add(str1, str1, cnt2);
 847       sub(cnt1, zr, cnt2);
 848       slli(cnt2, cnt2, 1);
 849       add(str2, str2, cnt2);
 850       inflate_lo32(tmp3, tmp1);
 851       mv(tmp1, tmp3);
 852       sub(cnt2, zr, cnt2);
 853       addi(cnt1, cnt1, 4);
 854     } else { // UL case
 855       ld(tmp1, Address(str1));
 856       lwu(tmp2, Address(str2));
 857       li(t0, STUB_THRESHOLD);
 858       bge(cnt2, t0, STUB);
 859       addi(cnt2, cnt2, -4);
 860       slli(t0, cnt2, 1);
 861       sub(cnt1, zr, t0);
 862       add(str1, str1, t0);
 863       add(str2, str2, cnt2);
 864       inflate_lo32(tmp3, tmp2);
 865       mv(tmp2, tmp3);
 866       sub(cnt2, zr, cnt2);
 867       addi(cnt1, cnt1, 8);
 868     }
 869     addi(cnt2, cnt2, isUL ? 4 : 8);
 870     bgez(cnt2, TAIL);
 871     xorr(tmp3, tmp1, tmp2);
 872     bnez(tmp3, DIFFERENCE);
 873 
 874     // main loop
 875     bind(NEXT_WORD);
 876     if (str1_isL == str2_isL) { // LL or UU
 877       add(t0, str1, cnt2);
 878       ld(tmp1, Address(t0));
 879       add(t0, str2, cnt2);
 880       ld(tmp2, Address(t0));
 881       addi(cnt2, cnt2, 8);
 882     } else if (isLU) { // LU case
 883       add(t0, str1, cnt1);
 884       lwu(tmp1, Address(t0));
 885       add(t0, str2, cnt2);
 886       ld(tmp2, Address(t0));
 887       addi(cnt1, cnt1, 4);
 888       inflate_lo32(tmp3, tmp1);
 889       mv(tmp1, tmp3);
 890       addi(cnt2, cnt2, 8);
 891     } else { // UL case
 892       add(t0, str2, cnt2);
 893       lwu(tmp2, Address(t0));
 894       add(t0, str1, cnt1);
 895       ld(tmp1, Address(t0));
 896       inflate_lo32(tmp3, tmp2);
 897       mv(tmp2, tmp3);
 898       addi(cnt1, cnt1, 8);
 899       addi(cnt2, cnt2, 4);
 900     }
 901     bgez(cnt2, TAIL);
 902 
 903     xorr(tmp3, tmp1, tmp2);
 904     beqz(tmp3, NEXT_WORD);
 905     j(DIFFERENCE);
 906     bind(TAIL);
 907     xorr(tmp3, tmp1, tmp2);
 908     bnez(tmp3, DIFFERENCE);
 909     // Last longword.  In the case where length == 4 we compare the
 910     // same longword twice, but that's still faster than another
 911     // conditional branch.
 912     if (str1_isL == str2_isL) { // LL or UU
 913       ld(tmp1, Address(str1));
 914       ld(tmp2, Address(str2));
 915     } else if (isLU) { // LU case
 916       lwu(tmp1, Address(str1));
 917       ld(tmp2, Address(str2));
 918       inflate_lo32(tmp3, tmp1);
 919       mv(tmp1, tmp3);
 920     } else { // UL case
 921       lwu(tmp2, Address(str2));
 922       ld(tmp1, Address(str1));
 923       inflate_lo32(tmp3, tmp2);
 924       mv(tmp2, tmp3);
 925     }
 926     bind(TAIL_CHECK);
 927     xorr(tmp3, tmp1, tmp2);
 928     beqz(tmp3, DONE);
 929 
 930     // Find the first different characters in the longwords and
 931     // compute their difference.
 932     bind(DIFFERENCE);
 933     ctzc_bit(result, tmp3, isLL); // count zero from lsb to msb
 934     srl(tmp1, tmp1, result);
 935     srl(tmp2, tmp2, result);
 936     if (isLL) {
 937       andi(tmp1, tmp1, 0xFF);
 938       andi(tmp2, tmp2, 0xFF);
 939     } else {
 940       andi(tmp1, tmp1, 0xFFFF);
 941       andi(tmp2, tmp2, 0xFFFF);
 942     }
 943     sub(result, tmp1, tmp2);
 944     j(DONE);
 945   }
 946 
 947   bind(STUB);
 948   RuntimeAddress stub = NULL;
 949   switch (ae) {
 950     case StrIntrinsicNode::LL:
 951       stub = RuntimeAddress(StubRoutines::riscv::compare_long_string_LL());
 952       break;
 953     case StrIntrinsicNode::UU:
 954       stub = RuntimeAddress(StubRoutines::riscv::compare_long_string_UU());
 955       break;
 956     case StrIntrinsicNode::LU:
 957       stub = RuntimeAddress(StubRoutines::riscv::compare_long_string_LU());
 958       break;
 959     case StrIntrinsicNode::UL:
 960       stub = RuntimeAddress(StubRoutines::riscv::compare_long_string_UL());
 961       break;
 962     default:
 963       ShouldNotReachHere();
 964   }
 965   assert(stub.target() != NULL, "compare_long_string stub has not been generated");
 966   trampoline_call(stub);
 967   j(DONE);
 968 
 969   bind(SHORT_STRING);
 970   // Is the minimum length zero?
 971   beqz(cnt2, DONE);
 972   // arrange code to do most branches while loading and loading next characters
 973   // while comparing previous
 974   (this->*str1_load_chr)(tmp1, Address(str1), t0);
 975   addi(str1, str1, str1_chr_size);
 976   addi(cnt2, cnt2, -1);
 977   beqz(cnt2, SHORT_LAST_INIT);
 978   (this->*str2_load_chr)(cnt1, Address(str2), t0);
 979   addi(str2, str2, str2_chr_size);
 980   j(SHORT_LOOP_START);
 981   bind(SHORT_LOOP);
 982   addi(cnt2, cnt2, -1);
 983   beqz(cnt2, SHORT_LAST);
 984   bind(SHORT_LOOP_START);
 985   (this->*str1_load_chr)(tmp2, Address(str1), t0);
 986   addi(str1, str1, str1_chr_size);
 987   (this->*str2_load_chr)(t0, Address(str2), t0);
 988   addi(str2, str2, str2_chr_size);
 989   bne(tmp1, cnt1, SHORT_LOOP_TAIL);
 990   addi(cnt2, cnt2, -1);
 991   beqz(cnt2, SHORT_LAST2);
 992   (this->*str1_load_chr)(tmp1, Address(str1), t0);
 993   addi(str1, str1, str1_chr_size);
 994   (this->*str2_load_chr)(cnt1, Address(str2), t0);
 995   addi(str2, str2, str2_chr_size);
 996   beq(tmp2, t0, SHORT_LOOP);
 997   sub(result, tmp2, t0);
 998   j(DONE);
 999   bind(SHORT_LOOP_TAIL);
1000   sub(result, tmp1, cnt1);
1001   j(DONE);
1002   bind(SHORT_LAST2);
1003   beq(tmp2, t0, DONE);
1004   sub(result, tmp2, t0);
1005 
1006   j(DONE);
1007   bind(SHORT_LAST_INIT);
1008   (this->*str2_load_chr)(cnt1, Address(str2), t0);
1009   addi(str2, str2, str2_chr_size);
1010   bind(SHORT_LAST);
1011   beq(tmp1, cnt1, DONE);
1012   sub(result, tmp1, cnt1);
1013 
1014   bind(DONE);
1015 
1016   BLOCK_COMMENT("} string_compare");
1017 }
1018 
1019 void C2_MacroAssembler::arrays_equals(Register a1, Register a2, Register tmp3,
1020                                       Register tmp4, Register tmp5, Register tmp6, Register result,
1021                                       Register cnt1, int elem_size) {
1022   Label DONE, SAME, NEXT_DWORD, SHORT, TAIL, TAIL2, IS_TMP5_ZR;
1023   Register tmp1 = t0;
1024   Register tmp2 = t1;
1025   Register cnt2 = tmp2;  // cnt2 only used in array length compare
1026   Register elem_per_word = tmp6;
1027   int log_elem_size = exact_log2(elem_size);
1028   int length_offset = arrayOopDesc::length_offset_in_bytes();
1029   int base_offset   = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE);
1030 
1031   assert(elem_size == 1 || elem_size == 2, "must be char or byte");
1032   assert_different_registers(a1, a2, result, cnt1, t0, t1, tmp3, tmp4, tmp5, tmp6);
1033   li(elem_per_word, wordSize / elem_size);
1034 
1035   BLOCK_COMMENT("arrays_equals {");
1036 
1037   // if (a1 == a2), return true
1038   beq(a1, a2, SAME);
1039 
1040   mv(result, false);
1041   beqz(a1, DONE);
1042   beqz(a2, DONE);
1043   lwu(cnt1, Address(a1, length_offset));
1044   lwu(cnt2, Address(a2, length_offset));
1045   bne(cnt2, cnt1, DONE);
1046   beqz(cnt1, SAME);
1047 
1048   slli(tmp5, cnt1, 3 + log_elem_size);
1049   sub(tmp5, zr, tmp5);
1050   add(a1, a1, base_offset);
1051   add(a2, a2, base_offset);
1052   ld(tmp3, Address(a1, 0));
1053   ld(tmp4, Address(a2, 0));
1054   ble(cnt1, elem_per_word, SHORT); // short or same
1055 
1056   // Main 16 byte comparison loop with 2 exits
1057   bind(NEXT_DWORD); {
1058     ld(tmp1, Address(a1, wordSize));
1059     ld(tmp2, Address(a2, wordSize));
1060     sub(cnt1, cnt1, 2 * wordSize / elem_size);
1061     blez(cnt1, TAIL);
1062     bne(tmp3, tmp4, DONE);
1063     ld(tmp3, Address(a1, 2 * wordSize));
1064     ld(tmp4, Address(a2, 2 * wordSize));
1065     add(a1, a1, 2 * wordSize);
1066     add(a2, a2, 2 * wordSize);
1067     ble(cnt1, elem_per_word, TAIL2);
1068   } beq(tmp1, tmp2, NEXT_DWORD);
1069   j(DONE);
1070 
1071   bind(TAIL);
1072   xorr(tmp4, tmp3, tmp4);
1073   xorr(tmp2, tmp1, tmp2);
1074   sll(tmp2, tmp2, tmp5);
1075   orr(tmp5, tmp4, tmp2);
1076   j(IS_TMP5_ZR);
1077 
1078   bind(TAIL2);
1079   bne(tmp1, tmp2, DONE);
1080 
1081   bind(SHORT);
1082   xorr(tmp4, tmp3, tmp4);
1083   sll(tmp5, tmp4, tmp5);
1084 
1085   bind(IS_TMP5_ZR);
1086   bnez(tmp5, DONE);
1087 
1088   bind(SAME);
1089   mv(result, true);
1090   // That's it.
1091   bind(DONE);
1092 
1093   BLOCK_COMMENT("} array_equals");
1094 }
1095 
1096 // Compare Strings
1097 
1098 // For Strings we're passed the address of the first characters in a1
1099 // and a2 and the length in cnt1.
1100 // elem_size is the element size in bytes: either 1 or 2.
1101 // There are two implementations.  For arrays >= 8 bytes, all
1102 // comparisons (including the final one, which may overlap) are
1103 // performed 8 bytes at a time.  For strings < 8 bytes, we compare a
1104 // halfword, then a short, and then a byte.
1105 
1106 void C2_MacroAssembler::string_equals(Register a1, Register a2,
1107                                       Register result, Register cnt1, int elem_size)
1108 {
1109   Label SAME, DONE, SHORT, NEXT_WORD;
1110   Register tmp1 = t0;
1111   Register tmp2 = t1;
1112 
1113   assert(elem_size == 1 || elem_size == 2, "must be 2 or 1 byte");
1114   assert_different_registers(a1, a2, result, cnt1, t0, t1);
1115 
1116   BLOCK_COMMENT("string_equals {");
1117 
1118   mv(result, false);
1119 
1120   // Check for short strings, i.e. smaller than wordSize.
1121   sub(cnt1, cnt1, wordSize);
1122   bltz(cnt1, SHORT);
1123 
1124   // Main 8 byte comparison loop.
1125   bind(NEXT_WORD); {
1126     ld(tmp1, Address(a1, 0));
1127     add(a1, a1, wordSize);
1128     ld(tmp2, Address(a2, 0));
1129     add(a2, a2, wordSize);
1130     sub(cnt1, cnt1, wordSize);
1131     bne(tmp1, tmp2, DONE);
1132   } bgtz(cnt1, NEXT_WORD);
1133 
1134   // Last longword.  In the case where length == 4 we compare the
1135   // same longword twice, but that's still faster than another
1136   // conditional branch.
1137   // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when
1138   // length == 4.
1139   add(tmp1, a1, cnt1);
1140   ld(tmp1, Address(tmp1, 0));
1141   add(tmp2, a2, cnt1);
1142   ld(tmp2, Address(tmp2, 0));
1143   bne(tmp1, tmp2, DONE);
1144   j(SAME);
1145 
1146   bind(SHORT);
1147   Label TAIL03, TAIL01;
1148 
1149   // 0-7 bytes left.
1150   andi(t0, cnt1, 4);
1151   beqz(t0, TAIL03);
1152   {
1153     lwu(tmp1, Address(a1, 0));
1154     add(a1, a1, 4);
1155     lwu(tmp2, Address(a2, 0));
1156     add(a2, a2, 4);
1157     bne(tmp1, tmp2, DONE);
1158   }
1159 
1160   bind(TAIL03);
1161   // 0-3 bytes left.
1162   andi(t0, cnt1, 2);
1163   beqz(t0, TAIL01);
1164   {
1165     lhu(tmp1, Address(a1, 0));
1166     add(a1, a1, 2);
1167     lhu(tmp2, Address(a2, 0));
1168     add(a2, a2, 2);
1169     bne(tmp1, tmp2, DONE);
1170   }
1171 
1172   bind(TAIL01);
1173   if (elem_size == 1) { // Only needed when comparing 1-byte elements
1174     // 0-1 bytes left.
1175     andi(t0, cnt1, 1);
1176     beqz(t0, SAME);
1177     {
1178       lbu(tmp1, a1, 0);
1179       lbu(tmp2, a2, 0);
1180       bne(tmp1, tmp2, DONE);
1181     }
1182   }
1183 
1184   // Arrays are equal.
1185   bind(SAME);
1186   mv(result, true);
1187 
1188   // That's it.
1189   bind(DONE);
1190   BLOCK_COMMENT("} string_equals");
1191 }
1192 
1193 typedef void (Assembler::*conditional_branch_insn)(Register op1, Register op2, Label& label, bool is_far);
1194 typedef void (MacroAssembler::*float_conditional_branch_insn)(FloatRegister op1, FloatRegister op2, Label& label,
1195                                                               bool is_far, bool is_unordered);
1196 
1197 static conditional_branch_insn conditional_branches[] =
1198 {
1199   /* SHORT branches */
1200   (conditional_branch_insn)&Assembler::beq,
1201   (conditional_branch_insn)&Assembler::bgt,
1202   NULL, // BoolTest::overflow
1203   (conditional_branch_insn)&Assembler::blt,
1204   (conditional_branch_insn)&Assembler::bne,
1205   (conditional_branch_insn)&Assembler::ble,
1206   NULL, // BoolTest::no_overflow
1207   (conditional_branch_insn)&Assembler::bge,
1208 
1209   /* UNSIGNED branches */
1210   (conditional_branch_insn)&Assembler::beq,
1211   (conditional_branch_insn)&Assembler::bgtu,
1212   NULL,
1213   (conditional_branch_insn)&Assembler::bltu,
1214   (conditional_branch_insn)&Assembler::bne,
1215   (conditional_branch_insn)&Assembler::bleu,
1216   NULL,
1217   (conditional_branch_insn)&Assembler::bgeu
1218 };
1219 
1220 static float_conditional_branch_insn float_conditional_branches[] =
1221 {
1222   /* FLOAT SHORT branches */
1223   (float_conditional_branch_insn)&MacroAssembler::float_beq,
1224   (float_conditional_branch_insn)&MacroAssembler::float_bgt,
1225   NULL,  // BoolTest::overflow
1226   (float_conditional_branch_insn)&MacroAssembler::float_blt,
1227   (float_conditional_branch_insn)&MacroAssembler::float_bne,
1228   (float_conditional_branch_insn)&MacroAssembler::float_ble,
1229   NULL, // BoolTest::no_overflow
1230   (float_conditional_branch_insn)&MacroAssembler::float_bge,
1231 
1232   /* DOUBLE SHORT branches */
1233   (float_conditional_branch_insn)&MacroAssembler::double_beq,
1234   (float_conditional_branch_insn)&MacroAssembler::double_bgt,
1235   NULL,
1236   (float_conditional_branch_insn)&MacroAssembler::double_blt,
1237   (float_conditional_branch_insn)&MacroAssembler::double_bne,
1238   (float_conditional_branch_insn)&MacroAssembler::double_ble,
1239   NULL,
1240   (float_conditional_branch_insn)&MacroAssembler::double_bge
1241 };
1242 
1243 void C2_MacroAssembler::cmp_branch(int cmpFlag, Register op1, Register op2, Label& label, bool is_far) {
1244   assert(cmpFlag >= 0 && cmpFlag < (int)(sizeof(conditional_branches) / sizeof(conditional_branches[0])),
1245          "invalid conditional branch index");
1246   (this->*conditional_branches[cmpFlag])(op1, op2, label, is_far);
1247 }
1248 
1249 // This is a function should only be used by C2. Flip the unordered when unordered-greater, C2 would use
1250 // unordered-lesser instead of unordered-greater. Finally, commute the result bits at function do_one_bytecode().
1251 void C2_MacroAssembler::float_cmp_branch(int cmpFlag, FloatRegister op1, FloatRegister op2, Label& label, bool is_far) {
1252   assert(cmpFlag >= 0 && cmpFlag < (int)(sizeof(float_conditional_branches) / sizeof(float_conditional_branches[0])),
1253          "invalid float conditional branch index");
1254   int booltest_flag = cmpFlag & ~(C2_MacroAssembler::double_branch_mask);
1255   (this->*float_conditional_branches[cmpFlag])(op1, op2, label, is_far,
1256     (booltest_flag == (BoolTest::ge) || booltest_flag == (BoolTest::gt)) ? false : true);
1257 }
1258 
1259 void C2_MacroAssembler::enc_cmpUEqNeLeGt_imm0_branch(int cmpFlag, Register op1, Label& L, bool is_far) {
1260   switch (cmpFlag) {
1261     case BoolTest::eq:
1262     case BoolTest::le:
1263       beqz(op1, L, is_far);
1264       break;
1265     case BoolTest::ne:
1266     case BoolTest::gt:
1267       bnez(op1, L, is_far);
1268       break;
1269     default:
1270       ShouldNotReachHere();
1271   }
1272 }
1273 
1274 void C2_MacroAssembler::enc_cmpEqNe_imm0_branch(int cmpFlag, Register op1, Label& L, bool is_far) {
1275   switch (cmpFlag) {
1276     case BoolTest::eq:
1277       beqz(op1, L, is_far);
1278       break;
1279     case BoolTest::ne:
1280       bnez(op1, L, is_far);
1281       break;
1282     default:
1283       ShouldNotReachHere();
1284   }
1285 }
1286 
1287 void C2_MacroAssembler::enc_cmove(int cmpFlag, Register op1, Register op2, Register dst, Register src) {
1288   Label L;
1289   cmp_branch(cmpFlag ^ (1 << neg_cond_bits), op1, op2, L);
1290   mv(dst, src);
1291   bind(L);
1292 }
1293 
1294 // Set dst to NaN if any NaN input.
1295 void C2_MacroAssembler::minmax_FD(FloatRegister dst, FloatRegister src1, FloatRegister src2,
1296                                   bool is_double, bool is_min) {
1297   assert_different_registers(dst, src1, src2);
1298 
1299   Label Done;
1300   fsflags(zr);
1301   if (is_double) {
1302     is_min ? fmin_d(dst, src1, src2)
1303            : fmax_d(dst, src1, src2);
1304     // Checking NaNs
1305     flt_d(zr, src1, src2);
1306   } else {
1307     is_min ? fmin_s(dst, src1, src2)
1308            : fmax_s(dst, src1, src2);
1309     // Checking NaNs
1310     flt_s(zr, src1, src2);
1311   }
1312 
1313   frflags(t0);
1314   beqz(t0, Done);
1315 
1316   // In case of NaNs
1317   is_double ? fadd_d(dst, src1, src2)
1318             : fadd_s(dst, src1, src2);
1319 
1320   bind(Done);
1321 }
1322 
1323 void C2_MacroAssembler::element_compare(Register a1, Register a2, Register result, Register cnt, Register tmp1, Register tmp2,
1324                                         VectorRegister vr1, VectorRegister vr2, VectorRegister vrs, bool islatin, Label &DONE) {
1325   Label loop;
1326   Assembler::SEW sew = islatin ? Assembler::e8 : Assembler::e16;
1327 
1328   bind(loop);
1329   vsetvli(tmp1, cnt, sew, Assembler::m2);
1330   vlex_v(vr1, a1, sew);
1331   vlex_v(vr2, a2, sew);
1332   vmsne_vv(vrs, vr1, vr2);
1333   vfirst_m(tmp2, vrs);
1334   bgez(tmp2, DONE);
1335   sub(cnt, cnt, tmp1);
1336   if (!islatin) {
1337     slli(tmp1, tmp1, 1); // get byte counts
1338   }
1339   add(a1, a1, tmp1);
1340   add(a2, a2, tmp1);
1341   bnez(cnt, loop);
1342 
1343   mv(result, true);
1344 }
1345 
1346 void C2_MacroAssembler::string_equals_v(Register a1, Register a2, Register result, Register cnt, int elem_size) {
1347   Label DONE;
1348   Register tmp1 = t0;
1349   Register tmp2 = t1;
1350 
1351   BLOCK_COMMENT("string_equals_v {");
1352 
1353   mv(result, false);
1354 
1355   if (elem_size == 2) {
1356     srli(cnt, cnt, 1);
1357   }
1358 
1359   element_compare(a1, a2, result, cnt, tmp1, tmp2, v0, v2, v0, elem_size == 1, DONE);
1360 
1361   bind(DONE);
1362   BLOCK_COMMENT("} string_equals_v");
1363 }
1364 
1365 // used by C2 ClearArray patterns.
1366 // base: Address of a buffer to be zeroed
1367 // cnt: Count in HeapWords
1368 //
1369 // base, cnt, v0, v1 and t0 are clobbered.
1370 void C2_MacroAssembler::clear_array_v(Register base, Register cnt) {
1371   Label loop;
1372 
1373   // making zero words
1374   vsetvli(t0, cnt, Assembler::e64, Assembler::m4);
1375   vxor_vv(v0, v0, v0);
1376 
1377   bind(loop);
1378   vsetvli(t0, cnt, Assembler::e64, Assembler::m4);
1379   vse64_v(v0, base);
1380   sub(cnt, cnt, t0);
1381   shadd(base, t0, base, t0, 3);
1382   bnez(cnt, loop);
1383 }
1384 
1385 void C2_MacroAssembler::arrays_equals_v(Register a1, Register a2, Register result,
1386                                         Register cnt1, int elem_size) {
1387   Label DONE;
1388   Register tmp1 = t0;
1389   Register tmp2 = t1;
1390   Register cnt2 = tmp2;
1391   int length_offset = arrayOopDesc::length_offset_in_bytes();
1392   int base_offset = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE);
1393 
1394   BLOCK_COMMENT("arrays_equals_v {");
1395 
1396   // if (a1 == a2), return true
1397   mv(result, true);
1398   beq(a1, a2, DONE);
1399 
1400   mv(result, false);
1401   // if a1 == null or a2 == null, return false
1402   beqz(a1, DONE);
1403   beqz(a2, DONE);
1404   // if (a1.length != a2.length), return false
1405   lwu(cnt1, Address(a1, length_offset));
1406   lwu(cnt2, Address(a2, length_offset));
1407   bne(cnt1, cnt2, DONE);
1408 
1409   la(a1, Address(a1, base_offset));
1410   la(a2, Address(a2, base_offset));
1411 
1412   element_compare(a1, a2, result, cnt1, tmp1, tmp2, v0, v2, v0, elem_size == 1, DONE);
1413 
1414   bind(DONE);
1415 
1416   BLOCK_COMMENT("} arrays_equals_v");
1417 }
1418 
1419 void C2_MacroAssembler::string_compare_v(Register str1, Register str2, Register cnt1, Register cnt2,
1420                                          Register result, Register tmp1, Register tmp2, int encForm) {
1421   Label DIFFERENCE, DONE, L, loop;
1422   bool encLL = encForm == StrIntrinsicNode::LL;
1423   bool encLU = encForm == StrIntrinsicNode::LU;
1424   bool encUL = encForm == StrIntrinsicNode::UL;
1425 
1426   bool str1_isL = encLL || encLU;
1427   bool str2_isL = encLL || encUL;
1428 
1429   int minCharsInWord = encLL ? wordSize : wordSize / 2;
1430 
1431   BLOCK_COMMENT("string_compare {");
1432 
1433   // for Lating strings, 1 byte for 1 character
1434   // for UTF16 strings, 2 bytes for 1 character
1435   if (!str1_isL)
1436     sraiw(cnt1, cnt1, 1);
1437   if (!str2_isL)
1438     sraiw(cnt2, cnt2, 1);
1439 
1440   // if str1 == str2, return the difference
1441   // save the minimum of the string lengths in cnt2.
1442   sub(result, cnt1, cnt2);
1443   bgt(cnt1, cnt2, L);
1444   mv(cnt2, cnt1);
1445   bind(L);
1446 
1447   if (str1_isL == str2_isL) { // LL or UU
1448     element_compare(str1, str2, zr, cnt2, tmp1, tmp2, v2, v4, v1, encLL, DIFFERENCE);
1449     j(DONE);
1450   } else { // LU or UL
1451     Register strL = encLU ? str1 : str2;
1452     Register strU = encLU ? str2 : str1;
1453     VectorRegister vstr1 = encLU ? v4 : v0;
1454     VectorRegister vstr2 = encLU ? v0 : v4;
1455 
1456     bind(loop);
1457     vsetvli(tmp1, cnt2, Assembler::e8, Assembler::m2);
1458     vle8_v(vstr1, strL);
1459     vsetvli(tmp1, cnt2, Assembler::e16, Assembler::m4);
1460     vzext_vf2(vstr2, vstr1);
1461     vle16_v(vstr1, strU);
1462     vmsne_vv(v0, vstr2, vstr1);
1463     vfirst_m(tmp2, v0);
1464     bgez(tmp2, DIFFERENCE);
1465     sub(cnt2, cnt2, tmp1);
1466     add(strL, strL, tmp1);
1467     shadd(strU, tmp1, strU, tmp1, 1);
1468     bnez(cnt2, loop);
1469     j(DONE);
1470   }
1471   bind(DIFFERENCE);
1472   slli(tmp1, tmp2, 1);
1473   add(str1, str1, str1_isL ? tmp2 : tmp1);
1474   add(str2, str2, str2_isL ? tmp2 : tmp1);
1475   str1_isL ? lbu(tmp1, Address(str1, 0)) : lhu(tmp1, Address(str1, 0));
1476   str2_isL ? lbu(tmp2, Address(str2, 0)) : lhu(tmp2, Address(str2, 0));
1477   sub(result, tmp1, tmp2);
1478 
1479   bind(DONE);
1480 }
1481 
1482 void C2_MacroAssembler::byte_array_inflate_v(Register src, Register dst, Register len, Register tmp) {
1483   Label loop;
1484   assert_different_registers(src, dst, len, tmp, t0);
1485 
1486   BLOCK_COMMENT("byte_array_inflate_v {");
1487   bind(loop);
1488   vsetvli(tmp, len, Assembler::e8, Assembler::m2);
1489   vle8_v(v2, src);
1490   vsetvli(t0, len, Assembler::e16, Assembler::m4);
1491   vzext_vf2(v0, v2);
1492   vse16_v(v0, dst);
1493   sub(len, len, tmp);
1494   add(src, src, tmp);
1495   shadd(dst, tmp, dst, tmp, 1);
1496   bnez(len, loop);
1497   BLOCK_COMMENT("} byte_array_inflate_v");
1498 }
1499 
1500 // Compress char[] array to byte[].
1501 // result: the array length if every element in array can be encoded; 0, otherwise.
1502 void C2_MacroAssembler::char_array_compress_v(Register src, Register dst, Register len, Register result, Register tmp) {
1503   Label done;
1504   encode_iso_array_v(src, dst, len, result, tmp);
1505   beqz(len, done);
1506   mv(result, zr);
1507   bind(done);
1508 }
1509 
1510 // result: the number of elements had been encoded.
1511 void C2_MacroAssembler::encode_iso_array_v(Register src, Register dst, Register len, Register result, Register tmp) {
1512   Label loop, DIFFERENCE, DONE;
1513 
1514   BLOCK_COMMENT("encode_iso_array_v {");
1515   mv(result, 0);
1516 
1517   bind(loop);
1518   mv(tmp, 0xff);
1519   vsetvli(t0, len, Assembler::e16, Assembler::m2);
1520   vle16_v(v2, src);
1521   // if element > 0xff, stop
1522   vmsgtu_vx(v1, v2, tmp);
1523   vfirst_m(tmp, v1);
1524   vmsbf_m(v0, v1);
1525   // compress char to byte
1526   vsetvli(t0, len, Assembler::e8);
1527   vncvt_x_x_w(v1, v2, Assembler::v0_t);
1528   vse8_v(v1, dst, Assembler::v0_t);
1529 
1530   bgez(tmp, DIFFERENCE);
1531   add(result, result, t0);
1532   add(dst, dst, t0);
1533   sub(len, len, t0);
1534   shadd(src, t0, src, t0, 1);
1535   bnez(len, loop);
1536   j(DONE);
1537 
1538   bind(DIFFERENCE);
1539   add(result, result, tmp);
1540 
1541   bind(DONE);
1542   BLOCK_COMMENT("} encode_iso_array_v");
1543 }
1544 
1545 void C2_MacroAssembler::count_positives_v(Register ary, Register len, Register result, Register tmp) {
1546   Label LOOP, SET_RESULT, DONE;
1547 
1548   BLOCK_COMMENT("count_positives_v {");
1549   mv(result, zr);
1550 
1551   bind(LOOP);
1552   vsetvli(t0, len, Assembler::e8, Assembler::m4);
1553   vle8_v(v0, ary);
1554   vmslt_vx(v0, v0, zr);
1555   vfirst_m(tmp, v0);
1556   bgez(tmp, SET_RESULT);
1557   // if tmp == -1, all bytes are positive
1558   add(result, result, t0);
1559 
1560   sub(len, len, t0);
1561   add(ary, ary, t0);
1562   bnez(len, LOOP);
1563   j(DONE);
1564 
1565   // add remaining positive bytes count
1566   bind(SET_RESULT);
1567   add(result, result, tmp);
1568 
1569   bind(DONE);
1570   BLOCK_COMMENT("} count_positives_v");
1571 }
1572 
1573 void C2_MacroAssembler::string_indexof_char_v(Register str1, Register cnt1,
1574                                               Register ch, Register result,
1575                                               Register tmp1, Register tmp2,
1576                                               bool isL) {
1577   mv(result, zr);
1578 
1579   Label loop, MATCH, DONE;
1580   Assembler::SEW sew = isL ? Assembler::e8 : Assembler::e16;
1581   bind(loop);
1582   vsetvli(tmp1, cnt1, sew, Assembler::m4);
1583   vlex_v(v0, str1, sew);
1584   vmseq_vx(v0, v0, ch);
1585   vfirst_m(tmp2, v0);
1586   bgez(tmp2, MATCH); // if equal, return index
1587 
1588   add(result, result, tmp1);
1589   sub(cnt1, cnt1, tmp1);
1590   if (!isL) slli(tmp1, tmp1, 1);
1591   add(str1, str1, tmp1);
1592   bnez(cnt1, loop);
1593 
1594   mv(result, -1);
1595   j(DONE);
1596 
1597   bind(MATCH);
1598   add(result, result, tmp2);
1599 
1600   bind(DONE);
1601 }
1602 
1603 // Set dst to NaN if any NaN input.
1604 void C2_MacroAssembler::minmax_FD_v(VectorRegister dst, VectorRegister src1, VectorRegister src2,
1605                                     bool is_double, bool is_min) {
1606   assert_different_registers(dst, src1, src2);
1607 
1608   vsetvli(t0, x0, is_double ? Assembler::e64 : Assembler::e32);
1609 
1610   is_min ? vfmin_vv(dst, src1, src2)
1611          : vfmax_vv(dst, src1, src2);
1612 
1613   vmfne_vv(v0,  src1, src1);
1614   vfadd_vv(dst, src1, src1, Assembler::v0_t);
1615   vmfne_vv(v0,  src2, src2);
1616   vfadd_vv(dst, src2, src2, Assembler::v0_t);
1617 }
1618 
1619 // Set dst to NaN if any NaN input.
1620 void C2_MacroAssembler::reduce_minmax_FD_v(FloatRegister dst,
1621                                            FloatRegister src1, VectorRegister src2,
1622                                            VectorRegister tmp1, VectorRegister tmp2,
1623                                            bool is_double, bool is_min) {
1624   assert_different_registers(src2, tmp1, tmp2);
1625 
1626   Label L_done, L_NaN;
1627   vsetvli(t0, x0, is_double ? Assembler::e64 : Assembler::e32);
1628   vfmv_s_f(tmp2, src1);
1629 
1630   is_min ? vfredmin_vs(tmp1, src2, tmp2)
1631          : vfredmax_vs(tmp1, src2, tmp2);
1632 
1633   fsflags(zr);
1634   // Checking NaNs
1635   vmflt_vf(tmp2, src2, src1);
1636   frflags(t0);
1637   bnez(t0, L_NaN);
1638   j(L_done);
1639 
1640   bind(L_NaN);
1641   vfmv_s_f(tmp2, src1);
1642   vfredsum_vs(tmp1, src2, tmp2);
1643 
1644   bind(L_done);
1645   vfmv_f_s(dst, tmp1);
1646 }