1 /*
   2  * Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2020, 2021, Huawei Technologies Co., Ltd. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/assembler.hpp"
  28 #include "asm/assembler.inline.hpp"
  29 #include "opto/c2_MacroAssembler.hpp"
  30 #include "opto/intrinsicnode.hpp"
  31 #include "opto/subnode.hpp"
  32 #include "runtime/stubRoutines.hpp"
  33 
  34 #ifdef PRODUCT
  35 #define BLOCK_COMMENT(str) /* nothing */
  36 #define STOP(error) stop(error)
  37 #else
  38 #define BLOCK_COMMENT(str) block_comment(str)
  39 #define STOP(error) block_comment(error); stop(error)
  40 #endif
  41 
  42 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  43 
  44 // short string
  45 // StringUTF16.indexOfChar
  46 // StringLatin1.indexOfChar
  47 void C2_MacroAssembler::string_indexof_char_short(Register str1, Register cnt1,
  48                                                   Register ch, Register result,
  49                                                   bool isL)
  50 {
  51   Register ch1 = t0;
  52   Register index = t1;
  53 
  54   BLOCK_COMMENT("string_indexof_char_short {");
  55 
  56   Label LOOP, LOOP1, LOOP4, LOOP8;
  57   Label MATCH,  MATCH1, MATCH2, MATCH3,
  58         MATCH4, MATCH5, MATCH6, MATCH7, NOMATCH;
  59 
  60   mv(result, -1);
  61   mv(index, zr);
  62 
  63   bind(LOOP);
  64   addi(t0, index, 8);
  65   ble(t0, cnt1, LOOP8);
  66   addi(t0, index, 4);
  67   ble(t0, cnt1, LOOP4);
  68   j(LOOP1);
  69 
  70   bind(LOOP8);
  71   isL ? lbu(ch1, Address(str1, 0)) : lhu(ch1, Address(str1, 0));
  72   beq(ch, ch1, MATCH);
  73   isL ? lbu(ch1, Address(str1, 1)) : lhu(ch1, Address(str1, 2));
  74   beq(ch, ch1, MATCH1);
  75   isL ? lbu(ch1, Address(str1, 2)) : lhu(ch1, Address(str1, 4));
  76   beq(ch, ch1, MATCH2);
  77   isL ? lbu(ch1, Address(str1, 3)) : lhu(ch1, Address(str1, 6));
  78   beq(ch, ch1, MATCH3);
  79   isL ? lbu(ch1, Address(str1, 4)) : lhu(ch1, Address(str1, 8));
  80   beq(ch, ch1, MATCH4);
  81   isL ? lbu(ch1, Address(str1, 5)) : lhu(ch1, Address(str1, 10));
  82   beq(ch, ch1, MATCH5);
  83   isL ? lbu(ch1, Address(str1, 6)) : lhu(ch1, Address(str1, 12));
  84   beq(ch, ch1, MATCH6);
  85   isL ? lbu(ch1, Address(str1, 7)) : lhu(ch1, Address(str1, 14));
  86   beq(ch, ch1, MATCH7);
  87   addi(index, index, 8);
  88   addi(str1, str1, isL ? 8 : 16);
  89   blt(index, cnt1, LOOP);
  90   j(NOMATCH);
  91 
  92   bind(LOOP4);
  93   isL ? lbu(ch1, Address(str1, 0)) : lhu(ch1, Address(str1, 0));
  94   beq(ch, ch1, MATCH);
  95   isL ? lbu(ch1, Address(str1, 1)) : lhu(ch1, Address(str1, 2));
  96   beq(ch, ch1, MATCH1);
  97   isL ? lbu(ch1, Address(str1, 2)) : lhu(ch1, Address(str1, 4));
  98   beq(ch, ch1, MATCH2);
  99   isL ? lbu(ch1, Address(str1, 3)) : lhu(ch1, Address(str1, 6));
 100   beq(ch, ch1, MATCH3);
 101   addi(index, index, 4);
 102   addi(str1, str1, isL ? 4 : 8);
 103   bge(index, cnt1, NOMATCH);
 104 
 105   bind(LOOP1);
 106   isL ? lbu(ch1, Address(str1)) : lhu(ch1, Address(str1));
 107   beq(ch, ch1, MATCH);
 108   addi(index, index, 1);
 109   addi(str1, str1, isL ? 1 : 2);
 110   blt(index, cnt1, LOOP1);
 111   j(NOMATCH);
 112 
 113   bind(MATCH1);
 114   addi(index, index, 1);
 115   j(MATCH);
 116 
 117   bind(MATCH2);
 118   addi(index, index, 2);
 119   j(MATCH);
 120 
 121   bind(MATCH3);
 122   addi(index, index, 3);
 123   j(MATCH);
 124 
 125   bind(MATCH4);
 126   addi(index, index, 4);
 127   j(MATCH);
 128 
 129   bind(MATCH5);
 130   addi(index, index, 5);
 131   j(MATCH);
 132 
 133   bind(MATCH6);
 134   addi(index, index, 6);
 135   j(MATCH);
 136 
 137   bind(MATCH7);
 138   addi(index, index, 7);
 139 
 140   bind(MATCH);
 141   mv(result, index);
 142   bind(NOMATCH);
 143   BLOCK_COMMENT("} string_indexof_char_short");
 144 }
 145 
 146 // StringUTF16.indexOfChar
 147 // StringLatin1.indexOfChar
 148 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1,
 149                                             Register ch, Register result,
 150                                             Register tmp1, Register tmp2,
 151                                             Register tmp3, Register tmp4,
 152                                             bool isL)
 153 {
 154   Label CH1_LOOP, HIT, NOMATCH, DONE, DO_LONG;
 155   Register ch1 = t0;
 156   Register orig_cnt = t1;
 157   Register mask1 = tmp3;
 158   Register mask2 = tmp2;
 159   Register match_mask = tmp1;
 160   Register trailing_char = tmp4;
 161   Register unaligned_elems = tmp4;
 162 
 163   BLOCK_COMMENT("string_indexof_char {");
 164   beqz(cnt1, NOMATCH);
 165 
 166   addi(t0, cnt1, isL ? -32 : -16);
 167   bgtz(t0, DO_LONG);
 168   string_indexof_char_short(str1, cnt1, ch, result, isL);
 169   j(DONE);
 170 
 171   bind(DO_LONG);
 172   mv(orig_cnt, cnt1);
 173   if (AvoidUnalignedAccesses) {
 174     Label ALIGNED;
 175     andi(unaligned_elems, str1, 0x7);
 176     beqz(unaligned_elems, ALIGNED);
 177     sub(unaligned_elems, unaligned_elems, 8);
 178     neg(unaligned_elems, unaligned_elems);
 179     if (!isL) {
 180       srli(unaligned_elems, unaligned_elems, 1);
 181     }
 182     // do unaligned part per element
 183     string_indexof_char_short(str1, unaligned_elems, ch, result, isL);
 184     bgez(result, DONE);
 185     mv(orig_cnt, cnt1);
 186     sub(cnt1, cnt1, unaligned_elems);
 187     bind(ALIGNED);
 188   }
 189 
 190   // duplicate ch
 191   if (isL) {
 192     slli(ch1, ch, 8);
 193     orr(ch, ch1, ch);
 194   }
 195   slli(ch1, ch, 16);
 196   orr(ch, ch1, ch);
 197   slli(ch1, ch, 32);
 198   orr(ch, ch1, ch);
 199 
 200   if (!isL) {
 201     slli(cnt1, cnt1, 1);
 202   }
 203 
 204   mv(mask1, isL ? 0x0101010101010101 : 0x0001000100010001);
 205   mv(mask2, isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 206 
 207   bind(CH1_LOOP);
 208   ld(ch1, Address(str1));
 209   addi(str1, str1, 8);
 210   addi(cnt1, cnt1, -8);
 211   compute_match_mask(ch1, ch, match_mask, mask1, mask2);
 212   bnez(match_mask, HIT);
 213   bgtz(cnt1, CH1_LOOP);
 214   j(NOMATCH);
 215 
 216   bind(HIT);
 217   ctzc_bit(trailing_char, match_mask, isL, ch1, result);
 218   srli(trailing_char, trailing_char, 3);
 219   addi(cnt1, cnt1, 8);
 220   ble(cnt1, trailing_char, NOMATCH);
 221   // match case
 222   if (!isL) {
 223     srli(cnt1, cnt1, 1);
 224     srli(trailing_char, trailing_char, 1);
 225   }
 226 
 227   sub(result, orig_cnt, cnt1);
 228   add(result, result, trailing_char);
 229   j(DONE);
 230 
 231   bind(NOMATCH);
 232   mv(result, -1);
 233 
 234   bind(DONE);
 235   BLOCK_COMMENT("} string_indexof_char");
 236 }
 237 
 238 typedef void (MacroAssembler::* load_chr_insn)(Register rd, const Address &adr, Register temp);
 239 
 240 // Search for needle in haystack and return index or -1
 241 // x10: result
 242 // x11: haystack
 243 // x12: haystack_len
 244 // x13: needle
 245 // x14: needle_len
 246 void C2_MacroAssembler::string_indexof(Register haystack, Register needle,
 247                                        Register haystack_len, Register needle_len,
 248                                        Register tmp1, Register tmp2,
 249                                        Register tmp3, Register tmp4,
 250                                        Register tmp5, Register tmp6,
 251                                        Register result, int ae)
 252 {
 253   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
 254 
 255   Label LINEARSEARCH, LINEARSTUB, DONE, NOMATCH;
 256 
 257   Register ch1 = t0;
 258   Register ch2 = t1;
 259   Register nlen_tmp = tmp1; // needle len tmp
 260   Register hlen_tmp = tmp2; // haystack len tmp
 261   Register result_tmp = tmp4;
 262 
 263   bool isLL = ae == StrIntrinsicNode::LL;
 264 
 265   bool needle_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
 266   bool haystack_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
 267   int needle_chr_shift = needle_isL ? 0 : 1;
 268   int haystack_chr_shift = haystack_isL ? 0 : 1;
 269   int needle_chr_size = needle_isL ? 1 : 2;
 270   int haystack_chr_size = haystack_isL ? 1 : 2;
 271   load_chr_insn needle_load_1chr = needle_isL ? (load_chr_insn)&MacroAssembler::lbu :
 272                               (load_chr_insn)&MacroAssembler::lhu;
 273   load_chr_insn haystack_load_1chr = haystack_isL ? (load_chr_insn)&MacroAssembler::lbu :
 274                                 (load_chr_insn)&MacroAssembler::lhu;
 275 
 276   BLOCK_COMMENT("string_indexof {");
 277 
 278   // Note, inline_string_indexOf() generates checks:
 279   // if (pattern.count > src.count) return -1;
 280   // if (pattern.count == 0) return 0;
 281 
 282   // We have two strings, a source string in haystack, haystack_len and a pattern string
 283   // in needle, needle_len. Find the first occurence of pattern in source or return -1.
 284 
 285   // For larger pattern and source we use a simplified Boyer Moore algorithm.
 286   // With a small pattern and source we use linear scan.
 287 
 288   // needle_len >=8 && needle_len < 256 && needle_len < haystack_len/4, use bmh algorithm.
 289   sub(result_tmp, haystack_len, needle_len);
 290   // needle_len < 8, use linear scan
 291   sub(t0, needle_len, 8);
 292   bltz(t0, LINEARSEARCH);
 293   // needle_len >= 256, use linear scan
 294   sub(t0, needle_len, 256);
 295   bgez(t0, LINEARSTUB);
 296   // needle_len >= haystack_len/4, use linear scan
 297   srli(t0, haystack_len, 2);
 298   bge(needle_len, t0, LINEARSTUB);
 299 
 300   // Boyer-Moore-Horspool introduction:
 301   // The Boyer Moore alogorithm is based on the description here:-
 302   //
 303   // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
 304   //
 305   // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
 306   // and the 'Good Suffix' rule.
 307   //
 308   // These rules are essentially heuristics for how far we can shift the
 309   // pattern along the search string.
 310   //
 311   // The implementation here uses the 'Bad Character' rule only because of the
 312   // complexity of initialisation for the 'Good Suffix' rule.
 313   //
 314   // This is also known as the Boyer-Moore-Horspool algorithm:
 315   //
 316   // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
 317   //
 318   // #define ASIZE 256
 319   //
 320   //    int bm(unsigned char *pattern, int m, unsigned char *src, int n) {
 321   //      int i, j;
 322   //      unsigned c;
 323   //      unsigned char bc[ASIZE];
 324   //
 325   //      /* Preprocessing */
 326   //      for (i = 0; i < ASIZE; ++i)
 327   //        bc[i] = m;
 328   //      for (i = 0; i < m - 1; ) {
 329   //        c = pattern[i];
 330   //        ++i;
 331   //        // c < 256 for Latin1 string, so, no need for branch
 332   //        #ifdef PATTERN_STRING_IS_LATIN1
 333   //        bc[c] = m - i;
 334   //        #else
 335   //        if (c < ASIZE) bc[c] = m - i;
 336   //        #endif
 337   //      }
 338   //
 339   //      /* Searching */
 340   //      j = 0;
 341   //      while (j <= n - m) {
 342   //        c = src[i+j];
 343   //        if (pattern[m-1] == c)
 344   //          int k;
 345   //          for (k = m - 2; k >= 0 && pattern[k] == src[k + j]; --k);
 346   //          if (k < 0) return j;
 347   //          // c < 256 for Latin1 string, so, no need for branch
 348   //          #ifdef SOURCE_STRING_IS_LATIN1_AND_PATTERN_STRING_IS_LATIN1
 349   //          // LL case: (c< 256) always true. Remove branch
 350   //          j += bc[pattern[j+m-1]];
 351   //          #endif
 352   //          #ifdef SOURCE_STRING_IS_UTF_AND_PATTERN_STRING_IS_UTF
 353   //          // UU case: need if (c<ASIZE) check. Skip 1 character if not.
 354   //          if (c < ASIZE)
 355   //            j += bc[pattern[j+m-1]];
 356   //          else
 357   //            j += 1
 358   //          #endif
 359   //          #ifdef SOURCE_IS_UTF_AND_PATTERN_IS_LATIN1
 360   //          // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
 361   //          if (c < ASIZE)
 362   //            j += bc[pattern[j+m-1]];
 363   //          else
 364   //            j += m
 365   //          #endif
 366   //      }
 367   //      return -1;
 368   //    }
 369 
 370   // temp register:t0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, result
 371   Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
 372         BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
 373 
 374   Register haystack_end = haystack_len;
 375   Register skipch = tmp2;
 376 
 377   // pattern length is >=8, so, we can read at least 1 register for cases when
 378   // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
 379   // UL case. We'll re-read last character in inner pre-loop code to have
 380   // single outer pre-loop load
 381   const int firstStep = isLL ? 7 : 3;
 382 
 383   const int ASIZE = 256;
 384   const int STORE_BYTES = 8; // 8 bytes stored per instruction(sd)
 385 
 386   sub(sp, sp, ASIZE);
 387 
 388   // init BC offset table with default value: needle_len
 389   slli(t0, needle_len, 8);
 390   orr(t0, t0, needle_len); // [63...16][needle_len][needle_len]
 391   slli(tmp1, t0, 16);
 392   orr(t0, tmp1, t0); // [63...32][needle_len][needle_len][needle_len][needle_len]
 393   slli(tmp1, t0, 32);
 394   orr(tmp5, tmp1, t0); // tmp5: 8 elements [needle_len]
 395 
 396   mv(ch1, sp);  // ch1 is t0
 397   mv(tmp6, ASIZE / STORE_BYTES); // loop iterations
 398 
 399   bind(BM_INIT_LOOP);
 400   // for (i = 0; i < ASIZE; ++i)
 401   //   bc[i] = m;
 402   for (int i = 0; i < 4; i++) {
 403     sd(tmp5, Address(ch1, i * wordSize));
 404   }
 405   add(ch1, ch1, 32);
 406   sub(tmp6, tmp6, 4);
 407   bgtz(tmp6, BM_INIT_LOOP);
 408 
 409   sub(nlen_tmp, needle_len, 1); // m - 1, index of the last element in pattern
 410   Register orig_haystack = tmp5;
 411   mv(orig_haystack, haystack);
 412   slli(haystack_end, result_tmp, haystack_chr_shift); // result_tmp = tmp4
 413   add(haystack_end, haystack, haystack_end);
 414   sub(ch2, needle_len, 1); // bc offset init value, ch2 is t1
 415   mv(tmp3, needle);
 416 
 417   //  for (i = 0; i < m - 1; ) {
 418   //    c = pattern[i];
 419   //    ++i;
 420   //    // c < 256 for Latin1 string, so, no need for branch
 421   //    #ifdef PATTERN_STRING_IS_LATIN1
 422   //    bc[c] = m - i;
 423   //    #else
 424   //    if (c < ASIZE) bc[c] = m - i;
 425   //    #endif
 426   //  }
 427   bind(BCLOOP);
 428   (this->*needle_load_1chr)(ch1, Address(tmp3), noreg);
 429   add(tmp3, tmp3, needle_chr_size);
 430   if (!needle_isL) {
 431     // ae == StrIntrinsicNode::UU
 432     mv(tmp6, ASIZE);
 433     bgeu(ch1, tmp6, BCSKIP);
 434   }
 435   add(tmp4, sp, ch1);
 436   sb(ch2, Address(tmp4)); // store skip offset to BC offset table
 437 
 438   bind(BCSKIP);
 439   sub(ch2, ch2, 1); // for next pattern element, skip distance -1
 440   bgtz(ch2, BCLOOP);
 441 
 442   slli(tmp6, needle_len, needle_chr_shift);
 443   add(tmp6, tmp6, needle); // tmp6: pattern end, address after needle
 444   if (needle_isL == haystack_isL) {
 445     // load last 8 bytes (8LL/4UU symbols)
 446     ld(tmp6, Address(tmp6, -wordSize));
 447   } else {
 448     // UL: from UTF-16(source) search Latin1(pattern)
 449     lwu(tmp6, Address(tmp6, -wordSize / 2)); // load last 4 bytes(4 symbols)
 450     // convert Latin1 to UTF. eg: 0x0000abcd -> 0x0a0b0c0d
 451     // We'll have to wait until load completed, but it's still faster than per-character loads+checks
 452     srli(tmp3, tmp6, BitsPerByte * (wordSize / 2 - needle_chr_size)); // pattern[m-1], eg:0x0000000a
 453     slli(ch2, tmp6, registerSize - 24);
 454     srli(ch2, ch2, registerSize - 8); // pattern[m-2], 0x0000000b
 455     slli(ch1, tmp6, registerSize - 16);
 456     srli(ch1, ch1, registerSize - 8); // pattern[m-3], 0x0000000c
 457     andi(tmp6, tmp6, 0xff); // pattern[m-4], 0x0000000d
 458     slli(ch2, ch2, 16);
 459     orr(ch2, ch2, ch1); // 0x00000b0c
 460     slli(result, tmp3, 48); // use result as temp register
 461     orr(tmp6, tmp6, result); // 0x0a00000d
 462     slli(result, ch2, 16);
 463     orr(tmp6, tmp6, result); // UTF-16:0x0a0b0c0d
 464   }
 465 
 466   // i = m - 1;
 467   // skipch = j + i;
 468   // if (skipch == pattern[m - 1]
 469   //   for (k = m - 2; k >= 0 && pattern[k] == src[k + j]; --k);
 470   // else
 471   //   move j with bad char offset table
 472   bind(BMLOOPSTR2);
 473   // compare pattern to source string backward
 474   slli(result, nlen_tmp, haystack_chr_shift);
 475   add(result, haystack, result);
 476   (this->*haystack_load_1chr)(skipch, Address(result), noreg);
 477   sub(nlen_tmp, nlen_tmp, firstStep); // nlen_tmp is positive here, because needle_len >= 8
 478   if (needle_isL == haystack_isL) {
 479     // re-init tmp3. It's for free because it's executed in parallel with
 480     // load above. Alternative is to initialize it before loop, but it'll
 481     // affect performance on in-order systems with 2 or more ld/st pipelines
 482     srli(tmp3, tmp6, BitsPerByte * (wordSize - needle_chr_size)); // UU/LL: pattern[m-1]
 483   }
 484   if (!isLL) { // UU/UL case
 485     slli(ch2, nlen_tmp, 1); // offsets in bytes
 486   }
 487   bne(tmp3, skipch, BMSKIP); // if not equal, skipch is bad char
 488   add(result, haystack, isLL ? nlen_tmp : ch2);
 489   ld(ch2, Address(result)); // load 8 bytes from source string
 490   mv(ch1, tmp6);
 491   if (isLL) {
 492     j(BMLOOPSTR1_AFTER_LOAD);
 493   } else {
 494     sub(nlen_tmp, nlen_tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
 495     j(BMLOOPSTR1_CMP);
 496   }
 497 
 498   bind(BMLOOPSTR1);
 499   slli(ch1, nlen_tmp, needle_chr_shift);
 500   add(ch1, ch1, needle);
 501   (this->*needle_load_1chr)(ch1, Address(ch1), noreg);
 502   slli(ch2, nlen_tmp, haystack_chr_shift);
 503   add(ch2, haystack, ch2);
 504   (this->*haystack_load_1chr)(ch2, Address(ch2), noreg);
 505 
 506   bind(BMLOOPSTR1_AFTER_LOAD);
 507   sub(nlen_tmp, nlen_tmp, 1);
 508   bltz(nlen_tmp, BMLOOPSTR1_LASTCMP);
 509 
 510   bind(BMLOOPSTR1_CMP);
 511   beq(ch1, ch2, BMLOOPSTR1);
 512 
 513   bind(BMSKIP);
 514   if (!isLL) {
 515     // if we've met UTF symbol while searching Latin1 pattern, then we can
 516     // skip needle_len symbols
 517     if (needle_isL != haystack_isL) {
 518       mv(result_tmp, needle_len);
 519     } else {
 520       mv(result_tmp, 1);
 521     }
 522     mv(t0, ASIZE);
 523     bgeu(skipch, t0, BMADV);
 524   }
 525   add(result_tmp, sp, skipch);
 526   lbu(result_tmp, Address(result_tmp)); // load skip offset
 527 
 528   bind(BMADV);
 529   sub(nlen_tmp, needle_len, 1);
 530   slli(result, result_tmp, haystack_chr_shift);
 531   add(haystack, haystack, result); // move haystack after bad char skip offset
 532   ble(haystack, haystack_end, BMLOOPSTR2);
 533   add(sp, sp, ASIZE);
 534   j(NOMATCH);
 535 
 536   bind(BMLOOPSTR1_LASTCMP);
 537   bne(ch1, ch2, BMSKIP);
 538 
 539   bind(BMMATCH);
 540   sub(result, haystack, orig_haystack);
 541   if (!haystack_isL) {
 542     srli(result, result, 1);
 543   }
 544   add(sp, sp, ASIZE);
 545   j(DONE);
 546 
 547   bind(LINEARSTUB);
 548   sub(t0, needle_len, 16); // small patterns still should be handled by simple algorithm
 549   bltz(t0, LINEARSEARCH);
 550   mv(result, zr);
 551   RuntimeAddress stub = NULL;
 552   if (isLL) {
 553     stub = RuntimeAddress(StubRoutines::riscv64::string_indexof_linear_ll());
 554     assert(stub.target() != NULL, "string_indexof_linear_ll stub has not been generated");
 555   } else if (needle_isL) {
 556     stub = RuntimeAddress(StubRoutines::riscv64::string_indexof_linear_ul());
 557     assert(stub.target() != NULL, "string_indexof_linear_ul stub has not been generated");
 558   } else {
 559     stub = RuntimeAddress(StubRoutines::riscv64::string_indexof_linear_uu());
 560     assert(stub.target() != NULL, "string_indexof_linear_uu stub has not been generated");
 561   }
 562   trampoline_call(stub);
 563   j(DONE);
 564 
 565   bind(NOMATCH);
 566   mv(result, -1);
 567   j(DONE);
 568 
 569   bind(LINEARSEARCH);
 570   string_indexof_linearscan(haystack, needle, haystack_len, needle_len, tmp1, tmp2, tmp3, tmp4, -1, result, ae);
 571 
 572   bind(DONE);
 573   BLOCK_COMMENT("} string_indexof");
 574 }
 575 
 576 // string_indexof
 577 // result: x10
 578 // src: x11
 579 // src_count: x12
 580 // pattern: x13
 581 // pattern_count: x14 or 1/2/3/4
 582 void C2_MacroAssembler::string_indexof_linearscan(Register haystack, Register needle,
 583                                                Register haystack_len, Register needle_len,
 584                                                Register tmp1, Register tmp2,
 585                                                Register tmp3, Register tmp4,
 586                                                int needle_con_cnt, Register result, int ae)
 587 {
 588   // Note:
 589   // needle_con_cnt > 0 means needle_len register is invalid, needle length is constant
 590   // for UU/LL: needle_con_cnt[1, 4], UL: needle_con_cnt = 1
 591   assert(needle_con_cnt <= 4, "Invalid needle constant count");
 592   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
 593 
 594   Register ch1 = t0;
 595   Register ch2 = t1;
 596   Register hlen_neg = haystack_len, nlen_neg = needle_len;
 597   Register nlen_tmp = tmp1, hlen_tmp = tmp2, result_tmp = tmp4;
 598 
 599   bool isLL = ae == StrIntrinsicNode::LL;
 600 
 601   bool needle_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
 602   bool haystack_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
 603   int needle_chr_shift = needle_isL ? 0 : 1;
 604   int haystack_chr_shift = haystack_isL ? 0 : 1;
 605   int needle_chr_size = needle_isL ? 1 : 2;
 606   int haystack_chr_size = haystack_isL ? 1 : 2;
 607 
 608   load_chr_insn needle_load_1chr = needle_isL ? (load_chr_insn)&MacroAssembler::lbu :
 609                               (load_chr_insn)&MacroAssembler::lhu;
 610   load_chr_insn haystack_load_1chr = haystack_isL ? (load_chr_insn)&MacroAssembler::lbu :
 611                                 (load_chr_insn)&MacroAssembler::lhu;
 612   load_chr_insn load_2chr = isLL ? (load_chr_insn)&MacroAssembler::lhu : (load_chr_insn)&MacroAssembler::lwu;
 613   load_chr_insn load_4chr = isLL ? (load_chr_insn)&MacroAssembler::lwu : (load_chr_insn)&MacroAssembler::ld;
 614 
 615   Label DO1, DO2, DO3, MATCH, NOMATCH, DONE;
 616 
 617   Register first = tmp3;
 618 
 619   if (needle_con_cnt == -1) {
 620     Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
 621 
 622     sub(t0, needle_len, needle_isL == haystack_isL ? 4 : 2);
 623     bltz(t0, DOSHORT);
 624 
 625     (this->*needle_load_1chr)(first, Address(needle), noreg);
 626     slli(t0, needle_len, needle_chr_shift);
 627     add(needle, needle, t0);
 628     neg(nlen_neg, t0);
 629     slli(t0, result_tmp, haystack_chr_shift);
 630     add(haystack, haystack, t0);
 631     neg(hlen_neg, t0);
 632 
 633     bind(FIRST_LOOP);
 634     add(t0, haystack, hlen_neg);
 635     (this->*haystack_load_1chr)(ch2, Address(t0), noreg);
 636     beq(first, ch2, STR1_LOOP);
 637 
 638     bind(STR2_NEXT);
 639     add(hlen_neg, hlen_neg, haystack_chr_size);
 640     blez(hlen_neg, FIRST_LOOP);
 641     j(NOMATCH);
 642 
 643     bind(STR1_LOOP);
 644     add(nlen_tmp, nlen_neg, needle_chr_size);
 645     add(hlen_tmp, hlen_neg, haystack_chr_size);
 646     bgez(nlen_tmp, MATCH);
 647 
 648     bind(STR1_NEXT);
 649     add(ch1, needle, nlen_tmp);
 650     (this->*needle_load_1chr)(ch1, Address(ch1), noreg);
 651     add(ch2, haystack, hlen_tmp);
 652     (this->*haystack_load_1chr)(ch2, Address(ch2), noreg);
 653     bne(ch1, ch2, STR2_NEXT);
 654     add(nlen_tmp, nlen_tmp, needle_chr_size);
 655     add(hlen_tmp, hlen_tmp, haystack_chr_size);
 656     bltz(nlen_tmp, STR1_NEXT);
 657     j(MATCH);
 658 
 659     bind(DOSHORT);
 660     if (needle_isL == haystack_isL) {
 661       sub(t0, needle_len, 2);
 662       bltz(t0, DO1);
 663       bgtz(t0, DO3);
 664     }
 665   }
 666 
 667   if (needle_con_cnt == 4) {
 668     Label CH1_LOOP;
 669     (this->*load_4chr)(ch1, Address(needle), noreg);
 670     sub(result_tmp, haystack_len, 4);
 671     slli(tmp3, result_tmp, haystack_chr_shift); // result as tmp
 672     add(haystack, haystack, tmp3);
 673     neg(hlen_neg, tmp3);
 674 
 675     bind(CH1_LOOP);
 676     add(ch2, haystack, hlen_neg);
 677     (this->*load_4chr)(ch2, Address(ch2), noreg);
 678     beq(ch1, ch2, MATCH);
 679     add(hlen_neg, hlen_neg, haystack_chr_size);
 680     blez(hlen_neg, CH1_LOOP);
 681     j(NOMATCH);
 682   }
 683 
 684   if ((needle_con_cnt == -1 && needle_isL == haystack_isL) || needle_con_cnt == 2) {
 685     Label CH1_LOOP;
 686     BLOCK_COMMENT("string_indexof DO2 {");
 687     bind(DO2);
 688     (this->*load_2chr)(ch1, Address(needle), noreg);
 689     if (needle_con_cnt == 2) {
 690       sub(result_tmp, haystack_len, 2);
 691     }
 692     slli(tmp3, result_tmp, haystack_chr_shift);
 693     add(haystack, haystack, tmp3);
 694     neg(hlen_neg, tmp3);
 695 
 696     bind(CH1_LOOP);
 697     add(tmp3, haystack, hlen_neg);
 698     (this->*load_2chr)(ch2, Address(tmp3), noreg);
 699     beq(ch1, ch2, MATCH);
 700     add(hlen_neg, hlen_neg, haystack_chr_size);
 701     blez(hlen_neg, CH1_LOOP);
 702     j(NOMATCH);
 703     BLOCK_COMMENT("} string_indexof DO2");
 704   }
 705 
 706   if ((needle_con_cnt == -1 && needle_isL == haystack_isL) || needle_con_cnt == 3) {
 707     Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
 708     BLOCK_COMMENT("string_indexof DO3 {");
 709 
 710     bind(DO3);
 711     (this->*load_2chr)(first, Address(needle), noreg);
 712     (this->*needle_load_1chr)(ch1, Address(needle, 2 * needle_chr_size), noreg);
 713     if (needle_con_cnt == 3) {
 714       sub(result_tmp, haystack_len, 3);
 715     }
 716     slli(hlen_tmp, result_tmp, haystack_chr_shift);
 717     add(haystack, haystack, hlen_tmp);
 718     neg(hlen_neg, hlen_tmp);
 719 
 720     bind(FIRST_LOOP);
 721     add(ch2, haystack, hlen_neg);
 722     (this->*load_2chr)(ch2, Address(ch2), noreg);
 723     beq(first, ch2, STR1_LOOP);
 724 
 725     bind(STR2_NEXT);
 726     add(hlen_neg, hlen_neg, haystack_chr_size);
 727     blez(hlen_neg, FIRST_LOOP);
 728     j(NOMATCH);
 729 
 730     bind(STR1_LOOP);
 731     add(hlen_tmp, hlen_neg, 2 * haystack_chr_size);
 732     add(ch2, haystack, hlen_tmp);
 733     (this->*haystack_load_1chr)(ch2, Address(ch2), noreg);
 734     bne(ch1, ch2, STR2_NEXT);
 735     j(MATCH);
 736     BLOCK_COMMENT("} string_indexof DO3");
 737   }
 738 
 739   if (needle_con_cnt == -1 || needle_con_cnt == 1) {
 740     Label DO1_LOOP;
 741 
 742     BLOCK_COMMENT("string_indexof DO1 {");
 743     bind(DO1);
 744     (this->*needle_load_1chr)(ch1, Address(needle), noreg);
 745     sub(result_tmp, haystack_len, 1);
 746     mv(tmp3, result_tmp);
 747     if (haystack_chr_shift) {
 748       slli(tmp3, result_tmp, haystack_chr_shift);
 749     }
 750     add(haystack, haystack, tmp3);
 751     neg(hlen_neg, tmp3);
 752 
 753     bind(DO1_LOOP);
 754     add(tmp3, haystack, hlen_neg);
 755     (this->*haystack_load_1chr)(ch2, Address(tmp3), noreg);
 756     beq(ch1, ch2, MATCH);
 757     add(hlen_neg, hlen_neg, haystack_chr_size);
 758     blez(hlen_neg, DO1_LOOP);
 759     BLOCK_COMMENT("} string_indexof DO1");
 760   }
 761 
 762   bind(NOMATCH);
 763   mv(result, -1);
 764   j(DONE);
 765 
 766   bind(MATCH);
 767   srai(t0, hlen_neg, haystack_chr_shift);
 768   add(result, result_tmp, t0);
 769 
 770   bind(DONE);
 771 }
 772 
 773 // Compare strings.
 774 void C2_MacroAssembler::string_compare(Register str1, Register str2,
 775                                     Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
 776                                     Register tmp3, int ae)
 777 {
 778   Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
 779       DIFFERENCE, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
 780       SHORT_LOOP_START, TAIL_CHECK, L;
 781 
 782   const int STUB_THRESHOLD = 64 + 8;
 783   bool isLL = ae == StrIntrinsicNode::LL;
 784   bool isLU = ae == StrIntrinsicNode::LU;
 785   bool isUL = ae == StrIntrinsicNode::UL;
 786 
 787   bool str1_isL = isLL || isLU;
 788   bool str2_isL = isLL || isUL;
 789 
 790   // for L strings, 1 byte for 1 character
 791   // for U strings, 2 bytes for 1 character
 792   int str1_chr_size = str1_isL ? 1 : 2;
 793   int str2_chr_size = str2_isL ? 1 : 2;
 794   int minCharsInWord = isLL ? wordSize : wordSize / 2;
 795 
 796   load_chr_insn str1_load_chr = str1_isL ? (load_chr_insn)&MacroAssembler::lbu : (load_chr_insn)&MacroAssembler::lhu;
 797   load_chr_insn str2_load_chr = str2_isL ? (load_chr_insn)&MacroAssembler::lbu : (load_chr_insn)&MacroAssembler::lhu;
 798 
 799   BLOCK_COMMENT("string_compare {");
 800 
 801   // Bizzarely, the counts are passed in bytes, regardless of whether they
 802   // are L or U strings, however the result is always in characters.
 803   if (!str1_isL) {
 804     sraiw(cnt1, cnt1, 1);
 805   }
 806   if (!str2_isL) {
 807     sraiw(cnt2, cnt2, 1);
 808   }
 809 
 810   // Compute the minimum of the string lengths and save the difference in result.
 811   sub(result, cnt1, cnt2);
 812   bgt(cnt1, cnt2, L);
 813   mv(cnt2, cnt1);
 814   bind(L);
 815 
 816   // A very short string
 817   li(t0, minCharsInWord);
 818   ble(cnt2, t0, SHORT_STRING);
 819 
 820   // Compare longwords
 821   // load first parts of strings and finish initialization while loading
 822   {
 823     if (str1_isL == str2_isL) { // LL or UU
 824       // load 8 bytes once to compare
 825       ld(tmp1, Address(str1));
 826       beq(str1, str2, DONE);
 827       ld(tmp2, Address(str2));
 828       li(t0, STUB_THRESHOLD);
 829       bge(cnt2, t0, STUB);
 830       sub(cnt2, cnt2, minCharsInWord);
 831       beqz(cnt2, TAIL_CHECK);
 832       // convert cnt2 from characters to bytes
 833       if(!str1_isL) {
 834         slli(cnt2, cnt2, 1);
 835       }
 836       add(str2, str2, cnt2);
 837       add(str1, str1, cnt2);
 838       sub(cnt2, zr, cnt2);
 839     } else if (isLU) { // LU case
 840       lwu(tmp1, Address(str1));
 841       ld(tmp2, Address(str2));
 842       li(t0, STUB_THRESHOLD);
 843       bge(cnt2, t0, STUB);
 844       addi(cnt2, cnt2, -4);
 845       add(str1, str1, cnt2);
 846       sub(cnt1, zr, cnt2);
 847       slli(cnt2, cnt2, 1);
 848       add(str2, str2, cnt2);
 849       inflate_lo32(tmp3, tmp1);
 850       mv(tmp1, tmp3);
 851       sub(cnt2, zr, cnt2);
 852       addi(cnt1, cnt1, 4);
 853     } else { // UL case
 854       ld(tmp1, Address(str1));
 855       lwu(tmp2, Address(str2));
 856       li(t0, STUB_THRESHOLD);
 857       bge(cnt2, t0, STUB);
 858       addi(cnt2, cnt2, -4);
 859       slli(t0, cnt2, 1);
 860       sub(cnt1, zr, t0);
 861       add(str1, str1, t0);
 862       add(str2, str2, cnt2);
 863       inflate_lo32(tmp3, tmp2);
 864       mv(tmp2, tmp3);
 865       sub(cnt2, zr, cnt2);
 866       addi(cnt1, cnt1, 8);
 867     }
 868     addi(cnt2, cnt2, isUL ? 4 : 8);
 869     bgez(cnt2, TAIL);
 870     xorr(tmp3, tmp1, tmp2);
 871     bnez(tmp3, DIFFERENCE);
 872 
 873     // main loop
 874     bind(NEXT_WORD);
 875     if (str1_isL == str2_isL) { // LL or UU
 876       add(t0, str1, cnt2);
 877       ld(tmp1, Address(t0));
 878       add(t0, str2, cnt2);
 879       ld(tmp2, Address(t0));
 880       addi(cnt2, cnt2, 8);
 881     } else if (isLU) { // LU case
 882       add(t0, str1, cnt1);
 883       lwu(tmp1, Address(t0));
 884       add(t0, str2, cnt2);
 885       ld(tmp2, Address(t0));
 886       addi(cnt1, cnt1, 4);
 887       inflate_lo32(tmp3, tmp1);
 888       mv(tmp1, tmp3);
 889       addi(cnt2, cnt2, 8);
 890     } else { // UL case
 891       add(t0, str2, cnt2);
 892       lwu(tmp2, Address(t0));
 893       add(t0, str1, cnt1);
 894       ld(tmp1, Address(t0));
 895       inflate_lo32(tmp3, tmp2);
 896       mv(tmp2, tmp3);
 897       addi(cnt1, cnt1, 8);
 898       addi(cnt2, cnt2, 4);
 899     }
 900     bgez(cnt2, TAIL);
 901 
 902     xorr(tmp3, tmp1, tmp2);
 903     beqz(tmp3, NEXT_WORD);
 904     j(DIFFERENCE);
 905     bind(TAIL);
 906     xorr(tmp3, tmp1, tmp2);
 907     bnez(tmp3, DIFFERENCE);
 908     // Last longword.  In the case where length == 4 we compare the
 909     // same longword twice, but that's still faster than another
 910     // conditional branch.
 911     if (str1_isL == str2_isL) { // LL or UU
 912       ld(tmp1, Address(str1));
 913       ld(tmp2, Address(str2));
 914     } else if (isLU) { // LU case
 915       lwu(tmp1, Address(str1));
 916       ld(tmp2, Address(str2));
 917       inflate_lo32(tmp3, tmp1);
 918       mv(tmp1, tmp3);
 919     } else { // UL case
 920       lwu(tmp2, Address(str2));
 921       ld(tmp1, Address(str1));
 922       inflate_lo32(tmp3, tmp2);
 923       mv(tmp2, tmp3);
 924     }
 925     bind(TAIL_CHECK);
 926     xorr(tmp3, tmp1, tmp2);
 927     beqz(tmp3, DONE);
 928 
 929     // Find the first different characters in the longwords and
 930     // compute their difference.
 931     bind(DIFFERENCE);
 932     ctzc_bit(result, tmp3, isLL); // count zero from lsb to msb
 933     srl(tmp1, tmp1, result);
 934     srl(tmp2, tmp2, result);
 935     if (isLL) {
 936       andi(tmp1, tmp1, 0xFF);
 937       andi(tmp2, tmp2, 0xFF);
 938     } else {
 939       andi(tmp1, tmp1, 0xFFFF);
 940       andi(tmp2, tmp2, 0xFFFF);
 941     }
 942     sub(result, tmp1, tmp2);
 943     j(DONE);
 944   }
 945 
 946   bind(STUB);
 947   RuntimeAddress stub = NULL;
 948   switch (ae) {
 949     case StrIntrinsicNode::LL:
 950       stub = RuntimeAddress(StubRoutines::riscv64::compare_long_string_LL());
 951       break;
 952     case StrIntrinsicNode::UU:
 953       stub = RuntimeAddress(StubRoutines::riscv64::compare_long_string_UU());
 954       break;
 955     case StrIntrinsicNode::LU:
 956       stub = RuntimeAddress(StubRoutines::riscv64::compare_long_string_LU());
 957       break;
 958     case StrIntrinsicNode::UL:
 959       stub = RuntimeAddress(StubRoutines::riscv64::compare_long_string_UL());
 960       break;
 961     default:
 962       ShouldNotReachHere();
 963   }
 964   assert(stub.target() != NULL, "compare_long_string stub has not been generated");
 965   trampoline_call(stub);
 966   j(DONE);
 967 
 968   bind(SHORT_STRING);
 969   // Is the minimum length zero?
 970   beqz(cnt2, DONE);
 971   // arrange code to do most branches while loading and loading next characters
 972   // while comparing previous
 973   (this->*str1_load_chr)(tmp1, Address(str1), t0);
 974   addi(str1, str1, str1_chr_size);
 975   addi(cnt2, cnt2, -1);
 976   beqz(cnt2, SHORT_LAST_INIT);
 977   (this->*str2_load_chr)(cnt1, Address(str2), t0);
 978   addi(str2, str2, str2_chr_size);
 979   j(SHORT_LOOP_START);
 980   bind(SHORT_LOOP);
 981   addi(cnt2, cnt2, -1);
 982   beqz(cnt2, SHORT_LAST);
 983   bind(SHORT_LOOP_START);
 984   (this->*str1_load_chr)(tmp2, Address(str1), t0);
 985   addi(str1, str1, str1_chr_size);
 986   (this->*str2_load_chr)(t0, Address(str2), t0);
 987   addi(str2, str2, str2_chr_size);
 988   bne(tmp1, cnt1, SHORT_LOOP_TAIL);
 989   addi(cnt2, cnt2, -1);
 990   beqz(cnt2, SHORT_LAST2);
 991   (this->*str1_load_chr)(tmp1, Address(str1), t0);
 992   addi(str1, str1, str1_chr_size);
 993   (this->*str2_load_chr)(cnt1, Address(str2), t0);
 994   addi(str2, str2, str2_chr_size);
 995   beq(tmp2, t0, SHORT_LOOP);
 996   sub(result, tmp2, t0);
 997   j(DONE);
 998   bind(SHORT_LOOP_TAIL);
 999   sub(result, tmp1, cnt1);
1000   j(DONE);
1001   bind(SHORT_LAST2);
1002   beq(tmp2, t0, DONE);
1003   sub(result, tmp2, t0);
1004 
1005   j(DONE);
1006   bind(SHORT_LAST_INIT);
1007   (this->*str2_load_chr)(cnt1, Address(str2), t0);
1008   addi(str2, str2, str2_chr_size);
1009   bind(SHORT_LAST);
1010   beq(tmp1, cnt1, DONE);
1011   sub(result, tmp1, cnt1);
1012 
1013   bind(DONE);
1014 
1015   BLOCK_COMMENT("} string_compare");
1016 }
1017 
1018 void C2_MacroAssembler::arrays_equals(Register a1, Register a2, Register tmp3,
1019                                       Register tmp4, Register tmp5, Register tmp6, Register result,
1020                                       Register cnt1, int elem_size) {
1021   Label DONE, SAME, NEXT_DWORD, SHORT, TAIL, TAIL2, IS_TMP5_ZR;
1022   Register tmp1 = t0;
1023   Register tmp2 = t1;
1024   Register cnt2 = tmp2;  // cnt2 only used in array length compare
1025   Register elem_per_word = tmp6;
1026   int log_elem_size = log2i_exact(elem_size);
1027   int length_offset = arrayOopDesc::length_offset_in_bytes();
1028   int base_offset   = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE);
1029 
1030   assert(elem_size == 1 || elem_size == 2, "must be char or byte");
1031   assert_different_registers(a1, a2, result, cnt1, t0, t1, tmp3, tmp4, tmp5, tmp6);
1032   li(elem_per_word, wordSize / elem_size);
1033 
1034   BLOCK_COMMENT("arrays_equals {");
1035 
1036   // if (a1 == a2), return true
1037   beq(a1, a2, SAME);
1038 
1039   mv(result, false);
1040   beqz(a1, DONE);
1041   beqz(a2, DONE);
1042   lwu(cnt1, Address(a1, length_offset));
1043   lwu(cnt2, Address(a2, length_offset));
1044   bne(cnt2, cnt1, DONE);
1045   beqz(cnt1, SAME);
1046 
1047   slli(tmp5, cnt1, 3 + log_elem_size);
1048   sub(tmp5, zr, tmp5);
1049   add(a1, a1, base_offset);
1050   add(a2, a2, base_offset);
1051   ld(tmp3, Address(a1, 0));
1052   ld(tmp4, Address(a2, 0));
1053   ble(cnt1, elem_per_word, SHORT); // short or same
1054 
1055   // Main 16 byte comparison loop with 2 exits
1056   bind(NEXT_DWORD); {
1057     ld(tmp1, Address(a1, wordSize));
1058     ld(tmp2, Address(a2, wordSize));
1059     sub(cnt1, cnt1, 2 * wordSize / elem_size);
1060     blez(cnt1, TAIL);
1061     bne(tmp3, tmp4, DONE);
1062     ld(tmp3, Address(a1, 2 * wordSize));
1063     ld(tmp4, Address(a2, 2 * wordSize));
1064     add(a1, a1, 2 * wordSize);
1065     add(a2, a2, 2 * wordSize);
1066     ble(cnt1, elem_per_word, TAIL2);
1067   } beq(tmp1, tmp2, NEXT_DWORD);
1068   j(DONE);
1069 
1070   bind(TAIL);
1071   xorr(tmp4, tmp3, tmp4);
1072   xorr(tmp2, tmp1, tmp2);
1073   sll(tmp2, tmp2, tmp5);
1074   orr(tmp5, tmp4, tmp2);
1075   j(IS_TMP5_ZR);
1076 
1077   bind(TAIL2);
1078   bne(tmp1, tmp2, DONE);
1079 
1080   bind(SHORT);
1081   xorr(tmp4, tmp3, tmp4);
1082   sll(tmp5, tmp4, tmp5);
1083 
1084   bind(IS_TMP5_ZR);
1085   bnez(tmp5, DONE);
1086 
1087   bind(SAME);
1088   mv(result, true);
1089   // That's it.
1090   bind(DONE);
1091 
1092   BLOCK_COMMENT("} array_equals");
1093 }
1094 
1095 // Compare Strings
1096 
1097 // For Strings we're passed the address of the first characters in a1
1098 // and a2 and the length in cnt1.
1099 // elem_size is the element size in bytes: either 1 or 2.
1100 // There are two implementations.  For arrays >= 8 bytes, all
1101 // comparisons (including the final one, which may overlap) are
1102 // performed 8 bytes at a time.  For strings < 8 bytes, we compare a
1103 // halfword, then a short, and then a byte.
1104 
1105 void C2_MacroAssembler::string_equals(Register a1, Register a2,
1106                                       Register result, Register cnt1, int elem_size)
1107 {
1108   Label SAME, DONE, SHORT, NEXT_WORD;
1109   Register tmp1 = t0;
1110   Register tmp2 = t1;
1111 
1112   assert(elem_size == 1 || elem_size == 2, "must be 2 or 1 byte");
1113   assert_different_registers(a1, a2, result, cnt1, t0, t1);
1114 
1115   BLOCK_COMMENT("string_equals {");
1116 
1117   mv(result, false);
1118 
1119   // Check for short strings, i.e. smaller than wordSize.
1120   sub(cnt1, cnt1, wordSize);
1121   bltz(cnt1, SHORT);
1122 
1123   // Main 8 byte comparison loop.
1124   bind(NEXT_WORD); {
1125     ld(tmp1, Address(a1, 0));
1126     add(a1, a1, wordSize);
1127     ld(tmp2, Address(a2, 0));
1128     add(a2, a2, wordSize);
1129     sub(cnt1, cnt1, wordSize);
1130     bne(tmp1, tmp2, DONE);
1131   } bgtz(cnt1, NEXT_WORD);
1132 
1133   // Last longword.  In the case where length == 4 we compare the
1134   // same longword twice, but that's still faster than another
1135   // conditional branch.
1136   // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when
1137   // length == 4.
1138   add(tmp1, a1, cnt1);
1139   ld(tmp1, Address(tmp1, 0));
1140   add(tmp2, a2, cnt1);
1141   ld(tmp2, Address(tmp2, 0));
1142   bne(tmp1, tmp2, DONE);
1143   j(SAME);
1144 
1145   bind(SHORT);
1146   Label TAIL03, TAIL01;
1147 
1148   // 0-7 bytes left.
1149   andi(t0, cnt1, 4);
1150   beqz(t0, TAIL03);
1151   {
1152     lwu(tmp1, Address(a1, 0));
1153     add(a1, a1, 4);
1154     lwu(tmp2, Address(a2, 0));
1155     add(a2, a2, 4);
1156     bne(tmp1, tmp2, DONE);
1157   }
1158 
1159   bind(TAIL03);
1160   // 0-3 bytes left.
1161   andi(t0, cnt1, 2);
1162   beqz(t0, TAIL01);
1163   {
1164     lhu(tmp1, Address(a1, 0));
1165     add(a1, a1, 2);
1166     lhu(tmp2, Address(a2, 0));
1167     add(a2, a2, 2);
1168     bne(tmp1, tmp2, DONE);
1169   }
1170 
1171   bind(TAIL01);
1172   if (elem_size == 1) { // Only needed when comparing 1-byte elements
1173     // 0-1 bytes left.
1174     andi(t0, cnt1, 1);
1175     beqz(t0, SAME);
1176     {
1177       lbu(tmp1, a1, 0);
1178       lbu(tmp2, a2, 0);
1179       bne(tmp1, tmp2, DONE);
1180     }
1181   }
1182 
1183   // Arrays are equal.
1184   bind(SAME);
1185   mv(result, true);
1186 
1187   // That's it.
1188   bind(DONE);
1189   BLOCK_COMMENT("} string_equals");
1190 }
1191 
1192 typedef void (Assembler::*conditional_branch_insn)(Register op1, Register op2, Label& label, bool is_far);
1193 typedef void (MacroAssembler::*float_conditional_branch_insn)(FloatRegister op1, FloatRegister op2, Label& label,
1194                                                               bool is_far, bool is_unordered);
1195 
1196 static conditional_branch_insn conditional_branches[] =
1197 {
1198   /* SHORT branches */
1199   (conditional_branch_insn)&Assembler::beq,
1200   (conditional_branch_insn)&Assembler::bgt,
1201   NULL, // BoolTest::overflow
1202   (conditional_branch_insn)&Assembler::blt,
1203   (conditional_branch_insn)&Assembler::bne,
1204   (conditional_branch_insn)&Assembler::ble,
1205   NULL, // BoolTest::no_overflow
1206   (conditional_branch_insn)&Assembler::bge,
1207 
1208   /* UNSIGNED branches */
1209   (conditional_branch_insn)&Assembler::beq,
1210   (conditional_branch_insn)&Assembler::bgtu,
1211   NULL,
1212   (conditional_branch_insn)&Assembler::bltu,
1213   (conditional_branch_insn)&Assembler::bne,
1214   (conditional_branch_insn)&Assembler::bleu,
1215   NULL,
1216   (conditional_branch_insn)&Assembler::bgeu
1217 };
1218 
1219 static float_conditional_branch_insn float_conditional_branches[] =
1220 {
1221   /* FLOAT SHORT branches */
1222   (float_conditional_branch_insn)&MacroAssembler::float_beq,
1223   (float_conditional_branch_insn)&MacroAssembler::float_bgt,
1224   NULL,  // BoolTest::overflow
1225   (float_conditional_branch_insn)&MacroAssembler::float_blt,
1226   (float_conditional_branch_insn)&MacroAssembler::float_bne,
1227   (float_conditional_branch_insn)&MacroAssembler::float_ble,
1228   NULL, // BoolTest::no_overflow
1229   (float_conditional_branch_insn)&MacroAssembler::float_bge,
1230 
1231   /* DOUBLE SHORT branches */
1232   (float_conditional_branch_insn)&MacroAssembler::double_beq,
1233   (float_conditional_branch_insn)&MacroAssembler::double_bgt,
1234   NULL,
1235   (float_conditional_branch_insn)&MacroAssembler::double_blt,
1236   (float_conditional_branch_insn)&MacroAssembler::double_bne,
1237   (float_conditional_branch_insn)&MacroAssembler::double_ble,
1238   NULL,
1239   (float_conditional_branch_insn)&MacroAssembler::double_bge
1240 };
1241 
1242 void C2_MacroAssembler::cmp_branch(int cmpFlag, Register op1, Register op2, Label& label, bool is_far) {
1243   assert(cmpFlag >= 0 && cmpFlag < (int)(sizeof(conditional_branches) / sizeof(conditional_branches[0])),
1244          "invalid conditional branch index");
1245   (this->*conditional_branches[cmpFlag])(op1, op2, label, is_far);
1246 }
1247 
1248 // This is a function should only be used by C2. Flip the unordered when unordered-greater, C2 would use
1249 // unordered-lesser instead of unordered-greater. Finally, commute the result bits at function do_one_bytecode().
1250 void C2_MacroAssembler::float_cmp_branch(int cmpFlag, FloatRegister op1, FloatRegister op2, Label& label, bool is_far) {
1251   assert(cmpFlag >= 0 && cmpFlag < (int)(sizeof(float_conditional_branches) / sizeof(float_conditional_branches[0])),
1252          "invalid float conditional branch index");
1253   int booltest_flag = cmpFlag & ~(C2_MacroAssembler::double_branch_mask);
1254   (this->*float_conditional_branches[cmpFlag])(op1, op2, label, is_far,
1255     (booltest_flag == (BoolTest::ge) || booltest_flag == (BoolTest::gt)) ? false : true);
1256 }
1257 
1258 void C2_MacroAssembler::enc_cmpUEqNeLeGt_imm0_branch(int cmpFlag, Register op1, Label& L, bool is_far) {
1259   switch (cmpFlag) {
1260     case BoolTest::eq:
1261     case BoolTest::le:
1262       beqz(op1, L, is_far);
1263       break;
1264     case BoolTest::ne:
1265     case BoolTest::gt:
1266       bnez(op1, L, is_far);
1267       break;
1268     default:
1269       ShouldNotReachHere();
1270   }
1271 }
1272 
1273 void C2_MacroAssembler::enc_cmpEqNe_imm0_branch(int cmpFlag, Register op1, Label& L, bool is_far) {
1274   switch (cmpFlag) {
1275     case BoolTest::eq:
1276       beqz(op1, L, is_far);
1277       break;
1278     case BoolTest::ne:
1279       bnez(op1, L, is_far);
1280       break;
1281     default:
1282       ShouldNotReachHere();
1283   }
1284 }
1285 
1286 void C2_MacroAssembler::enc_cmove(int cmpFlag, Register op1, Register op2, Register dst, Register src) {
1287   Label L;
1288   cmp_branch(cmpFlag ^ (1 << neg_cond_bits), op1, op2, L);
1289   mv(dst, src);
1290   bind(L);
1291 }
1292 
1293 // Set dst to NaN if any NaN input.
1294 void C2_MacroAssembler::minmax_FD(FloatRegister dst, FloatRegister src1, FloatRegister src2,
1295                                   bool is_double, bool is_min) {
1296   assert_different_registers(dst, src1, src2);
1297 
1298   Label Done;
1299   fsflags(zr);
1300   if (is_double) {
1301     is_min ? fmin_d(dst, src1, src2)
1302            : fmax_d(dst, src1, src2);
1303     // Checking NaNs
1304     flt_d(zr, src1, src2);
1305   } else {
1306     is_min ? fmin_s(dst, src1, src2)
1307            : fmax_s(dst, src1, src2);
1308     // Checking NaNs
1309     flt_s(zr, src1, src2);
1310   }
1311 
1312   frflags(t0);
1313   beqz(t0, Done);
1314 
1315   // In case of NaNs
1316   is_double ? fadd_d(dst, src1, src2)
1317             : fadd_s(dst, src1, src2);
1318 
1319   bind(Done);
1320 }
1321 
1322 void C2_MacroAssembler::element_compare(Register a1, Register a2, Register result, Register cnt, Register tmp1, Register tmp2,
1323                                         VectorRegister vr1, VectorRegister vr2, VectorRegister vrs, bool islatin, Label &DONE) {
1324   Label loop;
1325   Assembler::SEW sew = islatin ? Assembler::e8 : Assembler::e16;
1326 
1327   bind(loop);
1328   vsetvli(tmp1, cnt, sew, Assembler::m2);
1329   vlex_v(vr1, a1, sew);
1330   vlex_v(vr2, a2, sew);
1331   vmsne_vv(vrs, vr1, vr2);
1332   vfirst_m(tmp2, vrs);
1333   bgez(tmp2, DONE);
1334   sub(cnt, cnt, tmp1);
1335   if (!islatin) {
1336     slli(tmp1, tmp1, 1); // get byte counts
1337   }
1338   add(a1, a1, tmp1);
1339   add(a2, a2, tmp1);
1340   bnez(cnt, loop);
1341 
1342   mv(result, true);
1343 }
1344 
1345 void C2_MacroAssembler::string_equals_v(Register a1, Register a2, Register result, Register cnt, int elem_size) {
1346   Label DONE;
1347   Register tmp1 = t0;
1348   Register tmp2 = t1;
1349 
1350   BLOCK_COMMENT("string_equals_v {");
1351 
1352   mv(result, false);
1353 
1354   if (elem_size == 2) {
1355     srli(cnt, cnt, 1);
1356   }
1357 
1358   element_compare(a1, a2, result, cnt, tmp1, tmp2, v0, v2, v0, elem_size == 1, DONE);
1359 
1360   bind(DONE);
1361   BLOCK_COMMENT("} string_equals_v");
1362 }
1363 
1364 // used by C2 ClearArray patterns.
1365 // base: Address of a buffer to be zeroed
1366 // cnt: Count in HeapWords
1367 //
1368 // base, cnt, v0, v1 and t0 are clobbered.
1369 void C2_MacroAssembler::clear_array_v(Register base, Register cnt) {
1370   Label loop;
1371 
1372   // making zero words
1373   vsetvli(t0, cnt, Assembler::e64, Assembler::m4);
1374   vxor_vv(v0, v0, v0);
1375 
1376   bind(loop);
1377   vsetvli(t0, cnt, Assembler::e64, Assembler::m4);
1378   vse64_v(v0, base);
1379   sub(cnt, cnt, t0);
1380   slli(t0, t0, 3);
1381   add(base, base, t0);
1382   bnez(cnt, loop);
1383 }
1384 
1385 void C2_MacroAssembler::arrays_equals_v(Register a1, Register a2, Register result,
1386                                         Register cnt1, int elem_size) {
1387   Label DONE;
1388   Register tmp1 = t0;
1389   Register tmp2 = t1;
1390   Register cnt2 = tmp2;
1391   int length_offset = arrayOopDesc::length_offset_in_bytes();
1392   int base_offset = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE);
1393 
1394   BLOCK_COMMENT("arrays_equals_v {");
1395 
1396   // if (a1 == a2), return true
1397   mv(result, true);
1398   beq(a1, a2, DONE);
1399 
1400   mv(result, false);
1401   // if a1 == null or a2 == null, return false
1402   beqz(a1, DONE);
1403   beqz(a2, DONE);
1404   // if (a1.length != a2.length), return false
1405   lwu(cnt1, Address(a1, length_offset));
1406   lwu(cnt2, Address(a2, length_offset));
1407   bne(cnt1, cnt2, DONE);
1408 
1409   la(a1, Address(a1, base_offset));
1410   la(a2, Address(a2, base_offset));
1411 
1412   element_compare(a1, a2, result, cnt1, tmp1, tmp2, v0, v2, v0, elem_size == 1, DONE);
1413 
1414   bind(DONE);
1415 
1416   BLOCK_COMMENT("} arrays_equals_v");
1417 }
1418 
1419 void C2_MacroAssembler::string_compare_v(Register str1, Register str2, Register cnt1, Register cnt2,
1420                                          Register result, Register tmp1, Register tmp2, int encForm) {
1421   Label DIFFERENCE, DONE, L, loop;
1422   bool encLL = encForm == StrIntrinsicNode::LL;
1423   bool encLU = encForm == StrIntrinsicNode::LU;
1424   bool encUL = encForm == StrIntrinsicNode::UL;
1425 
1426   bool str1_isL = encLL || encLU;
1427   bool str2_isL = encLL || encUL;
1428 
1429   int minCharsInWord = encLL ? wordSize : wordSize / 2;
1430 
1431   BLOCK_COMMENT("string_compare {");
1432 
1433   // for Lating strings, 1 byte for 1 character
1434   // for UTF16 strings, 2 bytes for 1 character
1435   if (!str1_isL)
1436     sraiw(cnt1, cnt1, 1);
1437   if (!str2_isL)
1438     sraiw(cnt2, cnt2, 1);
1439 
1440   // if str1 == str2, return the difference
1441   // save the minimum of the string lengths in cnt2.
1442   sub(result, cnt1, cnt2);
1443   bgt(cnt1, cnt2, L);
1444   mv(cnt2, cnt1);
1445   bind(L);
1446 
1447   if (str1_isL == str2_isL) { // LL or UU
1448     element_compare(str1, str2, zr, cnt2, tmp1, tmp2, v2, v4, v1, encLL, DIFFERENCE);
1449     j(DONE);
1450   } else { // LU or UL
1451     Register strL = encLU ? str1 : str2;
1452     Register strU = encLU ? str2 : str1;
1453     VectorRegister vstr1 = encLU ? v4 : v0;
1454     VectorRegister vstr2 = encLU ? v0 : v4;
1455 
1456     bind(loop);
1457     vsetvli(tmp1, cnt2, Assembler::e8, Assembler::m2);
1458     vle8_v(vstr1, strL);
1459     vsetvli(tmp1, cnt2, Assembler::e16, Assembler::m4);
1460     vzext_vf2(vstr2, vstr1);
1461     vle16_v(vstr1, strU);
1462     vmsne_vv(v0, vstr2, vstr1);
1463     vfirst_m(tmp2, v0);
1464     bgez(tmp2, DIFFERENCE);
1465     sub(cnt2, cnt2, tmp1);
1466     add(strL, strL, tmp1);
1467     slli(tmp1, tmp1, 1);
1468     add(strU, strU, tmp1);
1469     bnez(cnt2, loop);
1470     j(DONE);
1471   }
1472   bind(DIFFERENCE);
1473   slli(tmp1, tmp2, 1);
1474   add(str1, str1, str1_isL ? tmp2 : tmp1);
1475   add(str2, str2, str2_isL ? tmp2 : tmp1);
1476   str1_isL ? lbu(tmp1, Address(str1, 0)) : lhu(tmp1, Address(str1, 0));
1477   str2_isL ? lbu(tmp2, Address(str2, 0)) : lhu(tmp2, Address(str2, 0));
1478   sub(result, tmp1, tmp2);
1479 
1480   bind(DONE);
1481 }
1482 
1483 void C2_MacroAssembler::byte_array_inflate_v(Register src, Register dst, Register len, Register tmp) {
1484   Label loop;
1485   assert_different_registers(src, dst, len, tmp, t0);
1486 
1487   BLOCK_COMMENT("byte_array_inflate_v {");
1488   bind(loop);
1489   vsetvli(tmp, len, Assembler::e8, Assembler::m2);
1490   vle8_v(v2, src);
1491   vsetvli(t0, len, Assembler::e16, Assembler::m4);
1492   vzext_vf2(v0, v2);
1493   vse16_v(v0, dst);
1494   sub(len, len, tmp);
1495   add(src, src, tmp);
1496   slli(tmp, tmp, 1);
1497   add(dst, dst, tmp);
1498   bnez(len, loop);
1499   BLOCK_COMMENT("} byte_array_inflate_v");
1500 }
1501 
1502 // Compress char[] array to byte[].
1503 // result: the array length if every element in array can be encoded; 0, otherwise.
1504 void C2_MacroAssembler::char_array_compress_v(Register src, Register dst, Register len, Register result, Register tmp) {
1505   Label done;
1506   encode_iso_array_v(src, dst, len, result, tmp);
1507   beqz(len, done);
1508   mv(result, zr);
1509   bind(done);
1510 }
1511 
1512 // result: the number of elements had been encoded.
1513 void C2_MacroAssembler::encode_iso_array_v(Register src, Register dst, Register len, Register result, Register tmp) {
1514   Label loop, DIFFERENCE, DONE;
1515 
1516   BLOCK_COMMENT("encode_iso_array_v {");
1517   mv(result, 0);
1518 
1519   bind(loop);
1520   mv(tmp, 0xff);
1521   vsetvli(t0, len, Assembler::e16, Assembler::m2);
1522   vle16_v(v2, src);
1523   // if element > 0xff, stop
1524   vmsgtu_vx(v1, v2, tmp);
1525   vfirst_m(tmp, v1);
1526   vmsbf_m(v0, v1);
1527   // compress char to byte
1528   vsetvli(t0, len, Assembler::e8);
1529   vncvt_x_x_w(v1, v2, Assembler::v0_t);
1530   vse8_v(v1, dst, Assembler::v0_t);
1531 
1532   bgez(tmp, DIFFERENCE);
1533   add(result, result, t0);
1534   add(dst, dst, t0);
1535   sub(len, len, t0);
1536   slli(t0, t0, 1);
1537   add(src, src, t0);
1538   bnez(len, loop);
1539   j(DONE);
1540 
1541   bind(DIFFERENCE);
1542   add(result, result, tmp);
1543 
1544   bind(DONE);
1545   BLOCK_COMMENT("} encode_iso_array_v");
1546 }
1547 
1548 void C2_MacroAssembler::has_negatives_v(Register ary1, Register len, Register result, Register tmp) {
1549   Label loop, DONE;
1550 
1551   mv(result, true);
1552 
1553   bind(loop);
1554   vsetvli(t0, len, Assembler::e8, Assembler::m4);
1555   vle8_v(v0, ary1);
1556   // if element highest bit is set, return true
1557   vmslt_vx(v0, v0, zr);
1558   vfirst_m(tmp, v0);
1559   bgez(tmp, DONE);
1560 
1561   sub(len, len, t0);
1562   add(ary1, ary1, t0);
1563   bnez(len, loop);
1564   mv(result, false);
1565 
1566   bind(DONE);
1567 }
1568 
1569 void C2_MacroAssembler::string_indexof_char_v(Register str1, Register cnt1,
1570                                               Register ch, Register result,
1571                                               Register tmp1, Register tmp2,
1572                                               bool isL) {
1573   mv(result, zr);
1574 
1575   Label loop, MATCH, DONE;
1576   Assembler::SEW sew = isL ? Assembler::e8 : Assembler::e16;
1577   bind(loop);
1578   vsetvli(tmp1, cnt1, sew, Assembler::m4);
1579   vlex_v(v0, str1, sew);
1580   vmseq_vx(v0, v0, ch);
1581   vfirst_m(tmp2, v0);
1582   bgez(tmp2, MATCH); // if equal, return index
1583 
1584   add(result, result, tmp1);
1585   sub(cnt1, cnt1, tmp1);
1586   if (!isL) slli(tmp1, tmp1, 1);
1587   add(str1, str1, tmp1);
1588   bnez(cnt1, loop);
1589 
1590   mv(result, -1);
1591   j(DONE);
1592 
1593   bind(MATCH);
1594   add(result, result, tmp2);
1595 
1596   bind(DONE);
1597 }
1598 
1599 // Set dst to NaN if any NaN input.
1600 void C2_MacroAssembler::minmax_FD_v(VectorRegister dst, VectorRegister src1, VectorRegister src2,
1601                                     bool is_double, bool is_min) {
1602   assert_different_registers(dst, src1, src2);
1603 
1604   vsetvli(t0, x0, is_double ? Assembler::e64 : Assembler::e32);
1605 
1606   is_min ? vfmin_vv(dst, src1, src2)
1607          : vfmax_vv(dst, src1, src2);
1608 
1609   vmfne_vv(v0,  src1, src1);
1610   vfadd_vv(dst, src1, src1, Assembler::v0_t);
1611   vmfne_vv(v0,  src2, src2);
1612   vfadd_vv(dst, src2, src2, Assembler::v0_t);
1613 }
1614 
1615 // Set dst to NaN if any NaN input.
1616 void C2_MacroAssembler::reduce_minmax_FD_v(FloatRegister dst,
1617                                            FloatRegister src1, VectorRegister src2,
1618                                            VectorRegister tmp1, VectorRegister tmp2,
1619                                            bool is_double, bool is_min) {
1620   assert_different_registers(src2, tmp1, tmp2);
1621 
1622   Label L_done, L_NaN;
1623   vsetvli(t0, x0, is_double ? Assembler::e64 : Assembler::e32);
1624   vfmv_s_f(tmp2, src1);
1625 
1626   is_min ? vfredmin_vs(tmp1, src2, tmp2)
1627          : vfredmax_vs(tmp1, src2, tmp2);
1628 
1629   fsflags(zr);
1630   // Checking NaNs
1631   vmflt_vf(tmp2, src2, src1);
1632   frflags(t0);
1633   bnez(t0, L_NaN);
1634   j(L_done);
1635 
1636   bind(L_NaN);
1637   vfmv_s_f(tmp2, src1);
1638   vfredsum_vs(tmp1, src2, tmp2);
1639 
1640   bind(L_done);
1641   vfmv_f_s(dst, tmp1);
1642 }