1 /*
   2  * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2020, 2022, Huawei Technologies Co., Ltd. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/assembler.hpp"
  28 #include "asm/assembler.inline.hpp"
  29 #include "opto/c2_MacroAssembler.hpp"
  30 #include "opto/compile.hpp"
  31 #include "opto/intrinsicnode.hpp"
  32 #include "opto/output.hpp"
  33 #include "opto/subnode.hpp"
  34 #include "runtime/stubRoutines.hpp"
  35 
  36 #ifdef PRODUCT
  37 #define BLOCK_COMMENT(str) /* nothing */
  38 #define STOP(error) stop(error)
  39 #else
  40 #define BLOCK_COMMENT(str) block_comment(str)
  41 #define STOP(error) block_comment(error); stop(error)
  42 #endif
  43 
  44 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  45 
  46 void C2_MacroAssembler::fast_lock(Register objectReg, Register boxReg,
  47                                   Register tmp1Reg, Register tmp2Reg, Register tmp3Reg) {
  48   // Use cr register to indicate the fast_lock result: zero for success; non-zero for failure.
  49   Register flag = t1;
  50   Register oop = objectReg;
  51   Register box = boxReg;
  52   Register disp_hdr = tmp1Reg;
  53   Register tmp = tmp2Reg;
  54   Label cont;
  55   Label object_has_monitor;
  56   Label count, no_count;
  57 
  58   assert_different_registers(oop, box, tmp, disp_hdr, flag, tmp3Reg, t0);
  59 
  60   // Load markWord from object into displaced_header.
  61   ld(disp_hdr, Address(oop, oopDesc::mark_offset_in_bytes()));
  62 
  63   if (DiagnoseSyncOnValueBasedClasses != 0) {
  64     load_klass(flag, oop);
  65     lwu(flag, Address(flag, Klass::access_flags_offset()));
  66     test_bit(flag, flag, exact_log2(JVM_ACC_IS_VALUE_BASED_CLASS));
  67     bnez(flag, cont, true /* is_far */);
  68   }
  69 
  70   // Check for existing monitor
  71   test_bit(t0, disp_hdr, exact_log2(markWord::monitor_value));
  72   bnez(t0, object_has_monitor);
  73 
  74   if (LockingMode == LM_MONITOR) {
  75     mv(flag, 1); // Set non-zero flag to indicate 'failure' -> take slow-path
  76     j(cont);
  77   } else if (LockingMode == LM_LEGACY) {
  78     // Set tmp to be (markWord of object | UNLOCK_VALUE).
  79     ori(tmp, disp_hdr, markWord::unlocked_value);
  80 
  81     // Initialize the box. (Must happen before we update the object mark!)
  82     sd(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes()));
  83 
  84     // Compare object markWord with an unlocked value (tmp) and if
  85     // equal exchange the stack address of our box with object markWord.
  86     // On failure disp_hdr contains the possibly locked markWord.
  87     cmpxchg(/*memory address*/oop, /*expected value*/tmp, /*new value*/box, Assembler::int64, Assembler::aq,
  88             Assembler::rl, /*result*/disp_hdr);
  89     mv(flag, zr);
  90     beq(disp_hdr, tmp, cont); // prepare zero flag and goto cont if we won the cas
  91 
  92     assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
  93 
  94     // If the compare-and-exchange succeeded, then we found an unlocked
  95     // object, will have now locked it will continue at label cont
  96     // We did not see an unlocked object so try the fast recursive case.
  97 
  98     // Check if the owner is self by comparing the value in the
  99     // markWord of object (disp_hdr) with the stack pointer.
 100     sub(disp_hdr, disp_hdr, sp);
 101     mv(tmp, (intptr_t) (~(os::vm_page_size()-1) | (uintptr_t)markWord::lock_mask_in_place));
 102     // If (mark & lock_mask) == 0 and mark - sp < page_size, we are stack-locking and goto cont,
 103     // hence we can store 0 as the displaced header in the box, which indicates that it is a
 104     // recursive lock.
 105     andr(tmp/*==0?*/, disp_hdr, tmp);
 106     sd(tmp/*==0, perhaps*/, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 107     mv(flag, tmp); // we can use the value of tmp as the result here
 108     j(cont);
 109   } else {
 110     assert(LockingMode == LM_LIGHTWEIGHT, "");
 111     Label slow;
 112     lightweight_lock(oop, disp_hdr, tmp, tmp3Reg, slow);
 113 
 114     // Indicate success on completion.
 115     mv(flag, zr);
 116     j(count);
 117     bind(slow);
 118     mv(flag, 1); // Set non-zero flag to indicate 'failure' -> take slow-path
 119     j(no_count);
 120   }
 121 
 122   // Handle existing monitor.
 123   bind(object_has_monitor);
 124   // The object's monitor m is unlocked iff m->owner == nullptr,
 125   // otherwise m->owner may contain a thread or a stack address.
 126   //
 127   // Try to CAS m->owner from null to current thread.
 128   add(tmp, disp_hdr, (in_bytes(ObjectMonitor::owner_offset()) - markWord::monitor_value));
 129   Register tid = t0;
 130   mv(tid, Address(xthread, JavaThread::lock_id_offset()));
 131   cmpxchg(/*memory address*/tmp, /*expected value*/zr, /*new value*/tid, Assembler::int64, Assembler::aq,
 132           Assembler::rl, /*result*/flag); // cas succeeds if flag == zr(expected)
 133 
 134   if (LockingMode != LM_LIGHTWEIGHT) {
 135     // Store a non-null value into the box to avoid looking like a re-entrant
 136     // lock. The fast-path monitor unlock code checks for
 137     // markWord::monitor_value so use markWord::unused_mark which has the
 138     // relevant bit set, and also matches ObjectSynchronizer::slow_enter.
 139     mv(tmp, (address)markWord::unused_mark().value());
 140     sd(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 141   }
 142 
 143   beqz(flag, cont); // CAS success means locking succeeded
 144 
 145   bne(flag, tid, cont); // Check for recursive locking
 146 
 147   // Recursive lock case
 148   mv(flag, zr);
 149   increment(Address(disp_hdr, in_bytes(ObjectMonitor::recursions_offset()) - markWord::monitor_value), 1, t0, tmp);
 150 
 151   bind(cont);
 152   // zero flag indicates success
 153   // non-zero flag indicates failure
 154   bnez(flag, no_count);
 155 
 156   bind(count);
 157   increment(Address(xthread, JavaThread::held_monitor_count_offset()), 1, t0, tmp);
 158 
 159   bind(no_count);
 160 }
 161 
 162 void C2_MacroAssembler::fast_unlock(Register objectReg, Register boxReg,
 163                                     Register tmp1Reg, Register tmp2Reg) {
 164   // Use cr register to indicate the fast_unlock result: zero for success; non-zero for failure.
 165   Register flag = t1;
 166   Register oop = objectReg;
 167   Register box = boxReg;
 168   Register disp_hdr = tmp1Reg;
 169   Register tmp = tmp2Reg;
 170   Label cont;
 171   Label object_has_monitor;
 172   Label count, no_count;
 173 
 174   assert_different_registers(oop, box, tmp, disp_hdr, flag, t0);
 175 
 176   if (LockingMode == LM_LEGACY) {
 177     // Find the lock address and load the displaced header from the stack.
 178     ld(disp_hdr, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 179 
 180     // If the displaced header is 0, we have a recursive unlock.
 181     mv(flag, disp_hdr);
 182     beqz(disp_hdr, cont);
 183   }
 184 
 185   // Handle existing monitor.
 186   ld(tmp, Address(oop, oopDesc::mark_offset_in_bytes()));
 187   test_bit(t0, tmp, exact_log2(markWord::monitor_value));
 188   bnez(t0, object_has_monitor);
 189 
 190   if (LockingMode == LM_MONITOR) {
 191     mv(flag, 1); // Set non-zero flag to indicate 'failure' -> take slow path
 192     j(cont);
 193   } else if (LockingMode == LM_LEGACY) {
 194     // Check if it is still a light weight lock, this is true if we
 195     // see the stack address of the basicLock in the markWord of the
 196     // object.
 197 
 198     cmpxchg(/*memory address*/oop, /*expected value*/box, /*new value*/disp_hdr, Assembler::int64, Assembler::relaxed,
 199             Assembler::rl, /*result*/tmp);
 200     xorr(flag, box, tmp); // box == tmp if cas succeeds
 201     j(cont);
 202   } else {
 203     assert(LockingMode == LM_LIGHTWEIGHT, "");
 204     Label slow;
 205     lightweight_unlock(oop, tmp, box, disp_hdr, slow);
 206 
 207     // Indicate success on completion.
 208     mv(flag, zr);
 209     j(count);
 210     bind(slow);
 211     mv(flag, 1); // Set non-zero flag to indicate 'failure' -> take slow path
 212     j(no_count);
 213   }
 214 
 215   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
 216 
 217   // Handle existing monitor.
 218   bind(object_has_monitor);
 219   STATIC_ASSERT(markWord::monitor_value <= INT_MAX);
 220   add(tmp, tmp, -(int)markWord::monitor_value); // monitor
 221 
 222   if (LockingMode == LM_LIGHTWEIGHT) {
 223     // If the owner is anonymous, we need to fix it -- in an outline stub.
 224     Register tmp2 = disp_hdr;
 225     ld(tmp2, Address(tmp, ObjectMonitor::owner_offset()));
 226     mv(t0, (int64_t)ObjectMonitor::ANONYMOUS_OWNER);
 227     C2HandleAnonOMOwnerStub* stub = new (Compile::current()->comp_arena()) C2HandleAnonOMOwnerStub(tmp, tmp2);
 228     Compile::current()->output()->add_stub(stub);
 229     beq(t0, tmp2, stub->entry(), /* is_far */ true);
 230     bind(stub->continuation());
 231   }
 232 
 233   ld(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset()));
 234 
 235   Label notRecursive;
 236   beqz(disp_hdr, notRecursive); // Will be 0 if not recursive.
 237 
 238   // Recursive lock
 239   addi(disp_hdr, disp_hdr, -1);
 240   sd(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset()));
 241   mv(flag, zr);
 242   j(cont);
 243 
 244   bind(notRecursive);
 245   ld(flag, Address(tmp, ObjectMonitor::EntryList_offset()));
 246   ld(disp_hdr, Address(tmp, ObjectMonitor::cxq_offset()));
 247   orr(flag, flag, disp_hdr); // Will be 0 if both are 0.
 248   bnez(flag, cont);
 249   // need a release store here
 250   la(tmp, Address(tmp, ObjectMonitor::owner_offset()));
 251   membar(MacroAssembler::LoadStore | MacroAssembler::StoreStore);
 252   sd(zr, Address(tmp)); // set unowned
 253 
 254   bind(cont);
 255   // zero flag indicates success
 256   // non-zero flag indicates failure
 257   bnez(flag, no_count);
 258 
 259   bind(count);
 260   decrement(Address(xthread, JavaThread::held_monitor_count_offset()), 1, t0, tmp);
 261 
 262   bind(no_count);
 263 }
 264 
 265 // short string
 266 // StringUTF16.indexOfChar
 267 // StringLatin1.indexOfChar
 268 void C2_MacroAssembler::string_indexof_char_short(Register str1, Register cnt1,
 269                                                   Register ch, Register result,
 270                                                   bool isL)
 271 {
 272   Register ch1 = t0;
 273   Register index = t1;
 274 
 275   BLOCK_COMMENT("string_indexof_char_short {");
 276 
 277   Label LOOP, LOOP1, LOOP4, LOOP8;
 278   Label MATCH,  MATCH1, MATCH2, MATCH3,
 279         MATCH4, MATCH5, MATCH6, MATCH7, NOMATCH;
 280 
 281   mv(result, -1);
 282   mv(index, zr);
 283 
 284   bind(LOOP);
 285   addi(t0, index, 8);
 286   ble(t0, cnt1, LOOP8);
 287   addi(t0, index, 4);
 288   ble(t0, cnt1, LOOP4);
 289   j(LOOP1);
 290 
 291   bind(LOOP8);
 292   isL ? lbu(ch1, Address(str1, 0)) : lhu(ch1, Address(str1, 0));
 293   beq(ch, ch1, MATCH);
 294   isL ? lbu(ch1, Address(str1, 1)) : lhu(ch1, Address(str1, 2));
 295   beq(ch, ch1, MATCH1);
 296   isL ? lbu(ch1, Address(str1, 2)) : lhu(ch1, Address(str1, 4));
 297   beq(ch, ch1, MATCH2);
 298   isL ? lbu(ch1, Address(str1, 3)) : lhu(ch1, Address(str1, 6));
 299   beq(ch, ch1, MATCH3);
 300   isL ? lbu(ch1, Address(str1, 4)) : lhu(ch1, Address(str1, 8));
 301   beq(ch, ch1, MATCH4);
 302   isL ? lbu(ch1, Address(str1, 5)) : lhu(ch1, Address(str1, 10));
 303   beq(ch, ch1, MATCH5);
 304   isL ? lbu(ch1, Address(str1, 6)) : lhu(ch1, Address(str1, 12));
 305   beq(ch, ch1, MATCH6);
 306   isL ? lbu(ch1, Address(str1, 7)) : lhu(ch1, Address(str1, 14));
 307   beq(ch, ch1, MATCH7);
 308   addi(index, index, 8);
 309   addi(str1, str1, isL ? 8 : 16);
 310   blt(index, cnt1, LOOP);
 311   j(NOMATCH);
 312 
 313   bind(LOOP4);
 314   isL ? lbu(ch1, Address(str1, 0)) : lhu(ch1, Address(str1, 0));
 315   beq(ch, ch1, MATCH);
 316   isL ? lbu(ch1, Address(str1, 1)) : lhu(ch1, Address(str1, 2));
 317   beq(ch, ch1, MATCH1);
 318   isL ? lbu(ch1, Address(str1, 2)) : lhu(ch1, Address(str1, 4));
 319   beq(ch, ch1, MATCH2);
 320   isL ? lbu(ch1, Address(str1, 3)) : lhu(ch1, Address(str1, 6));
 321   beq(ch, ch1, MATCH3);
 322   addi(index, index, 4);
 323   addi(str1, str1, isL ? 4 : 8);
 324   bge(index, cnt1, NOMATCH);
 325 
 326   bind(LOOP1);
 327   isL ? lbu(ch1, Address(str1)) : lhu(ch1, Address(str1));
 328   beq(ch, ch1, MATCH);
 329   addi(index, index, 1);
 330   addi(str1, str1, isL ? 1 : 2);
 331   blt(index, cnt1, LOOP1);
 332   j(NOMATCH);
 333 
 334   bind(MATCH1);
 335   addi(index, index, 1);
 336   j(MATCH);
 337 
 338   bind(MATCH2);
 339   addi(index, index, 2);
 340   j(MATCH);
 341 
 342   bind(MATCH3);
 343   addi(index, index, 3);
 344   j(MATCH);
 345 
 346   bind(MATCH4);
 347   addi(index, index, 4);
 348   j(MATCH);
 349 
 350   bind(MATCH5);
 351   addi(index, index, 5);
 352   j(MATCH);
 353 
 354   bind(MATCH6);
 355   addi(index, index, 6);
 356   j(MATCH);
 357 
 358   bind(MATCH7);
 359   addi(index, index, 7);
 360 
 361   bind(MATCH);
 362   mv(result, index);
 363   bind(NOMATCH);
 364   BLOCK_COMMENT("} string_indexof_char_short");
 365 }
 366 
 367 // StringUTF16.indexOfChar
 368 // StringLatin1.indexOfChar
 369 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1,
 370                                             Register ch, Register result,
 371                                             Register tmp1, Register tmp2,
 372                                             Register tmp3, Register tmp4,
 373                                             bool isL)
 374 {
 375   Label CH1_LOOP, HIT, NOMATCH, DONE, DO_LONG;
 376   Register ch1 = t0;
 377   Register orig_cnt = t1;
 378   Register mask1 = tmp3;
 379   Register mask2 = tmp2;
 380   Register match_mask = tmp1;
 381   Register trailing_char = tmp4;
 382   Register unaligned_elems = tmp4;
 383 
 384   BLOCK_COMMENT("string_indexof_char {");
 385   beqz(cnt1, NOMATCH);
 386 
 387   addi(t0, cnt1, isL ? -32 : -16);
 388   bgtz(t0, DO_LONG);
 389   string_indexof_char_short(str1, cnt1, ch, result, isL);
 390   j(DONE);
 391 
 392   bind(DO_LONG);
 393   mv(orig_cnt, cnt1);
 394   if (AvoidUnalignedAccesses) {
 395     Label ALIGNED;
 396     andi(unaligned_elems, str1, 0x7);
 397     beqz(unaligned_elems, ALIGNED);
 398     sub(unaligned_elems, unaligned_elems, 8);
 399     neg(unaligned_elems, unaligned_elems);
 400     if (!isL) {
 401       srli(unaligned_elems, unaligned_elems, 1);
 402     }
 403     // do unaligned part per element
 404     string_indexof_char_short(str1, unaligned_elems, ch, result, isL);
 405     bgez(result, DONE);
 406     mv(orig_cnt, cnt1);
 407     sub(cnt1, cnt1, unaligned_elems);
 408     bind(ALIGNED);
 409   }
 410 
 411   // duplicate ch
 412   if (isL) {
 413     slli(ch1, ch, 8);
 414     orr(ch, ch1, ch);
 415   }
 416   slli(ch1, ch, 16);
 417   orr(ch, ch1, ch);
 418   slli(ch1, ch, 32);
 419   orr(ch, ch1, ch);
 420 
 421   if (!isL) {
 422     slli(cnt1, cnt1, 1);
 423   }
 424 
 425   uint64_t mask0101 = UCONST64(0x0101010101010101);
 426   uint64_t mask0001 = UCONST64(0x0001000100010001);
 427   mv(mask1, isL ? mask0101 : mask0001);
 428   uint64_t mask7f7f = UCONST64(0x7f7f7f7f7f7f7f7f);
 429   uint64_t mask7fff = UCONST64(0x7fff7fff7fff7fff);
 430   mv(mask2, isL ? mask7f7f : mask7fff);
 431 
 432   bind(CH1_LOOP);
 433   ld(ch1, Address(str1));
 434   addi(str1, str1, 8);
 435   addi(cnt1, cnt1, -8);
 436   compute_match_mask(ch1, ch, match_mask, mask1, mask2);
 437   bnez(match_mask, HIT);
 438   bgtz(cnt1, CH1_LOOP);
 439   j(NOMATCH);
 440 
 441   bind(HIT);
 442   ctzc_bit(trailing_char, match_mask, isL, ch1, result);
 443   srli(trailing_char, trailing_char, 3);
 444   addi(cnt1, cnt1, 8);
 445   ble(cnt1, trailing_char, NOMATCH);
 446   // match case
 447   if (!isL) {
 448     srli(cnt1, cnt1, 1);
 449     srli(trailing_char, trailing_char, 1);
 450   }
 451 
 452   sub(result, orig_cnt, cnt1);
 453   add(result, result, trailing_char);
 454   j(DONE);
 455 
 456   bind(NOMATCH);
 457   mv(result, -1);
 458 
 459   bind(DONE);
 460   BLOCK_COMMENT("} string_indexof_char");
 461 }
 462 
 463 typedef void (MacroAssembler::* load_chr_insn)(Register rd, const Address &adr, Register temp);
 464 
 465 // Search for needle in haystack and return index or -1
 466 // x10: result
 467 // x11: haystack
 468 // x12: haystack_len
 469 // x13: needle
 470 // x14: needle_len
 471 void C2_MacroAssembler::string_indexof(Register haystack, Register needle,
 472                                        Register haystack_len, Register needle_len,
 473                                        Register tmp1, Register tmp2,
 474                                        Register tmp3, Register tmp4,
 475                                        Register tmp5, Register tmp6,
 476                                        Register result, int ae)
 477 {
 478   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
 479 
 480   Label LINEARSEARCH, LINEARSTUB, DONE, NOMATCH;
 481 
 482   Register ch1 = t0;
 483   Register ch2 = t1;
 484   Register nlen_tmp = tmp1; // needle len tmp
 485   Register hlen_tmp = tmp2; // haystack len tmp
 486   Register result_tmp = tmp4;
 487 
 488   bool isLL = ae == StrIntrinsicNode::LL;
 489 
 490   bool needle_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
 491   bool haystack_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
 492   int needle_chr_shift = needle_isL ? 0 : 1;
 493   int haystack_chr_shift = haystack_isL ? 0 : 1;
 494   int needle_chr_size = needle_isL ? 1 : 2;
 495   int haystack_chr_size = haystack_isL ? 1 : 2;
 496   load_chr_insn needle_load_1chr = needle_isL ? (load_chr_insn)&MacroAssembler::lbu :
 497                               (load_chr_insn)&MacroAssembler::lhu;
 498   load_chr_insn haystack_load_1chr = haystack_isL ? (load_chr_insn)&MacroAssembler::lbu :
 499                                 (load_chr_insn)&MacroAssembler::lhu;
 500 
 501   BLOCK_COMMENT("string_indexof {");
 502 
 503   // Note, inline_string_indexOf() generates checks:
 504   // if (pattern.count > src.count) return -1;
 505   // if (pattern.count == 0) return 0;
 506 
 507   // We have two strings, a source string in haystack, haystack_len and a pattern string
 508   // in needle, needle_len. Find the first occurrence of pattern in source or return -1.
 509 
 510   // For larger pattern and source we use a simplified Boyer Moore algorithm.
 511   // With a small pattern and source we use linear scan.
 512 
 513   // needle_len >=8 && needle_len < 256 && needle_len < haystack_len/4, use bmh algorithm.
 514   sub(result_tmp, haystack_len, needle_len);
 515   // needle_len < 8, use linear scan
 516   sub(t0, needle_len, 8);
 517   bltz(t0, LINEARSEARCH);
 518   // needle_len >= 256, use linear scan
 519   sub(t0, needle_len, 256);
 520   bgez(t0, LINEARSTUB);
 521   // needle_len >= haystack_len/4, use linear scan
 522   srli(t0, haystack_len, 2);
 523   bge(needle_len, t0, LINEARSTUB);
 524 
 525   // Boyer-Moore-Horspool introduction:
 526   // The Boyer Moore alogorithm is based on the description here:-
 527   //
 528   // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
 529   //
 530   // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
 531   // and the 'Good Suffix' rule.
 532   //
 533   // These rules are essentially heuristics for how far we can shift the
 534   // pattern along the search string.
 535   //
 536   // The implementation here uses the 'Bad Character' rule only because of the
 537   // complexity of initialisation for the 'Good Suffix' rule.
 538   //
 539   // This is also known as the Boyer-Moore-Horspool algorithm:
 540   //
 541   // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
 542   //
 543   // #define ASIZE 256
 544   //
 545   //    int bm(unsigned char *pattern, int m, unsigned char *src, int n) {
 546   //      int i, j;
 547   //      unsigned c;
 548   //      unsigned char bc[ASIZE];
 549   //
 550   //      /* Preprocessing */
 551   //      for (i = 0; i < ASIZE; ++i)
 552   //        bc[i] = m;
 553   //      for (i = 0; i < m - 1; ) {
 554   //        c = pattern[i];
 555   //        ++i;
 556   //        // c < 256 for Latin1 string, so, no need for branch
 557   //        #ifdef PATTERN_STRING_IS_LATIN1
 558   //        bc[c] = m - i;
 559   //        #else
 560   //        if (c < ASIZE) bc[c] = m - i;
 561   //        #endif
 562   //      }
 563   //
 564   //      /* Searching */
 565   //      j = 0;
 566   //      while (j <= n - m) {
 567   //        c = src[i+j];
 568   //        if (pattern[m-1] == c)
 569   //          int k;
 570   //          for (k = m - 2; k >= 0 && pattern[k] == src[k + j]; --k);
 571   //          if (k < 0) return j;
 572   //          // c < 256 for Latin1 string, so, no need for branch
 573   //          #ifdef SOURCE_STRING_IS_LATIN1_AND_PATTERN_STRING_IS_LATIN1
 574   //          // LL case: (c< 256) always true. Remove branch
 575   //          j += bc[pattern[j+m-1]];
 576   //          #endif
 577   //          #ifdef SOURCE_STRING_IS_UTF_AND_PATTERN_STRING_IS_UTF
 578   //          // UU case: need if (c<ASIZE) check. Skip 1 character if not.
 579   //          if (c < ASIZE)
 580   //            j += bc[pattern[j+m-1]];
 581   //          else
 582   //            j += 1
 583   //          #endif
 584   //          #ifdef SOURCE_IS_UTF_AND_PATTERN_IS_LATIN1
 585   //          // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
 586   //          if (c < ASIZE)
 587   //            j += bc[pattern[j+m-1]];
 588   //          else
 589   //            j += m
 590   //          #endif
 591   //      }
 592   //      return -1;
 593   //    }
 594 
 595   // temp register:t0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, result
 596   Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
 597         BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
 598 
 599   Register haystack_end = haystack_len;
 600   Register skipch = tmp2;
 601 
 602   // pattern length is >=8, so, we can read at least 1 register for cases when
 603   // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
 604   // UL case. We'll re-read last character in inner pre-loop code to have
 605   // single outer pre-loop load
 606   const int firstStep = isLL ? 7 : 3;
 607 
 608   const int ASIZE = 256;
 609   const int STORE_BYTES = 8; // 8 bytes stored per instruction(sd)
 610 
 611   sub(sp, sp, ASIZE);
 612 
 613   // init BC offset table with default value: needle_len
 614   slli(t0, needle_len, 8);
 615   orr(t0, t0, needle_len); // [63...16][needle_len][needle_len]
 616   slli(tmp1, t0, 16);
 617   orr(t0, tmp1, t0); // [63...32][needle_len][needle_len][needle_len][needle_len]
 618   slli(tmp1, t0, 32);
 619   orr(tmp5, tmp1, t0); // tmp5: 8 elements [needle_len]
 620 
 621   mv(ch1, sp);  // ch1 is t0
 622   mv(tmp6, ASIZE / STORE_BYTES); // loop iterations
 623 
 624   bind(BM_INIT_LOOP);
 625   // for (i = 0; i < ASIZE; ++i)
 626   //   bc[i] = m;
 627   for (int i = 0; i < 4; i++) {
 628     sd(tmp5, Address(ch1, i * wordSize));
 629   }
 630   add(ch1, ch1, 32);
 631   sub(tmp6, tmp6, 4);
 632   bgtz(tmp6, BM_INIT_LOOP);
 633 
 634   sub(nlen_tmp, needle_len, 1); // m - 1, index of the last element in pattern
 635   Register orig_haystack = tmp5;
 636   mv(orig_haystack, haystack);
 637   // result_tmp = tmp4
 638   shadd(haystack_end, result_tmp, haystack, haystack_end, haystack_chr_shift);
 639   sub(ch2, needle_len, 1); // bc offset init value, ch2 is t1
 640   mv(tmp3, needle);
 641 
 642   //  for (i = 0; i < m - 1; ) {
 643   //    c = pattern[i];
 644   //    ++i;
 645   //    // c < 256 for Latin1 string, so, no need for branch
 646   //    #ifdef PATTERN_STRING_IS_LATIN1
 647   //    bc[c] = m - i;
 648   //    #else
 649   //    if (c < ASIZE) bc[c] = m - i;
 650   //    #endif
 651   //  }
 652   bind(BCLOOP);
 653   (this->*needle_load_1chr)(ch1, Address(tmp3), noreg);
 654   add(tmp3, tmp3, needle_chr_size);
 655   if (!needle_isL) {
 656     // ae == StrIntrinsicNode::UU
 657     mv(tmp6, ASIZE);
 658     bgeu(ch1, tmp6, BCSKIP);
 659   }
 660   add(tmp4, sp, ch1);
 661   sb(ch2, Address(tmp4)); // store skip offset to BC offset table
 662 
 663   bind(BCSKIP);
 664   sub(ch2, ch2, 1); // for next pattern element, skip distance -1
 665   bgtz(ch2, BCLOOP);
 666 
 667   // tmp6: pattern end, address after needle
 668   shadd(tmp6, needle_len, needle, tmp6, needle_chr_shift);
 669   if (needle_isL == haystack_isL) {
 670     // load last 8 bytes (8LL/4UU symbols)
 671     ld(tmp6, Address(tmp6, -wordSize));
 672   } else {
 673     // UL: from UTF-16(source) search Latin1(pattern)
 674     lwu(tmp6, Address(tmp6, -wordSize / 2)); // load last 4 bytes(4 symbols)
 675     // convert Latin1 to UTF. eg: 0x0000abcd -> 0x0a0b0c0d
 676     // We'll have to wait until load completed, but it's still faster than per-character loads+checks
 677     srli(tmp3, tmp6, BitsPerByte * (wordSize / 2 - needle_chr_size)); // pattern[m-1], eg:0x0000000a
 678     slli(ch2, tmp6, XLEN - 24);
 679     srli(ch2, ch2, XLEN - 8); // pattern[m-2], 0x0000000b
 680     slli(ch1, tmp6, XLEN - 16);
 681     srli(ch1, ch1, XLEN - 8); // pattern[m-3], 0x0000000c
 682     andi(tmp6, tmp6, 0xff); // pattern[m-4], 0x0000000d
 683     slli(ch2, ch2, 16);
 684     orr(ch2, ch2, ch1); // 0x00000b0c
 685     slli(result, tmp3, 48); // use result as temp register
 686     orr(tmp6, tmp6, result); // 0x0a00000d
 687     slli(result, ch2, 16);
 688     orr(tmp6, tmp6, result); // UTF-16:0x0a0b0c0d
 689   }
 690 
 691   // i = m - 1;
 692   // skipch = j + i;
 693   // if (skipch == pattern[m - 1]
 694   //   for (k = m - 2; k >= 0 && pattern[k] == src[k + j]; --k);
 695   // else
 696   //   move j with bad char offset table
 697   bind(BMLOOPSTR2);
 698   // compare pattern to source string backward
 699   shadd(result, nlen_tmp, haystack, result, haystack_chr_shift);
 700   (this->*haystack_load_1chr)(skipch, Address(result), noreg);
 701   sub(nlen_tmp, nlen_tmp, firstStep); // nlen_tmp is positive here, because needle_len >= 8
 702   if (needle_isL == haystack_isL) {
 703     // re-init tmp3. It's for free because it's executed in parallel with
 704     // load above. Alternative is to initialize it before loop, but it'll
 705     // affect performance on in-order systems with 2 or more ld/st pipelines
 706     srli(tmp3, tmp6, BitsPerByte * (wordSize - needle_chr_size)); // UU/LL: pattern[m-1]
 707   }
 708   if (!isLL) { // UU/UL case
 709     slli(ch2, nlen_tmp, 1); // offsets in bytes
 710   }
 711   bne(tmp3, skipch, BMSKIP); // if not equal, skipch is bad char
 712   add(result, haystack, isLL ? nlen_tmp : ch2);
 713   // load 8 bytes from source string
 714   // if isLL is false then read granularity can be 2
 715   load_long_misaligned(ch2, Address(result), ch1, isLL ? 1 : 2); // can use ch1 as temp register here as it will be trashed by next mv anyway
 716   mv(ch1, tmp6);
 717   if (isLL) {
 718     j(BMLOOPSTR1_AFTER_LOAD);
 719   } else {
 720     sub(nlen_tmp, nlen_tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
 721     j(BMLOOPSTR1_CMP);
 722   }
 723 
 724   bind(BMLOOPSTR1);
 725   shadd(ch1, nlen_tmp, needle, ch1, needle_chr_shift);
 726   (this->*needle_load_1chr)(ch1, Address(ch1), noreg);
 727   shadd(ch2, nlen_tmp, haystack, ch2, haystack_chr_shift);
 728   (this->*haystack_load_1chr)(ch2, Address(ch2), noreg);
 729 
 730   bind(BMLOOPSTR1_AFTER_LOAD);
 731   sub(nlen_tmp, nlen_tmp, 1);
 732   bltz(nlen_tmp, BMLOOPSTR1_LASTCMP);
 733 
 734   bind(BMLOOPSTR1_CMP);
 735   beq(ch1, ch2, BMLOOPSTR1);
 736 
 737   bind(BMSKIP);
 738   if (!isLL) {
 739     // if we've met UTF symbol while searching Latin1 pattern, then we can
 740     // skip needle_len symbols
 741     if (needle_isL != haystack_isL) {
 742       mv(result_tmp, needle_len);
 743     } else {
 744       mv(result_tmp, 1);
 745     }
 746     mv(t0, ASIZE);
 747     bgeu(skipch, t0, BMADV);
 748   }
 749   add(result_tmp, sp, skipch);
 750   lbu(result_tmp, Address(result_tmp)); // load skip offset
 751 
 752   bind(BMADV);
 753   sub(nlen_tmp, needle_len, 1);
 754   // move haystack after bad char skip offset
 755   shadd(haystack, result_tmp, haystack, result, haystack_chr_shift);
 756   ble(haystack, haystack_end, BMLOOPSTR2);
 757   add(sp, sp, ASIZE);
 758   j(NOMATCH);
 759 
 760   bind(BMLOOPSTR1_LASTCMP);
 761   bne(ch1, ch2, BMSKIP);
 762 
 763   bind(BMMATCH);
 764   sub(result, haystack, orig_haystack);
 765   if (!haystack_isL) {
 766     srli(result, result, 1);
 767   }
 768   add(sp, sp, ASIZE);
 769   j(DONE);
 770 
 771   bind(LINEARSTUB);
 772   sub(t0, needle_len, 16); // small patterns still should be handled by simple algorithm
 773   bltz(t0, LINEARSEARCH);
 774   mv(result, zr);
 775   RuntimeAddress stub = nullptr;
 776   if (isLL) {
 777     stub = RuntimeAddress(StubRoutines::riscv::string_indexof_linear_ll());
 778     assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated");
 779   } else if (needle_isL) {
 780     stub = RuntimeAddress(StubRoutines::riscv::string_indexof_linear_ul());
 781     assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated");
 782   } else {
 783     stub = RuntimeAddress(StubRoutines::riscv::string_indexof_linear_uu());
 784     assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated");
 785   }
 786   address call = trampoline_call(stub);
 787   if (call == nullptr) {
 788     DEBUG_ONLY(reset_labels(LINEARSEARCH, DONE, NOMATCH));
 789     ciEnv::current()->record_failure("CodeCache is full");
 790     return;
 791   }
 792   j(DONE);
 793 
 794   bind(NOMATCH);
 795   mv(result, -1);
 796   j(DONE);
 797 
 798   bind(LINEARSEARCH);
 799   string_indexof_linearscan(haystack, needle, haystack_len, needle_len, tmp1, tmp2, tmp3, tmp4, -1, result, ae);
 800 
 801   bind(DONE);
 802   BLOCK_COMMENT("} string_indexof");
 803 }
 804 
 805 // string_indexof
 806 // result: x10
 807 // src: x11
 808 // src_count: x12
 809 // pattern: x13
 810 // pattern_count: x14 or 1/2/3/4
 811 void C2_MacroAssembler::string_indexof_linearscan(Register haystack, Register needle,
 812                                                Register haystack_len, Register needle_len,
 813                                                Register tmp1, Register tmp2,
 814                                                Register tmp3, Register tmp4,
 815                                                int needle_con_cnt, Register result, int ae)
 816 {
 817   // Note:
 818   // needle_con_cnt > 0 means needle_len register is invalid, needle length is constant
 819   // for UU/LL: needle_con_cnt[1, 4], UL: needle_con_cnt = 1
 820   assert(needle_con_cnt <= 4, "Invalid needle constant count");
 821   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
 822 
 823   Register ch1 = t0;
 824   Register ch2 = t1;
 825   Register hlen_neg = haystack_len, nlen_neg = needle_len;
 826   Register nlen_tmp = tmp1, hlen_tmp = tmp2, result_tmp = tmp4;
 827 
 828   bool isLL = ae == StrIntrinsicNode::LL;
 829 
 830   bool needle_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
 831   bool haystack_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
 832   int needle_chr_shift = needle_isL ? 0 : 1;
 833   int haystack_chr_shift = haystack_isL ? 0 : 1;
 834   int needle_chr_size = needle_isL ? 1 : 2;
 835   int haystack_chr_size = haystack_isL ? 1 : 2;
 836 
 837   load_chr_insn needle_load_1chr = needle_isL ? (load_chr_insn)&MacroAssembler::lbu :
 838                               (load_chr_insn)&MacroAssembler::lhu;
 839   load_chr_insn haystack_load_1chr = haystack_isL ? (load_chr_insn)&MacroAssembler::lbu :
 840                                 (load_chr_insn)&MacroAssembler::lhu;
 841   load_chr_insn load_2chr = isLL ? (load_chr_insn)&MacroAssembler::lhu : (load_chr_insn)&MacroAssembler::lwu;
 842   load_chr_insn load_4chr = isLL ? (load_chr_insn)&MacroAssembler::lwu : (load_chr_insn)&MacroAssembler::ld;
 843 
 844   Label DO1, DO2, DO3, MATCH, NOMATCH, DONE;
 845 
 846   Register first = tmp3;
 847 
 848   if (needle_con_cnt == -1) {
 849     Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
 850 
 851     sub(t0, needle_len, needle_isL == haystack_isL ? 4 : 2);
 852     bltz(t0, DOSHORT);
 853 
 854     (this->*needle_load_1chr)(first, Address(needle), noreg);
 855     slli(t0, needle_len, needle_chr_shift);
 856     add(needle, needle, t0);
 857     neg(nlen_neg, t0);
 858     slli(t0, result_tmp, haystack_chr_shift);
 859     add(haystack, haystack, t0);
 860     neg(hlen_neg, t0);
 861 
 862     bind(FIRST_LOOP);
 863     add(t0, haystack, hlen_neg);
 864     (this->*haystack_load_1chr)(ch2, Address(t0), noreg);
 865     beq(first, ch2, STR1_LOOP);
 866 
 867     bind(STR2_NEXT);
 868     add(hlen_neg, hlen_neg, haystack_chr_size);
 869     blez(hlen_neg, FIRST_LOOP);
 870     j(NOMATCH);
 871 
 872     bind(STR1_LOOP);
 873     add(nlen_tmp, nlen_neg, needle_chr_size);
 874     add(hlen_tmp, hlen_neg, haystack_chr_size);
 875     bgez(nlen_tmp, MATCH);
 876 
 877     bind(STR1_NEXT);
 878     add(ch1, needle, nlen_tmp);
 879     (this->*needle_load_1chr)(ch1, Address(ch1), noreg);
 880     add(ch2, haystack, hlen_tmp);
 881     (this->*haystack_load_1chr)(ch2, Address(ch2), noreg);
 882     bne(ch1, ch2, STR2_NEXT);
 883     add(nlen_tmp, nlen_tmp, needle_chr_size);
 884     add(hlen_tmp, hlen_tmp, haystack_chr_size);
 885     bltz(nlen_tmp, STR1_NEXT);
 886     j(MATCH);
 887 
 888     bind(DOSHORT);
 889     if (needle_isL == haystack_isL) {
 890       sub(t0, needle_len, 2);
 891       bltz(t0, DO1);
 892       bgtz(t0, DO3);
 893     }
 894   }
 895 
 896   if (needle_con_cnt == 4) {
 897     Label CH1_LOOP;
 898     (this->*load_4chr)(ch1, Address(needle), noreg);
 899     sub(result_tmp, haystack_len, 4);
 900     slli(tmp3, result_tmp, haystack_chr_shift); // result as tmp
 901     add(haystack, haystack, tmp3);
 902     neg(hlen_neg, tmp3);
 903     if (AvoidUnalignedAccesses) {
 904       // preload first value, then we will read by 1 character per loop, instead of four
 905       // just shifting previous ch2 right by size of character in bits
 906       add(tmp3, haystack, hlen_neg);
 907       (this->*load_4chr)(ch2, Address(tmp3), noreg);
 908       if (isLL) {
 909         // need to erase 1 most significant byte in 32-bit value of ch2
 910         slli(ch2, ch2, 40);
 911         srli(ch2, ch2, 32);
 912       } else {
 913         slli(ch2, ch2, 16); // 2 most significant bytes will be erased by this operation
 914       }
 915     }
 916 
 917     bind(CH1_LOOP);
 918     add(tmp3, haystack, hlen_neg);
 919     if (AvoidUnalignedAccesses) {
 920       srli(ch2, ch2, isLL ? 8 : 16);
 921       (this->*haystack_load_1chr)(tmp3, Address(tmp3, isLL ? 3 : 6), noreg);
 922       slli(tmp3, tmp3, isLL ? 24 : 48);
 923       add(ch2, ch2, tmp3);
 924     } else {
 925       (this->*load_4chr)(ch2, Address(tmp3), noreg);
 926     }
 927     beq(ch1, ch2, MATCH);
 928     add(hlen_neg, hlen_neg, haystack_chr_size);
 929     blez(hlen_neg, CH1_LOOP);
 930     j(NOMATCH);
 931   }
 932 
 933   if ((needle_con_cnt == -1 && needle_isL == haystack_isL) || needle_con_cnt == 2) {
 934     Label CH1_LOOP;
 935     BLOCK_COMMENT("string_indexof DO2 {");
 936     bind(DO2);
 937     (this->*load_2chr)(ch1, Address(needle), noreg);
 938     if (needle_con_cnt == 2) {
 939       sub(result_tmp, haystack_len, 2);
 940     }
 941     slli(tmp3, result_tmp, haystack_chr_shift);
 942     add(haystack, haystack, tmp3);
 943     neg(hlen_neg, tmp3);
 944     if (AvoidUnalignedAccesses) {
 945       // preload first value, then we will read by 1 character per loop, instead of two
 946       // just shifting previous ch2 right by size of character in bits
 947       add(tmp3, haystack, hlen_neg);
 948       (this->*haystack_load_1chr)(ch2, Address(tmp3), noreg);
 949       slli(ch2, ch2, isLL ? 8 : 16);
 950     }
 951     bind(CH1_LOOP);
 952     add(tmp3, haystack, hlen_neg);
 953     if (AvoidUnalignedAccesses) {
 954       srli(ch2, ch2, isLL ? 8 : 16);
 955       (this->*haystack_load_1chr)(tmp3, Address(tmp3, isLL ? 1 : 2), noreg);
 956       slli(tmp3, tmp3, isLL ? 8 : 16);
 957       add(ch2, ch2, tmp3);
 958     } else {
 959       (this->*load_2chr)(ch2, Address(tmp3), noreg);
 960     }
 961     beq(ch1, ch2, MATCH);
 962     add(hlen_neg, hlen_neg, haystack_chr_size);
 963     blez(hlen_neg, CH1_LOOP);
 964     j(NOMATCH);
 965     BLOCK_COMMENT("} string_indexof DO2");
 966   }
 967 
 968   if ((needle_con_cnt == -1 && needle_isL == haystack_isL) || needle_con_cnt == 3) {
 969     Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
 970     BLOCK_COMMENT("string_indexof DO3 {");
 971 
 972     bind(DO3);
 973     (this->*load_2chr)(first, Address(needle), noreg);
 974     (this->*needle_load_1chr)(ch1, Address(needle, 2 * needle_chr_size), noreg);
 975     if (needle_con_cnt == 3) {
 976       sub(result_tmp, haystack_len, 3);
 977     }
 978     slli(hlen_tmp, result_tmp, haystack_chr_shift);
 979     add(haystack, haystack, hlen_tmp);
 980     neg(hlen_neg, hlen_tmp);
 981 
 982     bind(FIRST_LOOP);
 983     add(ch2, haystack, hlen_neg);
 984     if (AvoidUnalignedAccesses) {
 985       (this->*haystack_load_1chr)(tmp2, Address(ch2, isLL ? 1 : 2), noreg); // we need a temp register, we can safely use hlen_tmp here, which is a synonym for tmp2
 986       (this->*haystack_load_1chr)(ch2, Address(ch2), noreg);
 987       slli(tmp2, tmp2, isLL ? 8 : 16);
 988       add(ch2, ch2, tmp2);
 989     } else {
 990       (this->*load_2chr)(ch2, Address(ch2), noreg);
 991     }
 992     beq(first, ch2, STR1_LOOP);
 993 
 994     bind(STR2_NEXT);
 995     add(hlen_neg, hlen_neg, haystack_chr_size);
 996     blez(hlen_neg, FIRST_LOOP);
 997     j(NOMATCH);
 998 
 999     bind(STR1_LOOP);
1000     add(hlen_tmp, hlen_neg, 2 * haystack_chr_size);
1001     add(ch2, haystack, hlen_tmp);
1002     (this->*haystack_load_1chr)(ch2, Address(ch2), noreg);
1003     bne(ch1, ch2, STR2_NEXT);
1004     j(MATCH);
1005     BLOCK_COMMENT("} string_indexof DO3");
1006   }
1007 
1008   if (needle_con_cnt == -1 || needle_con_cnt == 1) {
1009     Label DO1_LOOP;
1010 
1011     BLOCK_COMMENT("string_indexof DO1 {");
1012     bind(DO1);
1013     (this->*needle_load_1chr)(ch1, Address(needle), noreg);
1014     sub(result_tmp, haystack_len, 1);
1015     slli(tmp3, result_tmp, haystack_chr_shift);
1016     add(haystack, haystack, tmp3);
1017     neg(hlen_neg, tmp3);
1018 
1019     bind(DO1_LOOP);
1020     add(tmp3, haystack, hlen_neg);
1021     (this->*haystack_load_1chr)(ch2, Address(tmp3), noreg);
1022     beq(ch1, ch2, MATCH);
1023     add(hlen_neg, hlen_neg, haystack_chr_size);
1024     blez(hlen_neg, DO1_LOOP);
1025     BLOCK_COMMENT("} string_indexof DO1");
1026   }
1027 
1028   bind(NOMATCH);
1029   mv(result, -1);
1030   j(DONE);
1031 
1032   bind(MATCH);
1033   srai(t0, hlen_neg, haystack_chr_shift);
1034   add(result, result_tmp, t0);
1035 
1036   bind(DONE);
1037 }
1038 
1039 // Compare strings.
1040 void C2_MacroAssembler::string_compare(Register str1, Register str2,
1041                                        Register cnt1, Register cnt2, Register result,
1042                                        Register tmp1, Register tmp2, Register tmp3,
1043                                        int ae)
1044 {
1045   Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
1046         DIFFERENCE, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
1047         SHORT_LOOP_START, TAIL_CHECK, L;
1048 
1049   const int STUB_THRESHOLD = 64 + 8;
1050   bool isLL = ae == StrIntrinsicNode::LL;
1051   bool isLU = ae == StrIntrinsicNode::LU;
1052   bool isUL = ae == StrIntrinsicNode::UL;
1053 
1054   bool str1_isL = isLL || isLU;
1055   bool str2_isL = isLL || isUL;
1056 
1057   // for L strings, 1 byte for 1 character
1058   // for U strings, 2 bytes for 1 character
1059   int str1_chr_size = str1_isL ? 1 : 2;
1060   int str2_chr_size = str2_isL ? 1 : 2;
1061   int minCharsInWord = isLL ? wordSize : wordSize / 2;
1062 
1063   load_chr_insn str1_load_chr = str1_isL ? (load_chr_insn)&MacroAssembler::lbu : (load_chr_insn)&MacroAssembler::lhu;
1064   load_chr_insn str2_load_chr = str2_isL ? (load_chr_insn)&MacroAssembler::lbu : (load_chr_insn)&MacroAssembler::lhu;
1065 
1066   BLOCK_COMMENT("string_compare {");
1067 
1068   // Bizzarely, the counts are passed in bytes, regardless of whether they
1069   // are L or U strings, however the result is always in characters.
1070   if (!str1_isL) {
1071     sraiw(cnt1, cnt1, 1);
1072   }
1073   if (!str2_isL) {
1074     sraiw(cnt2, cnt2, 1);
1075   }
1076 
1077   // Compute the minimum of the string lengths and save the difference in result.
1078   sub(result, cnt1, cnt2);
1079   bgt(cnt1, cnt2, L);
1080   mv(cnt2, cnt1);
1081   bind(L);
1082 
1083   // A very short string
1084   mv(t0, minCharsInWord);
1085   ble(cnt2, t0, SHORT_STRING);
1086 
1087   // Compare longwords
1088   // load first parts of strings and finish initialization while loading
1089   {
1090     if (str1_isL == str2_isL) { // LL or UU
1091       // check if str1 and str2 is same pointer
1092       beq(str1, str2, DONE);
1093       // load 8 bytes once to compare
1094       ld(tmp1, Address(str1));
1095       ld(tmp2, Address(str2));
1096       mv(t0, STUB_THRESHOLD);
1097       bge(cnt2, t0, STUB);
1098       sub(cnt2, cnt2, minCharsInWord);
1099       beqz(cnt2, TAIL_CHECK);
1100       // convert cnt2 from characters to bytes
1101       if (!str1_isL) {
1102         slli(cnt2, cnt2, 1);
1103       }
1104       add(str2, str2, cnt2);
1105       add(str1, str1, cnt2);
1106       sub(cnt2, zr, cnt2);
1107     } else if (isLU) { // LU case
1108       lwu(tmp1, Address(str1));
1109       ld(tmp2, Address(str2));
1110       mv(t0, STUB_THRESHOLD);
1111       bge(cnt2, t0, STUB);
1112       addi(cnt2, cnt2, -4);
1113       add(str1, str1, cnt2);
1114       sub(cnt1, zr, cnt2);
1115       slli(cnt2, cnt2, 1);
1116       add(str2, str2, cnt2);
1117       inflate_lo32(tmp3, tmp1);
1118       mv(tmp1, tmp3);
1119       sub(cnt2, zr, cnt2);
1120       addi(cnt1, cnt1, 4);
1121     } else { // UL case
1122       ld(tmp1, Address(str1));
1123       lwu(tmp2, Address(str2));
1124       mv(t0, STUB_THRESHOLD);
1125       bge(cnt2, t0, STUB);
1126       addi(cnt2, cnt2, -4);
1127       slli(t0, cnt2, 1);
1128       sub(cnt1, zr, t0);
1129       add(str1, str1, t0);
1130       add(str2, str2, cnt2);
1131       inflate_lo32(tmp3, tmp2);
1132       mv(tmp2, tmp3);
1133       sub(cnt2, zr, cnt2);
1134       addi(cnt1, cnt1, 8);
1135     }
1136     addi(cnt2, cnt2, isUL ? 4 : 8);
1137     bne(tmp1, tmp2, DIFFERENCE);
1138     bgez(cnt2, TAIL);
1139 
1140     // main loop
1141     bind(NEXT_WORD);
1142     if (str1_isL == str2_isL) { // LL or UU
1143       add(t0, str1, cnt2);
1144       ld(tmp1, Address(t0));
1145       add(t0, str2, cnt2);
1146       ld(tmp2, Address(t0));
1147       addi(cnt2, cnt2, 8);
1148     } else if (isLU) { // LU case
1149       add(t0, str1, cnt1);
1150       lwu(tmp1, Address(t0));
1151       add(t0, str2, cnt2);
1152       ld(tmp2, Address(t0));
1153       addi(cnt1, cnt1, 4);
1154       inflate_lo32(tmp3, tmp1);
1155       mv(tmp1, tmp3);
1156       addi(cnt2, cnt2, 8);
1157     } else { // UL case
1158       add(t0, str2, cnt2);
1159       lwu(tmp2, Address(t0));
1160       add(t0, str1, cnt1);
1161       ld(tmp1, Address(t0));
1162       inflate_lo32(tmp3, tmp2);
1163       mv(tmp2, tmp3);
1164       addi(cnt1, cnt1, 8);
1165       addi(cnt2, cnt2, 4);
1166     }
1167     bne(tmp1, tmp2, DIFFERENCE);
1168     bltz(cnt2, NEXT_WORD);
1169     bind(TAIL);
1170     if (str1_isL == str2_isL) { // LL or UU
1171       load_long_misaligned(tmp1, Address(str1), tmp3, isLL ? 1 : 2);
1172       load_long_misaligned(tmp2, Address(str2), tmp3, isLL ? 1 : 2);
1173     } else if (isLU) { // LU case
1174       load_int_misaligned(tmp1, Address(str1), tmp3, false);
1175       load_long_misaligned(tmp2, Address(str2), tmp3, 2);
1176       inflate_lo32(tmp3, tmp1);
1177       mv(tmp1, tmp3);
1178     } else { // UL case
1179       load_int_misaligned(tmp2, Address(str2), tmp3, false);
1180       load_long_misaligned(tmp1, Address(str1), tmp3, 2);
1181       inflate_lo32(tmp3, tmp2);
1182       mv(tmp2, tmp3);
1183     }
1184     bind(TAIL_CHECK);
1185     beq(tmp1, tmp2, DONE);
1186 
1187     // Find the first different characters in the longwords and
1188     // compute their difference.
1189     bind(DIFFERENCE);
1190     xorr(tmp3, tmp1, tmp2);
1191     ctzc_bit(result, tmp3, isLL); // count zero from lsb to msb
1192     srl(tmp1, tmp1, result);
1193     srl(tmp2, tmp2, result);
1194     if (isLL) {
1195       andi(tmp1, tmp1, 0xFF);
1196       andi(tmp2, tmp2, 0xFF);
1197     } else {
1198       andi(tmp1, tmp1, 0xFFFF);
1199       andi(tmp2, tmp2, 0xFFFF);
1200     }
1201     sub(result, tmp1, tmp2);
1202     j(DONE);
1203   }
1204 
1205   bind(STUB);
1206   RuntimeAddress stub = nullptr;
1207   switch (ae) {
1208     case StrIntrinsicNode::LL:
1209       stub = RuntimeAddress(StubRoutines::riscv::compare_long_string_LL());
1210       break;
1211     case StrIntrinsicNode::UU:
1212       stub = RuntimeAddress(StubRoutines::riscv::compare_long_string_UU());
1213       break;
1214     case StrIntrinsicNode::LU:
1215       stub = RuntimeAddress(StubRoutines::riscv::compare_long_string_LU());
1216       break;
1217     case StrIntrinsicNode::UL:
1218       stub = RuntimeAddress(StubRoutines::riscv::compare_long_string_UL());
1219       break;
1220     default:
1221       ShouldNotReachHere();
1222   }
1223   assert(stub.target() != nullptr, "compare_long_string stub has not been generated");
1224   address call = trampoline_call(stub);
1225   if (call == nullptr) {
1226     DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START));
1227     ciEnv::current()->record_failure("CodeCache is full");
1228     return;
1229   }
1230   j(DONE);
1231 
1232   bind(SHORT_STRING);
1233   // Is the minimum length zero?
1234   beqz(cnt2, DONE);
1235   // arrange code to do most branches while loading and loading next characters
1236   // while comparing previous
1237   (this->*str1_load_chr)(tmp1, Address(str1), t0);
1238   addi(str1, str1, str1_chr_size);
1239   addi(cnt2, cnt2, -1);
1240   beqz(cnt2, SHORT_LAST_INIT);
1241   (this->*str2_load_chr)(cnt1, Address(str2), t0);
1242   addi(str2, str2, str2_chr_size);
1243   j(SHORT_LOOP_START);
1244   bind(SHORT_LOOP);
1245   addi(cnt2, cnt2, -1);
1246   beqz(cnt2, SHORT_LAST);
1247   bind(SHORT_LOOP_START);
1248   (this->*str1_load_chr)(tmp2, Address(str1), t0);
1249   addi(str1, str1, str1_chr_size);
1250   (this->*str2_load_chr)(t0, Address(str2), t0);
1251   addi(str2, str2, str2_chr_size);
1252   bne(tmp1, cnt1, SHORT_LOOP_TAIL);
1253   addi(cnt2, cnt2, -1);
1254   beqz(cnt2, SHORT_LAST2);
1255   (this->*str1_load_chr)(tmp1, Address(str1), t0);
1256   addi(str1, str1, str1_chr_size);
1257   (this->*str2_load_chr)(cnt1, Address(str2), t0);
1258   addi(str2, str2, str2_chr_size);
1259   beq(tmp2, t0, SHORT_LOOP);
1260   sub(result, tmp2, t0);
1261   j(DONE);
1262   bind(SHORT_LOOP_TAIL);
1263   sub(result, tmp1, cnt1);
1264   j(DONE);
1265   bind(SHORT_LAST2);
1266   beq(tmp2, t0, DONE);
1267   sub(result, tmp2, t0);
1268 
1269   j(DONE);
1270   bind(SHORT_LAST_INIT);
1271   (this->*str2_load_chr)(cnt1, Address(str2), t0);
1272   addi(str2, str2, str2_chr_size);
1273   bind(SHORT_LAST);
1274   beq(tmp1, cnt1, DONE);
1275   sub(result, tmp1, cnt1);
1276 
1277   bind(DONE);
1278 
1279   BLOCK_COMMENT("} string_compare");
1280 }
1281 
1282 void C2_MacroAssembler::arrays_equals(Register a1, Register a2, Register tmp3,
1283                                       Register tmp4, Register tmp5, Register tmp6, Register result,
1284                                       Register cnt1, int elem_size) {
1285   Label DONE, SAME, NEXT_DWORD, SHORT, TAIL, TAIL2, IS_TMP5_ZR;
1286   Register tmp1 = t0;
1287   Register tmp2 = t1;
1288   Register cnt2 = tmp2;  // cnt2 only used in array length compare
1289   Register elem_per_word = tmp6;
1290   int log_elem_size = exact_log2(elem_size);
1291   int length_offset = arrayOopDesc::length_offset_in_bytes();
1292   int base_offset   = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE);
1293 
1294   assert(elem_size == 1 || elem_size == 2, "must be char or byte");
1295   assert_different_registers(a1, a2, result, cnt1, t0, t1, tmp3, tmp4, tmp5, tmp6);
1296   mv(elem_per_word, wordSize / elem_size);
1297 
1298   BLOCK_COMMENT("arrays_equals {");
1299 
1300   // if (a1 == a2), return true
1301   beq(a1, a2, SAME);
1302 
1303   mv(result, false);
1304   beqz(a1, DONE);
1305   beqz(a2, DONE);
1306   lwu(cnt1, Address(a1, length_offset));
1307   lwu(cnt2, Address(a2, length_offset));
1308   bne(cnt2, cnt1, DONE);
1309   beqz(cnt1, SAME);
1310 
1311   slli(tmp5, cnt1, 3 + log_elem_size);
1312   sub(tmp5, zr, tmp5);
1313   add(a1, a1, base_offset);
1314   add(a2, a2, base_offset);
1315   ld(tmp3, Address(a1, 0));
1316   ld(tmp4, Address(a2, 0));
1317   ble(cnt1, elem_per_word, SHORT); // short or same
1318 
1319   // Main 16 byte comparison loop with 2 exits
1320   bind(NEXT_DWORD); {
1321     ld(tmp1, Address(a1, wordSize));
1322     ld(tmp2, Address(a2, wordSize));
1323     sub(cnt1, cnt1, 2 * wordSize / elem_size);
1324     blez(cnt1, TAIL);
1325     bne(tmp3, tmp4, DONE);
1326     ld(tmp3, Address(a1, 2 * wordSize));
1327     ld(tmp4, Address(a2, 2 * wordSize));
1328     add(a1, a1, 2 * wordSize);
1329     add(a2, a2, 2 * wordSize);
1330     ble(cnt1, elem_per_word, TAIL2);
1331   } beq(tmp1, tmp2, NEXT_DWORD);
1332   j(DONE);
1333 
1334   bind(TAIL);
1335   xorr(tmp4, tmp3, tmp4);
1336   xorr(tmp2, tmp1, tmp2);
1337   sll(tmp2, tmp2, tmp5);
1338   orr(tmp5, tmp4, tmp2);
1339   j(IS_TMP5_ZR);
1340 
1341   bind(TAIL2);
1342   bne(tmp1, tmp2, DONE);
1343 
1344   bind(SHORT);
1345   xorr(tmp4, tmp3, tmp4);
1346   sll(tmp5, tmp4, tmp5);
1347 
1348   bind(IS_TMP5_ZR);
1349   bnez(tmp5, DONE);
1350 
1351   bind(SAME);
1352   mv(result, true);
1353   // That's it.
1354   bind(DONE);
1355 
1356   BLOCK_COMMENT("} array_equals");
1357 }
1358 
1359 // Compare Strings
1360 
1361 // For Strings we're passed the address of the first characters in a1
1362 // and a2 and the length in cnt1.
1363 // There are two implementations.  For arrays >= 8 bytes, all
1364 // comparisons (for hw supporting unaligned access: including the final one,
1365 // which may overlap) are performed 8 bytes at a time.
1366 // For strings < 8 bytes (and for tails of long strings when
1367 // AvoidUnalignedAccesses is true), we compare a
1368 // halfword, then a short, and then a byte.
1369 
1370 void C2_MacroAssembler::string_equals(Register a1, Register a2,
1371                                       Register result, Register cnt1)
1372 {
1373   Label SAME, DONE, SHORT, NEXT_WORD;
1374   Register tmp1 = t0;
1375   Register tmp2 = t1;
1376 
1377   assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2);
1378 
1379   BLOCK_COMMENT("string_equals {");
1380 
1381   beqz(cnt1, SAME);
1382   mv(result, false);
1383 
1384   // Check for short strings, i.e. smaller than wordSize.
1385   sub(cnt1, cnt1, wordSize);
1386   bltz(cnt1, SHORT);
1387 
1388   // Main 8 byte comparison loop.
1389   bind(NEXT_WORD); {
1390     ld(tmp1, Address(a1, 0));
1391     add(a1, a1, wordSize);
1392     ld(tmp2, Address(a2, 0));
1393     add(a2, a2, wordSize);
1394     sub(cnt1, cnt1, wordSize);
1395     bne(tmp1, tmp2, DONE);
1396   } bgez(cnt1, NEXT_WORD);
1397 
1398   if (!AvoidUnalignedAccesses) {
1399     // Last longword.  In the case where length == 4 we compare the
1400     // same longword twice, but that's still faster than another
1401     // conditional branch.
1402     // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when
1403     // length == 4.
1404     add(tmp1, a1, cnt1);
1405     ld(tmp1, Address(tmp1, 0));
1406     add(tmp2, a2, cnt1);
1407     ld(tmp2, Address(tmp2, 0));
1408     bne(tmp1, tmp2, DONE);
1409     j(SAME);
1410   } else {
1411     add(tmp1, cnt1, wordSize);
1412     beqz(tmp1, SAME);
1413   }
1414 
1415   bind(SHORT);
1416   Label TAIL03, TAIL01;
1417 
1418   // 0-7 bytes left.
1419   test_bit(tmp1, cnt1, 2);
1420   beqz(tmp1, TAIL03);
1421   {
1422     lwu(tmp1, Address(a1, 0));
1423     add(a1, a1, 4);
1424     lwu(tmp2, Address(a2, 0));
1425     add(a2, a2, 4);
1426     bne(tmp1, tmp2, DONE);
1427   }
1428 
1429   bind(TAIL03);
1430   // 0-3 bytes left.
1431   test_bit(tmp1, cnt1, 1);
1432   beqz(tmp1, TAIL01);
1433   {
1434     lhu(tmp1, Address(a1, 0));
1435     add(a1, a1, 2);
1436     lhu(tmp2, Address(a2, 0));
1437     add(a2, a2, 2);
1438     bne(tmp1, tmp2, DONE);
1439   }
1440 
1441   bind(TAIL01);
1442   // 0-1 bytes left.
1443   test_bit(tmp1, cnt1, 0);
1444   beqz(tmp1, SAME);
1445   {
1446     lbu(tmp1, Address(a1, 0));
1447     lbu(tmp2, Address(a2, 0));
1448     bne(tmp1, tmp2, DONE);
1449   }
1450 
1451   // Arrays are equal.
1452   bind(SAME);
1453   mv(result, true);
1454 
1455   // That's it.
1456   bind(DONE);
1457   BLOCK_COMMENT("} string_equals");
1458 }
1459 
1460 // jdk.internal.util.ArraysSupport.vectorizedHashCode
1461 void C2_MacroAssembler::arrays_hashcode(Register ary, Register cnt, Register result,
1462                                         Register tmp1, Register tmp2, Register tmp3,
1463                                         Register tmp4, Register tmp5, Register tmp6,
1464                                         BasicType eltype)
1465 {
1466   assert_different_registers(ary, cnt, result, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, t0, t1);
1467 
1468   const int elsize = arrays_hashcode_elsize(eltype);
1469   const int chunks_end_shift = exact_log2(elsize);
1470 
1471   switch (eltype) {
1472   case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break;
1473   case T_CHAR:    BLOCK_COMMENT("arrays_hashcode(char) {");          break;
1474   case T_BYTE:    BLOCK_COMMENT("arrays_hashcode(byte) {");          break;
1475   case T_SHORT:   BLOCK_COMMENT("arrays_hashcode(short) {");         break;
1476   case T_INT:     BLOCK_COMMENT("arrays_hashcode(int) {");           break;
1477   default:
1478     ShouldNotReachHere();
1479   }
1480 
1481   const int stride = 4;
1482   const Register pow31_4 = tmp1;
1483   const Register pow31_3 = tmp2;
1484   const Register pow31_2 = tmp3;
1485   const Register chunks  = tmp4;
1486   const Register chunks_end = chunks;
1487 
1488   Label DONE, TAIL, TAIL_LOOP, WIDE_LOOP;
1489 
1490   // result has a value initially
1491 
1492   beqz(cnt, DONE);
1493 
1494   andi(chunks, cnt, ~(stride-1));
1495   beqz(chunks, TAIL);
1496 
1497   mv(pow31_4, 923521);           // [31^^4]
1498   mv(pow31_3,  29791);           // [31^^3]
1499   mv(pow31_2,    961);           // [31^^2]
1500 
1501   slli(chunks_end, chunks, chunks_end_shift);
1502   add(chunks_end, ary, chunks_end);
1503   andi(cnt, cnt, stride-1);      // don't forget about tail!
1504 
1505   bind(WIDE_LOOP);
1506   mulw(result, result, pow31_4); // 31^^4 * h
1507   arrays_hashcode_elload(t0,   Address(ary, 0 * elsize), eltype);
1508   arrays_hashcode_elload(t1,   Address(ary, 1 * elsize), eltype);
1509   arrays_hashcode_elload(tmp5, Address(ary, 2 * elsize), eltype);
1510   arrays_hashcode_elload(tmp6, Address(ary, 3 * elsize), eltype);
1511   mulw(t0, t0, pow31_3);         // 31^^3 * ary[i+0]
1512   addw(result, result, t0);
1513   mulw(t1, t1, pow31_2);         // 31^^2 * ary[i+1]
1514   addw(result, result, t1);
1515   slli(t0, tmp5, 5);             // optimize 31^^1 * ary[i+2]
1516   subw(tmp5, t0, tmp5);          // with ary[i+2]<<5 - ary[i+2]
1517   addw(result, result, tmp5);
1518   addw(result, result, tmp6);    // 31^^4 * h + 31^^3 * ary[i+0] + 31^^2 * ary[i+1]
1519                                  //           + 31^^1 * ary[i+2] + 31^^0 * ary[i+3]
1520   addi(ary, ary, elsize * stride);
1521   bne(ary, chunks_end, WIDE_LOOP);
1522   beqz(cnt, DONE);
1523 
1524   bind(TAIL);
1525   slli(chunks_end, cnt, chunks_end_shift);
1526   add(chunks_end, ary, chunks_end);
1527 
1528   bind(TAIL_LOOP);
1529   arrays_hashcode_elload(t0, Address(ary), eltype);
1530   slli(t1, result, 5);           // optimize 31 * result
1531   subw(result, t1, result);      // with result<<5 - result
1532   addw(result, result, t0);
1533   addi(ary, ary, elsize);
1534   bne(ary, chunks_end, TAIL_LOOP);
1535 
1536   bind(DONE);
1537   BLOCK_COMMENT("} // arrays_hashcode");
1538 }
1539 
1540 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) {
1541   switch (eltype) {
1542   case T_BOOLEAN: return sizeof(jboolean);
1543   case T_BYTE:    return sizeof(jbyte);
1544   case T_SHORT:   return sizeof(jshort);
1545   case T_CHAR:    return sizeof(jchar);
1546   case T_INT:     return sizeof(jint);
1547   default:
1548     ShouldNotReachHere();
1549     return -1;
1550   }
1551 }
1552 
1553 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) {
1554   switch (eltype) {
1555   // T_BOOLEAN used as surrogate for unsigned byte
1556   case T_BOOLEAN: lbu(dst, src);   break;
1557   case T_BYTE:     lb(dst, src);   break;
1558   case T_SHORT:    lh(dst, src);   break;
1559   case T_CHAR:    lhu(dst, src);   break;
1560   case T_INT:      lw(dst, src);   break;
1561   default:
1562     ShouldNotReachHere();
1563   }
1564 }
1565 
1566 typedef void (Assembler::*conditional_branch_insn)(Register op1, Register op2, Label& label, bool is_far);
1567 typedef void (MacroAssembler::*float_conditional_branch_insn)(FloatRegister op1, FloatRegister op2, Label& label,
1568                                                               bool is_far, bool is_unordered);
1569 
1570 static conditional_branch_insn conditional_branches[] =
1571 {
1572   /* SHORT branches */
1573   (conditional_branch_insn)&MacroAssembler::beq,
1574   (conditional_branch_insn)&MacroAssembler::bgt,
1575   nullptr, // BoolTest::overflow
1576   (conditional_branch_insn)&MacroAssembler::blt,
1577   (conditional_branch_insn)&MacroAssembler::bne,
1578   (conditional_branch_insn)&MacroAssembler::ble,
1579   nullptr, // BoolTest::no_overflow
1580   (conditional_branch_insn)&MacroAssembler::bge,
1581 
1582   /* UNSIGNED branches */
1583   (conditional_branch_insn)&MacroAssembler::beq,
1584   (conditional_branch_insn)&MacroAssembler::bgtu,
1585   nullptr,
1586   (conditional_branch_insn)&MacroAssembler::bltu,
1587   (conditional_branch_insn)&MacroAssembler::bne,
1588   (conditional_branch_insn)&MacroAssembler::bleu,
1589   nullptr,
1590   (conditional_branch_insn)&MacroAssembler::bgeu
1591 };
1592 
1593 static float_conditional_branch_insn float_conditional_branches[] =
1594 {
1595   /* FLOAT SHORT branches */
1596   (float_conditional_branch_insn)&MacroAssembler::float_beq,
1597   (float_conditional_branch_insn)&MacroAssembler::float_bgt,
1598   nullptr,  // BoolTest::overflow
1599   (float_conditional_branch_insn)&MacroAssembler::float_blt,
1600   (float_conditional_branch_insn)&MacroAssembler::float_bne,
1601   (float_conditional_branch_insn)&MacroAssembler::float_ble,
1602   nullptr, // BoolTest::no_overflow
1603   (float_conditional_branch_insn)&MacroAssembler::float_bge,
1604 
1605   /* DOUBLE SHORT branches */
1606   (float_conditional_branch_insn)&MacroAssembler::double_beq,
1607   (float_conditional_branch_insn)&MacroAssembler::double_bgt,
1608   nullptr,
1609   (float_conditional_branch_insn)&MacroAssembler::double_blt,
1610   (float_conditional_branch_insn)&MacroAssembler::double_bne,
1611   (float_conditional_branch_insn)&MacroAssembler::double_ble,
1612   nullptr,
1613   (float_conditional_branch_insn)&MacroAssembler::double_bge
1614 };
1615 
1616 void C2_MacroAssembler::cmp_branch(int cmpFlag, Register op1, Register op2, Label& label, bool is_far) {
1617   assert(cmpFlag >= 0 && cmpFlag < (int)(sizeof(conditional_branches) / sizeof(conditional_branches[0])),
1618          "invalid conditional branch index");
1619   (this->*conditional_branches[cmpFlag])(op1, op2, label, is_far);
1620 }
1621 
1622 // This is a function should only be used by C2. Flip the unordered when unordered-greater, C2 would use
1623 // unordered-lesser instead of unordered-greater. Finally, commute the result bits at function do_one_bytecode().
1624 void C2_MacroAssembler::float_cmp_branch(int cmpFlag, FloatRegister op1, FloatRegister op2, Label& label, bool is_far) {
1625   assert(cmpFlag >= 0 && cmpFlag < (int)(sizeof(float_conditional_branches) / sizeof(float_conditional_branches[0])),
1626          "invalid float conditional branch index");
1627   int booltest_flag = cmpFlag & ~(C2_MacroAssembler::double_branch_mask);
1628   (this->*float_conditional_branches[cmpFlag])(op1, op2, label, is_far,
1629     (booltest_flag == (BoolTest::ge) || booltest_flag == (BoolTest::gt)) ? false : true);
1630 }
1631 
1632 void C2_MacroAssembler::enc_cmpUEqNeLeGt_imm0_branch(int cmpFlag, Register op1, Label& L, bool is_far) {
1633   switch (cmpFlag) {
1634     case BoolTest::eq:
1635     case BoolTest::le:
1636       beqz(op1, L, is_far);
1637       break;
1638     case BoolTest::ne:
1639     case BoolTest::gt:
1640       bnez(op1, L, is_far);
1641       break;
1642     default:
1643       ShouldNotReachHere();
1644   }
1645 }
1646 
1647 void C2_MacroAssembler::enc_cmpEqNe_imm0_branch(int cmpFlag, Register op1, Label& L, bool is_far) {
1648   switch (cmpFlag) {
1649     case BoolTest::eq:
1650       beqz(op1, L, is_far);
1651       break;
1652     case BoolTest::ne:
1653       bnez(op1, L, is_far);
1654       break;
1655     default:
1656       ShouldNotReachHere();
1657   }
1658 }
1659 
1660 void C2_MacroAssembler::enc_cmove(int cmpFlag, Register op1, Register op2, Register dst, Register src) {
1661   Label L;
1662   cmp_branch(cmpFlag ^ (1 << neg_cond_bits), op1, op2, L);
1663   mv(dst, src);
1664   bind(L);
1665 }
1666 
1667 // Set dst to NaN if any NaN input.
1668 void C2_MacroAssembler::minmax_fp(FloatRegister dst, FloatRegister src1, FloatRegister src2,
1669                                   bool is_double, bool is_min) {
1670   assert_different_registers(dst, src1, src2);
1671 
1672   Label Done, Compare;
1673 
1674   is_double ? fclass_d(t0, src1)
1675             : fclass_s(t0, src1);
1676   is_double ? fclass_d(t1, src2)
1677             : fclass_s(t1, src2);
1678   orr(t0, t0, t1);
1679   andi(t0, t0, fclass_mask::nan); // if src1 or src2 is quiet or signaling NaN then return NaN
1680   beqz(t0, Compare);
1681   is_double ? fadd_d(dst, src1, src2)
1682             : fadd_s(dst, src1, src2);
1683   j(Done);
1684 
1685   bind(Compare);
1686   if (is_double) {
1687     is_min ? fmin_d(dst, src1, src2)
1688            : fmax_d(dst, src1, src2);
1689   } else {
1690     is_min ? fmin_s(dst, src1, src2)
1691            : fmax_s(dst, src1, src2);
1692   }
1693 
1694   bind(Done);
1695 }
1696 
1697 // According to Java SE specification, for floating-point round operations, if
1698 // the input is NaN, +/-infinity, or +/-0, the same input is returned as the
1699 // rounded result; this differs from behavior of RISC-V fcvt instructions (which
1700 // round out-of-range values to the nearest max or min value), therefore special
1701 // handling is needed by NaN, +/-Infinity, +/-0.
1702 void C2_MacroAssembler::round_double_mode(FloatRegister dst, FloatRegister src, int round_mode,
1703                                           Register tmp1, Register tmp2, Register tmp3) {
1704 
1705   assert_different_registers(dst, src);
1706   assert_different_registers(tmp1, tmp2, tmp3);
1707 
1708   // Set rounding mode for conversions
1709   // Here we use similar modes to double->long and long->double conversions
1710   // Different mode for long->double conversion matter only if long value was not representable as double,
1711   // we got long value as a result of double->long conversion so, it is definitely representable
1712   RoundingMode rm;
1713   switch (round_mode) {
1714     case RoundDoubleModeNode::rmode_ceil:
1715       rm = RoundingMode::rup;
1716       break;
1717     case RoundDoubleModeNode::rmode_floor:
1718       rm = RoundingMode::rdn;
1719       break;
1720     case RoundDoubleModeNode::rmode_rint:
1721       rm = RoundingMode::rne;
1722       break;
1723     default:
1724       ShouldNotReachHere();
1725   }
1726 
1727   // tmp1 - is a register to store double converted to long int
1728   // tmp2 - is a register to create constant for comparison
1729   // tmp3 - is a register where we store modified result of double->long conversion
1730   Label done, bad_val;
1731 
1732   // Conversion from double to long
1733   fcvt_l_d(tmp1, src, rm);
1734 
1735   // Generate constant (tmp2)
1736   // tmp2 = 100...0000
1737   addi(tmp2, zr, 1);
1738   slli(tmp2, tmp2, 63);
1739 
1740   // Prepare converted long (tmp1)
1741   // as a result when conversion overflow we got:
1742   // tmp1 = 011...1111 or 100...0000
1743   // Convert it to: tmp3 = 100...0000
1744   addi(tmp3, tmp1, 1);
1745   andi(tmp3, tmp3, -2);
1746   beq(tmp3, tmp2, bad_val);
1747 
1748   // Conversion from long to double
1749   fcvt_d_l(dst, tmp1, rm);
1750   // Add sign of input value to result for +/- 0 cases
1751   fsgnj_d(dst, dst, src);
1752   j(done);
1753 
1754   // If got conversion overflow return src
1755   bind(bad_val);
1756   fmv_d(dst, src);
1757 
1758   bind(done);
1759 }
1760 
1761 // According to Java SE specification, for floating-point signum operations, if
1762 // on input we have NaN or +/-0.0 value we should return it,
1763 // otherwise return +/- 1.0 using sign of input.
1764 // one - gives us a floating-point 1.0 (got from matching rule)
1765 // bool is_double - specifies single or double precision operations will be used.
1766 void C2_MacroAssembler::signum_fp(FloatRegister dst, FloatRegister one, bool is_double) {
1767   Label done;
1768 
1769   is_double ? fclass_d(t0, dst)
1770             : fclass_s(t0, dst);
1771 
1772   // check if input is -0, +0, signaling NaN or quiet NaN
1773   andi(t0, t0, fclass_mask::zero | fclass_mask::nan);
1774 
1775   bnez(t0, done);
1776 
1777   // use floating-point 1.0 with a sign of input
1778   is_double ? fsgnj_d(dst, one, dst)
1779             : fsgnj_s(dst, one, dst);
1780 
1781   bind(done);
1782 }
1783 
1784 static void float16_to_float_slow_path(C2_MacroAssembler& masm, C2GeneralStub<FloatRegister, Register, Register>& stub) {
1785 #define __ masm.
1786   FloatRegister dst = stub.data<0>();
1787   Register src = stub.data<1>();
1788   Register tmp = stub.data<2>();
1789   __ bind(stub.entry());
1790 
1791   // following instructions mainly focus on NaN, as riscv does not handle
1792   // NaN well with fcvt, but the code also works for Inf at the same time.
1793 
1794   // construct a NaN in 32 bits from the NaN in 16 bits,
1795   // we need the payloads of non-canonical NaNs to be preserved.
1796   __ mv(tmp, 0x7f800000);
1797   // sign-bit was already set via sign-extension if necessary.
1798   __ slli(t0, src, 13);
1799   __ orr(tmp, t0, tmp);
1800   __ fmv_w_x(dst, tmp);
1801 
1802   __ j(stub.continuation());
1803 #undef __
1804 }
1805 
1806 // j.l.Float.float16ToFloat
1807 void C2_MacroAssembler::float16_to_float(FloatRegister dst, Register src, Register tmp) {
1808   auto stub = C2CodeStub::make<FloatRegister, Register, Register>(dst, src, tmp, 20, float16_to_float_slow_path);
1809 
1810   // in riscv, NaN needs a special process as fcvt does not work in that case.
1811   // in riscv, Inf does not need a special process as fcvt can handle it correctly.
1812   // but we consider to get the slow path to process NaN and Inf at the same time,
1813   // as both of them are rare cases, and if we try to get the slow path to handle
1814   // only NaN case it would sacrifise the performance for normal cases,
1815   // i.e. non-NaN and non-Inf cases.
1816 
1817   // check whether it's a NaN or +/- Inf.
1818   mv(t0, 0x7c00);
1819   andr(tmp, src, t0);
1820   // jump to stub processing NaN and Inf cases.
1821   beq(t0, tmp, stub->entry());
1822 
1823   // non-NaN or non-Inf cases, just use built-in instructions.
1824   fmv_h_x(dst, src);
1825   fcvt_s_h(dst, dst);
1826 
1827   bind(stub->continuation());
1828 }
1829 
1830 static void float_to_float16_slow_path(C2_MacroAssembler& masm, C2GeneralStub<Register, FloatRegister, Register>& stub) {
1831 #define __ masm.
1832   Register dst = stub.data<0>();
1833   FloatRegister src = stub.data<1>();
1834   Register tmp = stub.data<2>();
1835   __ bind(stub.entry());
1836 
1837   __ fmv_x_w(dst, src);
1838 
1839   // preserve the payloads of non-canonical NaNs.
1840   __ srai(dst, dst, 13);
1841   // preserve the sign bit.
1842   __ srai(tmp, dst, 13);
1843   __ slli(tmp, tmp, 10);
1844   __ mv(t0, 0x3ff);
1845   __ orr(tmp, tmp, t0);
1846 
1847   // get the result by merging sign bit and payloads of preserved non-canonical NaNs.
1848   __ andr(dst, dst, tmp);
1849 
1850   __ j(stub.continuation());
1851 #undef __
1852 }
1853 
1854 // j.l.Float.floatToFloat16
1855 void C2_MacroAssembler::float_to_float16(Register dst, FloatRegister src, FloatRegister ftmp, Register xtmp) {
1856   auto stub = C2CodeStub::make<Register, FloatRegister, Register>(dst, src, xtmp, 130, float_to_float16_slow_path);
1857 
1858   // in riscv, NaN needs a special process as fcvt does not work in that case.
1859 
1860   // check whether it's a NaN.
1861   // replace fclass with feq as performance optimization.
1862   feq_s(t0, src, src);
1863   // jump to stub processing NaN cases.
1864   beqz(t0, stub->entry());
1865 
1866   // non-NaN cases, just use built-in instructions.
1867   fcvt_h_s(ftmp, src);
1868   fmv_x_h(dst, ftmp);
1869 
1870   bind(stub->continuation());
1871 }
1872 
1873 void C2_MacroAssembler::signum_fp_v(VectorRegister dst, VectorRegister one, BasicType bt, int vlen) {
1874   vsetvli_helper(bt, vlen);
1875 
1876   // check if input is -0, +0, signaling NaN or quiet NaN
1877   vfclass_v(v0, dst);
1878   mv(t0, fclass_mask::zero | fclass_mask::nan);
1879   vand_vx(v0, v0, t0);
1880   vmseq_vi(v0, v0, 0);
1881 
1882   // use floating-point 1.0 with a sign of input
1883   vfsgnj_vv(dst, one, dst, v0_t);
1884 }
1885 
1886 void C2_MacroAssembler::compress_bits_v(Register dst, Register src, Register mask, bool is_long) {
1887   Assembler::SEW sew = is_long ? Assembler::e64 : Assembler::e32;
1888   // intrinsic is enabled when MaxVectorSize >= 16
1889   Assembler::LMUL lmul = is_long ? Assembler::m4 : Assembler::m2;
1890   long len = is_long ? 64 : 32;
1891 
1892   // load the src data(in bits) to be compressed.
1893   vsetivli(x0, 1, sew, Assembler::m1);
1894   vmv_s_x(v0, src);
1895   // reset the src data(in bytes) to zero.
1896   mv(t0, len);
1897   vsetvli(x0, t0, Assembler::e8, lmul);
1898   vmv_v_i(v4, 0);
1899   // convert the src data from bits to bytes.
1900   vmerge_vim(v4, v4, 1); // v0 as the implicit mask register
1901   // reset the dst data(in bytes) to zero.
1902   vmv_v_i(v8, 0);
1903   // load the mask data(in bits).
1904   vsetivli(x0, 1, sew, Assembler::m1);
1905   vmv_s_x(v0, mask);
1906   // compress the src data(in bytes) to dst(in bytes).
1907   vsetvli(x0, t0, Assembler::e8, lmul);
1908   vcompress_vm(v8, v4, v0);
1909   // convert the dst data from bytes to bits.
1910   vmseq_vi(v0, v8, 1);
1911   // store result back.
1912   vsetivli(x0, 1, sew, Assembler::m1);
1913   vmv_x_s(dst, v0);
1914 }
1915 
1916 void C2_MacroAssembler::compress_bits_i_v(Register dst, Register src, Register mask) {
1917   compress_bits_v(dst, src, mask, /* is_long */ false);
1918 }
1919 
1920 void C2_MacroAssembler::compress_bits_l_v(Register dst, Register src, Register mask) {
1921   compress_bits_v(dst, src, mask, /* is_long */ true);
1922 }
1923 
1924 void C2_MacroAssembler::expand_bits_v(Register dst, Register src, Register mask, bool is_long) {
1925   Assembler::SEW sew = is_long ? Assembler::e64 : Assembler::e32;
1926   // intrinsic is enabled when MaxVectorSize >= 16
1927   Assembler::LMUL lmul = is_long ? Assembler::m4 : Assembler::m2;
1928   long len = is_long ? 64 : 32;
1929 
1930   // load the src data(in bits) to be expanded.
1931   vsetivli(x0, 1, sew, Assembler::m1);
1932   vmv_s_x(v0, src);
1933   // reset the src data(in bytes) to zero.
1934   mv(t0, len);
1935   vsetvli(x0, t0, Assembler::e8, lmul);
1936   vmv_v_i(v4, 0);
1937   // convert the src data from bits to bytes.
1938   vmerge_vim(v4, v4, 1); // v0 as implicit mask register
1939   // reset the dst data(in bytes) to zero.
1940   vmv_v_i(v12, 0);
1941   // load the mask data(in bits).
1942   vsetivli(x0, 1, sew, Assembler::m1);
1943   vmv_s_x(v0, mask);
1944   // expand the src data(in bytes) to dst(in bytes).
1945   vsetvli(x0, t0, Assembler::e8, lmul);
1946   viota_m(v8, v0);
1947   vrgather_vv(v12, v4, v8, VectorMask::v0_t); // v0 as implicit mask register
1948   // convert the dst data from bytes to bits.
1949   vmseq_vi(v0, v12, 1);
1950   // store result back.
1951   vsetivli(x0, 1, sew, Assembler::m1);
1952   vmv_x_s(dst, v0);
1953 }
1954 
1955 void C2_MacroAssembler::expand_bits_i_v(Register dst, Register src, Register mask) {
1956   expand_bits_v(dst, src, mask, /* is_long */ false);
1957 }
1958 
1959 void C2_MacroAssembler::expand_bits_l_v(Register dst, Register src, Register mask) {
1960   expand_bits_v(dst, src, mask, /* is_long */ true);
1961 }
1962 
1963 void C2_MacroAssembler::element_compare(Register a1, Register a2, Register result, Register cnt, Register tmp1, Register tmp2,
1964                                         VectorRegister vr1, VectorRegister vr2, VectorRegister vrs, bool islatin, Label &DONE) {
1965   Label loop;
1966   Assembler::SEW sew = islatin ? Assembler::e8 : Assembler::e16;
1967 
1968   bind(loop);
1969   vsetvli(tmp1, cnt, sew, Assembler::m2);
1970   vlex_v(vr1, a1, sew);
1971   vlex_v(vr2, a2, sew);
1972   vmsne_vv(vrs, vr1, vr2);
1973   vfirst_m(tmp2, vrs);
1974   bgez(tmp2, DONE);
1975   sub(cnt, cnt, tmp1);
1976   if (!islatin) {
1977     slli(tmp1, tmp1, 1); // get byte counts
1978   }
1979   add(a1, a1, tmp1);
1980   add(a2, a2, tmp1);
1981   bnez(cnt, loop);
1982 
1983   mv(result, true);
1984 }
1985 
1986 void C2_MacroAssembler::string_equals_v(Register a1, Register a2, Register result, Register cnt) {
1987   Label DONE;
1988   Register tmp1 = t0;
1989   Register tmp2 = t1;
1990 
1991   BLOCK_COMMENT("string_equals_v {");
1992 
1993   mv(result, false);
1994 
1995   element_compare(a1, a2, result, cnt, tmp1, tmp2, v2, v4, v2, true, DONE);
1996 
1997   bind(DONE);
1998   BLOCK_COMMENT("} string_equals_v");
1999 }
2000 
2001 // used by C2 ClearArray patterns.
2002 // base: Address of a buffer to be zeroed
2003 // cnt: Count in HeapWords
2004 //
2005 // base, cnt, v4, v5, v6, v7 and t0 are clobbered.
2006 void C2_MacroAssembler::clear_array_v(Register base, Register cnt) {
2007   Label loop;
2008 
2009   // making zero words
2010   vsetvli(t0, cnt, Assembler::e64, Assembler::m4);
2011   vxor_vv(v4, v4, v4);
2012 
2013   bind(loop);
2014   vsetvli(t0, cnt, Assembler::e64, Assembler::m4);
2015   vse64_v(v4, base);
2016   sub(cnt, cnt, t0);
2017   shadd(base, t0, base, t0, 3);
2018   bnez(cnt, loop);
2019 }
2020 
2021 void C2_MacroAssembler::arrays_equals_v(Register a1, Register a2, Register result,
2022                                         Register cnt1, int elem_size) {
2023   Label DONE;
2024   Register tmp1 = t0;
2025   Register tmp2 = t1;
2026   Register cnt2 = tmp2;
2027   int length_offset = arrayOopDesc::length_offset_in_bytes();
2028   int base_offset = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE);
2029 
2030   BLOCK_COMMENT("arrays_equals_v {");
2031 
2032   // if (a1 == a2), return true
2033   mv(result, true);
2034   beq(a1, a2, DONE);
2035 
2036   mv(result, false);
2037   // if a1 == null or a2 == null, return false
2038   beqz(a1, DONE);
2039   beqz(a2, DONE);
2040   // if (a1.length != a2.length), return false
2041   lwu(cnt1, Address(a1, length_offset));
2042   lwu(cnt2, Address(a2, length_offset));
2043   bne(cnt1, cnt2, DONE);
2044 
2045   la(a1, Address(a1, base_offset));
2046   la(a2, Address(a2, base_offset));
2047 
2048   element_compare(a1, a2, result, cnt1, tmp1, tmp2, v2, v4, v2, elem_size == 1, DONE);
2049 
2050   bind(DONE);
2051 
2052   BLOCK_COMMENT("} arrays_equals_v");
2053 }
2054 
2055 void C2_MacroAssembler::string_compare_v(Register str1, Register str2, Register cnt1, Register cnt2,
2056                                          Register result, Register tmp1, Register tmp2, int encForm) {
2057   Label DIFFERENCE, DONE, L, loop;
2058   bool encLL = encForm == StrIntrinsicNode::LL;
2059   bool encLU = encForm == StrIntrinsicNode::LU;
2060   bool encUL = encForm == StrIntrinsicNode::UL;
2061 
2062   bool str1_isL = encLL || encLU;
2063   bool str2_isL = encLL || encUL;
2064 
2065   int minCharsInWord = encLL ? wordSize : wordSize / 2;
2066 
2067   BLOCK_COMMENT("string_compare {");
2068 
2069   // for Latin strings, 1 byte for 1 character
2070   // for UTF16 strings, 2 bytes for 1 character
2071   if (!str1_isL)
2072     sraiw(cnt1, cnt1, 1);
2073   if (!str2_isL)
2074     sraiw(cnt2, cnt2, 1);
2075 
2076   // if str1 == str2, return the difference
2077   // save the minimum of the string lengths in cnt2.
2078   sub(result, cnt1, cnt2);
2079   bgt(cnt1, cnt2, L);
2080   mv(cnt2, cnt1);
2081   bind(L);
2082 
2083   if (str1_isL == str2_isL) { // LL or UU
2084     element_compare(str1, str2, zr, cnt2, tmp1, tmp2, v2, v4, v2, encLL, DIFFERENCE);
2085     j(DONE);
2086   } else { // LU or UL
2087     Register strL = encLU ? str1 : str2;
2088     Register strU = encLU ? str2 : str1;
2089     VectorRegister vstr1 = encLU ? v8 : v4;
2090     VectorRegister vstr2 = encLU ? v4 : v8;
2091 
2092     bind(loop);
2093     vsetvli(tmp1, cnt2, Assembler::e8, Assembler::m2);
2094     vle8_v(vstr1, strL);
2095     vsetvli(tmp1, cnt2, Assembler::e16, Assembler::m4);
2096     vzext_vf2(vstr2, vstr1);
2097     vle16_v(vstr1, strU);
2098     vmsne_vv(v4, vstr2, vstr1);
2099     vfirst_m(tmp2, v4);
2100     bgez(tmp2, DIFFERENCE);
2101     sub(cnt2, cnt2, tmp1);
2102     add(strL, strL, tmp1);
2103     shadd(strU, tmp1, strU, tmp1, 1);
2104     bnez(cnt2, loop);
2105     j(DONE);
2106   }
2107 
2108   bind(DIFFERENCE);
2109   slli(tmp1, tmp2, 1);
2110   add(str1, str1, str1_isL ? tmp2 : tmp1);
2111   add(str2, str2, str2_isL ? tmp2 : tmp1);
2112   str1_isL ? lbu(tmp1, Address(str1, 0)) : lhu(tmp1, Address(str1, 0));
2113   str2_isL ? lbu(tmp2, Address(str2, 0)) : lhu(tmp2, Address(str2, 0));
2114   sub(result, tmp1, tmp2);
2115 
2116   bind(DONE);
2117 }
2118 
2119 void C2_MacroAssembler::byte_array_inflate_v(Register src, Register dst, Register len, Register tmp) {
2120   Label loop;
2121   assert_different_registers(src, dst, len, tmp, t0);
2122 
2123   BLOCK_COMMENT("byte_array_inflate_v {");
2124   bind(loop);
2125   vsetvli(tmp, len, Assembler::e8, Assembler::m2);
2126   vle8_v(v6, src);
2127   vsetvli(t0, len, Assembler::e16, Assembler::m4);
2128   vzext_vf2(v4, v6);
2129   vse16_v(v4, dst);
2130   sub(len, len, tmp);
2131   add(src, src, tmp);
2132   shadd(dst, tmp, dst, tmp, 1);
2133   bnez(len, loop);
2134   BLOCK_COMMENT("} byte_array_inflate_v");
2135 }
2136 
2137 // Compress char[] array to byte[].
2138 // Intrinsic for java.lang.StringUTF16.compress(char[] src, int srcOff, byte[] dst, int dstOff, int len)
2139 // result: the array length if every element in array can be encoded,
2140 // otherwise, the index of first non-latin1 (> 0xff) character.
2141 void C2_MacroAssembler::char_array_compress_v(Register src, Register dst, Register len,
2142                                               Register result, Register tmp) {
2143   encode_iso_array_v(src, dst, len, result, tmp, false);
2144 }
2145 
2146 // Intrinsic for
2147 //
2148 // - sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray
2149 //     return the number of characters copied.
2150 // - java/lang/StringUTF16.compress
2151 //     return index of non-latin1 character if copy fails, otherwise 'len'.
2152 //
2153 // This version always returns the number of characters copied. A successful
2154 // copy will complete with the post-condition: 'res' == 'len', while an
2155 // unsuccessful copy will exit with the post-condition: 0 <= 'res' < 'len'.
2156 //
2157 // Clobbers: src, dst, len, result, t0
2158 void C2_MacroAssembler::encode_iso_array_v(Register src, Register dst, Register len,
2159                                            Register result, Register tmp, bool ascii) {
2160   Label loop, fail, done;
2161 
2162   BLOCK_COMMENT("encode_iso_array_v {");
2163   mv(result, 0);
2164 
2165   bind(loop);
2166   mv(tmp, ascii ? 0x7f : 0xff);
2167   vsetvli(t0, len, Assembler::e16, Assembler::m2);
2168   vle16_v(v2, src);
2169 
2170   vmsgtu_vx(v1, v2, tmp);
2171   vfirst_m(tmp, v1);
2172   vmsbf_m(v0, v1);
2173   // compress char to byte
2174   vsetvli(t0, len, Assembler::e8);
2175   vncvt_x_x_w(v1, v2, Assembler::v0_t);
2176   vse8_v(v1, dst, Assembler::v0_t);
2177 
2178   // fail if char > 0x7f/0xff
2179   bgez(tmp, fail);
2180   add(result, result, t0);
2181   add(dst, dst, t0);
2182   sub(len, len, t0);
2183   shadd(src, t0, src, t0, 1);
2184   bnez(len, loop);
2185   j(done);
2186 
2187   bind(fail);
2188   add(result, result, tmp);
2189 
2190   bind(done);
2191   BLOCK_COMMENT("} encode_iso_array_v");
2192 }
2193 
2194 void C2_MacroAssembler::count_positives_v(Register ary, Register len, Register result, Register tmp) {
2195   Label LOOP, SET_RESULT, DONE;
2196 
2197   BLOCK_COMMENT("count_positives_v {");
2198   assert_different_registers(ary, len, result, tmp);
2199 
2200   mv(result, zr);
2201 
2202   bind(LOOP);
2203   vsetvli(t0, len, Assembler::e8, Assembler::m4);
2204   vle8_v(v4, ary);
2205   vmslt_vx(v4, v4, zr);
2206   vfirst_m(tmp, v4);
2207   bgez(tmp, SET_RESULT);
2208   // if tmp == -1, all bytes are positive
2209   add(result, result, t0);
2210 
2211   sub(len, len, t0);
2212   add(ary, ary, t0);
2213   bnez(len, LOOP);
2214   j(DONE);
2215 
2216   // add remaining positive bytes count
2217   bind(SET_RESULT);
2218   add(result, result, tmp);
2219 
2220   bind(DONE);
2221   BLOCK_COMMENT("} count_positives_v");
2222 }
2223 
2224 void C2_MacroAssembler::string_indexof_char_v(Register str1, Register cnt1,
2225                                               Register ch, Register result,
2226                                               Register tmp1, Register tmp2,
2227                                               bool isL) {
2228   mv(result, zr);
2229 
2230   Label loop, MATCH, DONE;
2231   Assembler::SEW sew = isL ? Assembler::e8 : Assembler::e16;
2232   bind(loop);
2233   vsetvli(tmp1, cnt1, sew, Assembler::m4);
2234   vlex_v(v4, str1, sew);
2235   vmseq_vx(v4, v4, ch);
2236   vfirst_m(tmp2, v4);
2237   bgez(tmp2, MATCH); // if equal, return index
2238 
2239   add(result, result, tmp1);
2240   sub(cnt1, cnt1, tmp1);
2241   if (!isL) slli(tmp1, tmp1, 1);
2242   add(str1, str1, tmp1);
2243   bnez(cnt1, loop);
2244 
2245   mv(result, -1);
2246   j(DONE);
2247 
2248   bind(MATCH);
2249   add(result, result, tmp2);
2250 
2251   bind(DONE);
2252 }
2253 
2254 // Set dst to NaN if any NaN input.
2255 void C2_MacroAssembler::minmax_fp_v(VectorRegister dst, VectorRegister src1, VectorRegister src2,
2256                                     BasicType bt, bool is_min, int vector_length) {
2257   assert_different_registers(dst, src1, src2);
2258 
2259   vsetvli_helper(bt, vector_length);
2260 
2261   is_min ? vfmin_vv(dst, src1, src2)
2262          : vfmax_vv(dst, src1, src2);
2263 
2264   vmfne_vv(v0,  src1, src1);
2265   vfadd_vv(dst, src1, src1, Assembler::v0_t);
2266   vmfne_vv(v0,  src2, src2);
2267   vfadd_vv(dst, src2, src2, Assembler::v0_t);
2268 }
2269 
2270 // Set dst to NaN if any NaN input.
2271 // The destination vector register elements corresponding to masked-off elements
2272 // are handled with a mask-undisturbed policy.
2273 void C2_MacroAssembler::minmax_fp_masked_v(VectorRegister dst, VectorRegister src1, VectorRegister src2,
2274                                            VectorRegister vmask, VectorRegister tmp1, VectorRegister tmp2,
2275                                            BasicType bt, bool is_min, int vector_length) {
2276   assert_different_registers(src1, src2, tmp1, tmp2);
2277   vsetvli_helper(bt, vector_length);
2278 
2279   // Check vector elements of src1 and src2 for NaN.
2280   vmfeq_vv(tmp1, src1, src1);
2281   vmfeq_vv(tmp2, src2, src2);
2282 
2283   vmandn_mm(v0, vmask, tmp1);
2284   vfadd_vv(dst, src1, src1, Assembler::v0_t);
2285   vmandn_mm(v0, vmask, tmp2);
2286   vfadd_vv(dst, src2, src2, Assembler::v0_t);
2287 
2288   vmand_mm(tmp2, tmp1, tmp2);
2289   vmand_mm(v0, vmask, tmp2);
2290   is_min ? vfmin_vv(dst, src1, src2, Assembler::v0_t)
2291          : vfmax_vv(dst, src1, src2, Assembler::v0_t);
2292 }
2293 
2294 // Set dst to NaN if any NaN input.
2295 void C2_MacroAssembler::reduce_minmax_fp_v(FloatRegister dst,
2296                                            FloatRegister src1, VectorRegister src2,
2297                                            VectorRegister tmp1, VectorRegister tmp2,
2298                                            bool is_double, bool is_min, int vector_length, VectorMask vm) {
2299   assert_different_registers(dst, src1);
2300   assert_different_registers(src2, tmp1, tmp2);
2301 
2302   Label L_done, L_NaN_1, L_NaN_2;
2303   // Set dst to src1 if src1 is NaN
2304   is_double ? feq_d(t0, src1, src1)
2305             : feq_s(t0, src1, src1);
2306   beqz(t0, L_NaN_2);
2307 
2308   vsetvli_helper(is_double ? T_DOUBLE : T_FLOAT, vector_length);
2309   vfmv_s_f(tmp2, src1);
2310 
2311   is_min ? vfredmin_vs(tmp1, src2, tmp2, vm)
2312          : vfredmax_vs(tmp1, src2, tmp2, vm);
2313   vfmv_f_s(dst, tmp1);
2314 
2315   // Checking NaNs in src2
2316   vmfne_vv(tmp1, src2, src2, vm);
2317   vcpop_m(t0, tmp1, vm);
2318   beqz(t0, L_done);
2319 
2320   bind(L_NaN_1);
2321   vfredusum_vs(tmp1, src2, tmp2, vm);
2322   vfmv_f_s(dst, tmp1);
2323   j(L_done);
2324 
2325   bind(L_NaN_2);
2326   is_double ? fmv_d(dst, src1)
2327             : fmv_s(dst, src1);
2328   bind(L_done);
2329 }
2330 
2331 bool C2_MacroAssembler::in_scratch_emit_size() {
2332   if (ciEnv::current()->task() != nullptr) {
2333     PhaseOutput* phase_output = Compile::current()->output();
2334     if (phase_output != nullptr && phase_output->in_scratch_emit_size()) {
2335       return true;
2336     }
2337   }
2338   return MacroAssembler::in_scratch_emit_size();
2339 }
2340 
2341 void C2_MacroAssembler::reduce_integral_v(Register dst, Register src1,
2342                                           VectorRegister src2, VectorRegister tmp,
2343                                           int opc, BasicType bt, int vector_length, VectorMask vm) {
2344   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2345   vsetvli_helper(bt, vector_length);
2346   vmv_s_x(tmp, src1);
2347   switch (opc) {
2348     case Op_AddReductionVI:
2349     case Op_AddReductionVL:
2350       vredsum_vs(tmp, src2, tmp, vm);
2351       break;
2352     case Op_AndReductionV:
2353       vredand_vs(tmp, src2, tmp, vm);
2354       break;
2355     case Op_OrReductionV:
2356       vredor_vs(tmp, src2, tmp, vm);
2357       break;
2358     case Op_XorReductionV:
2359       vredxor_vs(tmp, src2, tmp, vm);
2360       break;
2361     case Op_MaxReductionV:
2362       vredmax_vs(tmp, src2, tmp, vm);
2363       break;
2364     case Op_MinReductionV:
2365       vredmin_vs(tmp, src2, tmp, vm);
2366       break;
2367     default:
2368       ShouldNotReachHere();
2369   }
2370   vmv_x_s(dst, tmp);
2371 }
2372 
2373 // Set vl and vtype for full and partial vector operations.
2374 // (vma = mu, vta = tu, vill = false)
2375 void C2_MacroAssembler::vsetvli_helper(BasicType bt, int vector_length, LMUL vlmul, Register tmp) {
2376   Assembler::SEW sew = Assembler::elemtype_to_sew(bt);
2377   if (vector_length <= 31) {
2378     vsetivli(tmp, vector_length, sew, vlmul);
2379   } else if (vector_length == (MaxVectorSize / type2aelembytes(bt))) {
2380     vsetvli(tmp, x0, sew, vlmul);
2381   } else {
2382     mv(tmp, vector_length);
2383     vsetvli(tmp, tmp, sew, vlmul);
2384   }
2385 }
2386 
2387 void C2_MacroAssembler::compare_integral_v(VectorRegister vd, VectorRegister src1, VectorRegister src2,
2388                                            int cond, BasicType bt, int vector_length, VectorMask vm) {
2389   assert(is_integral_type(bt), "unsupported element type");
2390   assert(vm == Assembler::v0_t ? vd != v0 : true, "should be different registers");
2391   vsetvli_helper(bt, vector_length);
2392   vmclr_m(vd);
2393   switch (cond) {
2394     case BoolTest::eq: vmseq_vv(vd, src1, src2, vm); break;
2395     case BoolTest::ne: vmsne_vv(vd, src1, src2, vm); break;
2396     case BoolTest::le: vmsle_vv(vd, src1, src2, vm); break;
2397     case BoolTest::ge: vmsge_vv(vd, src1, src2, vm); break;
2398     case BoolTest::lt: vmslt_vv(vd, src1, src2, vm); break;
2399     case BoolTest::gt: vmsgt_vv(vd, src1, src2, vm); break;
2400     default:
2401       assert(false, "unsupported compare condition");
2402       ShouldNotReachHere();
2403   }
2404 }
2405 
2406 void C2_MacroAssembler::compare_fp_v(VectorRegister vd, VectorRegister src1, VectorRegister src2,
2407                                      int cond, BasicType bt, int vector_length, VectorMask vm) {
2408   assert(is_floating_point_type(bt), "unsupported element type");
2409   assert(vm == Assembler::v0_t ? vd != v0 : true, "should be different registers");
2410   vsetvli_helper(bt, vector_length);
2411   vmclr_m(vd);
2412   switch (cond) {
2413     case BoolTest::eq: vmfeq_vv(vd, src1, src2, vm); break;
2414     case BoolTest::ne: vmfne_vv(vd, src1, src2, vm); break;
2415     case BoolTest::le: vmfle_vv(vd, src1, src2, vm); break;
2416     case BoolTest::ge: vmfge_vv(vd, src1, src2, vm); break;
2417     case BoolTest::lt: vmflt_vv(vd, src1, src2, vm); break;
2418     case BoolTest::gt: vmfgt_vv(vd, src1, src2, vm); break;
2419     default:
2420       assert(false, "unsupported compare condition");
2421       ShouldNotReachHere();
2422   }
2423 }
2424 
2425 void C2_MacroAssembler::integer_extend_v(VectorRegister dst, BasicType dst_bt, int vector_length,
2426                                          VectorRegister src, BasicType src_bt) {
2427   assert(type2aelembytes(dst_bt) > type2aelembytes(src_bt) && type2aelembytes(dst_bt) <= 8 && type2aelembytes(src_bt) <= 4, "invalid element size");
2428   assert(dst_bt != T_FLOAT && dst_bt != T_DOUBLE && src_bt != T_FLOAT && src_bt != T_DOUBLE, "unsupported element type");
2429   // https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#52-vector-operands
2430   // The destination EEW is greater than the source EEW, the source EMUL is at least 1,
2431   // and the overlap is in the highest-numbered part of the destination register group.
2432   // Since LMUL=1, vd and vs cannot be the same.
2433   assert_different_registers(dst, src);
2434 
2435   vsetvli_helper(dst_bt, vector_length);
2436   if (src_bt == T_BYTE) {
2437     switch (dst_bt) {
2438     case T_SHORT:
2439       vsext_vf2(dst, src);
2440       break;
2441     case T_INT:
2442       vsext_vf4(dst, src);
2443       break;
2444     case T_LONG:
2445       vsext_vf8(dst, src);
2446       break;
2447     default:
2448       ShouldNotReachHere();
2449     }
2450   } else if (src_bt == T_SHORT) {
2451     if (dst_bt == T_INT) {
2452       vsext_vf2(dst, src);
2453     } else {
2454       vsext_vf4(dst, src);
2455     }
2456   } else if (src_bt == T_INT) {
2457     vsext_vf2(dst, src);
2458   }
2459 }
2460 
2461 // Vector narrow from src to dst with specified element sizes.
2462 // High part of dst vector will be filled with zero.
2463 void C2_MacroAssembler::integer_narrow_v(VectorRegister dst, BasicType dst_bt, int vector_length,
2464                                          VectorRegister src, BasicType src_bt) {
2465   assert(type2aelembytes(dst_bt) < type2aelembytes(src_bt) && type2aelembytes(dst_bt) <= 4 && type2aelembytes(src_bt) <= 8, "invalid element size");
2466   assert(dst_bt != T_FLOAT && dst_bt != T_DOUBLE && src_bt != T_FLOAT && src_bt != T_DOUBLE, "unsupported element type");
2467   mv(t0, vector_length);
2468   if (src_bt == T_LONG) {
2469     // https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#117-vector-narrowing-integer-right-shift-instructions
2470     // Future extensions might add support for versions that narrow to a destination that is 1/4 the width of the source.
2471     // So we can currently only scale down by 1/2 the width at a time.
2472     vsetvli(t0, t0, Assembler::e32, Assembler::mf2);
2473     vncvt_x_x_w(dst, src);
2474     if (dst_bt == T_SHORT || dst_bt == T_BYTE) {
2475       vsetvli(t0, t0, Assembler::e16, Assembler::mf2);
2476       vncvt_x_x_w(dst, dst);
2477       if (dst_bt == T_BYTE) {
2478         vsetvli(t0, t0, Assembler::e8, Assembler::mf2);
2479         vncvt_x_x_w(dst, dst);
2480       }
2481     }
2482   } else if (src_bt == T_INT) {
2483     // T_SHORT
2484     vsetvli(t0, t0, Assembler::e16, Assembler::mf2);
2485     vncvt_x_x_w(dst, src);
2486     if (dst_bt == T_BYTE) {
2487       vsetvli(t0, t0, Assembler::e8, Assembler::mf2);
2488       vncvt_x_x_w(dst, dst);
2489     }
2490   } else if (src_bt == T_SHORT) {
2491     vsetvli(t0, t0, Assembler::e8, Assembler::mf2);
2492     vncvt_x_x_w(dst, src);
2493   }
2494 }
2495 
2496 #define VFCVT_SAFE(VFLOATCVT)                                                      \
2497 void C2_MacroAssembler::VFLOATCVT##_safe(VectorRegister dst, VectorRegister src) { \
2498   assert_different_registers(dst, src);                                            \
2499   vxor_vv(dst, dst, dst);                                                          \
2500   vmfeq_vv(v0, src, src);                                                          \
2501   VFLOATCVT(dst, src, Assembler::v0_t);                                            \
2502 }
2503 
2504 VFCVT_SAFE(vfcvt_rtz_x_f_v);
2505 
2506 #undef VFCVT_SAFE
2507 
2508 // Extract a scalar element from an vector at position 'idx'.
2509 // The input elements in src are expected to be of integral type.
2510 void C2_MacroAssembler::extract_v(Register dst, VectorRegister src, BasicType bt,
2511                                   int idx, VectorRegister tmp) {
2512   assert(is_integral_type(bt), "unsupported element type");
2513   assert(idx >= 0, "idx cannot be negative");
2514   // Only need the first element after vector slidedown
2515   vsetvli_helper(bt, 1);
2516   if (idx == 0) {
2517     vmv_x_s(dst, src);
2518   } else if (idx <= 31) {
2519     vslidedown_vi(tmp, src, idx);
2520     vmv_x_s(dst, tmp);
2521   } else {
2522     mv(t0, idx);
2523     vslidedown_vx(tmp, src, t0);
2524     vmv_x_s(dst, tmp);
2525   }
2526 }
2527 
2528 // Extract a scalar element from an vector at position 'idx'.
2529 // The input elements in src are expected to be of floating point type.
2530 void C2_MacroAssembler::extract_fp_v(FloatRegister dst, VectorRegister src, BasicType bt,
2531                                      int idx, VectorRegister tmp) {
2532   assert(is_floating_point_type(bt), "unsupported element type");
2533   assert(idx >= 0, "idx cannot be negative");
2534   // Only need the first element after vector slidedown
2535   vsetvli_helper(bt, 1);
2536   if (idx == 0) {
2537     vfmv_f_s(dst, src);
2538   } else if (idx <= 31) {
2539     vslidedown_vi(tmp, src, idx);
2540     vfmv_f_s(dst, tmp);
2541   } else {
2542     mv(t0, idx);
2543     vslidedown_vx(tmp, src, t0);
2544     vfmv_f_s(dst, tmp);
2545   }
2546 }