1 /*
   2  * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2020, 2022, Huawei Technologies Co., Ltd. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/assembler.hpp"
  28 #include "asm/assembler.inline.hpp"
  29 #include "opto/c2_MacroAssembler.hpp"
  30 #include "opto/compile.hpp"
  31 #include "opto/intrinsicnode.hpp"
  32 #include "opto/output.hpp"
  33 #include "opto/subnode.hpp"
  34 #include "runtime/stubRoutines.hpp"
  35 
  36 #ifdef PRODUCT
  37 #define BLOCK_COMMENT(str) /* nothing */
  38 #define STOP(error) stop(error)
  39 #else
  40 #define BLOCK_COMMENT(str) block_comment(str)
  41 #define STOP(error) block_comment(error); stop(error)
  42 #endif
  43 
  44 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  45 
  46 void C2_MacroAssembler::fast_lock(Register objectReg, Register boxReg,
  47                                   Register tmp1Reg, Register tmp2Reg, Register tmp3Reg) {
  48   // Use cr register to indicate the fast_lock result: zero for success; non-zero for failure.
  49   Register flag = t1;
  50   Register oop = objectReg;
  51   Register box = boxReg;
  52   Register disp_hdr = tmp1Reg;
  53   Register tmp = tmp2Reg;
  54   Label cont;
  55   Label object_has_monitor;
  56   Label count, no_count;
  57 
  58   assert_different_registers(oop, box, tmp, disp_hdr, flag, tmp3Reg, t0);
  59 
  60   // Load markWord from object into displaced_header.
  61   ld(disp_hdr, Address(oop, oopDesc::mark_offset_in_bytes()));
  62 
  63   if (DiagnoseSyncOnValueBasedClasses != 0) {
  64     load_klass(flag, oop);
  65     lwu(flag, Address(flag, Klass::access_flags_offset()));
  66     test_bit(flag, flag, exact_log2(JVM_ACC_IS_VALUE_BASED_CLASS));
  67     bnez(flag, cont, true /* is_far */);
  68   }
  69 
  70   // Check for existing monitor
  71   test_bit(t0, disp_hdr, exact_log2(markWord::monitor_value));
  72   bnez(t0, object_has_monitor);
  73 
  74   if (LockingMode == LM_MONITOR) {
  75     mv(flag, 1); // Set non-zero flag to indicate 'failure' -> take slow-path
  76     j(cont);
  77   } else if (LockingMode == LM_LEGACY) {
  78     // Set tmp to be (markWord of object | UNLOCK_VALUE).
  79     ori(tmp, disp_hdr, markWord::unlocked_value);
  80 
  81     // Initialize the box. (Must happen before we update the object mark!)
  82     sd(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes()));
  83 
  84     // Compare object markWord with an unlocked value (tmp) and if
  85     // equal exchange the stack address of our box with object markWord.
  86     // On failure disp_hdr contains the possibly locked markWord.
  87     cmpxchg(/*memory address*/oop, /*expected value*/tmp, /*new value*/box, Assembler::int64, Assembler::aq,
  88             Assembler::rl, /*result*/disp_hdr);
  89     mv(flag, zr);
  90     beq(disp_hdr, tmp, cont); // prepare zero flag and goto cont if we won the cas
  91 
  92     assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
  93 
  94     // If the compare-and-exchange succeeded, then we found an unlocked
  95     // object, will have now locked it will continue at label cont
  96     // We did not see an unlocked object so try the fast recursive case.
  97 
  98     // Check if the owner is self by comparing the value in the
  99     // markWord of object (disp_hdr) with the stack pointer.
 100     sub(disp_hdr, disp_hdr, sp);
 101     mv(tmp, (intptr_t) (~(os::vm_page_size()-1) | (uintptr_t)markWord::lock_mask_in_place));
 102     // If (mark & lock_mask) == 0 and mark - sp < page_size, we are stack-locking and goto cont,
 103     // hence we can store 0 as the displaced header in the box, which indicates that it is a
 104     // recursive lock.
 105     andr(tmp/*==0?*/, disp_hdr, tmp);
 106     sd(tmp/*==0, perhaps*/, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 107     mv(flag, tmp); // we can use the value of tmp as the result here
 108     j(cont);
 109   } else {
 110     assert(LockingMode == LM_LIGHTWEIGHT, "");
 111     Label slow;
 112     lightweight_lock(oop, disp_hdr, tmp, tmp3Reg, slow);
 113 
 114     // Indicate success on completion.
 115     mv(flag, zr);
 116     j(count);
 117     bind(slow);
 118     mv(flag, 1); // Set non-zero flag to indicate 'failure' -> take slow-path
 119     j(no_count);
 120   }
 121 
 122   // Handle existing monitor.
 123   bind(object_has_monitor);
 124   // The object's monitor m is unlocked iff m->owner == nullptr,
 125   // otherwise m->owner may contain a thread or a stack address.
 126   //
 127   // Try to CAS m->owner from null to current thread.
 128   add(tmp, disp_hdr, (in_bytes(ObjectMonitor::owner_offset()) - markWord::monitor_value));
 129   cmpxchg(/*memory address*/tmp, /*expected value*/zr, /*new value*/xthread, Assembler::int64, Assembler::aq,
 130           Assembler::rl, /*result*/flag); // cas succeeds if flag == zr(expected)
 131 
 132   if (LockingMode != LM_LIGHTWEIGHT) {
 133     // Store a non-null value into the box to avoid looking like a re-entrant
 134     // lock. The fast-path monitor unlock code checks for
 135     // markWord::monitor_value so use markWord::unused_mark which has the
 136     // relevant bit set, and also matches ObjectSynchronizer::slow_enter.
 137     mv(tmp, (address)markWord::unused_mark().value());
 138     sd(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 139   }
 140 
 141   beqz(flag, cont); // CAS success means locking succeeded
 142 
 143   bne(flag, xthread, cont); // Check for recursive locking
 144 
 145   // Recursive lock case
 146   mv(flag, zr);
 147   increment(Address(disp_hdr, in_bytes(ObjectMonitor::recursions_offset()) - markWord::monitor_value), 1, t0, tmp);
 148 
 149   bind(cont);
 150   // zero flag indicates success
 151   // non-zero flag indicates failure
 152   bnez(flag, no_count);
 153 
 154   bind(count);
 155   increment(Address(xthread, JavaThread::held_monitor_count_offset()), 1, t0, tmp);
 156 
 157   bind(no_count);
 158 }
 159 
 160 void C2_MacroAssembler::fast_unlock(Register objectReg, Register boxReg,
 161                                     Register tmp1Reg, Register tmp2Reg) {
 162   // Use cr register to indicate the fast_unlock result: zero for success; non-zero for failure.
 163   Register flag = t1;
 164   Register oop = objectReg;
 165   Register box = boxReg;
 166   Register disp_hdr = tmp1Reg;
 167   Register tmp = tmp2Reg;
 168   Label cont;
 169   Label object_has_monitor;
 170   Label count, no_count;
 171 
 172   assert_different_registers(oop, box, tmp, disp_hdr, flag, t0);
 173 
 174   if (LockingMode == LM_LEGACY) {
 175     // Find the lock address and load the displaced header from the stack.
 176     ld(disp_hdr, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 177 
 178     // If the displaced header is 0, we have a recursive unlock.
 179     mv(flag, disp_hdr);
 180     beqz(disp_hdr, cont);
 181   }
 182 
 183   // Handle existing monitor.
 184   ld(tmp, Address(oop, oopDesc::mark_offset_in_bytes()));
 185   test_bit(t0, tmp, exact_log2(markWord::monitor_value));
 186   bnez(t0, object_has_monitor);
 187 
 188   if (LockingMode == LM_MONITOR) {
 189     mv(flag, 1); // Set non-zero flag to indicate 'failure' -> take slow path
 190     j(cont);
 191   } else if (LockingMode == LM_LEGACY) {
 192     // Check if it is still a light weight lock, this is true if we
 193     // see the stack address of the basicLock in the markWord of the
 194     // object.
 195 
 196     cmpxchg(/*memory address*/oop, /*expected value*/box, /*new value*/disp_hdr, Assembler::int64, Assembler::relaxed,
 197             Assembler::rl, /*result*/tmp);
 198     xorr(flag, box, tmp); // box == tmp if cas succeeds
 199     j(cont);
 200   } else {
 201     assert(LockingMode == LM_LIGHTWEIGHT, "");
 202     Label slow;
 203     lightweight_unlock(oop, tmp, box, disp_hdr, slow);
 204 
 205     // Indicate success on completion.
 206     mv(flag, zr);
 207     j(count);
 208     bind(slow);
 209     mv(flag, 1); // Set non-zero flag to indicate 'failure' -> take slow path
 210     j(no_count);
 211   }
 212 
 213   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
 214 
 215   // Handle existing monitor.
 216   bind(object_has_monitor);
 217   STATIC_ASSERT(markWord::monitor_value <= INT_MAX);
 218   add(tmp, tmp, -(int)markWord::monitor_value); // monitor
 219 
 220   if (LockingMode == LM_LIGHTWEIGHT) {
 221     // If the owner is anonymous, we need to fix it -- in an outline stub.
 222     Register tmp2 = disp_hdr;
 223     ld(tmp2, Address(tmp, ObjectMonitor::owner_offset()));
 224     test_bit(t0, tmp2, exact_log2(ObjectMonitor::ANONYMOUS_OWNER));
 225     C2HandleAnonOMOwnerStub* stub = new (Compile::current()->comp_arena()) C2HandleAnonOMOwnerStub(tmp, tmp2);
 226     Compile::current()->output()->add_stub(stub);
 227     bnez(t0, stub->entry(), /* is_far */ true);
 228     bind(stub->continuation());
 229   }
 230 
 231   ld(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset()));
 232 
 233   Label notRecursive;
 234   beqz(disp_hdr, notRecursive); // Will be 0 if not recursive.
 235 
 236   // Recursive lock
 237   addi(disp_hdr, disp_hdr, -1);
 238   sd(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset()));
 239   mv(flag, zr);
 240   j(cont);
 241 
 242   bind(notRecursive);
 243   ld(flag, Address(tmp, ObjectMonitor::EntryList_offset()));
 244   ld(disp_hdr, Address(tmp, ObjectMonitor::cxq_offset()));
 245   orr(flag, flag, disp_hdr); // Will be 0 if both are 0.
 246   bnez(flag, cont);
 247   // need a release store here
 248   la(tmp, Address(tmp, ObjectMonitor::owner_offset()));
 249   membar(MacroAssembler::LoadStore | MacroAssembler::StoreStore);
 250   sd(zr, Address(tmp)); // set unowned
 251 
 252   bind(cont);
 253   // zero flag indicates success
 254   // non-zero flag indicates failure
 255   bnez(flag, no_count);
 256 
 257   bind(count);
 258   decrement(Address(xthread, JavaThread::held_monitor_count_offset()), 1, t0, tmp);
 259 
 260   bind(no_count);
 261 }
 262 
 263 // short string
 264 // StringUTF16.indexOfChar
 265 // StringLatin1.indexOfChar
 266 void C2_MacroAssembler::string_indexof_char_short(Register str1, Register cnt1,
 267                                                   Register ch, Register result,
 268                                                   bool isL)
 269 {
 270   Register ch1 = t0;
 271   Register index = t1;
 272 
 273   BLOCK_COMMENT("string_indexof_char_short {");
 274 
 275   Label LOOP, LOOP1, LOOP4, LOOP8;
 276   Label MATCH,  MATCH1, MATCH2, MATCH3,
 277         MATCH4, MATCH5, MATCH6, MATCH7, NOMATCH;
 278 
 279   mv(result, -1);
 280   mv(index, zr);
 281 
 282   bind(LOOP);
 283   addi(t0, index, 8);
 284   ble(t0, cnt1, LOOP8);
 285   addi(t0, index, 4);
 286   ble(t0, cnt1, LOOP4);
 287   j(LOOP1);
 288 
 289   bind(LOOP8);
 290   isL ? lbu(ch1, Address(str1, 0)) : lhu(ch1, Address(str1, 0));
 291   beq(ch, ch1, MATCH);
 292   isL ? lbu(ch1, Address(str1, 1)) : lhu(ch1, Address(str1, 2));
 293   beq(ch, ch1, MATCH1);
 294   isL ? lbu(ch1, Address(str1, 2)) : lhu(ch1, Address(str1, 4));
 295   beq(ch, ch1, MATCH2);
 296   isL ? lbu(ch1, Address(str1, 3)) : lhu(ch1, Address(str1, 6));
 297   beq(ch, ch1, MATCH3);
 298   isL ? lbu(ch1, Address(str1, 4)) : lhu(ch1, Address(str1, 8));
 299   beq(ch, ch1, MATCH4);
 300   isL ? lbu(ch1, Address(str1, 5)) : lhu(ch1, Address(str1, 10));
 301   beq(ch, ch1, MATCH5);
 302   isL ? lbu(ch1, Address(str1, 6)) : lhu(ch1, Address(str1, 12));
 303   beq(ch, ch1, MATCH6);
 304   isL ? lbu(ch1, Address(str1, 7)) : lhu(ch1, Address(str1, 14));
 305   beq(ch, ch1, MATCH7);
 306   addi(index, index, 8);
 307   addi(str1, str1, isL ? 8 : 16);
 308   blt(index, cnt1, LOOP);
 309   j(NOMATCH);
 310 
 311   bind(LOOP4);
 312   isL ? lbu(ch1, Address(str1, 0)) : lhu(ch1, Address(str1, 0));
 313   beq(ch, ch1, MATCH);
 314   isL ? lbu(ch1, Address(str1, 1)) : lhu(ch1, Address(str1, 2));
 315   beq(ch, ch1, MATCH1);
 316   isL ? lbu(ch1, Address(str1, 2)) : lhu(ch1, Address(str1, 4));
 317   beq(ch, ch1, MATCH2);
 318   isL ? lbu(ch1, Address(str1, 3)) : lhu(ch1, Address(str1, 6));
 319   beq(ch, ch1, MATCH3);
 320   addi(index, index, 4);
 321   addi(str1, str1, isL ? 4 : 8);
 322   bge(index, cnt1, NOMATCH);
 323 
 324   bind(LOOP1);
 325   isL ? lbu(ch1, Address(str1)) : lhu(ch1, Address(str1));
 326   beq(ch, ch1, MATCH);
 327   addi(index, index, 1);
 328   addi(str1, str1, isL ? 1 : 2);
 329   blt(index, cnt1, LOOP1);
 330   j(NOMATCH);
 331 
 332   bind(MATCH1);
 333   addi(index, index, 1);
 334   j(MATCH);
 335 
 336   bind(MATCH2);
 337   addi(index, index, 2);
 338   j(MATCH);
 339 
 340   bind(MATCH3);
 341   addi(index, index, 3);
 342   j(MATCH);
 343 
 344   bind(MATCH4);
 345   addi(index, index, 4);
 346   j(MATCH);
 347 
 348   bind(MATCH5);
 349   addi(index, index, 5);
 350   j(MATCH);
 351 
 352   bind(MATCH6);
 353   addi(index, index, 6);
 354   j(MATCH);
 355 
 356   bind(MATCH7);
 357   addi(index, index, 7);
 358 
 359   bind(MATCH);
 360   mv(result, index);
 361   bind(NOMATCH);
 362   BLOCK_COMMENT("} string_indexof_char_short");
 363 }
 364 
 365 // StringUTF16.indexOfChar
 366 // StringLatin1.indexOfChar
 367 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1,
 368                                             Register ch, Register result,
 369                                             Register tmp1, Register tmp2,
 370                                             Register tmp3, Register tmp4,
 371                                             bool isL)
 372 {
 373   Label CH1_LOOP, HIT, NOMATCH, DONE, DO_LONG;
 374   Register ch1 = t0;
 375   Register orig_cnt = t1;
 376   Register mask1 = tmp3;
 377   Register mask2 = tmp2;
 378   Register match_mask = tmp1;
 379   Register trailing_char = tmp4;
 380   Register unaligned_elems = tmp4;
 381 
 382   BLOCK_COMMENT("string_indexof_char {");
 383   beqz(cnt1, NOMATCH);
 384 
 385   addi(t0, cnt1, isL ? -32 : -16);
 386   bgtz(t0, DO_LONG);
 387   string_indexof_char_short(str1, cnt1, ch, result, isL);
 388   j(DONE);
 389 
 390   bind(DO_LONG);
 391   mv(orig_cnt, cnt1);
 392   if (AvoidUnalignedAccesses) {
 393     Label ALIGNED;
 394     andi(unaligned_elems, str1, 0x7);
 395     beqz(unaligned_elems, ALIGNED);
 396     sub(unaligned_elems, unaligned_elems, 8);
 397     neg(unaligned_elems, unaligned_elems);
 398     if (!isL) {
 399       srli(unaligned_elems, unaligned_elems, 1);
 400     }
 401     // do unaligned part per element
 402     string_indexof_char_short(str1, unaligned_elems, ch, result, isL);
 403     bgez(result, DONE);
 404     mv(orig_cnt, cnt1);
 405     sub(cnt1, cnt1, unaligned_elems);
 406     bind(ALIGNED);
 407   }
 408 
 409   // duplicate ch
 410   if (isL) {
 411     slli(ch1, ch, 8);
 412     orr(ch, ch1, ch);
 413   }
 414   slli(ch1, ch, 16);
 415   orr(ch, ch1, ch);
 416   slli(ch1, ch, 32);
 417   orr(ch, ch1, ch);
 418 
 419   if (!isL) {
 420     slli(cnt1, cnt1, 1);
 421   }
 422 
 423   uint64_t mask0101 = UCONST64(0x0101010101010101);
 424   uint64_t mask0001 = UCONST64(0x0001000100010001);
 425   mv(mask1, isL ? mask0101 : mask0001);
 426   uint64_t mask7f7f = UCONST64(0x7f7f7f7f7f7f7f7f);
 427   uint64_t mask7fff = UCONST64(0x7fff7fff7fff7fff);
 428   mv(mask2, isL ? mask7f7f : mask7fff);
 429 
 430   bind(CH1_LOOP);
 431   ld(ch1, Address(str1));
 432   addi(str1, str1, 8);
 433   addi(cnt1, cnt1, -8);
 434   compute_match_mask(ch1, ch, match_mask, mask1, mask2);
 435   bnez(match_mask, HIT);
 436   bgtz(cnt1, CH1_LOOP);
 437   j(NOMATCH);
 438 
 439   bind(HIT);
 440   ctzc_bit(trailing_char, match_mask, isL, ch1, result);
 441   srli(trailing_char, trailing_char, 3);
 442   addi(cnt1, cnt1, 8);
 443   ble(cnt1, trailing_char, NOMATCH);
 444   // match case
 445   if (!isL) {
 446     srli(cnt1, cnt1, 1);
 447     srli(trailing_char, trailing_char, 1);
 448   }
 449 
 450   sub(result, orig_cnt, cnt1);
 451   add(result, result, trailing_char);
 452   j(DONE);
 453 
 454   bind(NOMATCH);
 455   mv(result, -1);
 456 
 457   bind(DONE);
 458   BLOCK_COMMENT("} string_indexof_char");
 459 }
 460 
 461 typedef void (MacroAssembler::* load_chr_insn)(Register rd, const Address &adr, Register temp);
 462 
 463 // Search for needle in haystack and return index or -1
 464 // x10: result
 465 // x11: haystack
 466 // x12: haystack_len
 467 // x13: needle
 468 // x14: needle_len
 469 void C2_MacroAssembler::string_indexof(Register haystack, Register needle,
 470                                        Register haystack_len, Register needle_len,
 471                                        Register tmp1, Register tmp2,
 472                                        Register tmp3, Register tmp4,
 473                                        Register tmp5, Register tmp6,
 474                                        Register result, int ae)
 475 {
 476   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
 477 
 478   Label LINEARSEARCH, LINEARSTUB, DONE, NOMATCH;
 479 
 480   Register ch1 = t0;
 481   Register ch2 = t1;
 482   Register nlen_tmp = tmp1; // needle len tmp
 483   Register hlen_tmp = tmp2; // haystack len tmp
 484   Register result_tmp = tmp4;
 485 
 486   bool isLL = ae == StrIntrinsicNode::LL;
 487 
 488   bool needle_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
 489   bool haystack_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
 490   int needle_chr_shift = needle_isL ? 0 : 1;
 491   int haystack_chr_shift = haystack_isL ? 0 : 1;
 492   int needle_chr_size = needle_isL ? 1 : 2;
 493   int haystack_chr_size = haystack_isL ? 1 : 2;
 494   load_chr_insn needle_load_1chr = needle_isL ? (load_chr_insn)&MacroAssembler::lbu :
 495                               (load_chr_insn)&MacroAssembler::lhu;
 496   load_chr_insn haystack_load_1chr = haystack_isL ? (load_chr_insn)&MacroAssembler::lbu :
 497                                 (load_chr_insn)&MacroAssembler::lhu;
 498 
 499   BLOCK_COMMENT("string_indexof {");
 500 
 501   // Note, inline_string_indexOf() generates checks:
 502   // if (pattern.count > src.count) return -1;
 503   // if (pattern.count == 0) return 0;
 504 
 505   // We have two strings, a source string in haystack, haystack_len and a pattern string
 506   // in needle, needle_len. Find the first occurrence of pattern in source or return -1.
 507 
 508   // For larger pattern and source we use a simplified Boyer Moore algorithm.
 509   // With a small pattern and source we use linear scan.
 510 
 511   // needle_len >=8 && needle_len < 256 && needle_len < haystack_len/4, use bmh algorithm.
 512   sub(result_tmp, haystack_len, needle_len);
 513   // needle_len < 8, use linear scan
 514   sub(t0, needle_len, 8);
 515   bltz(t0, LINEARSEARCH);
 516   // needle_len >= 256, use linear scan
 517   sub(t0, needle_len, 256);
 518   bgez(t0, LINEARSTUB);
 519   // needle_len >= haystack_len/4, use linear scan
 520   srli(t0, haystack_len, 2);
 521   bge(needle_len, t0, LINEARSTUB);
 522 
 523   // Boyer-Moore-Horspool introduction:
 524   // The Boyer Moore alogorithm is based on the description here:-
 525   //
 526   // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
 527   //
 528   // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
 529   // and the 'Good Suffix' rule.
 530   //
 531   // These rules are essentially heuristics for how far we can shift the
 532   // pattern along the search string.
 533   //
 534   // The implementation here uses the 'Bad Character' rule only because of the
 535   // complexity of initialisation for the 'Good Suffix' rule.
 536   //
 537   // This is also known as the Boyer-Moore-Horspool algorithm:
 538   //
 539   // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
 540   //
 541   // #define ASIZE 256
 542   //
 543   //    int bm(unsigned char *pattern, int m, unsigned char *src, int n) {
 544   //      int i, j;
 545   //      unsigned c;
 546   //      unsigned char bc[ASIZE];
 547   //
 548   //      /* Preprocessing */
 549   //      for (i = 0; i < ASIZE; ++i)
 550   //        bc[i] = m;
 551   //      for (i = 0; i < m - 1; ) {
 552   //        c = pattern[i];
 553   //        ++i;
 554   //        // c < 256 for Latin1 string, so, no need for branch
 555   //        #ifdef PATTERN_STRING_IS_LATIN1
 556   //        bc[c] = m - i;
 557   //        #else
 558   //        if (c < ASIZE) bc[c] = m - i;
 559   //        #endif
 560   //      }
 561   //
 562   //      /* Searching */
 563   //      j = 0;
 564   //      while (j <= n - m) {
 565   //        c = src[i+j];
 566   //        if (pattern[m-1] == c)
 567   //          int k;
 568   //          for (k = m - 2; k >= 0 && pattern[k] == src[k + j]; --k);
 569   //          if (k < 0) return j;
 570   //          // c < 256 for Latin1 string, so, no need for branch
 571   //          #ifdef SOURCE_STRING_IS_LATIN1_AND_PATTERN_STRING_IS_LATIN1
 572   //          // LL case: (c< 256) always true. Remove branch
 573   //          j += bc[pattern[j+m-1]];
 574   //          #endif
 575   //          #ifdef SOURCE_STRING_IS_UTF_AND_PATTERN_STRING_IS_UTF
 576   //          // UU case: need if (c<ASIZE) check. Skip 1 character if not.
 577   //          if (c < ASIZE)
 578   //            j += bc[pattern[j+m-1]];
 579   //          else
 580   //            j += 1
 581   //          #endif
 582   //          #ifdef SOURCE_IS_UTF_AND_PATTERN_IS_LATIN1
 583   //          // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
 584   //          if (c < ASIZE)
 585   //            j += bc[pattern[j+m-1]];
 586   //          else
 587   //            j += m
 588   //          #endif
 589   //      }
 590   //      return -1;
 591   //    }
 592 
 593   // temp register:t0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, result
 594   Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
 595         BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
 596 
 597   Register haystack_end = haystack_len;
 598   Register skipch = tmp2;
 599 
 600   // pattern length is >=8, so, we can read at least 1 register for cases when
 601   // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
 602   // UL case. We'll re-read last character in inner pre-loop code to have
 603   // single outer pre-loop load
 604   const int firstStep = isLL ? 7 : 3;
 605 
 606   const int ASIZE = 256;
 607   const int STORE_BYTES = 8; // 8 bytes stored per instruction(sd)
 608 
 609   sub(sp, sp, ASIZE);
 610 
 611   // init BC offset table with default value: needle_len
 612   slli(t0, needle_len, 8);
 613   orr(t0, t0, needle_len); // [63...16][needle_len][needle_len]
 614   slli(tmp1, t0, 16);
 615   orr(t0, tmp1, t0); // [63...32][needle_len][needle_len][needle_len][needle_len]
 616   slli(tmp1, t0, 32);
 617   orr(tmp5, tmp1, t0); // tmp5: 8 elements [needle_len]
 618 
 619   mv(ch1, sp);  // ch1 is t0
 620   mv(tmp6, ASIZE / STORE_BYTES); // loop iterations
 621 
 622   bind(BM_INIT_LOOP);
 623   // for (i = 0; i < ASIZE; ++i)
 624   //   bc[i] = m;
 625   for (int i = 0; i < 4; i++) {
 626     sd(tmp5, Address(ch1, i * wordSize));
 627   }
 628   add(ch1, ch1, 32);
 629   sub(tmp6, tmp6, 4);
 630   bgtz(tmp6, BM_INIT_LOOP);
 631 
 632   sub(nlen_tmp, needle_len, 1); // m - 1, index of the last element in pattern
 633   Register orig_haystack = tmp5;
 634   mv(orig_haystack, haystack);
 635   // result_tmp = tmp4
 636   shadd(haystack_end, result_tmp, haystack, haystack_end, haystack_chr_shift);
 637   sub(ch2, needle_len, 1); // bc offset init value, ch2 is t1
 638   mv(tmp3, needle);
 639 
 640   //  for (i = 0; i < m - 1; ) {
 641   //    c = pattern[i];
 642   //    ++i;
 643   //    // c < 256 for Latin1 string, so, no need for branch
 644   //    #ifdef PATTERN_STRING_IS_LATIN1
 645   //    bc[c] = m - i;
 646   //    #else
 647   //    if (c < ASIZE) bc[c] = m - i;
 648   //    #endif
 649   //  }
 650   bind(BCLOOP);
 651   (this->*needle_load_1chr)(ch1, Address(tmp3), noreg);
 652   add(tmp3, tmp3, needle_chr_size);
 653   if (!needle_isL) {
 654     // ae == StrIntrinsicNode::UU
 655     mv(tmp6, ASIZE);
 656     bgeu(ch1, tmp6, BCSKIP);
 657   }
 658   add(tmp4, sp, ch1);
 659   sb(ch2, Address(tmp4)); // store skip offset to BC offset table
 660 
 661   bind(BCSKIP);
 662   sub(ch2, ch2, 1); // for next pattern element, skip distance -1
 663   bgtz(ch2, BCLOOP);
 664 
 665   // tmp6: pattern end, address after needle
 666   shadd(tmp6, needle_len, needle, tmp6, needle_chr_shift);
 667   if (needle_isL == haystack_isL) {
 668     // load last 8 bytes (8LL/4UU symbols)
 669     ld(tmp6, Address(tmp6, -wordSize));
 670   } else {
 671     // UL: from UTF-16(source) search Latin1(pattern)
 672     lwu(tmp6, Address(tmp6, -wordSize / 2)); // load last 4 bytes(4 symbols)
 673     // convert Latin1 to UTF. eg: 0x0000abcd -> 0x0a0b0c0d
 674     // We'll have to wait until load completed, but it's still faster than per-character loads+checks
 675     srli(tmp3, tmp6, BitsPerByte * (wordSize / 2 - needle_chr_size)); // pattern[m-1], eg:0x0000000a
 676     slli(ch2, tmp6, XLEN - 24);
 677     srli(ch2, ch2, XLEN - 8); // pattern[m-2], 0x0000000b
 678     slli(ch1, tmp6, XLEN - 16);
 679     srli(ch1, ch1, XLEN - 8); // pattern[m-3], 0x0000000c
 680     andi(tmp6, tmp6, 0xff); // pattern[m-4], 0x0000000d
 681     slli(ch2, ch2, 16);
 682     orr(ch2, ch2, ch1); // 0x00000b0c
 683     slli(result, tmp3, 48); // use result as temp register
 684     orr(tmp6, tmp6, result); // 0x0a00000d
 685     slli(result, ch2, 16);
 686     orr(tmp6, tmp6, result); // UTF-16:0x0a0b0c0d
 687   }
 688 
 689   // i = m - 1;
 690   // skipch = j + i;
 691   // if (skipch == pattern[m - 1]
 692   //   for (k = m - 2; k >= 0 && pattern[k] == src[k + j]; --k);
 693   // else
 694   //   move j with bad char offset table
 695   bind(BMLOOPSTR2);
 696   // compare pattern to source string backward
 697   shadd(result, nlen_tmp, haystack, result, haystack_chr_shift);
 698   (this->*haystack_load_1chr)(skipch, Address(result), noreg);
 699   sub(nlen_tmp, nlen_tmp, firstStep); // nlen_tmp is positive here, because needle_len >= 8
 700   if (needle_isL == haystack_isL) {
 701     // re-init tmp3. It's for free because it's executed in parallel with
 702     // load above. Alternative is to initialize it before loop, but it'll
 703     // affect performance on in-order systems with 2 or more ld/st pipelines
 704     srli(tmp3, tmp6, BitsPerByte * (wordSize - needle_chr_size)); // UU/LL: pattern[m-1]
 705   }
 706   if (!isLL) { // UU/UL case
 707     slli(ch2, nlen_tmp, 1); // offsets in bytes
 708   }
 709   bne(tmp3, skipch, BMSKIP); // if not equal, skipch is bad char
 710   add(result, haystack, isLL ? nlen_tmp : ch2);
 711   // load 8 bytes from source string
 712   // if isLL is false then read granularity can be 2
 713   load_long_misaligned(ch2, Address(result), ch1, isLL ? 1 : 2); // can use ch1 as temp register here as it will be trashed by next mv anyway
 714   mv(ch1, tmp6);
 715   if (isLL) {
 716     j(BMLOOPSTR1_AFTER_LOAD);
 717   } else {
 718     sub(nlen_tmp, nlen_tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
 719     j(BMLOOPSTR1_CMP);
 720   }
 721 
 722   bind(BMLOOPSTR1);
 723   shadd(ch1, nlen_tmp, needle, ch1, needle_chr_shift);
 724   (this->*needle_load_1chr)(ch1, Address(ch1), noreg);
 725   shadd(ch2, nlen_tmp, haystack, ch2, haystack_chr_shift);
 726   (this->*haystack_load_1chr)(ch2, Address(ch2), noreg);
 727 
 728   bind(BMLOOPSTR1_AFTER_LOAD);
 729   sub(nlen_tmp, nlen_tmp, 1);
 730   bltz(nlen_tmp, BMLOOPSTR1_LASTCMP);
 731 
 732   bind(BMLOOPSTR1_CMP);
 733   beq(ch1, ch2, BMLOOPSTR1);
 734 
 735   bind(BMSKIP);
 736   if (!isLL) {
 737     // if we've met UTF symbol while searching Latin1 pattern, then we can
 738     // skip needle_len symbols
 739     if (needle_isL != haystack_isL) {
 740       mv(result_tmp, needle_len);
 741     } else {
 742       mv(result_tmp, 1);
 743     }
 744     mv(t0, ASIZE);
 745     bgeu(skipch, t0, BMADV);
 746   }
 747   add(result_tmp, sp, skipch);
 748   lbu(result_tmp, Address(result_tmp)); // load skip offset
 749 
 750   bind(BMADV);
 751   sub(nlen_tmp, needle_len, 1);
 752   // move haystack after bad char skip offset
 753   shadd(haystack, result_tmp, haystack, result, haystack_chr_shift);
 754   ble(haystack, haystack_end, BMLOOPSTR2);
 755   add(sp, sp, ASIZE);
 756   j(NOMATCH);
 757 
 758   bind(BMLOOPSTR1_LASTCMP);
 759   bne(ch1, ch2, BMSKIP);
 760 
 761   bind(BMMATCH);
 762   sub(result, haystack, orig_haystack);
 763   if (!haystack_isL) {
 764     srli(result, result, 1);
 765   }
 766   add(sp, sp, ASIZE);
 767   j(DONE);
 768 
 769   bind(LINEARSTUB);
 770   sub(t0, needle_len, 16); // small patterns still should be handled by simple algorithm
 771   bltz(t0, LINEARSEARCH);
 772   mv(result, zr);
 773   RuntimeAddress stub = nullptr;
 774   if (isLL) {
 775     stub = RuntimeAddress(StubRoutines::riscv::string_indexof_linear_ll());
 776     assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated");
 777   } else if (needle_isL) {
 778     stub = RuntimeAddress(StubRoutines::riscv::string_indexof_linear_ul());
 779     assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated");
 780   } else {
 781     stub = RuntimeAddress(StubRoutines::riscv::string_indexof_linear_uu());
 782     assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated");
 783   }
 784   address call = trampoline_call(stub);
 785   if (call == nullptr) {
 786     DEBUG_ONLY(reset_labels(LINEARSEARCH, DONE, NOMATCH));
 787     ciEnv::current()->record_failure("CodeCache is full");
 788     return;
 789   }
 790   j(DONE);
 791 
 792   bind(NOMATCH);
 793   mv(result, -1);
 794   j(DONE);
 795 
 796   bind(LINEARSEARCH);
 797   string_indexof_linearscan(haystack, needle, haystack_len, needle_len, tmp1, tmp2, tmp3, tmp4, -1, result, ae);
 798 
 799   bind(DONE);
 800   BLOCK_COMMENT("} string_indexof");
 801 }
 802 
 803 // string_indexof
 804 // result: x10
 805 // src: x11
 806 // src_count: x12
 807 // pattern: x13
 808 // pattern_count: x14 or 1/2/3/4
 809 void C2_MacroAssembler::string_indexof_linearscan(Register haystack, Register needle,
 810                                                Register haystack_len, Register needle_len,
 811                                                Register tmp1, Register tmp2,
 812                                                Register tmp3, Register tmp4,
 813                                                int needle_con_cnt, Register result, int ae)
 814 {
 815   // Note:
 816   // needle_con_cnt > 0 means needle_len register is invalid, needle length is constant
 817   // for UU/LL: needle_con_cnt[1, 4], UL: needle_con_cnt = 1
 818   assert(needle_con_cnt <= 4, "Invalid needle constant count");
 819   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
 820 
 821   Register ch1 = t0;
 822   Register ch2 = t1;
 823   Register hlen_neg = haystack_len, nlen_neg = needle_len;
 824   Register nlen_tmp = tmp1, hlen_tmp = tmp2, result_tmp = tmp4;
 825 
 826   bool isLL = ae == StrIntrinsicNode::LL;
 827 
 828   bool needle_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
 829   bool haystack_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
 830   int needle_chr_shift = needle_isL ? 0 : 1;
 831   int haystack_chr_shift = haystack_isL ? 0 : 1;
 832   int needle_chr_size = needle_isL ? 1 : 2;
 833   int haystack_chr_size = haystack_isL ? 1 : 2;
 834 
 835   load_chr_insn needle_load_1chr = needle_isL ? (load_chr_insn)&MacroAssembler::lbu :
 836                               (load_chr_insn)&MacroAssembler::lhu;
 837   load_chr_insn haystack_load_1chr = haystack_isL ? (load_chr_insn)&MacroAssembler::lbu :
 838                                 (load_chr_insn)&MacroAssembler::lhu;
 839   load_chr_insn load_2chr = isLL ? (load_chr_insn)&MacroAssembler::lhu : (load_chr_insn)&MacroAssembler::lwu;
 840   load_chr_insn load_4chr = isLL ? (load_chr_insn)&MacroAssembler::lwu : (load_chr_insn)&MacroAssembler::ld;
 841 
 842   Label DO1, DO2, DO3, MATCH, NOMATCH, DONE;
 843 
 844   Register first = tmp3;
 845 
 846   if (needle_con_cnt == -1) {
 847     Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
 848 
 849     sub(t0, needle_len, needle_isL == haystack_isL ? 4 : 2);
 850     bltz(t0, DOSHORT);
 851 
 852     (this->*needle_load_1chr)(first, Address(needle), noreg);
 853     slli(t0, needle_len, needle_chr_shift);
 854     add(needle, needle, t0);
 855     neg(nlen_neg, t0);
 856     slli(t0, result_tmp, haystack_chr_shift);
 857     add(haystack, haystack, t0);
 858     neg(hlen_neg, t0);
 859 
 860     bind(FIRST_LOOP);
 861     add(t0, haystack, hlen_neg);
 862     (this->*haystack_load_1chr)(ch2, Address(t0), noreg);
 863     beq(first, ch2, STR1_LOOP);
 864 
 865     bind(STR2_NEXT);
 866     add(hlen_neg, hlen_neg, haystack_chr_size);
 867     blez(hlen_neg, FIRST_LOOP);
 868     j(NOMATCH);
 869 
 870     bind(STR1_LOOP);
 871     add(nlen_tmp, nlen_neg, needle_chr_size);
 872     add(hlen_tmp, hlen_neg, haystack_chr_size);
 873     bgez(nlen_tmp, MATCH);
 874 
 875     bind(STR1_NEXT);
 876     add(ch1, needle, nlen_tmp);
 877     (this->*needle_load_1chr)(ch1, Address(ch1), noreg);
 878     add(ch2, haystack, hlen_tmp);
 879     (this->*haystack_load_1chr)(ch2, Address(ch2), noreg);
 880     bne(ch1, ch2, STR2_NEXT);
 881     add(nlen_tmp, nlen_tmp, needle_chr_size);
 882     add(hlen_tmp, hlen_tmp, haystack_chr_size);
 883     bltz(nlen_tmp, STR1_NEXT);
 884     j(MATCH);
 885 
 886     bind(DOSHORT);
 887     if (needle_isL == haystack_isL) {
 888       sub(t0, needle_len, 2);
 889       bltz(t0, DO1);
 890       bgtz(t0, DO3);
 891     }
 892   }
 893 
 894   if (needle_con_cnt == 4) {
 895     Label CH1_LOOP;
 896     (this->*load_4chr)(ch1, Address(needle), noreg);
 897     sub(result_tmp, haystack_len, 4);
 898     slli(tmp3, result_tmp, haystack_chr_shift); // result as tmp
 899     add(haystack, haystack, tmp3);
 900     neg(hlen_neg, tmp3);
 901     if (AvoidUnalignedAccesses) {
 902       // preload first value, then we will read by 1 character per loop, instead of four
 903       // just shifting previous ch2 right by size of character in bits
 904       add(tmp3, haystack, hlen_neg);
 905       (this->*load_4chr)(ch2, Address(tmp3), noreg);
 906       if (isLL) {
 907         // need to erase 1 most significant byte in 32-bit value of ch2
 908         slli(ch2, ch2, 40);
 909         srli(ch2, ch2, 32);
 910       } else {
 911         slli(ch2, ch2, 16); // 2 most significant bytes will be erased by this operation
 912       }
 913     }
 914 
 915     bind(CH1_LOOP);
 916     add(tmp3, haystack, hlen_neg);
 917     if (AvoidUnalignedAccesses) {
 918       srli(ch2, ch2, isLL ? 8 : 16);
 919       (this->*haystack_load_1chr)(tmp3, Address(tmp3, isLL ? 3 : 6), noreg);
 920       slli(tmp3, tmp3, isLL ? 24 : 48);
 921       add(ch2, ch2, tmp3);
 922     } else {
 923       (this->*load_4chr)(ch2, Address(tmp3), noreg);
 924     }
 925     beq(ch1, ch2, MATCH);
 926     add(hlen_neg, hlen_neg, haystack_chr_size);
 927     blez(hlen_neg, CH1_LOOP);
 928     j(NOMATCH);
 929   }
 930 
 931   if ((needle_con_cnt == -1 && needle_isL == haystack_isL) || needle_con_cnt == 2) {
 932     Label CH1_LOOP;
 933     BLOCK_COMMENT("string_indexof DO2 {");
 934     bind(DO2);
 935     (this->*load_2chr)(ch1, Address(needle), noreg);
 936     if (needle_con_cnt == 2) {
 937       sub(result_tmp, haystack_len, 2);
 938     }
 939     slli(tmp3, result_tmp, haystack_chr_shift);
 940     add(haystack, haystack, tmp3);
 941     neg(hlen_neg, tmp3);
 942     if (AvoidUnalignedAccesses) {
 943       // preload first value, then we will read by 1 character per loop, instead of two
 944       // just shifting previous ch2 right by size of character in bits
 945       add(tmp3, haystack, hlen_neg);
 946       (this->*haystack_load_1chr)(ch2, Address(tmp3), noreg);
 947       slli(ch2, ch2, isLL ? 8 : 16);
 948     }
 949     bind(CH1_LOOP);
 950     add(tmp3, haystack, hlen_neg);
 951     if (AvoidUnalignedAccesses) {
 952       srli(ch2, ch2, isLL ? 8 : 16);
 953       (this->*haystack_load_1chr)(tmp3, Address(tmp3, isLL ? 1 : 2), noreg);
 954       slli(tmp3, tmp3, isLL ? 8 : 16);
 955       add(ch2, ch2, tmp3);
 956     } else {
 957       (this->*load_2chr)(ch2, Address(tmp3), noreg);
 958     }
 959     beq(ch1, ch2, MATCH);
 960     add(hlen_neg, hlen_neg, haystack_chr_size);
 961     blez(hlen_neg, CH1_LOOP);
 962     j(NOMATCH);
 963     BLOCK_COMMENT("} string_indexof DO2");
 964   }
 965 
 966   if ((needle_con_cnt == -1 && needle_isL == haystack_isL) || needle_con_cnt == 3) {
 967     Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
 968     BLOCK_COMMENT("string_indexof DO3 {");
 969 
 970     bind(DO3);
 971     (this->*load_2chr)(first, Address(needle), noreg);
 972     (this->*needle_load_1chr)(ch1, Address(needle, 2 * needle_chr_size), noreg);
 973     if (needle_con_cnt == 3) {
 974       sub(result_tmp, haystack_len, 3);
 975     }
 976     slli(hlen_tmp, result_tmp, haystack_chr_shift);
 977     add(haystack, haystack, hlen_tmp);
 978     neg(hlen_neg, hlen_tmp);
 979 
 980     bind(FIRST_LOOP);
 981     add(ch2, haystack, hlen_neg);
 982     if (AvoidUnalignedAccesses) {
 983       (this->*haystack_load_1chr)(tmp2, Address(ch2, isLL ? 1 : 2), noreg); // we need a temp register, we can safely use hlen_tmp here, which is a synonym for tmp2
 984       (this->*haystack_load_1chr)(ch2, Address(ch2), noreg);
 985       slli(tmp2, tmp2, isLL ? 8 : 16);
 986       add(ch2, ch2, tmp2);
 987     } else {
 988       (this->*load_2chr)(ch2, Address(ch2), noreg);
 989     }
 990     beq(first, ch2, STR1_LOOP);
 991 
 992     bind(STR2_NEXT);
 993     add(hlen_neg, hlen_neg, haystack_chr_size);
 994     blez(hlen_neg, FIRST_LOOP);
 995     j(NOMATCH);
 996 
 997     bind(STR1_LOOP);
 998     add(hlen_tmp, hlen_neg, 2 * haystack_chr_size);
 999     add(ch2, haystack, hlen_tmp);
1000     (this->*haystack_load_1chr)(ch2, Address(ch2), noreg);
1001     bne(ch1, ch2, STR2_NEXT);
1002     j(MATCH);
1003     BLOCK_COMMENT("} string_indexof DO3");
1004   }
1005 
1006   if (needle_con_cnt == -1 || needle_con_cnt == 1) {
1007     Label DO1_LOOP;
1008 
1009     BLOCK_COMMENT("string_indexof DO1 {");
1010     bind(DO1);
1011     (this->*needle_load_1chr)(ch1, Address(needle), noreg);
1012     sub(result_tmp, haystack_len, 1);
1013     slli(tmp3, result_tmp, haystack_chr_shift);
1014     add(haystack, haystack, tmp3);
1015     neg(hlen_neg, tmp3);
1016 
1017     bind(DO1_LOOP);
1018     add(tmp3, haystack, hlen_neg);
1019     (this->*haystack_load_1chr)(ch2, Address(tmp3), noreg);
1020     beq(ch1, ch2, MATCH);
1021     add(hlen_neg, hlen_neg, haystack_chr_size);
1022     blez(hlen_neg, DO1_LOOP);
1023     BLOCK_COMMENT("} string_indexof DO1");
1024   }
1025 
1026   bind(NOMATCH);
1027   mv(result, -1);
1028   j(DONE);
1029 
1030   bind(MATCH);
1031   srai(t0, hlen_neg, haystack_chr_shift);
1032   add(result, result_tmp, t0);
1033 
1034   bind(DONE);
1035 }
1036 
1037 // Compare strings.
1038 void C2_MacroAssembler::string_compare(Register str1, Register str2,
1039                                        Register cnt1, Register cnt2, Register result,
1040                                        Register tmp1, Register tmp2, Register tmp3,
1041                                        int ae)
1042 {
1043   Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
1044         DIFFERENCE, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
1045         SHORT_LOOP_START, TAIL_CHECK, L;
1046 
1047   const int STUB_THRESHOLD = 64 + 8;
1048   bool isLL = ae == StrIntrinsicNode::LL;
1049   bool isLU = ae == StrIntrinsicNode::LU;
1050   bool isUL = ae == StrIntrinsicNode::UL;
1051 
1052   bool str1_isL = isLL || isLU;
1053   bool str2_isL = isLL || isUL;
1054 
1055   // for L strings, 1 byte for 1 character
1056   // for U strings, 2 bytes for 1 character
1057   int str1_chr_size = str1_isL ? 1 : 2;
1058   int str2_chr_size = str2_isL ? 1 : 2;
1059   int minCharsInWord = isLL ? wordSize : wordSize / 2;
1060 
1061   load_chr_insn str1_load_chr = str1_isL ? (load_chr_insn)&MacroAssembler::lbu : (load_chr_insn)&MacroAssembler::lhu;
1062   load_chr_insn str2_load_chr = str2_isL ? (load_chr_insn)&MacroAssembler::lbu : (load_chr_insn)&MacroAssembler::lhu;
1063 
1064   BLOCK_COMMENT("string_compare {");
1065 
1066   // Bizzarely, the counts are passed in bytes, regardless of whether they
1067   // are L or U strings, however the result is always in characters.
1068   if (!str1_isL) {
1069     sraiw(cnt1, cnt1, 1);
1070   }
1071   if (!str2_isL) {
1072     sraiw(cnt2, cnt2, 1);
1073   }
1074 
1075   // Compute the minimum of the string lengths and save the difference in result.
1076   sub(result, cnt1, cnt2);
1077   bgt(cnt1, cnt2, L);
1078   mv(cnt2, cnt1);
1079   bind(L);
1080 
1081   // A very short string
1082   mv(t0, minCharsInWord);
1083   ble(cnt2, t0, SHORT_STRING);
1084 
1085   // Compare longwords
1086   // load first parts of strings and finish initialization while loading
1087   {
1088     if (str1_isL == str2_isL) { // LL or UU
1089       // check if str1 and str2 is same pointer
1090       beq(str1, str2, DONE);
1091       // load 8 bytes once to compare
1092       ld(tmp1, Address(str1));
1093       ld(tmp2, Address(str2));
1094       mv(t0, STUB_THRESHOLD);
1095       bge(cnt2, t0, STUB);
1096       sub(cnt2, cnt2, minCharsInWord);
1097       beqz(cnt2, TAIL_CHECK);
1098       // convert cnt2 from characters to bytes
1099       if (!str1_isL) {
1100         slli(cnt2, cnt2, 1);
1101       }
1102       add(str2, str2, cnt2);
1103       add(str1, str1, cnt2);
1104       sub(cnt2, zr, cnt2);
1105     } else if (isLU) { // LU case
1106       lwu(tmp1, Address(str1));
1107       ld(tmp2, Address(str2));
1108       mv(t0, STUB_THRESHOLD);
1109       bge(cnt2, t0, STUB);
1110       addi(cnt2, cnt2, -4);
1111       add(str1, str1, cnt2);
1112       sub(cnt1, zr, cnt2);
1113       slli(cnt2, cnt2, 1);
1114       add(str2, str2, cnt2);
1115       inflate_lo32(tmp3, tmp1);
1116       mv(tmp1, tmp3);
1117       sub(cnt2, zr, cnt2);
1118       addi(cnt1, cnt1, 4);
1119     } else { // UL case
1120       ld(tmp1, Address(str1));
1121       lwu(tmp2, Address(str2));
1122       mv(t0, STUB_THRESHOLD);
1123       bge(cnt2, t0, STUB);
1124       addi(cnt2, cnt2, -4);
1125       slli(t0, cnt2, 1);
1126       sub(cnt1, zr, t0);
1127       add(str1, str1, t0);
1128       add(str2, str2, cnt2);
1129       inflate_lo32(tmp3, tmp2);
1130       mv(tmp2, tmp3);
1131       sub(cnt2, zr, cnt2);
1132       addi(cnt1, cnt1, 8);
1133     }
1134     addi(cnt2, cnt2, isUL ? 4 : 8);
1135     bne(tmp1, tmp2, DIFFERENCE);
1136     bgez(cnt2, TAIL);
1137 
1138     // main loop
1139     bind(NEXT_WORD);
1140     if (str1_isL == str2_isL) { // LL or UU
1141       add(t0, str1, cnt2);
1142       ld(tmp1, Address(t0));
1143       add(t0, str2, cnt2);
1144       ld(tmp2, Address(t0));
1145       addi(cnt2, cnt2, 8);
1146     } else if (isLU) { // LU case
1147       add(t0, str1, cnt1);
1148       lwu(tmp1, Address(t0));
1149       add(t0, str2, cnt2);
1150       ld(tmp2, Address(t0));
1151       addi(cnt1, cnt1, 4);
1152       inflate_lo32(tmp3, tmp1);
1153       mv(tmp1, tmp3);
1154       addi(cnt2, cnt2, 8);
1155     } else { // UL case
1156       add(t0, str2, cnt2);
1157       lwu(tmp2, Address(t0));
1158       add(t0, str1, cnt1);
1159       ld(tmp1, Address(t0));
1160       inflate_lo32(tmp3, tmp2);
1161       mv(tmp2, tmp3);
1162       addi(cnt1, cnt1, 8);
1163       addi(cnt2, cnt2, 4);
1164     }
1165     bne(tmp1, tmp2, DIFFERENCE);
1166     bltz(cnt2, NEXT_WORD);
1167     bind(TAIL);
1168     if (str1_isL == str2_isL) { // LL or UU
1169       load_long_misaligned(tmp1, Address(str1), tmp3, isLL ? 1 : 2);
1170       load_long_misaligned(tmp2, Address(str2), tmp3, isLL ? 1 : 2);
1171     } else if (isLU) { // LU case
1172       load_int_misaligned(tmp1, Address(str1), tmp3, false);
1173       load_long_misaligned(tmp2, Address(str2), tmp3, 2);
1174       inflate_lo32(tmp3, tmp1);
1175       mv(tmp1, tmp3);
1176     } else { // UL case
1177       load_int_misaligned(tmp2, Address(str2), tmp3, false);
1178       load_long_misaligned(tmp1, Address(str1), tmp3, 2);
1179       inflate_lo32(tmp3, tmp2);
1180       mv(tmp2, tmp3);
1181     }
1182     bind(TAIL_CHECK);
1183     beq(tmp1, tmp2, DONE);
1184 
1185     // Find the first different characters in the longwords and
1186     // compute their difference.
1187     bind(DIFFERENCE);
1188     xorr(tmp3, tmp1, tmp2);
1189     ctzc_bit(result, tmp3, isLL); // count zero from lsb to msb
1190     srl(tmp1, tmp1, result);
1191     srl(tmp2, tmp2, result);
1192     if (isLL) {
1193       andi(tmp1, tmp1, 0xFF);
1194       andi(tmp2, tmp2, 0xFF);
1195     } else {
1196       andi(tmp1, tmp1, 0xFFFF);
1197       andi(tmp2, tmp2, 0xFFFF);
1198     }
1199     sub(result, tmp1, tmp2);
1200     j(DONE);
1201   }
1202 
1203   bind(STUB);
1204   RuntimeAddress stub = nullptr;
1205   switch (ae) {
1206     case StrIntrinsicNode::LL:
1207       stub = RuntimeAddress(StubRoutines::riscv::compare_long_string_LL());
1208       break;
1209     case StrIntrinsicNode::UU:
1210       stub = RuntimeAddress(StubRoutines::riscv::compare_long_string_UU());
1211       break;
1212     case StrIntrinsicNode::LU:
1213       stub = RuntimeAddress(StubRoutines::riscv::compare_long_string_LU());
1214       break;
1215     case StrIntrinsicNode::UL:
1216       stub = RuntimeAddress(StubRoutines::riscv::compare_long_string_UL());
1217       break;
1218     default:
1219       ShouldNotReachHere();
1220   }
1221   assert(stub.target() != nullptr, "compare_long_string stub has not been generated");
1222   address call = trampoline_call(stub);
1223   if (call == nullptr) {
1224     DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START));
1225     ciEnv::current()->record_failure("CodeCache is full");
1226     return;
1227   }
1228   j(DONE);
1229 
1230   bind(SHORT_STRING);
1231   // Is the minimum length zero?
1232   beqz(cnt2, DONE);
1233   // arrange code to do most branches while loading and loading next characters
1234   // while comparing previous
1235   (this->*str1_load_chr)(tmp1, Address(str1), t0);
1236   addi(str1, str1, str1_chr_size);
1237   addi(cnt2, cnt2, -1);
1238   beqz(cnt2, SHORT_LAST_INIT);
1239   (this->*str2_load_chr)(cnt1, Address(str2), t0);
1240   addi(str2, str2, str2_chr_size);
1241   j(SHORT_LOOP_START);
1242   bind(SHORT_LOOP);
1243   addi(cnt2, cnt2, -1);
1244   beqz(cnt2, SHORT_LAST);
1245   bind(SHORT_LOOP_START);
1246   (this->*str1_load_chr)(tmp2, Address(str1), t0);
1247   addi(str1, str1, str1_chr_size);
1248   (this->*str2_load_chr)(t0, Address(str2), t0);
1249   addi(str2, str2, str2_chr_size);
1250   bne(tmp1, cnt1, SHORT_LOOP_TAIL);
1251   addi(cnt2, cnt2, -1);
1252   beqz(cnt2, SHORT_LAST2);
1253   (this->*str1_load_chr)(tmp1, Address(str1), t0);
1254   addi(str1, str1, str1_chr_size);
1255   (this->*str2_load_chr)(cnt1, Address(str2), t0);
1256   addi(str2, str2, str2_chr_size);
1257   beq(tmp2, t0, SHORT_LOOP);
1258   sub(result, tmp2, t0);
1259   j(DONE);
1260   bind(SHORT_LOOP_TAIL);
1261   sub(result, tmp1, cnt1);
1262   j(DONE);
1263   bind(SHORT_LAST2);
1264   beq(tmp2, t0, DONE);
1265   sub(result, tmp2, t0);
1266 
1267   j(DONE);
1268   bind(SHORT_LAST_INIT);
1269   (this->*str2_load_chr)(cnt1, Address(str2), t0);
1270   addi(str2, str2, str2_chr_size);
1271   bind(SHORT_LAST);
1272   beq(tmp1, cnt1, DONE);
1273   sub(result, tmp1, cnt1);
1274 
1275   bind(DONE);
1276 
1277   BLOCK_COMMENT("} string_compare");
1278 }
1279 
1280 void C2_MacroAssembler::arrays_equals(Register a1, Register a2, Register tmp3,
1281                                       Register tmp4, Register tmp5, Register tmp6, Register result,
1282                                       Register cnt1, int elem_size) {
1283   Label DONE, SAME, NEXT_DWORD, SHORT, TAIL, TAIL2, IS_TMP5_ZR;
1284   Register tmp1 = t0;
1285   Register tmp2 = t1;
1286   Register cnt2 = tmp2;  // cnt2 only used in array length compare
1287   Register elem_per_word = tmp6;
1288   int log_elem_size = exact_log2(elem_size);
1289   int length_offset = arrayOopDesc::length_offset_in_bytes();
1290   int base_offset   = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE);
1291 
1292   assert(elem_size == 1 || elem_size == 2, "must be char or byte");
1293   assert_different_registers(a1, a2, result, cnt1, t0, t1, tmp3, tmp4, tmp5, tmp6);
1294   mv(elem_per_word, wordSize / elem_size);
1295 
1296   BLOCK_COMMENT("arrays_equals {");
1297 
1298   // if (a1 == a2), return true
1299   beq(a1, a2, SAME);
1300 
1301   mv(result, false);
1302   beqz(a1, DONE);
1303   beqz(a2, DONE);
1304   lwu(cnt1, Address(a1, length_offset));
1305   lwu(cnt2, Address(a2, length_offset));
1306   bne(cnt2, cnt1, DONE);
1307   beqz(cnt1, SAME);
1308 
1309   slli(tmp5, cnt1, 3 + log_elem_size);
1310   sub(tmp5, zr, tmp5);
1311   add(a1, a1, base_offset);
1312   add(a2, a2, base_offset);
1313   ld(tmp3, Address(a1, 0));
1314   ld(tmp4, Address(a2, 0));
1315   ble(cnt1, elem_per_word, SHORT); // short or same
1316 
1317   // Main 16 byte comparison loop with 2 exits
1318   bind(NEXT_DWORD); {
1319     ld(tmp1, Address(a1, wordSize));
1320     ld(tmp2, Address(a2, wordSize));
1321     sub(cnt1, cnt1, 2 * wordSize / elem_size);
1322     blez(cnt1, TAIL);
1323     bne(tmp3, tmp4, DONE);
1324     ld(tmp3, Address(a1, 2 * wordSize));
1325     ld(tmp4, Address(a2, 2 * wordSize));
1326     add(a1, a1, 2 * wordSize);
1327     add(a2, a2, 2 * wordSize);
1328     ble(cnt1, elem_per_word, TAIL2);
1329   } beq(tmp1, tmp2, NEXT_DWORD);
1330   j(DONE);
1331 
1332   bind(TAIL);
1333   xorr(tmp4, tmp3, tmp4);
1334   xorr(tmp2, tmp1, tmp2);
1335   sll(tmp2, tmp2, tmp5);
1336   orr(tmp5, tmp4, tmp2);
1337   j(IS_TMP5_ZR);
1338 
1339   bind(TAIL2);
1340   bne(tmp1, tmp2, DONE);
1341 
1342   bind(SHORT);
1343   xorr(tmp4, tmp3, tmp4);
1344   sll(tmp5, tmp4, tmp5);
1345 
1346   bind(IS_TMP5_ZR);
1347   bnez(tmp5, DONE);
1348 
1349   bind(SAME);
1350   mv(result, true);
1351   // That's it.
1352   bind(DONE);
1353 
1354   BLOCK_COMMENT("} array_equals");
1355 }
1356 
1357 // Compare Strings
1358 
1359 // For Strings we're passed the address of the first characters in a1
1360 // and a2 and the length in cnt1.
1361 // There are two implementations.  For arrays >= 8 bytes, all
1362 // comparisons (for hw supporting unaligned access: including the final one,
1363 // which may overlap) are performed 8 bytes at a time.
1364 // For strings < 8 bytes (and for tails of long strings when
1365 // AvoidUnalignedAccesses is true), we compare a
1366 // halfword, then a short, and then a byte.
1367 
1368 void C2_MacroAssembler::string_equals(Register a1, Register a2,
1369                                       Register result, Register cnt1)
1370 {
1371   Label SAME, DONE, SHORT, NEXT_WORD;
1372   Register tmp1 = t0;
1373   Register tmp2 = t1;
1374 
1375   assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2);
1376 
1377   BLOCK_COMMENT("string_equals {");
1378 
1379   beqz(cnt1, SAME);
1380   mv(result, false);
1381 
1382   // Check for short strings, i.e. smaller than wordSize.
1383   sub(cnt1, cnt1, wordSize);
1384   bltz(cnt1, SHORT);
1385 
1386   // Main 8 byte comparison loop.
1387   bind(NEXT_WORD); {
1388     ld(tmp1, Address(a1, 0));
1389     add(a1, a1, wordSize);
1390     ld(tmp2, Address(a2, 0));
1391     add(a2, a2, wordSize);
1392     sub(cnt1, cnt1, wordSize);
1393     bne(tmp1, tmp2, DONE);
1394   } bgez(cnt1, NEXT_WORD);
1395 
1396   if (!AvoidUnalignedAccesses) {
1397     // Last longword.  In the case where length == 4 we compare the
1398     // same longword twice, but that's still faster than another
1399     // conditional branch.
1400     // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when
1401     // length == 4.
1402     add(tmp1, a1, cnt1);
1403     ld(tmp1, Address(tmp1, 0));
1404     add(tmp2, a2, cnt1);
1405     ld(tmp2, Address(tmp2, 0));
1406     bne(tmp1, tmp2, DONE);
1407     j(SAME);
1408   } else {
1409     add(tmp1, cnt1, wordSize);
1410     beqz(tmp1, SAME);
1411   }
1412 
1413   bind(SHORT);
1414   Label TAIL03, TAIL01;
1415 
1416   // 0-7 bytes left.
1417   test_bit(tmp1, cnt1, 2);
1418   beqz(tmp1, TAIL03);
1419   {
1420     lwu(tmp1, Address(a1, 0));
1421     add(a1, a1, 4);
1422     lwu(tmp2, Address(a2, 0));
1423     add(a2, a2, 4);
1424     bne(tmp1, tmp2, DONE);
1425   }
1426 
1427   bind(TAIL03);
1428   // 0-3 bytes left.
1429   test_bit(tmp1, cnt1, 1);
1430   beqz(tmp1, TAIL01);
1431   {
1432     lhu(tmp1, Address(a1, 0));
1433     add(a1, a1, 2);
1434     lhu(tmp2, Address(a2, 0));
1435     add(a2, a2, 2);
1436     bne(tmp1, tmp2, DONE);
1437   }
1438 
1439   bind(TAIL01);
1440   // 0-1 bytes left.
1441   test_bit(tmp1, cnt1, 0);
1442   beqz(tmp1, SAME);
1443   {
1444     lbu(tmp1, Address(a1, 0));
1445     lbu(tmp2, Address(a2, 0));
1446     bne(tmp1, tmp2, DONE);
1447   }
1448 
1449   // Arrays are equal.
1450   bind(SAME);
1451   mv(result, true);
1452 
1453   // That's it.
1454   bind(DONE);
1455   BLOCK_COMMENT("} string_equals");
1456 }
1457 
1458 // jdk.internal.util.ArraysSupport.vectorizedHashCode
1459 void C2_MacroAssembler::arrays_hashcode(Register ary, Register cnt, Register result,
1460                                         Register tmp1, Register tmp2, Register tmp3,
1461                                         Register tmp4, Register tmp5, Register tmp6,
1462                                         BasicType eltype)
1463 {
1464   assert_different_registers(ary, cnt, result, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, t0, t1);
1465 
1466   const int elsize = arrays_hashcode_elsize(eltype);
1467   const int chunks_end_shift = exact_log2(elsize);
1468 
1469   switch (eltype) {
1470   case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break;
1471   case T_CHAR:    BLOCK_COMMENT("arrays_hashcode(char) {");          break;
1472   case T_BYTE:    BLOCK_COMMENT("arrays_hashcode(byte) {");          break;
1473   case T_SHORT:   BLOCK_COMMENT("arrays_hashcode(short) {");         break;
1474   case T_INT:     BLOCK_COMMENT("arrays_hashcode(int) {");           break;
1475   default:
1476     ShouldNotReachHere();
1477   }
1478 
1479   const int stride = 4;
1480   const Register pow31_4 = tmp1;
1481   const Register pow31_3 = tmp2;
1482   const Register pow31_2 = tmp3;
1483   const Register chunks  = tmp4;
1484   const Register chunks_end = chunks;
1485 
1486   Label DONE, TAIL, TAIL_LOOP, WIDE_LOOP;
1487 
1488   // result has a value initially
1489 
1490   beqz(cnt, DONE);
1491 
1492   andi(chunks, cnt, ~(stride-1));
1493   beqz(chunks, TAIL);
1494 
1495   mv(pow31_4, 923521);           // [31^^4]
1496   mv(pow31_3,  29791);           // [31^^3]
1497   mv(pow31_2,    961);           // [31^^2]
1498 
1499   slli(chunks_end, chunks, chunks_end_shift);
1500   add(chunks_end, ary, chunks_end);
1501   andi(cnt, cnt, stride-1);      // don't forget about tail!
1502 
1503   bind(WIDE_LOOP);
1504   mulw(result, result, pow31_4); // 31^^4 * h
1505   arrays_hashcode_elload(t0,   Address(ary, 0 * elsize), eltype);
1506   arrays_hashcode_elload(t1,   Address(ary, 1 * elsize), eltype);
1507   arrays_hashcode_elload(tmp5, Address(ary, 2 * elsize), eltype);
1508   arrays_hashcode_elload(tmp6, Address(ary, 3 * elsize), eltype);
1509   mulw(t0, t0, pow31_3);         // 31^^3 * ary[i+0]
1510   addw(result, result, t0);
1511   mulw(t1, t1, pow31_2);         // 31^^2 * ary[i+1]
1512   addw(result, result, t1);
1513   slli(t0, tmp5, 5);             // optimize 31^^1 * ary[i+2]
1514   subw(tmp5, t0, tmp5);          // with ary[i+2]<<5 - ary[i+2]
1515   addw(result, result, tmp5);
1516   addw(result, result, tmp6);    // 31^^4 * h + 31^^3 * ary[i+0] + 31^^2 * ary[i+1]
1517                                  //           + 31^^1 * ary[i+2] + 31^^0 * ary[i+3]
1518   addi(ary, ary, elsize * stride);
1519   bne(ary, chunks_end, WIDE_LOOP);
1520   beqz(cnt, DONE);
1521 
1522   bind(TAIL);
1523   slli(chunks_end, cnt, chunks_end_shift);
1524   add(chunks_end, ary, chunks_end);
1525 
1526   bind(TAIL_LOOP);
1527   arrays_hashcode_elload(t0, Address(ary), eltype);
1528   slli(t1, result, 5);           // optimize 31 * result
1529   subw(result, t1, result);      // with result<<5 - result
1530   addw(result, result, t0);
1531   addi(ary, ary, elsize);
1532   bne(ary, chunks_end, TAIL_LOOP);
1533 
1534   bind(DONE);
1535   BLOCK_COMMENT("} // arrays_hashcode");
1536 }
1537 
1538 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) {
1539   switch (eltype) {
1540   case T_BOOLEAN: return sizeof(jboolean);
1541   case T_BYTE:    return sizeof(jbyte);
1542   case T_SHORT:   return sizeof(jshort);
1543   case T_CHAR:    return sizeof(jchar);
1544   case T_INT:     return sizeof(jint);
1545   default:
1546     ShouldNotReachHere();
1547     return -1;
1548   }
1549 }
1550 
1551 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) {
1552   switch (eltype) {
1553   // T_BOOLEAN used as surrogate for unsigned byte
1554   case T_BOOLEAN: lbu(dst, src);   break;
1555   case T_BYTE:     lb(dst, src);   break;
1556   case T_SHORT:    lh(dst, src);   break;
1557   case T_CHAR:    lhu(dst, src);   break;
1558   case T_INT:      lw(dst, src);   break;
1559   default:
1560     ShouldNotReachHere();
1561   }
1562 }
1563 
1564 typedef void (Assembler::*conditional_branch_insn)(Register op1, Register op2, Label& label, bool is_far);
1565 typedef void (MacroAssembler::*float_conditional_branch_insn)(FloatRegister op1, FloatRegister op2, Label& label,
1566                                                               bool is_far, bool is_unordered);
1567 
1568 static conditional_branch_insn conditional_branches[] =
1569 {
1570   /* SHORT branches */
1571   (conditional_branch_insn)&MacroAssembler::beq,
1572   (conditional_branch_insn)&MacroAssembler::bgt,
1573   nullptr, // BoolTest::overflow
1574   (conditional_branch_insn)&MacroAssembler::blt,
1575   (conditional_branch_insn)&MacroAssembler::bne,
1576   (conditional_branch_insn)&MacroAssembler::ble,
1577   nullptr, // BoolTest::no_overflow
1578   (conditional_branch_insn)&MacroAssembler::bge,
1579 
1580   /* UNSIGNED branches */
1581   (conditional_branch_insn)&MacroAssembler::beq,
1582   (conditional_branch_insn)&MacroAssembler::bgtu,
1583   nullptr,
1584   (conditional_branch_insn)&MacroAssembler::bltu,
1585   (conditional_branch_insn)&MacroAssembler::bne,
1586   (conditional_branch_insn)&MacroAssembler::bleu,
1587   nullptr,
1588   (conditional_branch_insn)&MacroAssembler::bgeu
1589 };
1590 
1591 static float_conditional_branch_insn float_conditional_branches[] =
1592 {
1593   /* FLOAT SHORT branches */
1594   (float_conditional_branch_insn)&MacroAssembler::float_beq,
1595   (float_conditional_branch_insn)&MacroAssembler::float_bgt,
1596   nullptr,  // BoolTest::overflow
1597   (float_conditional_branch_insn)&MacroAssembler::float_blt,
1598   (float_conditional_branch_insn)&MacroAssembler::float_bne,
1599   (float_conditional_branch_insn)&MacroAssembler::float_ble,
1600   nullptr, // BoolTest::no_overflow
1601   (float_conditional_branch_insn)&MacroAssembler::float_bge,
1602 
1603   /* DOUBLE SHORT branches */
1604   (float_conditional_branch_insn)&MacroAssembler::double_beq,
1605   (float_conditional_branch_insn)&MacroAssembler::double_bgt,
1606   nullptr,
1607   (float_conditional_branch_insn)&MacroAssembler::double_blt,
1608   (float_conditional_branch_insn)&MacroAssembler::double_bne,
1609   (float_conditional_branch_insn)&MacroAssembler::double_ble,
1610   nullptr,
1611   (float_conditional_branch_insn)&MacroAssembler::double_bge
1612 };
1613 
1614 void C2_MacroAssembler::cmp_branch(int cmpFlag, Register op1, Register op2, Label& label, bool is_far) {
1615   assert(cmpFlag >= 0 && cmpFlag < (int)(sizeof(conditional_branches) / sizeof(conditional_branches[0])),
1616          "invalid conditional branch index");
1617   (this->*conditional_branches[cmpFlag])(op1, op2, label, is_far);
1618 }
1619 
1620 // This is a function should only be used by C2. Flip the unordered when unordered-greater, C2 would use
1621 // unordered-lesser instead of unordered-greater. Finally, commute the result bits at function do_one_bytecode().
1622 void C2_MacroAssembler::float_cmp_branch(int cmpFlag, FloatRegister op1, FloatRegister op2, Label& label, bool is_far) {
1623   assert(cmpFlag >= 0 && cmpFlag < (int)(sizeof(float_conditional_branches) / sizeof(float_conditional_branches[0])),
1624          "invalid float conditional branch index");
1625   int booltest_flag = cmpFlag & ~(C2_MacroAssembler::double_branch_mask);
1626   (this->*float_conditional_branches[cmpFlag])(op1, op2, label, is_far,
1627     (booltest_flag == (BoolTest::ge) || booltest_flag == (BoolTest::gt)) ? false : true);
1628 }
1629 
1630 void C2_MacroAssembler::enc_cmpUEqNeLeGt_imm0_branch(int cmpFlag, Register op1, Label& L, bool is_far) {
1631   switch (cmpFlag) {
1632     case BoolTest::eq:
1633     case BoolTest::le:
1634       beqz(op1, L, is_far);
1635       break;
1636     case BoolTest::ne:
1637     case BoolTest::gt:
1638       bnez(op1, L, is_far);
1639       break;
1640     default:
1641       ShouldNotReachHere();
1642   }
1643 }
1644 
1645 void C2_MacroAssembler::enc_cmpEqNe_imm0_branch(int cmpFlag, Register op1, Label& L, bool is_far) {
1646   switch (cmpFlag) {
1647     case BoolTest::eq:
1648       beqz(op1, L, is_far);
1649       break;
1650     case BoolTest::ne:
1651       bnez(op1, L, is_far);
1652       break;
1653     default:
1654       ShouldNotReachHere();
1655   }
1656 }
1657 
1658 void C2_MacroAssembler::enc_cmove(int cmpFlag, Register op1, Register op2, Register dst, Register src) {
1659   Label L;
1660   cmp_branch(cmpFlag ^ (1 << neg_cond_bits), op1, op2, L);
1661   mv(dst, src);
1662   bind(L);
1663 }
1664 
1665 // Set dst to NaN if any NaN input.
1666 void C2_MacroAssembler::minmax_fp(FloatRegister dst, FloatRegister src1, FloatRegister src2,
1667                                   bool is_double, bool is_min) {
1668   assert_different_registers(dst, src1, src2);
1669 
1670   Label Done, Compare;
1671 
1672   is_double ? fclass_d(t0, src1)
1673             : fclass_s(t0, src1);
1674   is_double ? fclass_d(t1, src2)
1675             : fclass_s(t1, src2);
1676   orr(t0, t0, t1);
1677   andi(t0, t0, fclass_mask::nan); // if src1 or src2 is quiet or signaling NaN then return NaN
1678   beqz(t0, Compare);
1679   is_double ? fadd_d(dst, src1, src2)
1680             : fadd_s(dst, src1, src2);
1681   j(Done);
1682 
1683   bind(Compare);
1684   if (is_double) {
1685     is_min ? fmin_d(dst, src1, src2)
1686            : fmax_d(dst, src1, src2);
1687   } else {
1688     is_min ? fmin_s(dst, src1, src2)
1689            : fmax_s(dst, src1, src2);
1690   }
1691 
1692   bind(Done);
1693 }
1694 
1695 // According to Java SE specification, for floating-point round operations, if
1696 // the input is NaN, +/-infinity, or +/-0, the same input is returned as the
1697 // rounded result; this differs from behavior of RISC-V fcvt instructions (which
1698 // round out-of-range values to the nearest max or min value), therefore special
1699 // handling is needed by NaN, +/-Infinity, +/-0.
1700 void C2_MacroAssembler::round_double_mode(FloatRegister dst, FloatRegister src, int round_mode,
1701                                           Register tmp1, Register tmp2, Register tmp3) {
1702 
1703   assert_different_registers(dst, src);
1704   assert_different_registers(tmp1, tmp2, tmp3);
1705 
1706   // Set rounding mode for conversions
1707   // Here we use similar modes to double->long and long->double conversions
1708   // Different mode for long->double conversion matter only if long value was not representable as double,
1709   // we got long value as a result of double->long conversion so, it is definitely representable
1710   RoundingMode rm;
1711   switch (round_mode) {
1712     case RoundDoubleModeNode::rmode_ceil:
1713       rm = RoundingMode::rup;
1714       break;
1715     case RoundDoubleModeNode::rmode_floor:
1716       rm = RoundingMode::rdn;
1717       break;
1718     case RoundDoubleModeNode::rmode_rint:
1719       rm = RoundingMode::rne;
1720       break;
1721     default:
1722       ShouldNotReachHere();
1723   }
1724 
1725   // tmp1 - is a register to store double converted to long int
1726   // tmp2 - is a register to create constant for comparison
1727   // tmp3 - is a register where we store modified result of double->long conversion
1728   Label done, bad_val;
1729 
1730   // Conversion from double to long
1731   fcvt_l_d(tmp1, src, rm);
1732 
1733   // Generate constant (tmp2)
1734   // tmp2 = 100...0000
1735   addi(tmp2, zr, 1);
1736   slli(tmp2, tmp2, 63);
1737 
1738   // Prepare converted long (tmp1)
1739   // as a result when conversion overflow we got:
1740   // tmp1 = 011...1111 or 100...0000
1741   // Convert it to: tmp3 = 100...0000
1742   addi(tmp3, tmp1, 1);
1743   andi(tmp3, tmp3, -2);
1744   beq(tmp3, tmp2, bad_val);
1745 
1746   // Conversion from long to double
1747   fcvt_d_l(dst, tmp1, rm);
1748   // Add sign of input value to result for +/- 0 cases
1749   fsgnj_d(dst, dst, src);
1750   j(done);
1751 
1752   // If got conversion overflow return src
1753   bind(bad_val);
1754   fmv_d(dst, src);
1755 
1756   bind(done);
1757 }
1758 
1759 // According to Java SE specification, for floating-point signum operations, if
1760 // on input we have NaN or +/-0.0 value we should return it,
1761 // otherwise return +/- 1.0 using sign of input.
1762 // one - gives us a floating-point 1.0 (got from matching rule)
1763 // bool is_double - specifies single or double precision operations will be used.
1764 void C2_MacroAssembler::signum_fp(FloatRegister dst, FloatRegister one, bool is_double) {
1765   Label done;
1766 
1767   is_double ? fclass_d(t0, dst)
1768             : fclass_s(t0, dst);
1769 
1770   // check if input is -0, +0, signaling NaN or quiet NaN
1771   andi(t0, t0, fclass_mask::zero | fclass_mask::nan);
1772 
1773   bnez(t0, done);
1774 
1775   // use floating-point 1.0 with a sign of input
1776   is_double ? fsgnj_d(dst, one, dst)
1777             : fsgnj_s(dst, one, dst);
1778 
1779   bind(done);
1780 }
1781 
1782 static void float16_to_float_slow_path(C2_MacroAssembler& masm, C2GeneralStub<FloatRegister, Register, Register>& stub) {
1783 #define __ masm.
1784   FloatRegister dst = stub.data<0>();
1785   Register src = stub.data<1>();
1786   Register tmp = stub.data<2>();
1787   __ bind(stub.entry());
1788 
1789   // following instructions mainly focus on NaN, as riscv does not handle
1790   // NaN well with fcvt, but the code also works for Inf at the same time.
1791 
1792   // construct a NaN in 32 bits from the NaN in 16 bits,
1793   // we need the payloads of non-canonical NaNs to be preserved.
1794   __ mv(tmp, 0x7f800000);
1795   // sign-bit was already set via sign-extension if necessary.
1796   __ slli(t0, src, 13);
1797   __ orr(tmp, t0, tmp);
1798   __ fmv_w_x(dst, tmp);
1799 
1800   __ j(stub.continuation());
1801 #undef __
1802 }
1803 
1804 // j.l.Float.float16ToFloat
1805 void C2_MacroAssembler::float16_to_float(FloatRegister dst, Register src, Register tmp) {
1806   auto stub = C2CodeStub::make<FloatRegister, Register, Register>(dst, src, tmp, 20, float16_to_float_slow_path);
1807 
1808   // in riscv, NaN needs a special process as fcvt does not work in that case.
1809   // in riscv, Inf does not need a special process as fcvt can handle it correctly.
1810   // but we consider to get the slow path to process NaN and Inf at the same time,
1811   // as both of them are rare cases, and if we try to get the slow path to handle
1812   // only NaN case it would sacrifise the performance for normal cases,
1813   // i.e. non-NaN and non-Inf cases.
1814 
1815   // check whether it's a NaN or +/- Inf.
1816   mv(t0, 0x7c00);
1817   andr(tmp, src, t0);
1818   // jump to stub processing NaN and Inf cases.
1819   beq(t0, tmp, stub->entry());
1820 
1821   // non-NaN or non-Inf cases, just use built-in instructions.
1822   fmv_h_x(dst, src);
1823   fcvt_s_h(dst, dst);
1824 
1825   bind(stub->continuation());
1826 }
1827 
1828 static void float_to_float16_slow_path(C2_MacroAssembler& masm, C2GeneralStub<Register, FloatRegister, Register>& stub) {
1829 #define __ masm.
1830   Register dst = stub.data<0>();
1831   FloatRegister src = stub.data<1>();
1832   Register tmp = stub.data<2>();
1833   __ bind(stub.entry());
1834 
1835   __ fmv_x_w(dst, src);
1836 
1837   // preserve the payloads of non-canonical NaNs.
1838   __ srai(dst, dst, 13);
1839   // preserve the sign bit.
1840   __ srai(tmp, dst, 13);
1841   __ slli(tmp, tmp, 10);
1842   __ mv(t0, 0x3ff);
1843   __ orr(tmp, tmp, t0);
1844 
1845   // get the result by merging sign bit and payloads of preserved non-canonical NaNs.
1846   __ andr(dst, dst, tmp);
1847 
1848   __ j(stub.continuation());
1849 #undef __
1850 }
1851 
1852 // j.l.Float.floatToFloat16
1853 void C2_MacroAssembler::float_to_float16(Register dst, FloatRegister src, FloatRegister ftmp, Register xtmp) {
1854   auto stub = C2CodeStub::make<Register, FloatRegister, Register>(dst, src, xtmp, 130, float_to_float16_slow_path);
1855 
1856   // in riscv, NaN needs a special process as fcvt does not work in that case.
1857 
1858   // check whether it's a NaN.
1859   // replace fclass with feq as performance optimization.
1860   feq_s(t0, src, src);
1861   // jump to stub processing NaN cases.
1862   beqz(t0, stub->entry());
1863 
1864   // non-NaN cases, just use built-in instructions.
1865   fcvt_h_s(ftmp, src);
1866   fmv_x_h(dst, ftmp);
1867 
1868   bind(stub->continuation());
1869 }
1870 
1871 void C2_MacroAssembler::signum_fp_v(VectorRegister dst, VectorRegister one, BasicType bt, int vlen) {
1872   vsetvli_helper(bt, vlen);
1873 
1874   // check if input is -0, +0, signaling NaN or quiet NaN
1875   vfclass_v(v0, dst);
1876   mv(t0, fclass_mask::zero | fclass_mask::nan);
1877   vand_vx(v0, v0, t0);
1878   vmseq_vi(v0, v0, 0);
1879 
1880   // use floating-point 1.0 with a sign of input
1881   vfsgnj_vv(dst, one, dst, v0_t);
1882 }
1883 
1884 void C2_MacroAssembler::compress_bits_v(Register dst, Register src, Register mask, bool is_long) {
1885   Assembler::SEW sew = is_long ? Assembler::e64 : Assembler::e32;
1886   // intrinsic is enabled when MaxVectorSize >= 16
1887   Assembler::LMUL lmul = is_long ? Assembler::m4 : Assembler::m2;
1888   long len = is_long ? 64 : 32;
1889 
1890   // load the src data(in bits) to be compressed.
1891   vsetivli(x0, 1, sew, Assembler::m1);
1892   vmv_s_x(v0, src);
1893   // reset the src data(in bytes) to zero.
1894   mv(t0, len);
1895   vsetvli(x0, t0, Assembler::e8, lmul);
1896   vmv_v_i(v4, 0);
1897   // convert the src data from bits to bytes.
1898   vmerge_vim(v4, v4, 1); // v0 as the implicit mask register
1899   // reset the dst data(in bytes) to zero.
1900   vmv_v_i(v8, 0);
1901   // load the mask data(in bits).
1902   vsetivli(x0, 1, sew, Assembler::m1);
1903   vmv_s_x(v0, mask);
1904   // compress the src data(in bytes) to dst(in bytes).
1905   vsetvli(x0, t0, Assembler::e8, lmul);
1906   vcompress_vm(v8, v4, v0);
1907   // convert the dst data from bytes to bits.
1908   vmseq_vi(v0, v8, 1);
1909   // store result back.
1910   vsetivli(x0, 1, sew, Assembler::m1);
1911   vmv_x_s(dst, v0);
1912 }
1913 
1914 void C2_MacroAssembler::compress_bits_i_v(Register dst, Register src, Register mask) {
1915   compress_bits_v(dst, src, mask, /* is_long */ false);
1916 }
1917 
1918 void C2_MacroAssembler::compress_bits_l_v(Register dst, Register src, Register mask) {
1919   compress_bits_v(dst, src, mask, /* is_long */ true);
1920 }
1921 
1922 void C2_MacroAssembler::expand_bits_v(Register dst, Register src, Register mask, bool is_long) {
1923   Assembler::SEW sew = is_long ? Assembler::e64 : Assembler::e32;
1924   // intrinsic is enabled when MaxVectorSize >= 16
1925   Assembler::LMUL lmul = is_long ? Assembler::m4 : Assembler::m2;
1926   long len = is_long ? 64 : 32;
1927 
1928   // load the src data(in bits) to be expanded.
1929   vsetivli(x0, 1, sew, Assembler::m1);
1930   vmv_s_x(v0, src);
1931   // reset the src data(in bytes) to zero.
1932   mv(t0, len);
1933   vsetvli(x0, t0, Assembler::e8, lmul);
1934   vmv_v_i(v4, 0);
1935   // convert the src data from bits to bytes.
1936   vmerge_vim(v4, v4, 1); // v0 as implicit mask register
1937   // reset the dst data(in bytes) to zero.
1938   vmv_v_i(v12, 0);
1939   // load the mask data(in bits).
1940   vsetivli(x0, 1, sew, Assembler::m1);
1941   vmv_s_x(v0, mask);
1942   // expand the src data(in bytes) to dst(in bytes).
1943   vsetvli(x0, t0, Assembler::e8, lmul);
1944   viota_m(v8, v0);
1945   vrgather_vv(v12, v4, v8, VectorMask::v0_t); // v0 as implicit mask register
1946   // convert the dst data from bytes to bits.
1947   vmseq_vi(v0, v12, 1);
1948   // store result back.
1949   vsetivli(x0, 1, sew, Assembler::m1);
1950   vmv_x_s(dst, v0);
1951 }
1952 
1953 void C2_MacroAssembler::expand_bits_i_v(Register dst, Register src, Register mask) {
1954   expand_bits_v(dst, src, mask, /* is_long */ false);
1955 }
1956 
1957 void C2_MacroAssembler::expand_bits_l_v(Register dst, Register src, Register mask) {
1958   expand_bits_v(dst, src, mask, /* is_long */ true);
1959 }
1960 
1961 void C2_MacroAssembler::element_compare(Register a1, Register a2, Register result, Register cnt, Register tmp1, Register tmp2,
1962                                         VectorRegister vr1, VectorRegister vr2, VectorRegister vrs, bool islatin, Label &DONE) {
1963   Label loop;
1964   Assembler::SEW sew = islatin ? Assembler::e8 : Assembler::e16;
1965 
1966   bind(loop);
1967   vsetvli(tmp1, cnt, sew, Assembler::m2);
1968   vlex_v(vr1, a1, sew);
1969   vlex_v(vr2, a2, sew);
1970   vmsne_vv(vrs, vr1, vr2);
1971   vfirst_m(tmp2, vrs);
1972   bgez(tmp2, DONE);
1973   sub(cnt, cnt, tmp1);
1974   if (!islatin) {
1975     slli(tmp1, tmp1, 1); // get byte counts
1976   }
1977   add(a1, a1, tmp1);
1978   add(a2, a2, tmp1);
1979   bnez(cnt, loop);
1980 
1981   mv(result, true);
1982 }
1983 
1984 void C2_MacroAssembler::string_equals_v(Register a1, Register a2, Register result, Register cnt) {
1985   Label DONE;
1986   Register tmp1 = t0;
1987   Register tmp2 = t1;
1988 
1989   BLOCK_COMMENT("string_equals_v {");
1990 
1991   mv(result, false);
1992 
1993   element_compare(a1, a2, result, cnt, tmp1, tmp2, v2, v4, v2, true, DONE);
1994 
1995   bind(DONE);
1996   BLOCK_COMMENT("} string_equals_v");
1997 }
1998 
1999 // used by C2 ClearArray patterns.
2000 // base: Address of a buffer to be zeroed
2001 // cnt: Count in HeapWords
2002 //
2003 // base, cnt, v4, v5, v6, v7 and t0 are clobbered.
2004 void C2_MacroAssembler::clear_array_v(Register base, Register cnt) {
2005   Label loop;
2006 
2007   // making zero words
2008   vsetvli(t0, cnt, Assembler::e64, Assembler::m4);
2009   vxor_vv(v4, v4, v4);
2010 
2011   bind(loop);
2012   vsetvli(t0, cnt, Assembler::e64, Assembler::m4);
2013   vse64_v(v4, base);
2014   sub(cnt, cnt, t0);
2015   shadd(base, t0, base, t0, 3);
2016   bnez(cnt, loop);
2017 }
2018 
2019 void C2_MacroAssembler::arrays_equals_v(Register a1, Register a2, Register result,
2020                                         Register cnt1, int elem_size) {
2021   Label DONE;
2022   Register tmp1 = t0;
2023   Register tmp2 = t1;
2024   Register cnt2 = tmp2;
2025   int length_offset = arrayOopDesc::length_offset_in_bytes();
2026   int base_offset = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE);
2027 
2028   BLOCK_COMMENT("arrays_equals_v {");
2029 
2030   // if (a1 == a2), return true
2031   mv(result, true);
2032   beq(a1, a2, DONE);
2033 
2034   mv(result, false);
2035   // if a1 == null or a2 == null, return false
2036   beqz(a1, DONE);
2037   beqz(a2, DONE);
2038   // if (a1.length != a2.length), return false
2039   lwu(cnt1, Address(a1, length_offset));
2040   lwu(cnt2, Address(a2, length_offset));
2041   bne(cnt1, cnt2, DONE);
2042 
2043   la(a1, Address(a1, base_offset));
2044   la(a2, Address(a2, base_offset));
2045 
2046   element_compare(a1, a2, result, cnt1, tmp1, tmp2, v2, v4, v2, elem_size == 1, DONE);
2047 
2048   bind(DONE);
2049 
2050   BLOCK_COMMENT("} arrays_equals_v");
2051 }
2052 
2053 void C2_MacroAssembler::string_compare_v(Register str1, Register str2, Register cnt1, Register cnt2,
2054                                          Register result, Register tmp1, Register tmp2, int encForm) {
2055   Label DIFFERENCE, DONE, L, loop;
2056   bool encLL = encForm == StrIntrinsicNode::LL;
2057   bool encLU = encForm == StrIntrinsicNode::LU;
2058   bool encUL = encForm == StrIntrinsicNode::UL;
2059 
2060   bool str1_isL = encLL || encLU;
2061   bool str2_isL = encLL || encUL;
2062 
2063   int minCharsInWord = encLL ? wordSize : wordSize / 2;
2064 
2065   BLOCK_COMMENT("string_compare {");
2066 
2067   // for Latin strings, 1 byte for 1 character
2068   // for UTF16 strings, 2 bytes for 1 character
2069   if (!str1_isL)
2070     sraiw(cnt1, cnt1, 1);
2071   if (!str2_isL)
2072     sraiw(cnt2, cnt2, 1);
2073 
2074   // if str1 == str2, return the difference
2075   // save the minimum of the string lengths in cnt2.
2076   sub(result, cnt1, cnt2);
2077   bgt(cnt1, cnt2, L);
2078   mv(cnt2, cnt1);
2079   bind(L);
2080 
2081   if (str1_isL == str2_isL) { // LL or UU
2082     element_compare(str1, str2, zr, cnt2, tmp1, tmp2, v2, v4, v2, encLL, DIFFERENCE);
2083     j(DONE);
2084   } else { // LU or UL
2085     Register strL = encLU ? str1 : str2;
2086     Register strU = encLU ? str2 : str1;
2087     VectorRegister vstr1 = encLU ? v8 : v4;
2088     VectorRegister vstr2 = encLU ? v4 : v8;
2089 
2090     bind(loop);
2091     vsetvli(tmp1, cnt2, Assembler::e8, Assembler::m2);
2092     vle8_v(vstr1, strL);
2093     vsetvli(tmp1, cnt2, Assembler::e16, Assembler::m4);
2094     vzext_vf2(vstr2, vstr1);
2095     vle16_v(vstr1, strU);
2096     vmsne_vv(v4, vstr2, vstr1);
2097     vfirst_m(tmp2, v4);
2098     bgez(tmp2, DIFFERENCE);
2099     sub(cnt2, cnt2, tmp1);
2100     add(strL, strL, tmp1);
2101     shadd(strU, tmp1, strU, tmp1, 1);
2102     bnez(cnt2, loop);
2103     j(DONE);
2104   }
2105 
2106   bind(DIFFERENCE);
2107   slli(tmp1, tmp2, 1);
2108   add(str1, str1, str1_isL ? tmp2 : tmp1);
2109   add(str2, str2, str2_isL ? tmp2 : tmp1);
2110   str1_isL ? lbu(tmp1, Address(str1, 0)) : lhu(tmp1, Address(str1, 0));
2111   str2_isL ? lbu(tmp2, Address(str2, 0)) : lhu(tmp2, Address(str2, 0));
2112   sub(result, tmp1, tmp2);
2113 
2114   bind(DONE);
2115 }
2116 
2117 void C2_MacroAssembler::byte_array_inflate_v(Register src, Register dst, Register len, Register tmp) {
2118   Label loop;
2119   assert_different_registers(src, dst, len, tmp, t0);
2120 
2121   BLOCK_COMMENT("byte_array_inflate_v {");
2122   bind(loop);
2123   vsetvli(tmp, len, Assembler::e8, Assembler::m2);
2124   vle8_v(v6, src);
2125   vsetvli(t0, len, Assembler::e16, Assembler::m4);
2126   vzext_vf2(v4, v6);
2127   vse16_v(v4, dst);
2128   sub(len, len, tmp);
2129   add(src, src, tmp);
2130   shadd(dst, tmp, dst, tmp, 1);
2131   bnez(len, loop);
2132   BLOCK_COMMENT("} byte_array_inflate_v");
2133 }
2134 
2135 // Compress char[] array to byte[].
2136 // Intrinsic for java.lang.StringUTF16.compress(char[] src, int srcOff, byte[] dst, int dstOff, int len)
2137 // result: the array length if every element in array can be encoded,
2138 // otherwise, the index of first non-latin1 (> 0xff) character.
2139 void C2_MacroAssembler::char_array_compress_v(Register src, Register dst, Register len,
2140                                               Register result, Register tmp) {
2141   encode_iso_array_v(src, dst, len, result, tmp, false);
2142 }
2143 
2144 // Intrinsic for
2145 //
2146 // - sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray
2147 //     return the number of characters copied.
2148 // - java/lang/StringUTF16.compress
2149 //     return index of non-latin1 character if copy fails, otherwise 'len'.
2150 //
2151 // This version always returns the number of characters copied. A successful
2152 // copy will complete with the post-condition: 'res' == 'len', while an
2153 // unsuccessful copy will exit with the post-condition: 0 <= 'res' < 'len'.
2154 //
2155 // Clobbers: src, dst, len, result, t0
2156 void C2_MacroAssembler::encode_iso_array_v(Register src, Register dst, Register len,
2157                                            Register result, Register tmp, bool ascii) {
2158   Label loop, fail, done;
2159 
2160   BLOCK_COMMENT("encode_iso_array_v {");
2161   mv(result, 0);
2162 
2163   bind(loop);
2164   mv(tmp, ascii ? 0x7f : 0xff);
2165   vsetvli(t0, len, Assembler::e16, Assembler::m2);
2166   vle16_v(v2, src);
2167 
2168   vmsgtu_vx(v1, v2, tmp);
2169   vfirst_m(tmp, v1);
2170   vmsbf_m(v0, v1);
2171   // compress char to byte
2172   vsetvli(t0, len, Assembler::e8);
2173   vncvt_x_x_w(v1, v2, Assembler::v0_t);
2174   vse8_v(v1, dst, Assembler::v0_t);
2175 
2176   // fail if char > 0x7f/0xff
2177   bgez(tmp, fail);
2178   add(result, result, t0);
2179   add(dst, dst, t0);
2180   sub(len, len, t0);
2181   shadd(src, t0, src, t0, 1);
2182   bnez(len, loop);
2183   j(done);
2184 
2185   bind(fail);
2186   add(result, result, tmp);
2187 
2188   bind(done);
2189   BLOCK_COMMENT("} encode_iso_array_v");
2190 }
2191 
2192 void C2_MacroAssembler::count_positives_v(Register ary, Register len, Register result, Register tmp) {
2193   Label LOOP, SET_RESULT, DONE;
2194 
2195   BLOCK_COMMENT("count_positives_v {");
2196   assert_different_registers(ary, len, result, tmp);
2197 
2198   mv(result, zr);
2199 
2200   bind(LOOP);
2201   vsetvli(t0, len, Assembler::e8, Assembler::m4);
2202   vle8_v(v4, ary);
2203   vmslt_vx(v4, v4, zr);
2204   vfirst_m(tmp, v4);
2205   bgez(tmp, SET_RESULT);
2206   // if tmp == -1, all bytes are positive
2207   add(result, result, t0);
2208 
2209   sub(len, len, t0);
2210   add(ary, ary, t0);
2211   bnez(len, LOOP);
2212   j(DONE);
2213 
2214   // add remaining positive bytes count
2215   bind(SET_RESULT);
2216   add(result, result, tmp);
2217 
2218   bind(DONE);
2219   BLOCK_COMMENT("} count_positives_v");
2220 }
2221 
2222 void C2_MacroAssembler::string_indexof_char_v(Register str1, Register cnt1,
2223                                               Register ch, Register result,
2224                                               Register tmp1, Register tmp2,
2225                                               bool isL) {
2226   mv(result, zr);
2227 
2228   Label loop, MATCH, DONE;
2229   Assembler::SEW sew = isL ? Assembler::e8 : Assembler::e16;
2230   bind(loop);
2231   vsetvli(tmp1, cnt1, sew, Assembler::m4);
2232   vlex_v(v4, str1, sew);
2233   vmseq_vx(v4, v4, ch);
2234   vfirst_m(tmp2, v4);
2235   bgez(tmp2, MATCH); // if equal, return index
2236 
2237   add(result, result, tmp1);
2238   sub(cnt1, cnt1, tmp1);
2239   if (!isL) slli(tmp1, tmp1, 1);
2240   add(str1, str1, tmp1);
2241   bnez(cnt1, loop);
2242 
2243   mv(result, -1);
2244   j(DONE);
2245 
2246   bind(MATCH);
2247   add(result, result, tmp2);
2248 
2249   bind(DONE);
2250 }
2251 
2252 // Set dst to NaN if any NaN input.
2253 void C2_MacroAssembler::minmax_fp_v(VectorRegister dst, VectorRegister src1, VectorRegister src2,
2254                                     BasicType bt, bool is_min, int vector_length) {
2255   assert_different_registers(dst, src1, src2);
2256 
2257   vsetvli_helper(bt, vector_length);
2258 
2259   is_min ? vfmin_vv(dst, src1, src2)
2260          : vfmax_vv(dst, src1, src2);
2261 
2262   vmfne_vv(v0,  src1, src1);
2263   vfadd_vv(dst, src1, src1, Assembler::v0_t);
2264   vmfne_vv(v0,  src2, src2);
2265   vfadd_vv(dst, src2, src2, Assembler::v0_t);
2266 }
2267 
2268 // Set dst to NaN if any NaN input.
2269 // The destination vector register elements corresponding to masked-off elements
2270 // are handled with a mask-undisturbed policy.
2271 void C2_MacroAssembler::minmax_fp_masked_v(VectorRegister dst, VectorRegister src1, VectorRegister src2,
2272                                            VectorRegister vmask, VectorRegister tmp1, VectorRegister tmp2,
2273                                            BasicType bt, bool is_min, int vector_length) {
2274   assert_different_registers(src1, src2, tmp1, tmp2);
2275   vsetvli_helper(bt, vector_length);
2276 
2277   // Check vector elements of src1 and src2 for NaN.
2278   vmfeq_vv(tmp1, src1, src1);
2279   vmfeq_vv(tmp2, src2, src2);
2280 
2281   vmandn_mm(v0, vmask, tmp1);
2282   vfadd_vv(dst, src1, src1, Assembler::v0_t);
2283   vmandn_mm(v0, vmask, tmp2);
2284   vfadd_vv(dst, src2, src2, Assembler::v0_t);
2285 
2286   vmand_mm(tmp2, tmp1, tmp2);
2287   vmand_mm(v0, vmask, tmp2);
2288   is_min ? vfmin_vv(dst, src1, src2, Assembler::v0_t)
2289          : vfmax_vv(dst, src1, src2, Assembler::v0_t);
2290 }
2291 
2292 // Set dst to NaN if any NaN input.
2293 void C2_MacroAssembler::reduce_minmax_fp_v(FloatRegister dst,
2294                                            FloatRegister src1, VectorRegister src2,
2295                                            VectorRegister tmp1, VectorRegister tmp2,
2296                                            bool is_double, bool is_min, int vector_length, VectorMask vm) {
2297   assert_different_registers(dst, src1);
2298   assert_different_registers(src2, tmp1, tmp2);
2299 
2300   Label L_done, L_NaN_1, L_NaN_2;
2301   // Set dst to src1 if src1 is NaN
2302   is_double ? feq_d(t0, src1, src1)
2303             : feq_s(t0, src1, src1);
2304   beqz(t0, L_NaN_2);
2305 
2306   vsetvli_helper(is_double ? T_DOUBLE : T_FLOAT, vector_length);
2307   vfmv_s_f(tmp2, src1);
2308 
2309   is_min ? vfredmin_vs(tmp1, src2, tmp2, vm)
2310          : vfredmax_vs(tmp1, src2, tmp2, vm);
2311   vfmv_f_s(dst, tmp1);
2312 
2313   // Checking NaNs in src2
2314   vmfne_vv(tmp1, src2, src2, vm);
2315   vcpop_m(t0, tmp1, vm);
2316   beqz(t0, L_done);
2317 
2318   bind(L_NaN_1);
2319   vfredusum_vs(tmp1, src2, tmp2, vm);
2320   vfmv_f_s(dst, tmp1);
2321   j(L_done);
2322 
2323   bind(L_NaN_2);
2324   is_double ? fmv_d(dst, src1)
2325             : fmv_s(dst, src1);
2326   bind(L_done);
2327 }
2328 
2329 bool C2_MacroAssembler::in_scratch_emit_size() {
2330   if (ciEnv::current()->task() != nullptr) {
2331     PhaseOutput* phase_output = Compile::current()->output();
2332     if (phase_output != nullptr && phase_output->in_scratch_emit_size()) {
2333       return true;
2334     }
2335   }
2336   return MacroAssembler::in_scratch_emit_size();
2337 }
2338 
2339 void C2_MacroAssembler::reduce_integral_v(Register dst, Register src1,
2340                                           VectorRegister src2, VectorRegister tmp,
2341                                           int opc, BasicType bt, int vector_length, VectorMask vm) {
2342   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2343   vsetvli_helper(bt, vector_length);
2344   vmv_s_x(tmp, src1);
2345   switch (opc) {
2346     case Op_AddReductionVI:
2347     case Op_AddReductionVL:
2348       vredsum_vs(tmp, src2, tmp, vm);
2349       break;
2350     case Op_AndReductionV:
2351       vredand_vs(tmp, src2, tmp, vm);
2352       break;
2353     case Op_OrReductionV:
2354       vredor_vs(tmp, src2, tmp, vm);
2355       break;
2356     case Op_XorReductionV:
2357       vredxor_vs(tmp, src2, tmp, vm);
2358       break;
2359     case Op_MaxReductionV:
2360       vredmax_vs(tmp, src2, tmp, vm);
2361       break;
2362     case Op_MinReductionV:
2363       vredmin_vs(tmp, src2, tmp, vm);
2364       break;
2365     default:
2366       ShouldNotReachHere();
2367   }
2368   vmv_x_s(dst, tmp);
2369 }
2370 
2371 // Set vl and vtype for full and partial vector operations.
2372 // (vma = mu, vta = tu, vill = false)
2373 void C2_MacroAssembler::vsetvli_helper(BasicType bt, int vector_length, LMUL vlmul, Register tmp) {
2374   Assembler::SEW sew = Assembler::elemtype_to_sew(bt);
2375   if (vector_length <= 31) {
2376     vsetivli(tmp, vector_length, sew, vlmul);
2377   } else if (vector_length == (MaxVectorSize / type2aelembytes(bt))) {
2378     vsetvli(tmp, x0, sew, vlmul);
2379   } else {
2380     mv(tmp, vector_length);
2381     vsetvli(tmp, tmp, sew, vlmul);
2382   }
2383 }
2384 
2385 void C2_MacroAssembler::compare_integral_v(VectorRegister vd, VectorRegister src1, VectorRegister src2,
2386                                            int cond, BasicType bt, int vector_length, VectorMask vm) {
2387   assert(is_integral_type(bt), "unsupported element type");
2388   assert(vm == Assembler::v0_t ? vd != v0 : true, "should be different registers");
2389   vsetvli_helper(bt, vector_length);
2390   vmclr_m(vd);
2391   switch (cond) {
2392     case BoolTest::eq: vmseq_vv(vd, src1, src2, vm); break;
2393     case BoolTest::ne: vmsne_vv(vd, src1, src2, vm); break;
2394     case BoolTest::le: vmsle_vv(vd, src1, src2, vm); break;
2395     case BoolTest::ge: vmsge_vv(vd, src1, src2, vm); break;
2396     case BoolTest::lt: vmslt_vv(vd, src1, src2, vm); break;
2397     case BoolTest::gt: vmsgt_vv(vd, src1, src2, vm); break;
2398     default:
2399       assert(false, "unsupported compare condition");
2400       ShouldNotReachHere();
2401   }
2402 }
2403 
2404 void C2_MacroAssembler::compare_fp_v(VectorRegister vd, VectorRegister src1, VectorRegister src2,
2405                                      int cond, BasicType bt, int vector_length, VectorMask vm) {
2406   assert(is_floating_point_type(bt), "unsupported element type");
2407   assert(vm == Assembler::v0_t ? vd != v0 : true, "should be different registers");
2408   vsetvli_helper(bt, vector_length);
2409   vmclr_m(vd);
2410   switch (cond) {
2411     case BoolTest::eq: vmfeq_vv(vd, src1, src2, vm); break;
2412     case BoolTest::ne: vmfne_vv(vd, src1, src2, vm); break;
2413     case BoolTest::le: vmfle_vv(vd, src1, src2, vm); break;
2414     case BoolTest::ge: vmfge_vv(vd, src1, src2, vm); break;
2415     case BoolTest::lt: vmflt_vv(vd, src1, src2, vm); break;
2416     case BoolTest::gt: vmfgt_vv(vd, src1, src2, vm); break;
2417     default:
2418       assert(false, "unsupported compare condition");
2419       ShouldNotReachHere();
2420   }
2421 }
2422 
2423 void C2_MacroAssembler::integer_extend_v(VectorRegister dst, BasicType dst_bt, int vector_length,
2424                                          VectorRegister src, BasicType src_bt) {
2425   assert(type2aelembytes(dst_bt) > type2aelembytes(src_bt) && type2aelembytes(dst_bt) <= 8 && type2aelembytes(src_bt) <= 4, "invalid element size");
2426   assert(dst_bt != T_FLOAT && dst_bt != T_DOUBLE && src_bt != T_FLOAT && src_bt != T_DOUBLE, "unsupported element type");
2427   // https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#52-vector-operands
2428   // The destination EEW is greater than the source EEW, the source EMUL is at least 1,
2429   // and the overlap is in the highest-numbered part of the destination register group.
2430   // Since LMUL=1, vd and vs cannot be the same.
2431   assert_different_registers(dst, src);
2432 
2433   vsetvli_helper(dst_bt, vector_length);
2434   if (src_bt == T_BYTE) {
2435     switch (dst_bt) {
2436     case T_SHORT:
2437       vsext_vf2(dst, src);
2438       break;
2439     case T_INT:
2440       vsext_vf4(dst, src);
2441       break;
2442     case T_LONG:
2443       vsext_vf8(dst, src);
2444       break;
2445     default:
2446       ShouldNotReachHere();
2447     }
2448   } else if (src_bt == T_SHORT) {
2449     if (dst_bt == T_INT) {
2450       vsext_vf2(dst, src);
2451     } else {
2452       vsext_vf4(dst, src);
2453     }
2454   } else if (src_bt == T_INT) {
2455     vsext_vf2(dst, src);
2456   }
2457 }
2458 
2459 // Vector narrow from src to dst with specified element sizes.
2460 // High part of dst vector will be filled with zero.
2461 void C2_MacroAssembler::integer_narrow_v(VectorRegister dst, BasicType dst_bt, int vector_length,
2462                                          VectorRegister src, BasicType src_bt) {
2463   assert(type2aelembytes(dst_bt) < type2aelembytes(src_bt) && type2aelembytes(dst_bt) <= 4 && type2aelembytes(src_bt) <= 8, "invalid element size");
2464   assert(dst_bt != T_FLOAT && dst_bt != T_DOUBLE && src_bt != T_FLOAT && src_bt != T_DOUBLE, "unsupported element type");
2465   mv(t0, vector_length);
2466   if (src_bt == T_LONG) {
2467     // https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#117-vector-narrowing-integer-right-shift-instructions
2468     // Future extensions might add support for versions that narrow to a destination that is 1/4 the width of the source.
2469     // So we can currently only scale down by 1/2 the width at a time.
2470     vsetvli(t0, t0, Assembler::e32, Assembler::mf2);
2471     vncvt_x_x_w(dst, src);
2472     if (dst_bt == T_SHORT || dst_bt == T_BYTE) {
2473       vsetvli(t0, t0, Assembler::e16, Assembler::mf2);
2474       vncvt_x_x_w(dst, dst);
2475       if (dst_bt == T_BYTE) {
2476         vsetvli(t0, t0, Assembler::e8, Assembler::mf2);
2477         vncvt_x_x_w(dst, dst);
2478       }
2479     }
2480   } else if (src_bt == T_INT) {
2481     // T_SHORT
2482     vsetvli(t0, t0, Assembler::e16, Assembler::mf2);
2483     vncvt_x_x_w(dst, src);
2484     if (dst_bt == T_BYTE) {
2485       vsetvli(t0, t0, Assembler::e8, Assembler::mf2);
2486       vncvt_x_x_w(dst, dst);
2487     }
2488   } else if (src_bt == T_SHORT) {
2489     vsetvli(t0, t0, Assembler::e8, Assembler::mf2);
2490     vncvt_x_x_w(dst, src);
2491   }
2492 }
2493 
2494 #define VFCVT_SAFE(VFLOATCVT)                                                      \
2495 void C2_MacroAssembler::VFLOATCVT##_safe(VectorRegister dst, VectorRegister src) { \
2496   assert_different_registers(dst, src);                                            \
2497   vxor_vv(dst, dst, dst);                                                          \
2498   vmfeq_vv(v0, src, src);                                                          \
2499   VFLOATCVT(dst, src, Assembler::v0_t);                                            \
2500 }
2501 
2502 VFCVT_SAFE(vfcvt_rtz_x_f_v);
2503 
2504 #undef VFCVT_SAFE
2505 
2506 // Extract a scalar element from an vector at position 'idx'.
2507 // The input elements in src are expected to be of integral type.
2508 void C2_MacroAssembler::extract_v(Register dst, VectorRegister src, BasicType bt,
2509                                   int idx, VectorRegister tmp) {
2510   assert(is_integral_type(bt), "unsupported element type");
2511   assert(idx >= 0, "idx cannot be negative");
2512   // Only need the first element after vector slidedown
2513   vsetvli_helper(bt, 1);
2514   if (idx == 0) {
2515     vmv_x_s(dst, src);
2516   } else if (idx <= 31) {
2517     vslidedown_vi(tmp, src, idx);
2518     vmv_x_s(dst, tmp);
2519   } else {
2520     mv(t0, idx);
2521     vslidedown_vx(tmp, src, t0);
2522     vmv_x_s(dst, tmp);
2523   }
2524 }
2525 
2526 // Extract a scalar element from an vector at position 'idx'.
2527 // The input elements in src are expected to be of floating point type.
2528 void C2_MacroAssembler::extract_fp_v(FloatRegister dst, VectorRegister src, BasicType bt,
2529                                      int idx, VectorRegister tmp) {
2530   assert(is_floating_point_type(bt), "unsupported element type");
2531   assert(idx >= 0, "idx cannot be negative");
2532   // Only need the first element after vector slidedown
2533   vsetvli_helper(bt, 1);
2534   if (idx == 0) {
2535     vfmv_f_s(dst, src);
2536   } else if (idx <= 31) {
2537     vslidedown_vi(tmp, src, idx);
2538     vfmv_f_s(dst, tmp);
2539   } else {
2540     mv(t0, idx);
2541     vslidedown_vx(tmp, src, t0);
2542     vfmv_f_s(dst, tmp);
2543   }
2544 }