1 /*
   2  * Copyright (c) 2020, 2023, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2020, 2022, Huawei Technologies Co., Ltd. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/assembler.hpp"
  28 #include "asm/assembler.inline.hpp"
  29 #include "opto/c2_MacroAssembler.hpp"
  30 #include "opto/compile.hpp"
  31 #include "opto/intrinsicnode.hpp"
  32 #include "opto/output.hpp"
  33 #include "opto/subnode.hpp"
  34 #include "runtime/stubRoutines.hpp"
  35 
  36 #ifdef PRODUCT
  37 #define BLOCK_COMMENT(str) /* nothing */
  38 #define STOP(error) stop(error)
  39 #else
  40 #define BLOCK_COMMENT(str) block_comment(str)
  41 #define STOP(error) block_comment(error); stop(error)
  42 #endif
  43 
  44 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  45 
  46 void C2_MacroAssembler::fast_lock(Register objectReg, Register boxReg,
  47                                   Register tmp1Reg, Register tmp2Reg, Register tmp3Reg) {
  48   // Use cr register to indicate the fast_lock result: zero for success; non-zero for failure.
  49   Register flag = t1;
  50   Register oop = objectReg;
  51   Register box = boxReg;
  52   Register disp_hdr = tmp1Reg;
  53   Register tmp = tmp2Reg;
  54   Label cont;
  55   Label object_has_monitor;
  56   Label count, no_count;
  57 
  58   assert_different_registers(oop, box, tmp, disp_hdr, flag, tmp3Reg, t0);
  59 
  60   // Load markWord from object into displaced_header.
  61   ld(disp_hdr, Address(oop, oopDesc::mark_offset_in_bytes()));
  62 
  63   if (DiagnoseSyncOnValueBasedClasses != 0) {
  64     load_klass(flag, oop);
  65     lwu(flag, Address(flag, Klass::access_flags_offset()));
  66     test_bit(flag, flag, exact_log2(JVM_ACC_IS_VALUE_BASED_CLASS));
  67     bnez(flag, cont, true /* is_far */);
  68   }
  69 
  70   // Check for existing monitor
  71   test_bit(t0, disp_hdr, exact_log2(markWord::monitor_value));
  72   bnez(t0, object_has_monitor);
  73 
  74   if (LockingMode == LM_MONITOR) {
  75     mv(flag, 1); // Set non-zero flag to indicate 'failure' -> take slow-path
  76     j(cont);
  77   } else if (LockingMode == LM_LEGACY) {
  78     // Set tmp to be (markWord of object | UNLOCK_VALUE).
  79     ori(tmp, disp_hdr, markWord::unlocked_value);
  80 
  81     // Initialize the box. (Must happen before we update the object mark!)
  82     sd(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes()));
  83 
  84     // Compare object markWord with an unlocked value (tmp) and if
  85     // equal exchange the stack address of our box with object markWord.
  86     // On failure disp_hdr contains the possibly locked markWord.
  87     cmpxchg(/*memory address*/oop, /*expected value*/tmp, /*new value*/box, Assembler::int64, Assembler::aq,
  88             Assembler::rl, /*result*/disp_hdr);
  89     mv(flag, zr);
  90     beq(disp_hdr, tmp, cont); // prepare zero flag and goto cont if we won the cas
  91 
  92     assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
  93 
  94     // If the compare-and-exchange succeeded, then we found an unlocked
  95     // object, will have now locked it will continue at label cont
  96     // We did not see an unlocked object so try the fast recursive case.
  97 
  98     // Check if the owner is self by comparing the value in the
  99     // markWord of object (disp_hdr) with the stack pointer.
 100     sub(disp_hdr, disp_hdr, sp);
 101     mv(tmp, (intptr_t) (~(os::vm_page_size()-1) | (uintptr_t)markWord::lock_mask_in_place));
 102     // If (mark & lock_mask) == 0 and mark - sp < page_size, we are stack-locking and goto cont,
 103     // hence we can store 0 as the displaced header in the box, which indicates that it is a
 104     // recursive lock.
 105     andr(tmp/*==0?*/, disp_hdr, tmp);
 106     sd(tmp/*==0, perhaps*/, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 107     mv(flag, tmp); // we can use the value of tmp as the result here
 108     j(cont);
 109   } else {
 110     assert(LockingMode == LM_LIGHTWEIGHT, "");
 111     Label slow;
 112     lightweight_lock(oop, disp_hdr, tmp, tmp3Reg, slow);
 113 
 114     // Indicate success on completion.
 115     mv(flag, zr);
 116     j(count);
 117     bind(slow);
 118     mv(flag, 1); // Set non-zero flag to indicate 'failure' -> take slow-path
 119     j(no_count);
 120   }
 121 
 122   // Handle existing monitor.
 123   bind(object_has_monitor);
 124   // The object's monitor m is unlocked iff m->owner == NULL,
 125   // otherwise m->owner may contain a thread or a stack address.
 126   //
 127   // Try to CAS m->owner from NULL to current thread.
 128   add(tmp, disp_hdr, (in_bytes(ObjectMonitor::owner_offset()) - markWord::monitor_value));
 129   cmpxchg(/*memory address*/tmp, /*expected value*/zr, /*new value*/xthread, Assembler::int64, Assembler::aq,
 130           Assembler::rl, /*result*/flag); // cas succeeds if flag == zr(expected)
 131 
 132   if (LockingMode != LM_LIGHTWEIGHT) {
 133     // Store a non-null value into the box to avoid looking like a re-entrant
 134     // lock. The fast-path monitor unlock code checks for
 135     // markWord::monitor_value so use markWord::unused_mark which has the
 136     // relevant bit set, and also matches ObjectSynchronizer::slow_enter.
 137     mv(tmp, (address)markWord::unused_mark().value());
 138     sd(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 139   }
 140 
 141   beqz(flag, cont); // CAS success means locking succeeded
 142 
 143   bne(flag, xthread, cont); // Check for recursive locking
 144 
 145   // Recursive lock case
 146   mv(flag, zr);
 147   increment(Address(disp_hdr, in_bytes(ObjectMonitor::recursions_offset()) - markWord::monitor_value), 1, t0, tmp);
 148 
 149   bind(cont);
 150   // zero flag indicates success
 151   // non-zero flag indicates failure
 152   bnez(flag, no_count);
 153 
 154   bind(count);
 155   increment(Address(xthread, JavaThread::held_monitor_count_offset()), 1, t0, tmp);
 156 
 157   bind(no_count);
 158 }
 159 
 160 void C2_MacroAssembler::fast_unlock(Register objectReg, Register boxReg,
 161                                     Register tmp1Reg, Register tmp2Reg) {
 162   // Use cr register to indicate the fast_unlock result: zero for success; non-zero for failure.
 163   Register flag = t1;
 164   Register oop = objectReg;
 165   Register box = boxReg;
 166   Register disp_hdr = tmp1Reg;
 167   Register tmp = tmp2Reg;
 168   Label cont;
 169   Label object_has_monitor;
 170   Label count, no_count;
 171 
 172   assert_different_registers(oop, box, tmp, disp_hdr, flag, t0);
 173 
 174   if (LockingMode == LM_LEGACY) {
 175     // Find the lock address and load the displaced header from the stack.
 176     ld(disp_hdr, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 177 
 178     // If the displaced header is 0, we have a recursive unlock.
 179     mv(flag, disp_hdr);
 180     beqz(disp_hdr, cont);
 181   }
 182 
 183   // Handle existing monitor.
 184   ld(tmp, Address(oop, oopDesc::mark_offset_in_bytes()));
 185   test_bit(t0, tmp, exact_log2(markWord::monitor_value));
 186   bnez(t0, object_has_monitor);
 187 
 188   if (LockingMode == LM_MONITOR) {
 189     mv(flag, 1); // Set non-zero flag to indicate 'failure' -> take slow path
 190     j(cont);
 191   } else if (LockingMode == LM_LEGACY) {
 192     // Check if it is still a light weight lock, this is true if we
 193     // see the stack address of the basicLock in the markWord of the
 194     // object.
 195 
 196     cmpxchg(/*memory address*/oop, /*expected value*/box, /*new value*/disp_hdr, Assembler::int64, Assembler::relaxed,
 197             Assembler::rl, /*result*/tmp);
 198     xorr(flag, box, tmp); // box == tmp if cas succeeds
 199     j(cont);
 200   } else {
 201     assert(LockingMode == LM_LIGHTWEIGHT, "");
 202     Label slow;
 203     lightweight_unlock(oop, tmp, box, disp_hdr, slow);
 204 
 205     // Indicate success on completion.
 206     mv(flag, zr);
 207     j(count);
 208     bind(slow);
 209     mv(flag, 1); // Set non-zero flag to indicate 'failure' -> take slow path
 210     j(no_count);
 211   }
 212 
 213   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
 214 
 215   // Handle existing monitor.
 216   bind(object_has_monitor);
 217   STATIC_ASSERT(markWord::monitor_value <= INT_MAX);
 218   add(tmp, tmp, -(int)markWord::monitor_value); // monitor
 219 
 220   if (LockingMode == LM_LIGHTWEIGHT) {
 221     // If the owner is anonymous, we need to fix it -- in an outline stub.
 222     Register tmp2 = disp_hdr;
 223     ld(tmp2, Address(tmp, ObjectMonitor::owner_offset()));
 224     test_bit(t0, tmp2, exact_log2(ObjectMonitor::ANONYMOUS_OWNER));
 225     C2HandleAnonOMOwnerStub* stub = new (Compile::current()->comp_arena()) C2HandleAnonOMOwnerStub(tmp, tmp2);
 226     Compile::current()->output()->add_stub(stub);
 227     bnez(t0, stub->entry(), /* is_far */ true);
 228     bind(stub->continuation());
 229   }
 230 
 231   ld(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset()));
 232 
 233   Label notRecursive;
 234   beqz(disp_hdr, notRecursive); // Will be 0 if not recursive.
 235 
 236   // Recursive lock
 237   addi(disp_hdr, disp_hdr, -1);
 238   sd(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset()));
 239   mv(flag, zr);
 240   j(cont);
 241 
 242   bind(notRecursive);
 243   ld(flag, Address(tmp, ObjectMonitor::EntryList_offset()));
 244   ld(disp_hdr, Address(tmp, ObjectMonitor::cxq_offset()));
 245   orr(flag, flag, disp_hdr); // Will be 0 if both are 0.
 246   bnez(flag, cont);
 247   // need a release store here
 248   la(tmp, Address(tmp, ObjectMonitor::owner_offset()));
 249   membar(MacroAssembler::LoadStore | MacroAssembler::StoreStore);
 250   sd(zr, Address(tmp)); // set unowned
 251 
 252   bind(cont);
 253   // zero flag indicates success
 254   // non-zero flag indicates failure
 255   bnez(flag, no_count);
 256 
 257   bind(count);
 258   decrement(Address(xthread, JavaThread::held_monitor_count_offset()), 1, t0, tmp);
 259 
 260   bind(no_count);
 261 }
 262 
 263 // short string
 264 // StringUTF16.indexOfChar
 265 // StringLatin1.indexOfChar
 266 void C2_MacroAssembler::string_indexof_char_short(Register str1, Register cnt1,
 267                                                   Register ch, Register result,
 268                                                   bool isL)
 269 {
 270   Register ch1 = t0;
 271   Register index = t1;
 272 
 273   BLOCK_COMMENT("string_indexof_char_short {");
 274 
 275   Label LOOP, LOOP1, LOOP4, LOOP8;
 276   Label MATCH,  MATCH1, MATCH2, MATCH3,
 277         MATCH4, MATCH5, MATCH6, MATCH7, NOMATCH;
 278 
 279   mv(result, -1);
 280   mv(index, zr);
 281 
 282   bind(LOOP);
 283   addi(t0, index, 8);
 284   ble(t0, cnt1, LOOP8);
 285   addi(t0, index, 4);
 286   ble(t0, cnt1, LOOP4);
 287   j(LOOP1);
 288 
 289   bind(LOOP8);
 290   isL ? lbu(ch1, Address(str1, 0)) : lhu(ch1, Address(str1, 0));
 291   beq(ch, ch1, MATCH);
 292   isL ? lbu(ch1, Address(str1, 1)) : lhu(ch1, Address(str1, 2));
 293   beq(ch, ch1, MATCH1);
 294   isL ? lbu(ch1, Address(str1, 2)) : lhu(ch1, Address(str1, 4));
 295   beq(ch, ch1, MATCH2);
 296   isL ? lbu(ch1, Address(str1, 3)) : lhu(ch1, Address(str1, 6));
 297   beq(ch, ch1, MATCH3);
 298   isL ? lbu(ch1, Address(str1, 4)) : lhu(ch1, Address(str1, 8));
 299   beq(ch, ch1, MATCH4);
 300   isL ? lbu(ch1, Address(str1, 5)) : lhu(ch1, Address(str1, 10));
 301   beq(ch, ch1, MATCH5);
 302   isL ? lbu(ch1, Address(str1, 6)) : lhu(ch1, Address(str1, 12));
 303   beq(ch, ch1, MATCH6);
 304   isL ? lbu(ch1, Address(str1, 7)) : lhu(ch1, Address(str1, 14));
 305   beq(ch, ch1, MATCH7);
 306   addi(index, index, 8);
 307   addi(str1, str1, isL ? 8 : 16);
 308   blt(index, cnt1, LOOP);
 309   j(NOMATCH);
 310 
 311   bind(LOOP4);
 312   isL ? lbu(ch1, Address(str1, 0)) : lhu(ch1, Address(str1, 0));
 313   beq(ch, ch1, MATCH);
 314   isL ? lbu(ch1, Address(str1, 1)) : lhu(ch1, Address(str1, 2));
 315   beq(ch, ch1, MATCH1);
 316   isL ? lbu(ch1, Address(str1, 2)) : lhu(ch1, Address(str1, 4));
 317   beq(ch, ch1, MATCH2);
 318   isL ? lbu(ch1, Address(str1, 3)) : lhu(ch1, Address(str1, 6));
 319   beq(ch, ch1, MATCH3);
 320   addi(index, index, 4);
 321   addi(str1, str1, isL ? 4 : 8);
 322   bge(index, cnt1, NOMATCH);
 323 
 324   bind(LOOP1);
 325   isL ? lbu(ch1, Address(str1)) : lhu(ch1, Address(str1));
 326   beq(ch, ch1, MATCH);
 327   addi(index, index, 1);
 328   addi(str1, str1, isL ? 1 : 2);
 329   blt(index, cnt1, LOOP1);
 330   j(NOMATCH);
 331 
 332   bind(MATCH1);
 333   addi(index, index, 1);
 334   j(MATCH);
 335 
 336   bind(MATCH2);
 337   addi(index, index, 2);
 338   j(MATCH);
 339 
 340   bind(MATCH3);
 341   addi(index, index, 3);
 342   j(MATCH);
 343 
 344   bind(MATCH4);
 345   addi(index, index, 4);
 346   j(MATCH);
 347 
 348   bind(MATCH5);
 349   addi(index, index, 5);
 350   j(MATCH);
 351 
 352   bind(MATCH6);
 353   addi(index, index, 6);
 354   j(MATCH);
 355 
 356   bind(MATCH7);
 357   addi(index, index, 7);
 358 
 359   bind(MATCH);
 360   mv(result, index);
 361   bind(NOMATCH);
 362   BLOCK_COMMENT("} string_indexof_char_short");
 363 }
 364 
 365 // StringUTF16.indexOfChar
 366 // StringLatin1.indexOfChar
 367 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1,
 368                                             Register ch, Register result,
 369                                             Register tmp1, Register tmp2,
 370                                             Register tmp3, Register tmp4,
 371                                             bool isL)
 372 {
 373   Label CH1_LOOP, HIT, NOMATCH, DONE, DO_LONG;
 374   Register ch1 = t0;
 375   Register orig_cnt = t1;
 376   Register mask1 = tmp3;
 377   Register mask2 = tmp2;
 378   Register match_mask = tmp1;
 379   Register trailing_char = tmp4;
 380   Register unaligned_elems = tmp4;
 381 
 382   BLOCK_COMMENT("string_indexof_char {");
 383   beqz(cnt1, NOMATCH);
 384 
 385   addi(t0, cnt1, isL ? -32 : -16);
 386   bgtz(t0, DO_LONG);
 387   string_indexof_char_short(str1, cnt1, ch, result, isL);
 388   j(DONE);
 389 
 390   bind(DO_LONG);
 391   mv(orig_cnt, cnt1);
 392   if (AvoidUnalignedAccesses) {
 393     Label ALIGNED;
 394     andi(unaligned_elems, str1, 0x7);
 395     beqz(unaligned_elems, ALIGNED);
 396     sub(unaligned_elems, unaligned_elems, 8);
 397     neg(unaligned_elems, unaligned_elems);
 398     if (!isL) {
 399       srli(unaligned_elems, unaligned_elems, 1);
 400     }
 401     // do unaligned part per element
 402     string_indexof_char_short(str1, unaligned_elems, ch, result, isL);
 403     bgez(result, DONE);
 404     mv(orig_cnt, cnt1);
 405     sub(cnt1, cnt1, unaligned_elems);
 406     bind(ALIGNED);
 407   }
 408 
 409   // duplicate ch
 410   if (isL) {
 411     slli(ch1, ch, 8);
 412     orr(ch, ch1, ch);
 413   }
 414   slli(ch1, ch, 16);
 415   orr(ch, ch1, ch);
 416   slli(ch1, ch, 32);
 417   orr(ch, ch1, ch);
 418 
 419   if (!isL) {
 420     slli(cnt1, cnt1, 1);
 421   }
 422 
 423   uint64_t mask0101 = UCONST64(0x0101010101010101);
 424   uint64_t mask0001 = UCONST64(0x0001000100010001);
 425   mv(mask1, isL ? mask0101 : mask0001);
 426   uint64_t mask7f7f = UCONST64(0x7f7f7f7f7f7f7f7f);
 427   uint64_t mask7fff = UCONST64(0x7fff7fff7fff7fff);
 428   mv(mask2, isL ? mask7f7f : mask7fff);
 429 
 430   bind(CH1_LOOP);
 431   ld(ch1, Address(str1));
 432   addi(str1, str1, 8);
 433   addi(cnt1, cnt1, -8);
 434   compute_match_mask(ch1, ch, match_mask, mask1, mask2);
 435   bnez(match_mask, HIT);
 436   bgtz(cnt1, CH1_LOOP);
 437   j(NOMATCH);
 438 
 439   bind(HIT);
 440   ctzc_bit(trailing_char, match_mask, isL, ch1, result);
 441   srli(trailing_char, trailing_char, 3);
 442   addi(cnt1, cnt1, 8);
 443   ble(cnt1, trailing_char, NOMATCH);
 444   // match case
 445   if (!isL) {
 446     srli(cnt1, cnt1, 1);
 447     srli(trailing_char, trailing_char, 1);
 448   }
 449 
 450   sub(result, orig_cnt, cnt1);
 451   add(result, result, trailing_char);
 452   j(DONE);
 453 
 454   bind(NOMATCH);
 455   mv(result, -1);
 456 
 457   bind(DONE);
 458   BLOCK_COMMENT("} string_indexof_char");
 459 }
 460 
 461 typedef void (MacroAssembler::* load_chr_insn)(Register rd, const Address &adr, Register temp);
 462 
 463 // Search for needle in haystack and return index or -1
 464 // x10: result
 465 // x11: haystack
 466 // x12: haystack_len
 467 // x13: needle
 468 // x14: needle_len
 469 void C2_MacroAssembler::string_indexof(Register haystack, Register needle,
 470                                        Register haystack_len, Register needle_len,
 471                                        Register tmp1, Register tmp2,
 472                                        Register tmp3, Register tmp4,
 473                                        Register tmp5, Register tmp6,
 474                                        Register result, int ae)
 475 {
 476   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
 477 
 478   Label LINEARSEARCH, LINEARSTUB, DONE, NOMATCH;
 479 
 480   Register ch1 = t0;
 481   Register ch2 = t1;
 482   Register nlen_tmp = tmp1; // needle len tmp
 483   Register hlen_tmp = tmp2; // haystack len tmp
 484   Register result_tmp = tmp4;
 485 
 486   bool isLL = ae == StrIntrinsicNode::LL;
 487 
 488   bool needle_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
 489   bool haystack_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
 490   int needle_chr_shift = needle_isL ? 0 : 1;
 491   int haystack_chr_shift = haystack_isL ? 0 : 1;
 492   int needle_chr_size = needle_isL ? 1 : 2;
 493   int haystack_chr_size = haystack_isL ? 1 : 2;
 494   load_chr_insn needle_load_1chr = needle_isL ? (load_chr_insn)&MacroAssembler::lbu :
 495                               (load_chr_insn)&MacroAssembler::lhu;
 496   load_chr_insn haystack_load_1chr = haystack_isL ? (load_chr_insn)&MacroAssembler::lbu :
 497                                 (load_chr_insn)&MacroAssembler::lhu;
 498 
 499   BLOCK_COMMENT("string_indexof {");
 500 
 501   // Note, inline_string_indexOf() generates checks:
 502   // if (pattern.count > src.count) return -1;
 503   // if (pattern.count == 0) return 0;
 504 
 505   // We have two strings, a source string in haystack, haystack_len and a pattern string
 506   // in needle, needle_len. Find the first occurrence of pattern in source or return -1.
 507 
 508   // For larger pattern and source we use a simplified Boyer Moore algorithm.
 509   // With a small pattern and source we use linear scan.
 510 
 511   // needle_len >=8 && needle_len < 256 && needle_len < haystack_len/4, use bmh algorithm.
 512   sub(result_tmp, haystack_len, needle_len);
 513   // needle_len < 8, use linear scan
 514   sub(t0, needle_len, 8);
 515   bltz(t0, LINEARSEARCH);
 516   // needle_len >= 256, use linear scan
 517   sub(t0, needle_len, 256);
 518   bgez(t0, LINEARSTUB);
 519   // needle_len >= haystack_len/4, use linear scan
 520   srli(t0, haystack_len, 2);
 521   bge(needle_len, t0, LINEARSTUB);
 522 
 523   // Boyer-Moore-Horspool introduction:
 524   // The Boyer Moore alogorithm is based on the description here:-
 525   //
 526   // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
 527   //
 528   // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
 529   // and the 'Good Suffix' rule.
 530   //
 531   // These rules are essentially heuristics for how far we can shift the
 532   // pattern along the search string.
 533   //
 534   // The implementation here uses the 'Bad Character' rule only because of the
 535   // complexity of initialisation for the 'Good Suffix' rule.
 536   //
 537   // This is also known as the Boyer-Moore-Horspool algorithm:
 538   //
 539   // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
 540   //
 541   // #define ASIZE 256
 542   //
 543   //    int bm(unsigned char *pattern, int m, unsigned char *src, int n) {
 544   //      int i, j;
 545   //      unsigned c;
 546   //      unsigned char bc[ASIZE];
 547   //
 548   //      /* Preprocessing */
 549   //      for (i = 0; i < ASIZE; ++i)
 550   //        bc[i] = m;
 551   //      for (i = 0; i < m - 1; ) {
 552   //        c = pattern[i];
 553   //        ++i;
 554   //        // c < 256 for Latin1 string, so, no need for branch
 555   //        #ifdef PATTERN_STRING_IS_LATIN1
 556   //        bc[c] = m - i;
 557   //        #else
 558   //        if (c < ASIZE) bc[c] = m - i;
 559   //        #endif
 560   //      }
 561   //
 562   //      /* Searching */
 563   //      j = 0;
 564   //      while (j <= n - m) {
 565   //        c = src[i+j];
 566   //        if (pattern[m-1] == c)
 567   //          int k;
 568   //          for (k = m - 2; k >= 0 && pattern[k] == src[k + j]; --k);
 569   //          if (k < 0) return j;
 570   //          // c < 256 for Latin1 string, so, no need for branch
 571   //          #ifdef SOURCE_STRING_IS_LATIN1_AND_PATTERN_STRING_IS_LATIN1
 572   //          // LL case: (c< 256) always true. Remove branch
 573   //          j += bc[pattern[j+m-1]];
 574   //          #endif
 575   //          #ifdef SOURCE_STRING_IS_UTF_AND_PATTERN_STRING_IS_UTF
 576   //          // UU case: need if (c<ASIZE) check. Skip 1 character if not.
 577   //          if (c < ASIZE)
 578   //            j += bc[pattern[j+m-1]];
 579   //          else
 580   //            j += 1
 581   //          #endif
 582   //          #ifdef SOURCE_IS_UTF_AND_PATTERN_IS_LATIN1
 583   //          // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
 584   //          if (c < ASIZE)
 585   //            j += bc[pattern[j+m-1]];
 586   //          else
 587   //            j += m
 588   //          #endif
 589   //      }
 590   //      return -1;
 591   //    }
 592 
 593   // temp register:t0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, result
 594   Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
 595         BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
 596 
 597   Register haystack_end = haystack_len;
 598   Register skipch = tmp2;
 599 
 600   // pattern length is >=8, so, we can read at least 1 register for cases when
 601   // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
 602   // UL case. We'll re-read last character in inner pre-loop code to have
 603   // single outer pre-loop load
 604   const int firstStep = isLL ? 7 : 3;
 605 
 606   const int ASIZE = 256;
 607   const int STORE_BYTES = 8; // 8 bytes stored per instruction(sd)
 608 
 609   sub(sp, sp, ASIZE);
 610 
 611   // init BC offset table with default value: needle_len
 612   slli(t0, needle_len, 8);
 613   orr(t0, t0, needle_len); // [63...16][needle_len][needle_len]
 614   slli(tmp1, t0, 16);
 615   orr(t0, tmp1, t0); // [63...32][needle_len][needle_len][needle_len][needle_len]
 616   slli(tmp1, t0, 32);
 617   orr(tmp5, tmp1, t0); // tmp5: 8 elements [needle_len]
 618 
 619   mv(ch1, sp);  // ch1 is t0
 620   mv(tmp6, ASIZE / STORE_BYTES); // loop iterations
 621 
 622   bind(BM_INIT_LOOP);
 623   // for (i = 0; i < ASIZE; ++i)
 624   //   bc[i] = m;
 625   for (int i = 0; i < 4; i++) {
 626     sd(tmp5, Address(ch1, i * wordSize));
 627   }
 628   add(ch1, ch1, 32);
 629   sub(tmp6, tmp6, 4);
 630   bgtz(tmp6, BM_INIT_LOOP);
 631 
 632   sub(nlen_tmp, needle_len, 1); // m - 1, index of the last element in pattern
 633   Register orig_haystack = tmp5;
 634   mv(orig_haystack, haystack);
 635   // result_tmp = tmp4
 636   shadd(haystack_end, result_tmp, haystack, haystack_end, haystack_chr_shift);
 637   sub(ch2, needle_len, 1); // bc offset init value, ch2 is t1
 638   mv(tmp3, needle);
 639 
 640   //  for (i = 0; i < m - 1; ) {
 641   //    c = pattern[i];
 642   //    ++i;
 643   //    // c < 256 for Latin1 string, so, no need for branch
 644   //    #ifdef PATTERN_STRING_IS_LATIN1
 645   //    bc[c] = m - i;
 646   //    #else
 647   //    if (c < ASIZE) bc[c] = m - i;
 648   //    #endif
 649   //  }
 650   bind(BCLOOP);
 651   (this->*needle_load_1chr)(ch1, Address(tmp3), noreg);
 652   add(tmp3, tmp3, needle_chr_size);
 653   if (!needle_isL) {
 654     // ae == StrIntrinsicNode::UU
 655     mv(tmp6, ASIZE);
 656     bgeu(ch1, tmp6, BCSKIP);
 657   }
 658   add(tmp4, sp, ch1);
 659   sb(ch2, Address(tmp4)); // store skip offset to BC offset table
 660 
 661   bind(BCSKIP);
 662   sub(ch2, ch2, 1); // for next pattern element, skip distance -1
 663   bgtz(ch2, BCLOOP);
 664 
 665   // tmp6: pattern end, address after needle
 666   shadd(tmp6, needle_len, needle, tmp6, needle_chr_shift);
 667   if (needle_isL == haystack_isL) {
 668     // load last 8 bytes (8LL/4UU symbols)
 669     ld(tmp6, Address(tmp6, -wordSize));
 670   } else {
 671     // UL: from UTF-16(source) search Latin1(pattern)
 672     lwu(tmp6, Address(tmp6, -wordSize / 2)); // load last 4 bytes(4 symbols)
 673     // convert Latin1 to UTF. eg: 0x0000abcd -> 0x0a0b0c0d
 674     // We'll have to wait until load completed, but it's still faster than per-character loads+checks
 675     srli(tmp3, tmp6, BitsPerByte * (wordSize / 2 - needle_chr_size)); // pattern[m-1], eg:0x0000000a
 676     slli(ch2, tmp6, XLEN - 24);
 677     srli(ch2, ch2, XLEN - 8); // pattern[m-2], 0x0000000b
 678     slli(ch1, tmp6, XLEN - 16);
 679     srli(ch1, ch1, XLEN - 8); // pattern[m-3], 0x0000000c
 680     andi(tmp6, tmp6, 0xff); // pattern[m-4], 0x0000000d
 681     slli(ch2, ch2, 16);
 682     orr(ch2, ch2, ch1); // 0x00000b0c
 683     slli(result, tmp3, 48); // use result as temp register
 684     orr(tmp6, tmp6, result); // 0x0a00000d
 685     slli(result, ch2, 16);
 686     orr(tmp6, tmp6, result); // UTF-16:0x0a0b0c0d
 687   }
 688 
 689   // i = m - 1;
 690   // skipch = j + i;
 691   // if (skipch == pattern[m - 1]
 692   //   for (k = m - 2; k >= 0 && pattern[k] == src[k + j]; --k);
 693   // else
 694   //   move j with bad char offset table
 695   bind(BMLOOPSTR2);
 696   // compare pattern to source string backward
 697   shadd(result, nlen_tmp, haystack, result, haystack_chr_shift);
 698   (this->*haystack_load_1chr)(skipch, Address(result), noreg);
 699   sub(nlen_tmp, nlen_tmp, firstStep); // nlen_tmp is positive here, because needle_len >= 8
 700   if (needle_isL == haystack_isL) {
 701     // re-init tmp3. It's for free because it's executed in parallel with
 702     // load above. Alternative is to initialize it before loop, but it'll
 703     // affect performance on in-order systems with 2 or more ld/st pipelines
 704     srli(tmp3, tmp6, BitsPerByte * (wordSize - needle_chr_size)); // UU/LL: pattern[m-1]
 705   }
 706   if (!isLL) { // UU/UL case
 707     slli(ch2, nlen_tmp, 1); // offsets in bytes
 708   }
 709   bne(tmp3, skipch, BMSKIP); // if not equal, skipch is bad char
 710   add(result, haystack, isLL ? nlen_tmp : ch2);
 711   // load 8 bytes from source string
 712   // if isLL is false then read granularity can be 2
 713   load_long_misaligned(ch2, Address(result), ch1, isLL ? 1 : 2); // can use ch1 as temp register here as it will be trashed by next mv anyway
 714   mv(ch1, tmp6);
 715   if (isLL) {
 716     j(BMLOOPSTR1_AFTER_LOAD);
 717   } else {
 718     sub(nlen_tmp, nlen_tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
 719     j(BMLOOPSTR1_CMP);
 720   }
 721 
 722   bind(BMLOOPSTR1);
 723   shadd(ch1, nlen_tmp, needle, ch1, needle_chr_shift);
 724   (this->*needle_load_1chr)(ch1, Address(ch1), noreg);
 725   shadd(ch2, nlen_tmp, haystack, ch2, haystack_chr_shift);
 726   (this->*haystack_load_1chr)(ch2, Address(ch2), noreg);
 727 
 728   bind(BMLOOPSTR1_AFTER_LOAD);
 729   sub(nlen_tmp, nlen_tmp, 1);
 730   bltz(nlen_tmp, BMLOOPSTR1_LASTCMP);
 731 
 732   bind(BMLOOPSTR1_CMP);
 733   beq(ch1, ch2, BMLOOPSTR1);
 734 
 735   bind(BMSKIP);
 736   if (!isLL) {
 737     // if we've met UTF symbol while searching Latin1 pattern, then we can
 738     // skip needle_len symbols
 739     if (needle_isL != haystack_isL) {
 740       mv(result_tmp, needle_len);
 741     } else {
 742       mv(result_tmp, 1);
 743     }
 744     mv(t0, ASIZE);
 745     bgeu(skipch, t0, BMADV);
 746   }
 747   add(result_tmp, sp, skipch);
 748   lbu(result_tmp, Address(result_tmp)); // load skip offset
 749 
 750   bind(BMADV);
 751   sub(nlen_tmp, needle_len, 1);
 752   // move haystack after bad char skip offset
 753   shadd(haystack, result_tmp, haystack, result, haystack_chr_shift);
 754   ble(haystack, haystack_end, BMLOOPSTR2);
 755   add(sp, sp, ASIZE);
 756   j(NOMATCH);
 757 
 758   bind(BMLOOPSTR1_LASTCMP);
 759   bne(ch1, ch2, BMSKIP);
 760 
 761   bind(BMMATCH);
 762   sub(result, haystack, orig_haystack);
 763   if (!haystack_isL) {
 764     srli(result, result, 1);
 765   }
 766   add(sp, sp, ASIZE);
 767   j(DONE);
 768 
 769   bind(LINEARSTUB);
 770   sub(t0, needle_len, 16); // small patterns still should be handled by simple algorithm
 771   bltz(t0, LINEARSEARCH);
 772   mv(result, zr);
 773   RuntimeAddress stub = nullptr;
 774   if (isLL) {
 775     stub = RuntimeAddress(StubRoutines::riscv::string_indexof_linear_ll());
 776     assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated");
 777   } else if (needle_isL) {
 778     stub = RuntimeAddress(StubRoutines::riscv::string_indexof_linear_ul());
 779     assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated");
 780   } else {
 781     stub = RuntimeAddress(StubRoutines::riscv::string_indexof_linear_uu());
 782     assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated");
 783   }
 784   address call = trampoline_call(stub);
 785   if (call == nullptr) {
 786     DEBUG_ONLY(reset_labels(LINEARSEARCH, DONE, NOMATCH));
 787     ciEnv::current()->record_failure("CodeCache is full");
 788     return;
 789   }
 790   j(DONE);
 791 
 792   bind(NOMATCH);
 793   mv(result, -1);
 794   j(DONE);
 795 
 796   bind(LINEARSEARCH);
 797   string_indexof_linearscan(haystack, needle, haystack_len, needle_len, tmp1, tmp2, tmp3, tmp4, -1, result, ae);
 798 
 799   bind(DONE);
 800   BLOCK_COMMENT("} string_indexof");
 801 }
 802 
 803 // string_indexof
 804 // result: x10
 805 // src: x11
 806 // src_count: x12
 807 // pattern: x13
 808 // pattern_count: x14 or 1/2/3/4
 809 void C2_MacroAssembler::string_indexof_linearscan(Register haystack, Register needle,
 810                                                Register haystack_len, Register needle_len,
 811                                                Register tmp1, Register tmp2,
 812                                                Register tmp3, Register tmp4,
 813                                                int needle_con_cnt, Register result, int ae)
 814 {
 815   // Note:
 816   // needle_con_cnt > 0 means needle_len register is invalid, needle length is constant
 817   // for UU/LL: needle_con_cnt[1, 4], UL: needle_con_cnt = 1
 818   assert(needle_con_cnt <= 4, "Invalid needle constant count");
 819   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
 820 
 821   Register ch1 = t0;
 822   Register ch2 = t1;
 823   Register hlen_neg = haystack_len, nlen_neg = needle_len;
 824   Register nlen_tmp = tmp1, hlen_tmp = tmp2, result_tmp = tmp4;
 825 
 826   bool isLL = ae == StrIntrinsicNode::LL;
 827 
 828   bool needle_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
 829   bool haystack_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
 830   int needle_chr_shift = needle_isL ? 0 : 1;
 831   int haystack_chr_shift = haystack_isL ? 0 : 1;
 832   int needle_chr_size = needle_isL ? 1 : 2;
 833   int haystack_chr_size = haystack_isL ? 1 : 2;
 834 
 835   load_chr_insn needle_load_1chr = needle_isL ? (load_chr_insn)&MacroAssembler::lbu :
 836                               (load_chr_insn)&MacroAssembler::lhu;
 837   load_chr_insn haystack_load_1chr = haystack_isL ? (load_chr_insn)&MacroAssembler::lbu :
 838                                 (load_chr_insn)&MacroAssembler::lhu;
 839   load_chr_insn load_2chr = isLL ? (load_chr_insn)&MacroAssembler::lhu : (load_chr_insn)&MacroAssembler::lwu;
 840   load_chr_insn load_4chr = isLL ? (load_chr_insn)&MacroAssembler::lwu : (load_chr_insn)&MacroAssembler::ld;
 841 
 842   Label DO1, DO2, DO3, MATCH, NOMATCH, DONE;
 843 
 844   Register first = tmp3;
 845 
 846   if (needle_con_cnt == -1) {
 847     Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
 848 
 849     sub(t0, needle_len, needle_isL == haystack_isL ? 4 : 2);
 850     bltz(t0, DOSHORT);
 851 
 852     (this->*needle_load_1chr)(first, Address(needle), noreg);
 853     slli(t0, needle_len, needle_chr_shift);
 854     add(needle, needle, t0);
 855     neg(nlen_neg, t0);
 856     slli(t0, result_tmp, haystack_chr_shift);
 857     add(haystack, haystack, t0);
 858     neg(hlen_neg, t0);
 859 
 860     bind(FIRST_LOOP);
 861     add(t0, haystack, hlen_neg);
 862     (this->*haystack_load_1chr)(ch2, Address(t0), noreg);
 863     beq(first, ch2, STR1_LOOP);
 864 
 865     bind(STR2_NEXT);
 866     add(hlen_neg, hlen_neg, haystack_chr_size);
 867     blez(hlen_neg, FIRST_LOOP);
 868     j(NOMATCH);
 869 
 870     bind(STR1_LOOP);
 871     add(nlen_tmp, nlen_neg, needle_chr_size);
 872     add(hlen_tmp, hlen_neg, haystack_chr_size);
 873     bgez(nlen_tmp, MATCH);
 874 
 875     bind(STR1_NEXT);
 876     add(ch1, needle, nlen_tmp);
 877     (this->*needle_load_1chr)(ch1, Address(ch1), noreg);
 878     add(ch2, haystack, hlen_tmp);
 879     (this->*haystack_load_1chr)(ch2, Address(ch2), noreg);
 880     bne(ch1, ch2, STR2_NEXT);
 881     add(nlen_tmp, nlen_tmp, needle_chr_size);
 882     add(hlen_tmp, hlen_tmp, haystack_chr_size);
 883     bltz(nlen_tmp, STR1_NEXT);
 884     j(MATCH);
 885 
 886     bind(DOSHORT);
 887     if (needle_isL == haystack_isL) {
 888       sub(t0, needle_len, 2);
 889       bltz(t0, DO1);
 890       bgtz(t0, DO3);
 891     }
 892   }
 893 
 894   if (needle_con_cnt == 4) {
 895     Label CH1_LOOP;
 896     (this->*load_4chr)(ch1, Address(needle), noreg);
 897     sub(result_tmp, haystack_len, 4);
 898     slli(tmp3, result_tmp, haystack_chr_shift); // result as tmp
 899     add(haystack, haystack, tmp3);
 900     neg(hlen_neg, tmp3);
 901     if (AvoidUnalignedAccesses) {
 902       // preload first value, then we will read by 1 character per loop, instead of four
 903       // just shifting previous ch2 right by size of character in bits
 904       add(tmp3, haystack, hlen_neg);
 905       (this->*load_4chr)(ch2, Address(tmp3), noreg);
 906       if (isLL) {
 907         // need to erase 1 most significant byte in 32-bit value of ch2
 908         slli(ch2, ch2, 40);
 909         srli(ch2, ch2, 32);
 910       } else {
 911         slli(ch2, ch2, 16); // 2 most significant bytes will be erased by this operation
 912       }
 913     }
 914 
 915     bind(CH1_LOOP);
 916     add(tmp3, haystack, hlen_neg);
 917     if (AvoidUnalignedAccesses) {
 918       srli(ch2, ch2, isLL ? 8 : 16);
 919       (this->*haystack_load_1chr)(tmp3, Address(tmp3, isLL ? 3 : 6), noreg);
 920       slli(tmp3, tmp3, isLL ? 24 : 48);
 921       add(ch2, ch2, tmp3);
 922     } else {
 923       (this->*load_4chr)(ch2, Address(tmp3), noreg);
 924     }
 925     beq(ch1, ch2, MATCH);
 926     add(hlen_neg, hlen_neg, haystack_chr_size);
 927     blez(hlen_neg, CH1_LOOP);
 928     j(NOMATCH);
 929   }
 930 
 931   if ((needle_con_cnt == -1 && needle_isL == haystack_isL) || needle_con_cnt == 2) {
 932     Label CH1_LOOP;
 933     BLOCK_COMMENT("string_indexof DO2 {");
 934     bind(DO2);
 935     (this->*load_2chr)(ch1, Address(needle), noreg);
 936     if (needle_con_cnt == 2) {
 937       sub(result_tmp, haystack_len, 2);
 938     }
 939     slli(tmp3, result_tmp, haystack_chr_shift);
 940     add(haystack, haystack, tmp3);
 941     neg(hlen_neg, tmp3);
 942     if (AvoidUnalignedAccesses) {
 943       // preload first value, then we will read by 1 character per loop, instead of two
 944       // just shifting previous ch2 right by size of character in bits
 945       add(tmp3, haystack, hlen_neg);
 946       (this->*haystack_load_1chr)(ch2, Address(tmp3), noreg);
 947       slli(ch2, ch2, isLL ? 8 : 16);
 948     }
 949     bind(CH1_LOOP);
 950     add(tmp3, haystack, hlen_neg);
 951     if (AvoidUnalignedAccesses) {
 952       srli(ch2, ch2, isLL ? 8 : 16);
 953       (this->*haystack_load_1chr)(tmp3, Address(tmp3, isLL ? 1 : 2), noreg);
 954       slli(tmp3, tmp3, isLL ? 8 : 16);
 955       add(ch2, ch2, tmp3);
 956     } else {
 957       (this->*load_2chr)(ch2, Address(tmp3), noreg);
 958     }
 959     beq(ch1, ch2, MATCH);
 960     add(hlen_neg, hlen_neg, haystack_chr_size);
 961     blez(hlen_neg, CH1_LOOP);
 962     j(NOMATCH);
 963     BLOCK_COMMENT("} string_indexof DO2");
 964   }
 965 
 966   if ((needle_con_cnt == -1 && needle_isL == haystack_isL) || needle_con_cnt == 3) {
 967     Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
 968     BLOCK_COMMENT("string_indexof DO3 {");
 969 
 970     bind(DO3);
 971     (this->*load_2chr)(first, Address(needle), noreg);
 972     (this->*needle_load_1chr)(ch1, Address(needle, 2 * needle_chr_size), noreg);
 973     if (needle_con_cnt == 3) {
 974       sub(result_tmp, haystack_len, 3);
 975     }
 976     slli(hlen_tmp, result_tmp, haystack_chr_shift);
 977     add(haystack, haystack, hlen_tmp);
 978     neg(hlen_neg, hlen_tmp);
 979 
 980     bind(FIRST_LOOP);
 981     add(ch2, haystack, hlen_neg);
 982     if (AvoidUnalignedAccesses) {
 983       (this->*haystack_load_1chr)(tmp2, Address(ch2, isLL ? 1 : 2), noreg); // we need a temp register, we can safely use hlen_tmp here, which is a synonym for tmp2
 984       (this->*haystack_load_1chr)(ch2, Address(ch2), noreg);
 985       slli(tmp2, tmp2, isLL ? 8 : 16);
 986       add(ch2, ch2, tmp2);
 987     } else {
 988       (this->*load_2chr)(ch2, Address(ch2), noreg);
 989     }
 990     beq(first, ch2, STR1_LOOP);
 991 
 992     bind(STR2_NEXT);
 993     add(hlen_neg, hlen_neg, haystack_chr_size);
 994     blez(hlen_neg, FIRST_LOOP);
 995     j(NOMATCH);
 996 
 997     bind(STR1_LOOP);
 998     add(hlen_tmp, hlen_neg, 2 * haystack_chr_size);
 999     add(ch2, haystack, hlen_tmp);
1000     (this->*haystack_load_1chr)(ch2, Address(ch2), noreg);
1001     bne(ch1, ch2, STR2_NEXT);
1002     j(MATCH);
1003     BLOCK_COMMENT("} string_indexof DO3");
1004   }
1005 
1006   if (needle_con_cnt == -1 || needle_con_cnt == 1) {
1007     Label DO1_LOOP;
1008 
1009     BLOCK_COMMENT("string_indexof DO1 {");
1010     bind(DO1);
1011     (this->*needle_load_1chr)(ch1, Address(needle), noreg);
1012     sub(result_tmp, haystack_len, 1);
1013     slli(tmp3, result_tmp, haystack_chr_shift);
1014     add(haystack, haystack, tmp3);
1015     neg(hlen_neg, tmp3);
1016 
1017     bind(DO1_LOOP);
1018     add(tmp3, haystack, hlen_neg);
1019     (this->*haystack_load_1chr)(ch2, Address(tmp3), noreg);
1020     beq(ch1, ch2, MATCH);
1021     add(hlen_neg, hlen_neg, haystack_chr_size);
1022     blez(hlen_neg, DO1_LOOP);
1023     BLOCK_COMMENT("} string_indexof DO1");
1024   }
1025 
1026   bind(NOMATCH);
1027   mv(result, -1);
1028   j(DONE);
1029 
1030   bind(MATCH);
1031   srai(t0, hlen_neg, haystack_chr_shift);
1032   add(result, result_tmp, t0);
1033 
1034   bind(DONE);
1035 }
1036 
1037 // Compare strings.
1038 void C2_MacroAssembler::string_compare(Register str1, Register str2,
1039                                     Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
1040                                     Register tmp3, int ae)
1041 {
1042   Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
1043       DIFFERENCE, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
1044       SHORT_LOOP_START, TAIL_CHECK, L;
1045 
1046   const int STUB_THRESHOLD = 64 + 8;
1047   bool isLL = ae == StrIntrinsicNode::LL;
1048   bool isLU = ae == StrIntrinsicNode::LU;
1049   bool isUL = ae == StrIntrinsicNode::UL;
1050 
1051   bool str1_isL = isLL || isLU;
1052   bool str2_isL = isLL || isUL;
1053 
1054   // for L strings, 1 byte for 1 character
1055   // for U strings, 2 bytes for 1 character
1056   int str1_chr_size = str1_isL ? 1 : 2;
1057   int str2_chr_size = str2_isL ? 1 : 2;
1058   int minCharsInWord = isLL ? wordSize : wordSize / 2;
1059 
1060   load_chr_insn str1_load_chr = str1_isL ? (load_chr_insn)&MacroAssembler::lbu : (load_chr_insn)&MacroAssembler::lhu;
1061   load_chr_insn str2_load_chr = str2_isL ? (load_chr_insn)&MacroAssembler::lbu : (load_chr_insn)&MacroAssembler::lhu;
1062 
1063   BLOCK_COMMENT("string_compare {");
1064 
1065   // Bizzarely, the counts are passed in bytes, regardless of whether they
1066   // are L or U strings, however the result is always in characters.
1067   if (!str1_isL) {
1068     sraiw(cnt1, cnt1, 1);
1069   }
1070   if (!str2_isL) {
1071     sraiw(cnt2, cnt2, 1);
1072   }
1073 
1074   // Compute the minimum of the string lengths and save the difference in result.
1075   sub(result, cnt1, cnt2);
1076   bgt(cnt1, cnt2, L);
1077   mv(cnt2, cnt1);
1078   bind(L);
1079 
1080   // A very short string
1081   mv(t0, minCharsInWord);
1082   ble(cnt2, t0, SHORT_STRING);
1083 
1084   // Compare longwords
1085   // load first parts of strings and finish initialization while loading
1086   {
1087     if (str1_isL == str2_isL) { // LL or UU
1088       // check if str1 and str2 is same pointer
1089       beq(str1, str2, DONE);
1090       // load 8 bytes once to compare
1091       ld(tmp1, Address(str1));
1092       ld(tmp2, Address(str2));
1093       mv(t0, STUB_THRESHOLD);
1094       bge(cnt2, t0, STUB);
1095       sub(cnt2, cnt2, minCharsInWord);
1096       beqz(cnt2, TAIL_CHECK);
1097       // convert cnt2 from characters to bytes
1098       if (!str1_isL) {
1099         slli(cnt2, cnt2, 1);
1100       }
1101       add(str2, str2, cnt2);
1102       add(str1, str1, cnt2);
1103       sub(cnt2, zr, cnt2);
1104     } else if (isLU) { // LU case
1105       lwu(tmp1, Address(str1));
1106       ld(tmp2, Address(str2));
1107       mv(t0, STUB_THRESHOLD);
1108       bge(cnt2, t0, STUB);
1109       addi(cnt2, cnt2, -4);
1110       add(str1, str1, cnt2);
1111       sub(cnt1, zr, cnt2);
1112       slli(cnt2, cnt2, 1);
1113       add(str2, str2, cnt2);
1114       inflate_lo32(tmp3, tmp1);
1115       mv(tmp1, tmp3);
1116       sub(cnt2, zr, cnt2);
1117       addi(cnt1, cnt1, 4);
1118     } else { // UL case
1119       ld(tmp1, Address(str1));
1120       lwu(tmp2, Address(str2));
1121       mv(t0, STUB_THRESHOLD);
1122       bge(cnt2, t0, STUB);
1123       addi(cnt2, cnt2, -4);
1124       slli(t0, cnt2, 1);
1125       sub(cnt1, zr, t0);
1126       add(str1, str1, t0);
1127       add(str2, str2, cnt2);
1128       inflate_lo32(tmp3, tmp2);
1129       mv(tmp2, tmp3);
1130       sub(cnt2, zr, cnt2);
1131       addi(cnt1, cnt1, 8);
1132     }
1133     addi(cnt2, cnt2, isUL ? 4 : 8);
1134     bne(tmp1, tmp2, DIFFERENCE);
1135     bgez(cnt2, TAIL);
1136 
1137     // main loop
1138     bind(NEXT_WORD);
1139     if (str1_isL == str2_isL) { // LL or UU
1140       add(t0, str1, cnt2);
1141       ld(tmp1, Address(t0));
1142       add(t0, str2, cnt2);
1143       ld(tmp2, Address(t0));
1144       addi(cnt2, cnt2, 8);
1145     } else if (isLU) { // LU case
1146       add(t0, str1, cnt1);
1147       lwu(tmp1, Address(t0));
1148       add(t0, str2, cnt2);
1149       ld(tmp2, Address(t0));
1150       addi(cnt1, cnt1, 4);
1151       inflate_lo32(tmp3, tmp1);
1152       mv(tmp1, tmp3);
1153       addi(cnt2, cnt2, 8);
1154     } else { // UL case
1155       add(t0, str2, cnt2);
1156       lwu(tmp2, Address(t0));
1157       add(t0, str1, cnt1);
1158       ld(tmp1, Address(t0));
1159       inflate_lo32(tmp3, tmp2);
1160       mv(tmp2, tmp3);
1161       addi(cnt1, cnt1, 8);
1162       addi(cnt2, cnt2, 4);
1163     }
1164     bne(tmp1, tmp2, DIFFERENCE);
1165     bltz(cnt2, NEXT_WORD);
1166     bind(TAIL);
1167     if (str1_isL == str2_isL) { // LL or UU
1168       load_long_misaligned(tmp1, Address(str1), tmp3, isLL ? 1 : 2);
1169       load_long_misaligned(tmp2, Address(str2), tmp3, isLL ? 1 : 2);
1170     } else if (isLU) { // LU case
1171       load_int_misaligned(tmp1, Address(str1), tmp3, false);
1172       load_long_misaligned(tmp2, Address(str2), tmp3, 2);
1173       inflate_lo32(tmp3, tmp1);
1174       mv(tmp1, tmp3);
1175     } else { // UL case
1176       load_int_misaligned(tmp2, Address(str2), tmp3, false);
1177       load_long_misaligned(tmp1, Address(str1), tmp3, 2);
1178       inflate_lo32(tmp3, tmp2);
1179       mv(tmp2, tmp3);
1180     }
1181     bind(TAIL_CHECK);
1182     beq(tmp1, tmp2, DONE);
1183 
1184     // Find the first different characters in the longwords and
1185     // compute their difference.
1186     bind(DIFFERENCE);
1187     xorr(tmp3, tmp1, tmp2);
1188     ctzc_bit(result, tmp3, isLL); // count zero from lsb to msb
1189     srl(tmp1, tmp1, result);
1190     srl(tmp2, tmp2, result);
1191     if (isLL) {
1192       andi(tmp1, tmp1, 0xFF);
1193       andi(tmp2, tmp2, 0xFF);
1194     } else {
1195       andi(tmp1, tmp1, 0xFFFF);
1196       andi(tmp2, tmp2, 0xFFFF);
1197     }
1198     sub(result, tmp1, tmp2);
1199     j(DONE);
1200   }
1201 
1202   bind(STUB);
1203   RuntimeAddress stub = nullptr;
1204   switch (ae) {
1205     case StrIntrinsicNode::LL:
1206       stub = RuntimeAddress(StubRoutines::riscv::compare_long_string_LL());
1207       break;
1208     case StrIntrinsicNode::UU:
1209       stub = RuntimeAddress(StubRoutines::riscv::compare_long_string_UU());
1210       break;
1211     case StrIntrinsicNode::LU:
1212       stub = RuntimeAddress(StubRoutines::riscv::compare_long_string_LU());
1213       break;
1214     case StrIntrinsicNode::UL:
1215       stub = RuntimeAddress(StubRoutines::riscv::compare_long_string_UL());
1216       break;
1217     default:
1218       ShouldNotReachHere();
1219   }
1220   assert(stub.target() != nullptr, "compare_long_string stub has not been generated");
1221   address call = trampoline_call(stub);
1222   if (call == nullptr) {
1223     DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START));
1224     ciEnv::current()->record_failure("CodeCache is full");
1225     return;
1226   }
1227   j(DONE);
1228 
1229   bind(SHORT_STRING);
1230   // Is the minimum length zero?
1231   beqz(cnt2, DONE);
1232   // arrange code to do most branches while loading and loading next characters
1233   // while comparing previous
1234   (this->*str1_load_chr)(tmp1, Address(str1), t0);
1235   addi(str1, str1, str1_chr_size);
1236   addi(cnt2, cnt2, -1);
1237   beqz(cnt2, SHORT_LAST_INIT);
1238   (this->*str2_load_chr)(cnt1, Address(str2), t0);
1239   addi(str2, str2, str2_chr_size);
1240   j(SHORT_LOOP_START);
1241   bind(SHORT_LOOP);
1242   addi(cnt2, cnt2, -1);
1243   beqz(cnt2, SHORT_LAST);
1244   bind(SHORT_LOOP_START);
1245   (this->*str1_load_chr)(tmp2, Address(str1), t0);
1246   addi(str1, str1, str1_chr_size);
1247   (this->*str2_load_chr)(t0, Address(str2), t0);
1248   addi(str2, str2, str2_chr_size);
1249   bne(tmp1, cnt1, SHORT_LOOP_TAIL);
1250   addi(cnt2, cnt2, -1);
1251   beqz(cnt2, SHORT_LAST2);
1252   (this->*str1_load_chr)(tmp1, Address(str1), t0);
1253   addi(str1, str1, str1_chr_size);
1254   (this->*str2_load_chr)(cnt1, Address(str2), t0);
1255   addi(str2, str2, str2_chr_size);
1256   beq(tmp2, t0, SHORT_LOOP);
1257   sub(result, tmp2, t0);
1258   j(DONE);
1259   bind(SHORT_LOOP_TAIL);
1260   sub(result, tmp1, cnt1);
1261   j(DONE);
1262   bind(SHORT_LAST2);
1263   beq(tmp2, t0, DONE);
1264   sub(result, tmp2, t0);
1265 
1266   j(DONE);
1267   bind(SHORT_LAST_INIT);
1268   (this->*str2_load_chr)(cnt1, Address(str2), t0);
1269   addi(str2, str2, str2_chr_size);
1270   bind(SHORT_LAST);
1271   beq(tmp1, cnt1, DONE);
1272   sub(result, tmp1, cnt1);
1273 
1274   bind(DONE);
1275 
1276   BLOCK_COMMENT("} string_compare");
1277 }
1278 
1279 void C2_MacroAssembler::arrays_equals(Register a1, Register a2, Register tmp3,
1280                                       Register tmp4, Register tmp5, Register tmp6, Register result,
1281                                       Register cnt1, int elem_size) {
1282   Label DONE, SAME, NEXT_DWORD, SHORT, TAIL, TAIL2, IS_TMP5_ZR;
1283   Register tmp1 = t0;
1284   Register tmp2 = t1;
1285   Register cnt2 = tmp2;  // cnt2 only used in array length compare
1286   Register elem_per_word = tmp6;
1287   int log_elem_size = exact_log2(elem_size);
1288   int length_offset = arrayOopDesc::length_offset_in_bytes();
1289   int base_offset   = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE);
1290 
1291   assert(elem_size == 1 || elem_size == 2, "must be char or byte");
1292   assert_different_registers(a1, a2, result, cnt1, t0, t1, tmp3, tmp4, tmp5, tmp6);
1293   mv(elem_per_word, wordSize / elem_size);
1294 
1295   BLOCK_COMMENT("arrays_equals {");
1296 
1297   // if (a1 == a2), return true
1298   beq(a1, a2, SAME);
1299 
1300   mv(result, false);
1301   beqz(a1, DONE);
1302   beqz(a2, DONE);
1303   lwu(cnt1, Address(a1, length_offset));
1304   lwu(cnt2, Address(a2, length_offset));
1305   bne(cnt2, cnt1, DONE);
1306   beqz(cnt1, SAME);
1307 
1308   slli(tmp5, cnt1, 3 + log_elem_size);
1309   sub(tmp5, zr, tmp5);
1310   add(a1, a1, base_offset);
1311   add(a2, a2, base_offset);
1312   ld(tmp3, Address(a1, 0));
1313   ld(tmp4, Address(a2, 0));
1314   ble(cnt1, elem_per_word, SHORT); // short or same
1315 
1316   // Main 16 byte comparison loop with 2 exits
1317   bind(NEXT_DWORD); {
1318     ld(tmp1, Address(a1, wordSize));
1319     ld(tmp2, Address(a2, wordSize));
1320     sub(cnt1, cnt1, 2 * wordSize / elem_size);
1321     blez(cnt1, TAIL);
1322     bne(tmp3, tmp4, DONE);
1323     ld(tmp3, Address(a1, 2 * wordSize));
1324     ld(tmp4, Address(a2, 2 * wordSize));
1325     add(a1, a1, 2 * wordSize);
1326     add(a2, a2, 2 * wordSize);
1327     ble(cnt1, elem_per_word, TAIL2);
1328   } beq(tmp1, tmp2, NEXT_DWORD);
1329   j(DONE);
1330 
1331   bind(TAIL);
1332   xorr(tmp4, tmp3, tmp4);
1333   xorr(tmp2, tmp1, tmp2);
1334   sll(tmp2, tmp2, tmp5);
1335   orr(tmp5, tmp4, tmp2);
1336   j(IS_TMP5_ZR);
1337 
1338   bind(TAIL2);
1339   bne(tmp1, tmp2, DONE);
1340 
1341   bind(SHORT);
1342   xorr(tmp4, tmp3, tmp4);
1343   sll(tmp5, tmp4, tmp5);
1344 
1345   bind(IS_TMP5_ZR);
1346   bnez(tmp5, DONE);
1347 
1348   bind(SAME);
1349   mv(result, true);
1350   // That's it.
1351   bind(DONE);
1352 
1353   BLOCK_COMMENT("} array_equals");
1354 }
1355 
1356 // Compare Strings
1357 
1358 // For Strings we're passed the address of the first characters in a1
1359 // and a2 and the length in cnt1.
1360 // elem_size is the element size in bytes: either 1 or 2.
1361 // There are two implementations.  For arrays >= 8 bytes, all
1362 // comparisons (for hw supporting unaligned access: including the final one,
1363 // which may overlap) are performed 8 bytes at a time.
1364 // For strings < 8 bytes (and for tails of long strings when
1365 // AvoidUnalignedAccesses is true), we compare a
1366 // halfword, then a short, and then a byte.
1367 
1368 void C2_MacroAssembler::string_equals(Register a1, Register a2,
1369                                       Register result, Register cnt1, int elem_size)
1370 {
1371   Label SAME, DONE, SHORT, NEXT_WORD;
1372   Register tmp1 = t0;
1373   Register tmp2 = t1;
1374 
1375   assert(elem_size == 1 || elem_size == 2, "must be 2 or 1 byte");
1376   assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2);
1377 
1378   BLOCK_COMMENT("string_equals {");
1379 
1380   beqz(cnt1, SAME);
1381   mv(result, false);
1382 
1383   // Check for short strings, i.e. smaller than wordSize.
1384   sub(cnt1, cnt1, wordSize);
1385   bltz(cnt1, SHORT);
1386 
1387   // Main 8 byte comparison loop.
1388   bind(NEXT_WORD); {
1389     ld(tmp1, Address(a1, 0));
1390     add(a1, a1, wordSize);
1391     ld(tmp2, Address(a2, 0));
1392     add(a2, a2, wordSize);
1393     sub(cnt1, cnt1, wordSize);
1394     bne(tmp1, tmp2, DONE);
1395   } bgez(cnt1, NEXT_WORD);
1396 
1397   if (!AvoidUnalignedAccesses) {
1398     // Last longword.  In the case where length == 4 we compare the
1399     // same longword twice, but that's still faster than another
1400     // conditional branch.
1401     // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when
1402     // length == 4.
1403     add(tmp1, a1, cnt1);
1404     ld(tmp1, Address(tmp1, 0));
1405     add(tmp2, a2, cnt1);
1406     ld(tmp2, Address(tmp2, 0));
1407     bne(tmp1, tmp2, DONE);
1408     j(SAME);
1409   } else {
1410     add(tmp1, cnt1, wordSize);
1411     beqz(tmp1, SAME);
1412   }
1413 
1414   bind(SHORT);
1415   Label TAIL03, TAIL01;
1416 
1417   // 0-7 bytes left.
1418   test_bit(tmp1, cnt1, 2);
1419   beqz(tmp1, TAIL03);
1420   {
1421     lwu(tmp1, Address(a1, 0));
1422     add(a1, a1, 4);
1423     lwu(tmp2, Address(a2, 0));
1424     add(a2, a2, 4);
1425     bne(tmp1, tmp2, DONE);
1426   }
1427 
1428   bind(TAIL03);
1429   // 0-3 bytes left.
1430   test_bit(tmp1, cnt1, 1);
1431   beqz(tmp1, TAIL01);
1432   {
1433     lhu(tmp1, Address(a1, 0));
1434     add(a1, a1, 2);
1435     lhu(tmp2, Address(a2, 0));
1436     add(a2, a2, 2);
1437     bne(tmp1, tmp2, DONE);
1438   }
1439 
1440   bind(TAIL01);
1441   if (elem_size == 1) { // Only needed when comparing 1-byte elements
1442     // 0-1 bytes left.
1443     test_bit(tmp1, cnt1, 0);
1444     beqz(tmp1, SAME);
1445     {
1446       lbu(tmp1, Address(a1, 0));
1447       lbu(tmp2, Address(a2, 0));
1448       bne(tmp1, tmp2, DONE);
1449     }
1450   }
1451 
1452   // Arrays are equal.
1453   bind(SAME);
1454   mv(result, true);
1455 
1456   // That's it.
1457   bind(DONE);
1458   BLOCK_COMMENT("} string_equals");
1459 }
1460 
1461 typedef void (Assembler::*conditional_branch_insn)(Register op1, Register op2, Label& label, bool is_far);
1462 typedef void (MacroAssembler::*float_conditional_branch_insn)(FloatRegister op1, FloatRegister op2, Label& label,
1463                                                               bool is_far, bool is_unordered);
1464 
1465 static conditional_branch_insn conditional_branches[] =
1466 {
1467   /* SHORT branches */
1468   (conditional_branch_insn)&MacroAssembler::beq,
1469   (conditional_branch_insn)&MacroAssembler::bgt,
1470   nullptr, // BoolTest::overflow
1471   (conditional_branch_insn)&MacroAssembler::blt,
1472   (conditional_branch_insn)&MacroAssembler::bne,
1473   (conditional_branch_insn)&MacroAssembler::ble,
1474   nullptr, // BoolTest::no_overflow
1475   (conditional_branch_insn)&MacroAssembler::bge,
1476 
1477   /* UNSIGNED branches */
1478   (conditional_branch_insn)&MacroAssembler::beq,
1479   (conditional_branch_insn)&MacroAssembler::bgtu,
1480   nullptr,
1481   (conditional_branch_insn)&MacroAssembler::bltu,
1482   (conditional_branch_insn)&MacroAssembler::bne,
1483   (conditional_branch_insn)&MacroAssembler::bleu,
1484   nullptr,
1485   (conditional_branch_insn)&MacroAssembler::bgeu
1486 };
1487 
1488 static float_conditional_branch_insn float_conditional_branches[] =
1489 {
1490   /* FLOAT SHORT branches */
1491   (float_conditional_branch_insn)&MacroAssembler::float_beq,
1492   (float_conditional_branch_insn)&MacroAssembler::float_bgt,
1493   nullptr,  // BoolTest::overflow
1494   (float_conditional_branch_insn)&MacroAssembler::float_blt,
1495   (float_conditional_branch_insn)&MacroAssembler::float_bne,
1496   (float_conditional_branch_insn)&MacroAssembler::float_ble,
1497   nullptr, // BoolTest::no_overflow
1498   (float_conditional_branch_insn)&MacroAssembler::float_bge,
1499 
1500   /* DOUBLE SHORT branches */
1501   (float_conditional_branch_insn)&MacroAssembler::double_beq,
1502   (float_conditional_branch_insn)&MacroAssembler::double_bgt,
1503   nullptr,
1504   (float_conditional_branch_insn)&MacroAssembler::double_blt,
1505   (float_conditional_branch_insn)&MacroAssembler::double_bne,
1506   (float_conditional_branch_insn)&MacroAssembler::double_ble,
1507   nullptr,
1508   (float_conditional_branch_insn)&MacroAssembler::double_bge
1509 };
1510 
1511 void C2_MacroAssembler::cmp_branch(int cmpFlag, Register op1, Register op2, Label& label, bool is_far) {
1512   assert(cmpFlag >= 0 && cmpFlag < (int)(sizeof(conditional_branches) / sizeof(conditional_branches[0])),
1513          "invalid conditional branch index");
1514   (this->*conditional_branches[cmpFlag])(op1, op2, label, is_far);
1515 }
1516 
1517 // This is a function should only be used by C2. Flip the unordered when unordered-greater, C2 would use
1518 // unordered-lesser instead of unordered-greater. Finally, commute the result bits at function do_one_bytecode().
1519 void C2_MacroAssembler::float_cmp_branch(int cmpFlag, FloatRegister op1, FloatRegister op2, Label& label, bool is_far) {
1520   assert(cmpFlag >= 0 && cmpFlag < (int)(sizeof(float_conditional_branches) / sizeof(float_conditional_branches[0])),
1521          "invalid float conditional branch index");
1522   int booltest_flag = cmpFlag & ~(C2_MacroAssembler::double_branch_mask);
1523   (this->*float_conditional_branches[cmpFlag])(op1, op2, label, is_far,
1524     (booltest_flag == (BoolTest::ge) || booltest_flag == (BoolTest::gt)) ? false : true);
1525 }
1526 
1527 void C2_MacroAssembler::enc_cmpUEqNeLeGt_imm0_branch(int cmpFlag, Register op1, Label& L, bool is_far) {
1528   switch (cmpFlag) {
1529     case BoolTest::eq:
1530     case BoolTest::le:
1531       beqz(op1, L, is_far);
1532       break;
1533     case BoolTest::ne:
1534     case BoolTest::gt:
1535       bnez(op1, L, is_far);
1536       break;
1537     default:
1538       ShouldNotReachHere();
1539   }
1540 }
1541 
1542 void C2_MacroAssembler::enc_cmpEqNe_imm0_branch(int cmpFlag, Register op1, Label& L, bool is_far) {
1543   switch (cmpFlag) {
1544     case BoolTest::eq:
1545       beqz(op1, L, is_far);
1546       break;
1547     case BoolTest::ne:
1548       bnez(op1, L, is_far);
1549       break;
1550     default:
1551       ShouldNotReachHere();
1552   }
1553 }
1554 
1555 void C2_MacroAssembler::enc_cmove(int cmpFlag, Register op1, Register op2, Register dst, Register src) {
1556   Label L;
1557   cmp_branch(cmpFlag ^ (1 << neg_cond_bits), op1, op2, L);
1558   mv(dst, src);
1559   bind(L);
1560 }
1561 
1562 // Set dst to NaN if any NaN input.
1563 void C2_MacroAssembler::minmax_fp(FloatRegister dst, FloatRegister src1, FloatRegister src2,
1564                                   bool is_double, bool is_min) {
1565   assert_different_registers(dst, src1, src2);
1566 
1567   Label Done, Compare;
1568 
1569   is_double ? fclass_d(t0, src1)
1570             : fclass_s(t0, src1);
1571   is_double ? fclass_d(t1, src2)
1572             : fclass_s(t1, src2);
1573   orr(t0, t0, t1);
1574   andi(t0, t0, 0b1100000000); //if src1 or src2 is quiet or signaling NaN then return NaN
1575   beqz(t0, Compare);
1576   is_double ? fadd_d(dst, src1, src2)
1577             : fadd_s(dst, src1, src2);
1578   j(Done);
1579 
1580   bind(Compare);
1581   if (is_double) {
1582     is_min ? fmin_d(dst, src1, src2)
1583            : fmax_d(dst, src1, src2);
1584   } else {
1585     is_min ? fmin_s(dst, src1, src2)
1586            : fmax_s(dst, src1, src2);
1587   }
1588 
1589   bind(Done);
1590 }
1591 
1592 void C2_MacroAssembler::element_compare(Register a1, Register a2, Register result, Register cnt, Register tmp1, Register tmp2,
1593                                         VectorRegister vr1, VectorRegister vr2, VectorRegister vrs, bool islatin, Label &DONE) {
1594   Label loop;
1595   Assembler::SEW sew = islatin ? Assembler::e8 : Assembler::e16;
1596 
1597   bind(loop);
1598   vsetvli(tmp1, cnt, sew, Assembler::m2);
1599   vlex_v(vr1, a1, sew);
1600   vlex_v(vr2, a2, sew);
1601   vmsne_vv(vrs, vr1, vr2);
1602   vfirst_m(tmp2, vrs);
1603   bgez(tmp2, DONE);
1604   sub(cnt, cnt, tmp1);
1605   if (!islatin) {
1606     slli(tmp1, tmp1, 1); // get byte counts
1607   }
1608   add(a1, a1, tmp1);
1609   add(a2, a2, tmp1);
1610   bnez(cnt, loop);
1611 
1612   mv(result, true);
1613 }
1614 
1615 void C2_MacroAssembler::string_equals_v(Register a1, Register a2, Register result, Register cnt, int elem_size) {
1616   Label DONE;
1617   Register tmp1 = t0;
1618   Register tmp2 = t1;
1619 
1620   BLOCK_COMMENT("string_equals_v {");
1621 
1622   mv(result, false);
1623 
1624   if (elem_size == 2) {
1625     srli(cnt, cnt, 1);
1626   }
1627 
1628   element_compare(a1, a2, result, cnt, tmp1, tmp2, v2, v4, v2, elem_size == 1, DONE);
1629 
1630   bind(DONE);
1631   BLOCK_COMMENT("} string_equals_v");
1632 }
1633 
1634 // used by C2 ClearArray patterns.
1635 // base: Address of a buffer to be zeroed
1636 // cnt: Count in HeapWords
1637 //
1638 // base, cnt, v4, v5, v6, v7 and t0 are clobbered.
1639 void C2_MacroAssembler::clear_array_v(Register base, Register cnt) {
1640   Label loop;
1641 
1642   // making zero words
1643   vsetvli(t0, cnt, Assembler::e64, Assembler::m4);
1644   vxor_vv(v4, v4, v4);
1645 
1646   bind(loop);
1647   vsetvli(t0, cnt, Assembler::e64, Assembler::m4);
1648   vse64_v(v4, base);
1649   sub(cnt, cnt, t0);
1650   shadd(base, t0, base, t0, 3);
1651   bnez(cnt, loop);
1652 }
1653 
1654 void C2_MacroAssembler::arrays_equals_v(Register a1, Register a2, Register result,
1655                                         Register cnt1, int elem_size) {
1656   Label DONE;
1657   Register tmp1 = t0;
1658   Register tmp2 = t1;
1659   Register cnt2 = tmp2;
1660   int length_offset = arrayOopDesc::length_offset_in_bytes();
1661   int base_offset = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE);
1662 
1663   BLOCK_COMMENT("arrays_equals_v {");
1664 
1665   // if (a1 == a2), return true
1666   mv(result, true);
1667   beq(a1, a2, DONE);
1668 
1669   mv(result, false);
1670   // if a1 == null or a2 == null, return false
1671   beqz(a1, DONE);
1672   beqz(a2, DONE);
1673   // if (a1.length != a2.length), return false
1674   lwu(cnt1, Address(a1, length_offset));
1675   lwu(cnt2, Address(a2, length_offset));
1676   bne(cnt1, cnt2, DONE);
1677 
1678   la(a1, Address(a1, base_offset));
1679   la(a2, Address(a2, base_offset));
1680 
1681   element_compare(a1, a2, result, cnt1, tmp1, tmp2, v2, v4, v2, elem_size == 1, DONE);
1682 
1683   bind(DONE);
1684 
1685   BLOCK_COMMENT("} arrays_equals_v");
1686 }
1687 
1688 void C2_MacroAssembler::string_compare_v(Register str1, Register str2, Register cnt1, Register cnt2,
1689                                          Register result, Register tmp1, Register tmp2, int encForm) {
1690   Label DIFFERENCE, DONE, L, loop;
1691   bool encLL = encForm == StrIntrinsicNode::LL;
1692   bool encLU = encForm == StrIntrinsicNode::LU;
1693   bool encUL = encForm == StrIntrinsicNode::UL;
1694 
1695   bool str1_isL = encLL || encLU;
1696   bool str2_isL = encLL || encUL;
1697 
1698   int minCharsInWord = encLL ? wordSize : wordSize / 2;
1699 
1700   BLOCK_COMMENT("string_compare {");
1701 
1702   // for Latin strings, 1 byte for 1 character
1703   // for UTF16 strings, 2 bytes for 1 character
1704   if (!str1_isL)
1705     sraiw(cnt1, cnt1, 1);
1706   if (!str2_isL)
1707     sraiw(cnt2, cnt2, 1);
1708 
1709   // if str1 == str2, return the difference
1710   // save the minimum of the string lengths in cnt2.
1711   sub(result, cnt1, cnt2);
1712   bgt(cnt1, cnt2, L);
1713   mv(cnt2, cnt1);
1714   bind(L);
1715 
1716   if (str1_isL == str2_isL) { // LL or UU
1717     element_compare(str1, str2, zr, cnt2, tmp1, tmp2, v2, v4, v2, encLL, DIFFERENCE);
1718     j(DONE);
1719   } else { // LU or UL
1720     Register strL = encLU ? str1 : str2;
1721     Register strU = encLU ? str2 : str1;
1722     VectorRegister vstr1 = encLU ? v8 : v4;
1723     VectorRegister vstr2 = encLU ? v4 : v8;
1724 
1725     bind(loop);
1726     vsetvli(tmp1, cnt2, Assembler::e8, Assembler::m2);
1727     vle8_v(vstr1, strL);
1728     vsetvli(tmp1, cnt2, Assembler::e16, Assembler::m4);
1729     vzext_vf2(vstr2, vstr1);
1730     vle16_v(vstr1, strU);
1731     vmsne_vv(v4, vstr2, vstr1);
1732     vfirst_m(tmp2, v4);
1733     bgez(tmp2, DIFFERENCE);
1734     sub(cnt2, cnt2, tmp1);
1735     add(strL, strL, tmp1);
1736     shadd(strU, tmp1, strU, tmp1, 1);
1737     bnez(cnt2, loop);
1738     j(DONE);
1739   }
1740 
1741   bind(DIFFERENCE);
1742   slli(tmp1, tmp2, 1);
1743   add(str1, str1, str1_isL ? tmp2 : tmp1);
1744   add(str2, str2, str2_isL ? tmp2 : tmp1);
1745   str1_isL ? lbu(tmp1, Address(str1, 0)) : lhu(tmp1, Address(str1, 0));
1746   str2_isL ? lbu(tmp2, Address(str2, 0)) : lhu(tmp2, Address(str2, 0));
1747   sub(result, tmp1, tmp2);
1748 
1749   bind(DONE);
1750 }
1751 
1752 void C2_MacroAssembler::byte_array_inflate_v(Register src, Register dst, Register len, Register tmp) {
1753   Label loop;
1754   assert_different_registers(src, dst, len, tmp, t0);
1755 
1756   BLOCK_COMMENT("byte_array_inflate_v {");
1757   bind(loop);
1758   vsetvli(tmp, len, Assembler::e8, Assembler::m2);
1759   vle8_v(v6, src);
1760   vsetvli(t0, len, Assembler::e16, Assembler::m4);
1761   vzext_vf2(v4, v6);
1762   vse16_v(v4, dst);
1763   sub(len, len, tmp);
1764   add(src, src, tmp);
1765   shadd(dst, tmp, dst, tmp, 1);
1766   bnez(len, loop);
1767   BLOCK_COMMENT("} byte_array_inflate_v");
1768 }
1769 
1770 // Compress char[] array to byte[].
1771 // result: the array length if every element in array can be encoded; 0, otherwise.
1772 void C2_MacroAssembler::char_array_compress_v(Register src, Register dst, Register len,
1773                                               Register result, Register tmp) {
1774   Label done;
1775   encode_iso_array_v(src, dst, len, result, tmp, false);
1776   beqz(len, done);
1777   mv(result, zr);
1778   bind(done);
1779 }
1780 
1781 // Intrinsic for
1782 //
1783 // - sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray
1784 //     return the number of characters copied.
1785 // - java/lang/StringUTF16.compress
1786 //     return zero (0) if copy fails, otherwise 'len'.
1787 //
1788 // This version always returns the number of characters copied. A successful
1789 // copy will complete with the post-condition: 'res' == 'len', while an
1790 // unsuccessful copy will exit with the post-condition: 0 <= 'res' < 'len'.
1791 //
1792 // Clobbers: src, dst, len, result, t0
1793 void C2_MacroAssembler::encode_iso_array_v(Register src, Register dst, Register len,
1794                                            Register result, Register tmp, bool ascii) {
1795   Label loop, fail, done;
1796 
1797   BLOCK_COMMENT("encode_iso_array_v {");
1798   mv(result, 0);
1799 
1800   bind(loop);
1801   mv(tmp, ascii ? 0x7f : 0xff);
1802   vsetvli(t0, len, Assembler::e16, Assembler::m2);
1803   vle16_v(v2, src);
1804 
1805   vmsgtu_vx(v1, v2, tmp);
1806   vfirst_m(tmp, v1);
1807   vmsbf_m(v0, v1);
1808   // compress char to byte
1809   vsetvli(t0, len, Assembler::e8);
1810   vncvt_x_x_w(v1, v2, Assembler::v0_t);
1811   vse8_v(v1, dst, Assembler::v0_t);
1812 
1813   // fail if char > 0x7f/0xff
1814   bgez(tmp, fail);
1815   add(result, result, t0);
1816   add(dst, dst, t0);
1817   sub(len, len, t0);
1818   shadd(src, t0, src, t0, 1);
1819   bnez(len, loop);
1820   j(done);
1821 
1822   bind(fail);
1823   add(result, result, tmp);
1824 
1825   bind(done);
1826   BLOCK_COMMENT("} encode_iso_array_v");
1827 }
1828 
1829 void C2_MacroAssembler::count_positives_v(Register ary, Register len, Register result, Register tmp) {
1830   Label LOOP, SET_RESULT, DONE;
1831 
1832   BLOCK_COMMENT("count_positives_v {");
1833   assert_different_registers(ary, len, result, tmp);
1834 
1835   mv(result, zr);
1836 
1837   bind(LOOP);
1838   vsetvli(t0, len, Assembler::e8, Assembler::m4);
1839   vle8_v(v4, ary);
1840   vmslt_vx(v4, v4, zr);
1841   vfirst_m(tmp, v4);
1842   bgez(tmp, SET_RESULT);
1843   // if tmp == -1, all bytes are positive
1844   add(result, result, t0);
1845 
1846   sub(len, len, t0);
1847   add(ary, ary, t0);
1848   bnez(len, LOOP);
1849   j(DONE);
1850 
1851   // add remaining positive bytes count
1852   bind(SET_RESULT);
1853   add(result, result, tmp);
1854 
1855   bind(DONE);
1856   BLOCK_COMMENT("} count_positives_v");
1857 }
1858 
1859 void C2_MacroAssembler::string_indexof_char_v(Register str1, Register cnt1,
1860                                               Register ch, Register result,
1861                                               Register tmp1, Register tmp2,
1862                                               bool isL) {
1863   mv(result, zr);
1864 
1865   Label loop, MATCH, DONE;
1866   Assembler::SEW sew = isL ? Assembler::e8 : Assembler::e16;
1867   bind(loop);
1868   vsetvli(tmp1, cnt1, sew, Assembler::m4);
1869   vlex_v(v4, str1, sew);
1870   vmseq_vx(v4, v4, ch);
1871   vfirst_m(tmp2, v4);
1872   bgez(tmp2, MATCH); // if equal, return index
1873 
1874   add(result, result, tmp1);
1875   sub(cnt1, cnt1, tmp1);
1876   if (!isL) slli(tmp1, tmp1, 1);
1877   add(str1, str1, tmp1);
1878   bnez(cnt1, loop);
1879 
1880   mv(result, -1);
1881   j(DONE);
1882 
1883   bind(MATCH);
1884   add(result, result, tmp2);
1885 
1886   bind(DONE);
1887 }
1888 
1889 // Set dst to NaN if any NaN input.
1890 void C2_MacroAssembler::minmax_fp_v(VectorRegister dst, VectorRegister src1, VectorRegister src2,
1891                                     bool is_double, bool is_min, int vector_length) {
1892   assert_different_registers(dst, src1, src2);
1893 
1894   vsetvli_helper(is_double ? T_DOUBLE : T_FLOAT, vector_length);
1895 
1896   is_min ? vfmin_vv(dst, src1, src2)
1897          : vfmax_vv(dst, src1, src2);
1898 
1899   vmfne_vv(v0,  src1, src1);
1900   vfadd_vv(dst, src1, src1, Assembler::v0_t);
1901   vmfne_vv(v0,  src2, src2);
1902   vfadd_vv(dst, src2, src2, Assembler::v0_t);
1903 }
1904 
1905 // Set dst to NaN if any NaN input.
1906 // The destination vector register elements corresponding to masked-off elements
1907 // are handled with a mask-undisturbed policy.
1908 void C2_MacroAssembler::minmax_fp_masked_v(VectorRegister dst, VectorRegister src1, VectorRegister src2,
1909                                            VectorRegister vmask, VectorRegister tmp1, VectorRegister tmp2,
1910                                            bool is_double, bool is_min, int vector_length) {
1911   assert_different_registers(src1, src2, tmp1, tmp2);
1912   vsetvli_helper(is_double ? T_DOUBLE : T_FLOAT, vector_length);
1913 
1914   // Check vector elements of src1 and src2 for NaN.
1915   vmfeq_vv(tmp1, src1, src1);
1916   vmfeq_vv(tmp2, src2, src2);
1917 
1918   vmandn_mm(v0, vmask, tmp1);
1919   vfadd_vv(dst, src1, src1, Assembler::v0_t);
1920   vmandn_mm(v0, vmask, tmp2);
1921   vfadd_vv(dst, src2, src2, Assembler::v0_t);
1922 
1923   vmand_mm(tmp2, tmp1, tmp2);
1924   vmand_mm(v0, vmask, tmp2);
1925   is_min ? vfmin_vv(dst, src1, src2, Assembler::v0_t)
1926          : vfmax_vv(dst, src1, src2, Assembler::v0_t);
1927 }
1928 
1929 // Set dst to NaN if any NaN input.
1930 void C2_MacroAssembler::reduce_minmax_fp_v(FloatRegister dst,
1931                                            FloatRegister src1, VectorRegister src2,
1932                                            VectorRegister tmp1, VectorRegister tmp2,
1933                                            bool is_double, bool is_min, int vector_length, VectorMask vm) {
1934   assert_different_registers(dst, src1);
1935   assert_different_registers(src2, tmp1, tmp2);
1936 
1937   Label L_done, L_NaN_1, L_NaN_2;
1938   // Set dst to src1 if src1 is NaN
1939   is_double ? feq_d(t0, src1, src1)
1940             : feq_s(t0, src1, src1);
1941   beqz(t0, L_NaN_2);
1942 
1943   vsetvli_helper(is_double ? T_DOUBLE : T_FLOAT, vector_length);
1944   vfmv_s_f(tmp2, src1);
1945 
1946   is_min ? vfredmin_vs(tmp1, src2, tmp2, vm)
1947          : vfredmax_vs(tmp1, src2, tmp2, vm);
1948   vfmv_f_s(dst, tmp1);
1949 
1950   // Checking NaNs in src2
1951   vmfne_vv(tmp1, src2, src2, vm);
1952   vcpop_m(t0, tmp1, vm);
1953   beqz(t0, L_done);
1954 
1955   bind(L_NaN_1);
1956   vfredusum_vs(tmp1, src2, tmp2, vm);
1957   vfmv_f_s(dst, tmp1);
1958   j(L_done);
1959 
1960   bind(L_NaN_2);
1961   is_double ? fmv_d(dst, src1)
1962             : fmv_s(dst, src1);
1963   bind(L_done);
1964 }
1965 
1966 bool C2_MacroAssembler::in_scratch_emit_size() {
1967   if (ciEnv::current()->task() != nullptr) {
1968     PhaseOutput* phase_output = Compile::current()->output();
1969     if (phase_output != nullptr && phase_output->in_scratch_emit_size()) {
1970       return true;
1971     }
1972   }
1973   return MacroAssembler::in_scratch_emit_size();
1974 }
1975 
1976 void C2_MacroAssembler::reduce_integral_v(Register dst, Register src1,
1977                                           VectorRegister src2, VectorRegister tmp,
1978                                           int opc, BasicType bt, int vector_length, VectorMask vm) {
1979   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
1980   vsetvli_helper(bt, vector_length);
1981   vmv_s_x(tmp, src1);
1982   switch (opc) {
1983     case Op_AddReductionVI:
1984     case Op_AddReductionVL:
1985       vredsum_vs(tmp, src2, tmp, vm);
1986       break;
1987     case Op_AndReductionV:
1988       vredand_vs(tmp, src2, tmp, vm);
1989       break;
1990     case Op_OrReductionV:
1991       vredor_vs(tmp, src2, tmp, vm);
1992       break;
1993     case Op_XorReductionV:
1994       vredxor_vs(tmp, src2, tmp, vm);
1995       break;
1996     case Op_MaxReductionV:
1997       vredmax_vs(tmp, src2, tmp, vm);
1998       break;
1999     case Op_MinReductionV:
2000       vredmin_vs(tmp, src2, tmp, vm);
2001       break;
2002     default:
2003       ShouldNotReachHere();
2004   }
2005   vmv_x_s(dst, tmp);
2006 }
2007 
2008 // Set vl and vtype for full and partial vector operations.
2009 // (vma = mu, vta = tu, vill = false)
2010 void C2_MacroAssembler::vsetvli_helper(BasicType bt, int vector_length, LMUL vlmul, Register tmp) {
2011   Assembler::SEW sew = Assembler::elemtype_to_sew(bt);
2012   if (vector_length <= 31) {
2013     vsetivli(tmp, vector_length, sew, vlmul);
2014   } else if (vector_length == (MaxVectorSize / type2aelembytes(bt))) {
2015     vsetvli(tmp, x0, sew, vlmul);
2016   } else {
2017     mv(tmp, vector_length);
2018     vsetvli(tmp, tmp, sew, vlmul);
2019   }
2020 }
2021 
2022 void C2_MacroAssembler::compare_integral_v(VectorRegister vd, VectorRegister src1, VectorRegister src2,
2023                                            int cond, BasicType bt, int vector_length, VectorMask vm) {
2024   assert(is_integral_type(bt), "unsupported element type");
2025   assert(vm == Assembler::v0_t ? vd != v0 : true, "should be different registers");
2026   vsetvli_helper(bt, vector_length);
2027   vmclr_m(vd);
2028   switch (cond) {
2029     case BoolTest::eq: vmseq_vv(vd, src1, src2, vm); break;
2030     case BoolTest::ne: vmsne_vv(vd, src1, src2, vm); break;
2031     case BoolTest::le: vmsle_vv(vd, src1, src2, vm); break;
2032     case BoolTest::ge: vmsge_vv(vd, src1, src2, vm); break;
2033     case BoolTest::lt: vmslt_vv(vd, src1, src2, vm); break;
2034     case BoolTest::gt: vmsgt_vv(vd, src1, src2, vm); break;
2035     default:
2036       assert(false, "unsupported compare condition");
2037       ShouldNotReachHere();
2038   }
2039 }
2040 
2041 void C2_MacroAssembler::compare_fp_v(VectorRegister vd, VectorRegister src1, VectorRegister src2,
2042                                      int cond, BasicType bt, int vector_length, VectorMask vm) {
2043   assert(is_floating_point_type(bt), "unsupported element type");
2044   assert(vm == Assembler::v0_t ? vd != v0 : true, "should be different registers");
2045   vsetvli_helper(bt, vector_length);
2046   vmclr_m(vd);
2047   switch (cond) {
2048     case BoolTest::eq: vmfeq_vv(vd, src1, src2, vm); break;
2049     case BoolTest::ne: vmfne_vv(vd, src1, src2, vm); break;
2050     case BoolTest::le: vmfle_vv(vd, src1, src2, vm); break;
2051     case BoolTest::ge: vmfge_vv(vd, src1, src2, vm); break;
2052     case BoolTest::lt: vmflt_vv(vd, src1, src2, vm); break;
2053     case BoolTest::gt: vmfgt_vv(vd, src1, src2, vm); break;
2054     default:
2055       assert(false, "unsupported compare condition");
2056       ShouldNotReachHere();
2057   }
2058 }
2059 
2060 void C2_MacroAssembler::integer_extend_v(VectorRegister dst, BasicType dst_bt, int vector_length,
2061                                          VectorRegister src, BasicType src_bt) {
2062   assert(type2aelembytes(dst_bt) > type2aelembytes(src_bt) && type2aelembytes(dst_bt) <= 8 && type2aelembytes(src_bt) <= 4, "invalid element size");
2063   assert(dst_bt != T_FLOAT && dst_bt != T_DOUBLE && src_bt != T_FLOAT && src_bt != T_DOUBLE, "unsupported element type");
2064   // https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#52-vector-operands
2065   // The destination EEW is greater than the source EEW, the source EMUL is at least 1,
2066   // and the overlap is in the highest-numbered part of the destination register group.
2067   // Since LMUL=1, vd and vs cannot be the same.
2068   assert_different_registers(dst, src);
2069 
2070   vsetvli_helper(dst_bt, vector_length);
2071   if (src_bt == T_BYTE) {
2072     switch (dst_bt) {
2073     case T_SHORT:
2074       vsext_vf2(dst, src);
2075       break;
2076     case T_INT:
2077       vsext_vf4(dst, src);
2078       break;
2079     case T_LONG:
2080       vsext_vf8(dst, src);
2081       break;
2082     default:
2083       ShouldNotReachHere();
2084     }
2085   } else if (src_bt == T_SHORT) {
2086     if (dst_bt == T_INT) {
2087       vsext_vf2(dst, src);
2088     } else {
2089       vsext_vf4(dst, src);
2090     }
2091   } else if (src_bt == T_INT) {
2092     vsext_vf2(dst, src);
2093   }
2094 }
2095 
2096 // Vector narrow from src to dst with specified element sizes.
2097 // High part of dst vector will be filled with zero.
2098 void C2_MacroAssembler::integer_narrow_v(VectorRegister dst, BasicType dst_bt, int vector_length,
2099                                          VectorRegister src, BasicType src_bt) {
2100   assert(type2aelembytes(dst_bt) < type2aelembytes(src_bt) && type2aelembytes(dst_bt) <= 4 && type2aelembytes(src_bt) <= 8, "invalid element size");
2101   assert(dst_bt != T_FLOAT && dst_bt != T_DOUBLE && src_bt != T_FLOAT && src_bt != T_DOUBLE, "unsupported element type");
2102   mv(t0, vector_length);
2103   if (src_bt == T_LONG) {
2104     // https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#117-vector-narrowing-integer-right-shift-instructions
2105     // Future extensions might add support for versions that narrow to a destination that is 1/4 the width of the source.
2106     // So we can currently only scale down by 1/2 the width at a time.
2107     vsetvli(t0, t0, Assembler::e32, Assembler::mf2);
2108     vncvt_x_x_w(dst, src);
2109     if (dst_bt == T_SHORT || dst_bt == T_BYTE) {
2110       vsetvli(t0, t0, Assembler::e16, Assembler::mf2);
2111       vncvt_x_x_w(dst, dst);
2112       if (dst_bt == T_BYTE) {
2113         vsetvli(t0, t0, Assembler::e8, Assembler::mf2);
2114         vncvt_x_x_w(dst, dst);
2115       }
2116     }
2117   } else if (src_bt == T_INT) {
2118     // T_SHORT
2119     vsetvli(t0, t0, Assembler::e16, Assembler::mf2);
2120     vncvt_x_x_w(dst, src);
2121     if (dst_bt == T_BYTE) {
2122       vsetvli(t0, t0, Assembler::e8, Assembler::mf2);
2123       vncvt_x_x_w(dst, dst);
2124     }
2125   } else if (src_bt == T_SHORT) {
2126     vsetvli(t0, t0, Assembler::e8, Assembler::mf2);
2127     vncvt_x_x_w(dst, src);
2128   }
2129 }
2130 
2131 #define VFCVT_SAFE(VFLOATCVT)                                                      \
2132 void C2_MacroAssembler::VFLOATCVT##_safe(VectorRegister dst, VectorRegister src) { \
2133   assert_different_registers(dst, src);                                            \
2134   vxor_vv(dst, dst, dst);                                                          \
2135   vmfeq_vv(v0, src, src);                                                          \
2136   VFLOATCVT(dst, src, Assembler::v0_t);                                            \
2137 }
2138 
2139 VFCVT_SAFE(vfcvt_rtz_x_f_v);
2140 
2141 #undef VFCVT_SAFE
2142 
2143 // Extract a scalar element from an vector at position 'idx'.
2144 // The input elements in src are expected to be of integral type.
2145 void C2_MacroAssembler::extract_v(Register dst, VectorRegister src, BasicType bt,
2146                                   int idx, VectorRegister tmp) {
2147   assert(is_integral_type(bt), "unsupported element type");
2148   assert(idx >= 0, "idx cannot be negative");
2149   // Only need the first element after vector slidedown
2150   vsetvli_helper(bt, 1);
2151   if (idx == 0) {
2152     vmv_x_s(dst, src);
2153   } else if (idx <= 31) {
2154     vslidedown_vi(tmp, src, idx);
2155     vmv_x_s(dst, tmp);
2156   } else {
2157     mv(t0, idx);
2158     vslidedown_vx(tmp, src, t0);
2159     vmv_x_s(dst, tmp);
2160   }
2161 }
2162 
2163 // Extract a scalar element from an vector at position 'idx'.
2164 // The input elements in src are expected to be of floating point type.
2165 void C2_MacroAssembler::extract_fp_v(FloatRegister dst, VectorRegister src, BasicType bt,
2166                                      int idx, VectorRegister tmp) {
2167   assert(is_floating_point_type(bt), "unsupported element type");
2168   assert(idx >= 0, "idx cannot be negative");
2169   // Only need the first element after vector slidedown
2170   vsetvli_helper(bt, 1);
2171   if (idx == 0) {
2172     vfmv_f_s(dst, src);
2173   } else if (idx <= 31) {
2174     vslidedown_vi(tmp, src, idx);
2175     vfmv_f_s(dst, tmp);
2176   } else {
2177     mv(t0, idx);
2178     vslidedown_vx(tmp, src, t0);
2179     vfmv_f_s(dst, tmp);
2180   }
2181 }