1 /*
   2  * Copyright (c) 2020, 2026, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2020, 2022, Huawei Technologies Co., Ltd. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "opto/c2_MacroAssembler.hpp"
  29 #include "opto/compile.hpp"
  30 #include "opto/intrinsicnode.hpp"
  31 #include "opto/output.hpp"
  32 #include "opto/subnode.hpp"
  33 #include "runtime/objectMonitorTable.hpp"
  34 #include "runtime/stubRoutines.hpp"
  35 #include "runtime/synchronizer.hpp"
  36 #include "utilities/globalDefinitions.hpp"
  37 
  38 #ifdef PRODUCT
  39 #define BLOCK_COMMENT(str) /* nothing */
  40 #define STOP(error) stop(error)
  41 #else
  42 #define BLOCK_COMMENT(str) block_comment(str)
  43 #define STOP(error) block_comment(error); stop(error)
  44 #endif
  45 
  46 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  47 
  48 void C2_MacroAssembler::fast_lock(Register obj, Register box,
  49                                   Register tmp1, Register tmp2, Register tmp3, Register tmp4) {
  50   // Flag register, zero for success; non-zero for failure.
  51   Register flag = t1;
  52 
  53   assert_different_registers(obj, box, tmp1, tmp2, tmp3, tmp4, flag, t0);
  54 
  55   mv(flag, 1);
  56 
  57   // Handle inflated monitor.
  58   Label inflated;
  59   // Finish fast lock successfully. MUST branch to with flag == 0
  60   Label locked;
  61   // Finish fast lock unsuccessfully. slow_path MUST branch to with flag != 0
  62   Label slow_path;
  63 
  64   if (UseObjectMonitorTable) {
  65     // Clear cache in case fast locking succeeds or we need to take the slow-path.
  66     sd(zr, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
  67   }
  68 
  69   if (DiagnoseSyncOnValueBasedClasses != 0) {
  70     load_klass(tmp1, obj);
  71     lbu(tmp1, Address(tmp1, Klass::misc_flags_offset()));
  72     test_bit(tmp1, tmp1, exact_log2(KlassFlags::_misc_is_value_based_class));
  73     bnez(tmp1, slow_path);
  74   }
  75 
  76   const Register tmp1_mark = tmp1;
  77   const Register tmp3_t = tmp3;
  78 
  79   { // Fast locking
  80 
  81     // Push lock to the lock stack and finish successfully. MUST branch to with flag == 0
  82     Label push;
  83 
  84     const Register tmp2_top = tmp2;
  85 
  86     // Check if lock-stack is full.
  87     lwu(tmp2_top, Address(xthread, JavaThread::lock_stack_top_offset()));
  88     mv(tmp3_t, (unsigned)LockStack::end_offset());
  89     bge(tmp2_top, tmp3_t, slow_path);
  90 
  91     // Check if recursive.
  92     add(tmp3_t, xthread, tmp2_top);
  93     ld(tmp3_t, Address(tmp3_t, -oopSize));
  94     beq(obj, tmp3_t, push);
  95 
  96     // Relaxed normal load to check for monitor. Optimization for monitor case.
  97     ld(tmp1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
  98     test_bit(tmp3_t, tmp1_mark, exact_log2(markWord::monitor_value));
  99     bnez(tmp3_t, inflated);
 100 
 101     // Not inflated
 102     assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a la");
 103 
 104     // Try to lock. Transition lock-bits 0b01 => 0b00
 105     ori(tmp1_mark, tmp1_mark, markWord::unlocked_value);
 106     xori(tmp3_t, tmp1_mark, markWord::unlocked_value);
 107     cmpxchg(/*addr*/ obj, /*expected*/ tmp1_mark, /*new*/ tmp3_t, Assembler::int64,
 108             /*acquire*/ Assembler::aq, /*release*/ Assembler::relaxed, /*result*/ tmp3_t);
 109     bne(tmp1_mark, tmp3_t, slow_path);
 110 
 111     bind(push);
 112     // After successful lock, push object on lock-stack.
 113     add(tmp3_t, xthread, tmp2_top);
 114     sd(obj, Address(tmp3_t));
 115     addw(tmp2_top, tmp2_top, oopSize);
 116     sw(tmp2_top, Address(xthread, JavaThread::lock_stack_top_offset()));
 117     j(locked);
 118   }
 119 
 120   { // Handle inflated monitor.
 121     bind(inflated);
 122 
 123     const Register tmp1_monitor = tmp1;
 124 
 125     if (!UseObjectMonitorTable) {
 126       assert(tmp1_monitor == tmp1_mark, "should be the same here");
 127     } else {
 128       const Register tmp2_hash = tmp2;
 129       const Register tmp3_bucket = tmp3;
 130       Label monitor_found;
 131 
 132       // Save the mark, we might need it to extract the hash.
 133       mv(tmp2_hash, tmp1_mark);
 134 
 135       // Look for the monitor in the om_cache.
 136 
 137       ByteSize cache_offset   = JavaThread::om_cache_oops_offset();
 138       ByteSize monitor_offset = OMCache::oop_to_monitor_difference();
 139       const int num_unrolled  = OMCache::CAPACITY;
 140       for (int i = 0; i < num_unrolled; i++) {
 141         ld(tmp1_monitor, Address(xthread, cache_offset + monitor_offset));
 142         ld(tmp4, Address(xthread, cache_offset));
 143         beq(obj, tmp4, monitor_found);
 144         cache_offset = cache_offset + OMCache::oop_to_oop_difference();
 145       }
 146 
 147       // Look for the monitor in the table.
 148 
 149       // Get the hash code.
 150       srli(tmp2_hash, tmp2_hash, markWord::hash_shift);
 151 
 152       // Get the table and calculate the bucket's address.
 153       la(tmp3_t, ExternalAddress(ObjectMonitorTable::current_table_address()));
 154       ld(tmp3_t, Address(tmp3_t));
 155       ld(tmp1, Address(tmp3_t, ObjectMonitorTable::table_capacity_mask_offset()));
 156       andr(tmp2_hash, tmp2_hash, tmp1);
 157       ld(tmp3_t, Address(tmp3_t, ObjectMonitorTable::table_buckets_offset()));
 158 
 159       // Read the monitor from the bucket.
 160       shadd(tmp3_bucket, tmp2_hash, tmp3_t, tmp4, LogBytesPerWord);
 161       ld(tmp1_monitor, Address(tmp3_bucket));
 162 
 163       // Check if the monitor in the bucket is special (empty, tombstone or removed).
 164       mv(tmp2, ObjectMonitorTable::SpecialPointerValues::below_is_special);
 165       bltu(tmp1_monitor, tmp2, slow_path);
 166 
 167       // Check if object matches.
 168       ld(tmp3, Address(tmp1_monitor, ObjectMonitor::object_offset()));
 169       BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
 170       bs_asm->try_peek_weak_handle_in_nmethod(this, tmp3, tmp3, tmp2, slow_path);
 171       bne(tmp3, obj, slow_path);
 172 
 173       bind(monitor_found);
 174     }
 175 
 176     const Register tmp2_owner_addr = tmp2;
 177     const Register tmp3_owner = tmp3;
 178 
 179     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 180     const Address owner_address(tmp1_monitor, ObjectMonitor::owner_offset() - monitor_tag);
 181     const Address recursions_address(tmp1_monitor, ObjectMonitor::recursions_offset() - monitor_tag);
 182 
 183     Label monitor_locked;
 184 
 185     // Compute owner address.
 186     la(tmp2_owner_addr, owner_address);
 187 
 188     // Try to CAS owner (no owner => current thread's _monitor_owner_id).
 189     Register tid = tmp4;
 190     ld(tid, Address(xthread, JavaThread::monitor_owner_id_offset()));
 191     cmpxchg(/*addr*/ tmp2_owner_addr, /*expected*/ zr, /*new*/ tid, Assembler::int64,
 192             /*acquire*/ Assembler::aq, /*release*/ Assembler::relaxed, /*result*/ tmp3_owner);
 193     beqz(tmp3_owner, monitor_locked);
 194 
 195     // Check if recursive.
 196     bne(tmp3_owner, tid, slow_path);
 197 
 198     // Recursive.
 199     increment(recursions_address, 1, tmp2, tmp3);
 200 
 201     bind(monitor_locked);
 202     if (UseObjectMonitorTable) {
 203       sd(tmp1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 204     }
 205   }
 206 
 207   bind(locked);
 208   mv(flag, zr);
 209 
 210 #ifdef ASSERT
 211   // Check that locked label is reached with flag == 0.
 212   Label flag_correct;
 213   beqz(flag, flag_correct);
 214   stop("Fast Lock Flag != 0");
 215 #endif
 216 
 217   bind(slow_path);
 218 #ifdef ASSERT
 219   // Check that slow_path label is reached with flag != 0.
 220   bnez(flag, flag_correct);
 221   stop("Fast Lock Flag == 0");
 222   bind(flag_correct);
 223 #endif
 224   // C2 uses the value of flag (0 vs !0) to determine the continuation.
 225 }
 226 
 227 void C2_MacroAssembler::fast_unlock(Register obj, Register box,
 228                                     Register tmp1, Register tmp2, Register tmp3) {
 229   // Flag register, zero for success; non-zero for failure.
 230   Register flag = t1;
 231 
 232   assert_different_registers(obj, box, tmp1, tmp2, tmp3, flag, t0);
 233 
 234   mv(flag, 1);
 235 
 236   // Handle inflated monitor.
 237   Label inflated, inflated_load_mark;
 238   // Finish fast unlock successfully. unlocked MUST branch to with flag == 0
 239   Label unlocked;
 240   // Finish fast unlock unsuccessfully. MUST branch to with flag != 0
 241   Label slow_path;
 242 
 243   const Register tmp1_mark = tmp1;
 244   const Register tmp2_top = tmp2;
 245   const Register tmp3_t = tmp3;
 246 
 247   { // Fast unlock
 248     Label push_and_slow_path;
 249 
 250     // Check if obj is top of lock-stack.
 251     lwu(tmp2_top, Address(xthread, JavaThread::lock_stack_top_offset()));
 252     subw(tmp2_top, tmp2_top, oopSize);
 253     add(tmp3_t, xthread, tmp2_top);
 254     ld(tmp3_t, Address(tmp3_t));
 255     // Top of lock stack was not obj. Must be monitor.
 256     bne(obj, tmp3_t, inflated_load_mark);
 257 
 258     // Pop lock-stack.
 259     DEBUG_ONLY(add(tmp3_t, xthread, tmp2_top);)
 260     DEBUG_ONLY(sd(zr, Address(tmp3_t));)
 261     sw(tmp2_top, Address(xthread, JavaThread::lock_stack_top_offset()));
 262 
 263     // Check if recursive.
 264     add(tmp3_t, xthread, tmp2_top);
 265     ld(tmp3_t, Address(tmp3_t, -oopSize));
 266     beq(obj, tmp3_t, unlocked);
 267 
 268     // Not recursive.
 269     // Load Mark.
 270     ld(tmp1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 271 
 272     // Check header for monitor (0b10).
 273     // Because we got here by popping (meaning we pushed in locked)
 274     // there will be no monitor in the box. So we need to push back the obj
 275     // so that the runtime can fix any potential anonymous owner.
 276     test_bit(tmp3_t, tmp1_mark, exact_log2(markWord::monitor_value));
 277     bnez(tmp3_t, UseObjectMonitorTable ? push_and_slow_path : inflated);
 278 
 279     // Try to unlock. Transition lock bits 0b00 => 0b01
 280     assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea");
 281     ori(tmp3_t, tmp1_mark, markWord::unlocked_value);
 282     cmpxchg(/*addr*/ obj, /*expected*/ tmp1_mark, /*new*/ tmp3_t, Assembler::int64,
 283             /*acquire*/ Assembler::relaxed, /*release*/ Assembler::rl, /*result*/ tmp3_t);
 284     beq(tmp1_mark, tmp3_t, unlocked);
 285 
 286     bind(push_and_slow_path);
 287     // Compare and exchange failed.
 288     // Restore lock-stack and handle the unlock in runtime.
 289     DEBUG_ONLY(add(tmp3_t, xthread, tmp2_top);)
 290     DEBUG_ONLY(sd(obj, Address(tmp3_t));)
 291     addw(tmp2_top, tmp2_top, oopSize);
 292     sd(tmp2_top, Address(xthread, JavaThread::lock_stack_top_offset()));
 293     j(slow_path);
 294   }
 295 
 296   { // Handle inflated monitor.
 297     bind(inflated_load_mark);
 298     ld(tmp1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 299 #ifdef ASSERT
 300     test_bit(tmp3_t, tmp1_mark, exact_log2(markWord::monitor_value));
 301     bnez(tmp3_t, inflated);
 302     stop("Fast Unlock not monitor");
 303 #endif
 304 
 305     bind(inflated);
 306 
 307 #ifdef ASSERT
 308     Label check_done;
 309     subw(tmp2_top, tmp2_top, oopSize);
 310     mv(tmp3_t, in_bytes(JavaThread::lock_stack_base_offset()));
 311     blt(tmp2_top, tmp3_t, check_done);
 312     add(tmp3_t, xthread, tmp2_top);
 313     ld(tmp3_t, Address(tmp3_t));
 314     bne(obj, tmp3_t, inflated);
 315     stop("Fast Unlock lock on stack");
 316     bind(check_done);
 317 #endif
 318 
 319     const Register tmp1_monitor = tmp1;
 320 
 321     if (!UseObjectMonitorTable) {
 322       assert(tmp1_monitor == tmp1_mark, "should be the same here");
 323       // Untag the monitor.
 324       subi(tmp1_monitor, tmp1_mark, (int)markWord::monitor_value);
 325     } else {
 326       ld(tmp1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 327       // No valid pointer below alignof(ObjectMonitor*). Take the slow path.
 328       mv(tmp3_t, alignof(ObjectMonitor*));
 329       bltu(tmp1_monitor, tmp3_t, slow_path);
 330     }
 331 
 332     const Register tmp2_recursions = tmp2;
 333     Label not_recursive;
 334 
 335     // Check if recursive.
 336     ld(tmp2_recursions, Address(tmp1_monitor, ObjectMonitor::recursions_offset()));
 337     beqz(tmp2_recursions, not_recursive);
 338 
 339     // Recursive unlock.
 340     subi(tmp2_recursions, tmp2_recursions, 1);
 341     sd(tmp2_recursions, Address(tmp1_monitor, ObjectMonitor::recursions_offset()));
 342     j(unlocked);
 343 
 344     bind(not_recursive);
 345 
 346     const Register tmp2_owner_addr = tmp2;
 347 
 348     // Compute owner address.
 349     la(tmp2_owner_addr, Address(tmp1_monitor, ObjectMonitor::owner_offset()));
 350 
 351     // Set owner to null.
 352     // Release to satisfy the JMM
 353     membar(MacroAssembler::LoadStore | MacroAssembler::StoreStore);
 354     sd(zr, Address(tmp2_owner_addr));
 355     // We need a full fence after clearing owner to avoid stranding.
 356     // StoreLoad achieves this.
 357     membar(StoreLoad);
 358 
 359     // Check if the entry_list is empty.
 360     ld(t0, Address(tmp1_monitor, ObjectMonitor::entry_list_offset()));
 361     beqz(t0, unlocked); // If so we are done.
 362 
 363     // Check if there is a successor.
 364     ld(tmp3_t, Address(tmp1_monitor, ObjectMonitor::succ_offset()));
 365     bnez(tmp3_t, unlocked); // If so we are done.
 366 
 367     // Save the monitor pointer in the current thread, so we can try
 368     // to reacquire the lock in SharedRuntime::monitor_exit_helper().
 369     sd(tmp1_monitor, Address(xthread, JavaThread::unlocked_inflated_monitor_offset()));
 370 
 371     mv(flag, 1);
 372     j(slow_path);
 373   }
 374 
 375   bind(unlocked);
 376   mv(flag, zr);
 377 
 378 #ifdef ASSERT
 379   // Check that unlocked label is reached with flag == 0.
 380   Label flag_correct;
 381   beqz(flag, flag_correct);
 382   stop("Fast Lock Flag != 0");
 383 #endif
 384 
 385   bind(slow_path);
 386 #ifdef ASSERT
 387   // Check that slow_path label is reached with flag != 0.
 388   bnez(flag, flag_correct);
 389   stop("Fast Lock Flag == 0");
 390   bind(flag_correct);
 391 #endif
 392   // C2 uses the value of flag (0 vs !0) to determine the continuation.
 393 }
 394 
 395 // short string
 396 // StringUTF16.indexOfChar
 397 // StringLatin1.indexOfChar
 398 void C2_MacroAssembler::string_indexof_char_short(Register str1, Register cnt1,
 399                                                   Register ch, Register result,
 400                                                   bool isL)
 401 {
 402   Register ch1 = t0;
 403   Register index = t1;
 404 
 405   BLOCK_COMMENT("string_indexof_char_short {");
 406 
 407   Label LOOP, LOOP1, LOOP4, LOOP8;
 408   Label MATCH,  MATCH1, MATCH2, MATCH3,
 409         MATCH4, MATCH5, MATCH6, MATCH7, NOMATCH;
 410 
 411   mv(result, -1);
 412   mv(index, zr);
 413 
 414   bind(LOOP);
 415   addi(t0, index, 8);
 416   ble(t0, cnt1, LOOP8);
 417   addi(t0, index, 4);
 418   ble(t0, cnt1, LOOP4);
 419   j(LOOP1);
 420 
 421   bind(LOOP8);
 422   isL ? lbu(ch1, Address(str1, 0)) : lhu(ch1, Address(str1, 0));
 423   beq(ch, ch1, MATCH);
 424   isL ? lbu(ch1, Address(str1, 1)) : lhu(ch1, Address(str1, 2));
 425   beq(ch, ch1, MATCH1);
 426   isL ? lbu(ch1, Address(str1, 2)) : lhu(ch1, Address(str1, 4));
 427   beq(ch, ch1, MATCH2);
 428   isL ? lbu(ch1, Address(str1, 3)) : lhu(ch1, Address(str1, 6));
 429   beq(ch, ch1, MATCH3);
 430   isL ? lbu(ch1, Address(str1, 4)) : lhu(ch1, Address(str1, 8));
 431   beq(ch, ch1, MATCH4);
 432   isL ? lbu(ch1, Address(str1, 5)) : lhu(ch1, Address(str1, 10));
 433   beq(ch, ch1, MATCH5);
 434   isL ? lbu(ch1, Address(str1, 6)) : lhu(ch1, Address(str1, 12));
 435   beq(ch, ch1, MATCH6);
 436   isL ? lbu(ch1, Address(str1, 7)) : lhu(ch1, Address(str1, 14));
 437   beq(ch, ch1, MATCH7);
 438   addi(index, index, 8);
 439   addi(str1, str1, isL ? 8 : 16);
 440   blt(index, cnt1, LOOP);
 441   j(NOMATCH);
 442 
 443   bind(LOOP4);
 444   isL ? lbu(ch1, Address(str1, 0)) : lhu(ch1, Address(str1, 0));
 445   beq(ch, ch1, MATCH);
 446   isL ? lbu(ch1, Address(str1, 1)) : lhu(ch1, Address(str1, 2));
 447   beq(ch, ch1, MATCH1);
 448   isL ? lbu(ch1, Address(str1, 2)) : lhu(ch1, Address(str1, 4));
 449   beq(ch, ch1, MATCH2);
 450   isL ? lbu(ch1, Address(str1, 3)) : lhu(ch1, Address(str1, 6));
 451   beq(ch, ch1, MATCH3);
 452   addi(index, index, 4);
 453   addi(str1, str1, isL ? 4 : 8);
 454   bge(index, cnt1, NOMATCH);
 455 
 456   bind(LOOP1);
 457   isL ? lbu(ch1, Address(str1)) : lhu(ch1, Address(str1));
 458   beq(ch, ch1, MATCH);
 459   addi(index, index, 1);
 460   addi(str1, str1, isL ? 1 : 2);
 461   blt(index, cnt1, LOOP1);
 462   j(NOMATCH);
 463 
 464   bind(MATCH1);
 465   addi(index, index, 1);
 466   j(MATCH);
 467 
 468   bind(MATCH2);
 469   addi(index, index, 2);
 470   j(MATCH);
 471 
 472   bind(MATCH3);
 473   addi(index, index, 3);
 474   j(MATCH);
 475 
 476   bind(MATCH4);
 477   addi(index, index, 4);
 478   j(MATCH);
 479 
 480   bind(MATCH5);
 481   addi(index, index, 5);
 482   j(MATCH);
 483 
 484   bind(MATCH6);
 485   addi(index, index, 6);
 486   j(MATCH);
 487 
 488   bind(MATCH7);
 489   addi(index, index, 7);
 490 
 491   bind(MATCH);
 492   mv(result, index);
 493   bind(NOMATCH);
 494   BLOCK_COMMENT("} string_indexof_char_short");
 495 }
 496 
 497 // StringUTF16.indexOfChar
 498 // StringLatin1.indexOfChar
 499 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1,
 500                                             Register ch, Register result,
 501                                             Register tmp1, Register tmp2,
 502                                             Register tmp3, Register tmp4,
 503                                             bool isL)
 504 {
 505   Label CH1_LOOP, HIT, NOMATCH, DONE, DO_LONG;
 506   Register ch1 = t0;
 507   Register orig_cnt = t1;
 508   Register mask1 = tmp3;
 509   Register mask2 = tmp2;
 510   Register match_mask = tmp1;
 511   Register trailing_char = tmp4;
 512   Register unaligned_elems = tmp4;
 513 
 514   BLOCK_COMMENT("string_indexof_char {");
 515   beqz(cnt1, NOMATCH);
 516 
 517   subi(t0, cnt1, isL ? 32 : 16);
 518   bgtz(t0, DO_LONG);
 519   string_indexof_char_short(str1, cnt1, ch, result, isL);
 520   j(DONE);
 521 
 522   bind(DO_LONG);
 523   mv(orig_cnt, cnt1);
 524   if (AvoidUnalignedAccesses) {
 525     Label ALIGNED;
 526     andi(unaligned_elems, str1, 0x7);
 527     beqz(unaligned_elems, ALIGNED);
 528     sub(unaligned_elems, unaligned_elems, 8);
 529     neg(unaligned_elems, unaligned_elems);
 530     if (!isL) {
 531       srli(unaligned_elems, unaligned_elems, 1);
 532     }
 533     // do unaligned part per element
 534     string_indexof_char_short(str1, unaligned_elems, ch, result, isL);
 535     bgez(result, DONE);
 536     mv(orig_cnt, cnt1);
 537     sub(cnt1, cnt1, unaligned_elems);
 538     bind(ALIGNED);
 539   }
 540 
 541   // duplicate ch
 542   if (isL) {
 543     slli(ch1, ch, 8);
 544     orr(ch, ch1, ch);
 545   }
 546   slli(ch1, ch, 16);
 547   orr(ch, ch1, ch);
 548   slli(ch1, ch, 32);
 549   orr(ch, ch1, ch);
 550 
 551   if (!isL) {
 552     slli(cnt1, cnt1, 1);
 553   }
 554 
 555   uint64_t mask0101 = UCONST64(0x0101010101010101);
 556   uint64_t mask0001 = UCONST64(0x0001000100010001);
 557   mv(mask1, isL ? mask0101 : mask0001);
 558   uint64_t mask7f7f = UCONST64(0x7f7f7f7f7f7f7f7f);
 559   uint64_t mask7fff = UCONST64(0x7fff7fff7fff7fff);
 560   mv(mask2, isL ? mask7f7f : mask7fff);
 561 
 562   bind(CH1_LOOP);
 563   ld(ch1, Address(str1));
 564   addi(str1, str1, 8);
 565   subi(cnt1, cnt1, 8);
 566   compute_match_mask(ch1, ch, match_mask, mask1, mask2);
 567   bnez(match_mask, HIT);
 568   bgtz(cnt1, CH1_LOOP);
 569   j(NOMATCH);
 570 
 571   bind(HIT);
 572   // count bits of trailing zero chars
 573   ctzc_bits(trailing_char, match_mask, isL, ch1, result);
 574   srli(trailing_char, trailing_char, 3);
 575   addi(cnt1, cnt1, 8);
 576   ble(cnt1, trailing_char, NOMATCH);
 577   // match case
 578   if (!isL) {
 579     srli(cnt1, cnt1, 1);
 580     srli(trailing_char, trailing_char, 1);
 581   }
 582 
 583   sub(result, orig_cnt, cnt1);
 584   add(result, result, trailing_char);
 585   j(DONE);
 586 
 587   bind(NOMATCH);
 588   mv(result, -1);
 589 
 590   bind(DONE);
 591   BLOCK_COMMENT("} string_indexof_char");
 592 }
 593 
 594 typedef void (MacroAssembler::* load_chr_insn)(Register rd, const Address &adr, Register temp);
 595 
 596 // Search for needle in haystack and return index or -1
 597 // x10: result
 598 // x11: haystack
 599 // x12: haystack_len
 600 // x13: needle
 601 // x14: needle_len
 602 void C2_MacroAssembler::string_indexof(Register haystack, Register needle,
 603                                        Register haystack_len, Register needle_len,
 604                                        Register tmp1, Register tmp2,
 605                                        Register tmp3, Register tmp4,
 606                                        Register tmp5, Register tmp6,
 607                                        Register result, int ae)
 608 {
 609   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
 610 
 611   Label LINEARSEARCH, LINEARSTUB, DONE, NOMATCH;
 612 
 613   Register ch1 = t0;
 614   Register ch2 = t1;
 615   Register nlen_tmp = tmp1; // needle len tmp
 616   Register hlen_tmp = tmp2; // haystack len tmp
 617   Register result_tmp = tmp4;
 618 
 619   bool isLL = ae == StrIntrinsicNode::LL;
 620 
 621   bool needle_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
 622   bool haystack_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
 623   int needle_chr_shift = needle_isL ? 0 : 1;
 624   int haystack_chr_shift = haystack_isL ? 0 : 1;
 625   int needle_chr_size = needle_isL ? 1 : 2;
 626   int haystack_chr_size = haystack_isL ? 1 : 2;
 627   load_chr_insn needle_load_1chr = needle_isL ? (load_chr_insn)&MacroAssembler::lbu :
 628                               (load_chr_insn)&MacroAssembler::lhu;
 629   load_chr_insn haystack_load_1chr = haystack_isL ? (load_chr_insn)&MacroAssembler::lbu :
 630                                 (load_chr_insn)&MacroAssembler::lhu;
 631 
 632   BLOCK_COMMENT("string_indexof {");
 633 
 634   // Note, inline_string_indexOf() generates checks:
 635   // if (pattern.count > src.count) return -1;
 636   // if (pattern.count == 0) return 0;
 637 
 638   // We have two strings, a source string in haystack, haystack_len and a pattern string
 639   // in needle, needle_len. Find the first occurrence of pattern in source or return -1.
 640 
 641   // For larger pattern and source we use a simplified Boyer Moore algorithm.
 642   // With a small pattern and source we use linear scan.
 643 
 644   // needle_len >=8 && needle_len < 256 && needle_len < haystack_len/4, use bmh algorithm.
 645   sub(result_tmp, haystack_len, needle_len);
 646   // needle_len < 8, use linear scan
 647   sub(t0, needle_len, 8);
 648   bltz(t0, LINEARSEARCH);
 649   // needle_len >= 256, use linear scan
 650   sub(t0, needle_len, 256);
 651   bgez(t0, LINEARSTUB);
 652   // needle_len >= haystack_len/4, use linear scan
 653   srli(t0, haystack_len, 2);
 654   bge(needle_len, t0, LINEARSTUB);
 655 
 656   // Boyer-Moore-Horspool introduction:
 657   // The Boyer Moore alogorithm is based on the description here:-
 658   //
 659   // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
 660   //
 661   // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
 662   // and the 'Good Suffix' rule.
 663   //
 664   // These rules are essentially heuristics for how far we can shift the
 665   // pattern along the search string.
 666   //
 667   // The implementation here uses the 'Bad Character' rule only because of the
 668   // complexity of initialisation for the 'Good Suffix' rule.
 669   //
 670   // This is also known as the Boyer-Moore-Horspool algorithm:
 671   //
 672   // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
 673   //
 674   // #define ASIZE 256
 675   //
 676   //    int bm(unsigned char *pattern, int m, unsigned char *src, int n) {
 677   //      int i, j;
 678   //      unsigned c;
 679   //      unsigned char bc[ASIZE];
 680   //
 681   //      /* Preprocessing */
 682   //      for (i = 0; i < ASIZE; ++i)
 683   //        bc[i] = m;
 684   //      for (i = 0; i < m - 1; ) {
 685   //        c = pattern[i];
 686   //        ++i;
 687   //        // c < 256 for Latin1 string, so, no need for branch
 688   //        #ifdef PATTERN_STRING_IS_LATIN1
 689   //        bc[c] = m - i;
 690   //        #else
 691   //        if (c < ASIZE) bc[c] = m - i;
 692   //        #endif
 693   //      }
 694   //
 695   //      /* Searching */
 696   //      j = 0;
 697   //      while (j <= n - m) {
 698   //        c = src[i+j];
 699   //        if (pattern[m-1] == c)
 700   //          int k;
 701   //          for (k = m - 2; k >= 0 && pattern[k] == src[k + j]; --k);
 702   //          if (k < 0) return j;
 703   //          // c < 256 for Latin1 string, so, no need for branch
 704   //          #ifdef SOURCE_STRING_IS_LATIN1_AND_PATTERN_STRING_IS_LATIN1
 705   //          // LL case: (c< 256) always true. Remove branch
 706   //          j += bc[pattern[j+m-1]];
 707   //          #endif
 708   //          #ifdef SOURCE_STRING_IS_UTF_AND_PATTERN_STRING_IS_UTF
 709   //          // UU case: need if (c<ASIZE) check. Skip 1 character if not.
 710   //          if (c < ASIZE)
 711   //            j += bc[pattern[j+m-1]];
 712   //          else
 713   //            j += 1
 714   //          #endif
 715   //          #ifdef SOURCE_IS_UTF_AND_PATTERN_IS_LATIN1
 716   //          // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
 717   //          if (c < ASIZE)
 718   //            j += bc[pattern[j+m-1]];
 719   //          else
 720   //            j += m
 721   //          #endif
 722   //      }
 723   //      return -1;
 724   //    }
 725 
 726   // temp register:t0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, result
 727   Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
 728         BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
 729 
 730   Register haystack_end = haystack_len;
 731   Register skipch = tmp2;
 732 
 733   // pattern length is >=8, so, we can read at least 1 register for cases when
 734   // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
 735   // UL case. We'll re-read last character in inner pre-loop code to have
 736   // single outer pre-loop load
 737   const int firstStep = isLL ? 7 : 3;
 738 
 739   const int ASIZE = 256;
 740   const int STORE_BYTES = 8; // 8 bytes stored per instruction(sd)
 741 
 742   subi(sp, sp, ASIZE);
 743 
 744   // init BC offset table with default value: needle_len
 745   slli(t0, needle_len, 8);
 746   orr(t0, t0, needle_len); // [63...16][needle_len][needle_len]
 747   slli(tmp1, t0, 16);
 748   orr(t0, tmp1, t0); // [63...32][needle_len][needle_len][needle_len][needle_len]
 749   slli(tmp1, t0, 32);
 750   orr(tmp5, tmp1, t0); // tmp5: 8 elements [needle_len]
 751 
 752   mv(ch1, sp);  // ch1 is t0
 753   mv(tmp6, ASIZE / STORE_BYTES); // loop iterations
 754 
 755   bind(BM_INIT_LOOP);
 756   // for (i = 0; i < ASIZE; ++i)
 757   //   bc[i] = m;
 758   for (int i = 0; i < 4; i++) {
 759     sd(tmp5, Address(ch1, i * wordSize));
 760   }
 761   addi(ch1, ch1, 32);
 762   subi(tmp6, tmp6, 4);
 763   bgtz(tmp6, BM_INIT_LOOP);
 764 
 765   subi(nlen_tmp, needle_len, 1); // m - 1, index of the last element in pattern
 766   Register orig_haystack = tmp5;
 767   mv(orig_haystack, haystack);
 768   // result_tmp = tmp4
 769   shadd(haystack_end, result_tmp, haystack, haystack_end, haystack_chr_shift);
 770   subi(ch2, needle_len, 1); // bc offset init value, ch2 is t1
 771   mv(tmp3, needle);
 772 
 773   //  for (i = 0; i < m - 1; ) {
 774   //    c = pattern[i];
 775   //    ++i;
 776   //    // c < 256 for Latin1 string, so, no need for branch
 777   //    #ifdef PATTERN_STRING_IS_LATIN1
 778   //    bc[c] = m - i;
 779   //    #else
 780   //    if (c < ASIZE) bc[c] = m - i;
 781   //    #endif
 782   //  }
 783   bind(BCLOOP);
 784   (this->*needle_load_1chr)(ch1, Address(tmp3), noreg);
 785   addi(tmp3, tmp3, needle_chr_size);
 786   if (!needle_isL) {
 787     // ae == StrIntrinsicNode::UU
 788     mv(tmp6, ASIZE);
 789     bgeu(ch1, tmp6, BCSKIP);
 790   }
 791   add(tmp4, sp, ch1);
 792   sb(ch2, Address(tmp4)); // store skip offset to BC offset table
 793 
 794   bind(BCSKIP);
 795   subi(ch2, ch2, 1); // for next pattern element, skip distance -1
 796   bgtz(ch2, BCLOOP);
 797 
 798   // tmp6: pattern end, address after needle
 799   shadd(tmp6, needle_len, needle, tmp6, needle_chr_shift);
 800   if (needle_isL == haystack_isL) {
 801     // load last 8 bytes (8LL/4UU symbols)
 802     ld(tmp6, Address(tmp6, -wordSize));
 803   } else {
 804     // UL: from UTF-16(source) search Latin1(pattern)
 805     lwu(tmp6, Address(tmp6, -wordSize / 2)); // load last 4 bytes(4 symbols)
 806     // convert Latin1 to UTF. eg: 0x0000abcd -> 0x0a0b0c0d
 807     // We'll have to wait until load completed, but it's still faster than per-character loads+checks
 808     srli(tmp3, tmp6, BitsPerByte * (wordSize / 2 - needle_chr_size)); // pattern[m-1], eg:0x0000000a
 809     slli(ch2, tmp6, XLEN - 24);
 810     srli(ch2, ch2, XLEN - 8); // pattern[m-2], 0x0000000b
 811     slli(ch1, tmp6, XLEN - 16);
 812     srli(ch1, ch1, XLEN - 8); // pattern[m-3], 0x0000000c
 813     zext(tmp6, tmp6, 8); // pattern[m-4], 0x0000000d
 814     slli(ch2, ch2, 16);
 815     orr(ch2, ch2, ch1); // 0x00000b0c
 816     slli(result, tmp3, 48); // use result as temp register
 817     orr(tmp6, tmp6, result); // 0x0a00000d
 818     slli(result, ch2, 16);
 819     orr(tmp6, tmp6, result); // UTF-16:0x0a0b0c0d
 820   }
 821 
 822   // i = m - 1;
 823   // skipch = j + i;
 824   // if (skipch == pattern[m - 1]
 825   //   for (k = m - 2; k >= 0 && pattern[k] == src[k + j]; --k);
 826   // else
 827   //   move j with bad char offset table
 828   bind(BMLOOPSTR2);
 829   // compare pattern to source string backward
 830   shadd(result, nlen_tmp, haystack, result, haystack_chr_shift);
 831   (this->*haystack_load_1chr)(skipch, Address(result), noreg);
 832   subi(nlen_tmp, nlen_tmp, firstStep); // nlen_tmp is positive here, because needle_len >= 8
 833   if (needle_isL == haystack_isL) {
 834     // re-init tmp3. It's for free because it's executed in parallel with
 835     // load above. Alternative is to initialize it before loop, but it'll
 836     // affect performance on in-order systems with 2 or more ld/st pipelines
 837     srli(tmp3, tmp6, BitsPerByte * (wordSize - needle_chr_size)); // UU/LL: pattern[m-1]
 838   }
 839   if (!isLL) { // UU/UL case
 840     slli(ch2, nlen_tmp, 1); // offsets in bytes
 841   }
 842   bne(tmp3, skipch, BMSKIP); // if not equal, skipch is bad char
 843   add(result, haystack, isLL ? nlen_tmp : ch2);
 844   // load 8 bytes from source string
 845   // if isLL is false then read granularity can be 2
 846   load_long_misaligned(ch2, Address(result), ch1, isLL ? 1 : 2); // can use ch1 as temp register here as it will be trashed by next mv anyway
 847   mv(ch1, tmp6);
 848   if (isLL) {
 849     j(BMLOOPSTR1_AFTER_LOAD);
 850   } else {
 851     subi(nlen_tmp, nlen_tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
 852     j(BMLOOPSTR1_CMP);
 853   }
 854 
 855   bind(BMLOOPSTR1);
 856   shadd(ch1, nlen_tmp, needle, ch1, needle_chr_shift);
 857   (this->*needle_load_1chr)(ch1, Address(ch1), noreg);
 858   shadd(ch2, nlen_tmp, haystack, ch2, haystack_chr_shift);
 859   (this->*haystack_load_1chr)(ch2, Address(ch2), noreg);
 860 
 861   bind(BMLOOPSTR1_AFTER_LOAD);
 862   subi(nlen_tmp, nlen_tmp, 1);
 863   bltz(nlen_tmp, BMLOOPSTR1_LASTCMP);
 864 
 865   bind(BMLOOPSTR1_CMP);
 866   beq(ch1, ch2, BMLOOPSTR1);
 867 
 868   bind(BMSKIP);
 869   if (!isLL) {
 870     // if we've met UTF symbol while searching Latin1 pattern, then we can
 871     // skip needle_len symbols
 872     if (needle_isL != haystack_isL) {
 873       mv(result_tmp, needle_len);
 874     } else {
 875       mv(result_tmp, 1);
 876     }
 877     mv(t0, ASIZE);
 878     bgeu(skipch, t0, BMADV);
 879   }
 880   add(result_tmp, sp, skipch);
 881   lbu(result_tmp, Address(result_tmp)); // load skip offset
 882 
 883   bind(BMADV);
 884   subi(nlen_tmp, needle_len, 1);
 885   // move haystack after bad char skip offset
 886   shadd(haystack, result_tmp, haystack, result, haystack_chr_shift);
 887   ble(haystack, haystack_end, BMLOOPSTR2);
 888   addi(sp, sp, ASIZE);
 889   j(NOMATCH);
 890 
 891   bind(BMLOOPSTR1_LASTCMP);
 892   bne(ch1, ch2, BMSKIP);
 893 
 894   bind(BMMATCH);
 895   sub(result, haystack, orig_haystack);
 896   if (!haystack_isL) {
 897     srli(result, result, 1);
 898   }
 899   addi(sp, sp, ASIZE);
 900   j(DONE);
 901 
 902   bind(LINEARSTUB);
 903   subi(t0, needle_len, 16); // small patterns still should be handled by simple algorithm
 904   bltz(t0, LINEARSEARCH);
 905   mv(result, zr);
 906   RuntimeAddress stub = nullptr;
 907   if (isLL) {
 908     stub = RuntimeAddress(StubRoutines::riscv::string_indexof_linear_ll());
 909     assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated");
 910   } else if (needle_isL) {
 911     stub = RuntimeAddress(StubRoutines::riscv::string_indexof_linear_ul());
 912     assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated");
 913   } else {
 914     stub = RuntimeAddress(StubRoutines::riscv::string_indexof_linear_uu());
 915     assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated");
 916   }
 917   address call = reloc_call(stub);
 918   if (call == nullptr) {
 919     DEBUG_ONLY(reset_labels(LINEARSEARCH, DONE, NOMATCH));
 920     ciEnv::current()->record_failure("CodeCache is full");
 921     return;
 922   }
 923   j(DONE);
 924 
 925   bind(NOMATCH);
 926   mv(result, -1);
 927   j(DONE);
 928 
 929   bind(LINEARSEARCH);
 930   string_indexof_linearscan(haystack, needle, haystack_len, needle_len, tmp1, tmp2, tmp3, tmp4, -1, result, ae);
 931 
 932   bind(DONE);
 933   BLOCK_COMMENT("} string_indexof");
 934 }
 935 
 936 // string_indexof
 937 // result: x10
 938 // src: x11
 939 // src_count: x12
 940 // pattern: x13
 941 // pattern_count: x14 or 1/2/3/4
 942 void C2_MacroAssembler::string_indexof_linearscan(Register haystack, Register needle,
 943                                                Register haystack_len, Register needle_len,
 944                                                Register tmp1, Register tmp2,
 945                                                Register tmp3, Register tmp4,
 946                                                int needle_con_cnt, Register result, int ae)
 947 {
 948   // Note:
 949   // needle_con_cnt > 0 means needle_len register is invalid, needle length is constant
 950   // for UU/LL: needle_con_cnt[1, 4], UL: needle_con_cnt = 1
 951   assert(needle_con_cnt <= 4, "Invalid needle constant count");
 952   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
 953 
 954   Register ch1 = t0;
 955   Register ch2 = t1;
 956   Register hlen_neg = haystack_len, nlen_neg = needle_len;
 957   Register nlen_tmp = tmp1, hlen_tmp = tmp2, result_tmp = tmp4;
 958 
 959   bool isLL = ae == StrIntrinsicNode::LL;
 960 
 961   bool needle_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
 962   bool haystack_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
 963   int needle_chr_shift = needle_isL ? 0 : 1;
 964   int haystack_chr_shift = haystack_isL ? 0 : 1;
 965   int needle_chr_size = needle_isL ? 1 : 2;
 966   int haystack_chr_size = haystack_isL ? 1 : 2;
 967 
 968   load_chr_insn needle_load_1chr = needle_isL ? (load_chr_insn)&MacroAssembler::lbu :
 969                               (load_chr_insn)&MacroAssembler::lhu;
 970   load_chr_insn haystack_load_1chr = haystack_isL ? (load_chr_insn)&MacroAssembler::lbu :
 971                                 (load_chr_insn)&MacroAssembler::lhu;
 972   load_chr_insn load_2chr = isLL ? (load_chr_insn)&MacroAssembler::lhu : (load_chr_insn)&MacroAssembler::lwu;
 973   load_chr_insn load_4chr = isLL ? (load_chr_insn)&MacroAssembler::lwu : (load_chr_insn)&MacroAssembler::ld;
 974 
 975   Label DO1, DO2, DO3, MATCH, NOMATCH, DONE;
 976 
 977   Register first = tmp3;
 978 
 979   if (needle_con_cnt == -1) {
 980     Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
 981 
 982     subi(t0, needle_len, needle_isL == haystack_isL ? 4 : 2);
 983     bltz(t0, DOSHORT);
 984 
 985     (this->*needle_load_1chr)(first, Address(needle), noreg);
 986     slli(t0, needle_len, needle_chr_shift);
 987     add(needle, needle, t0);
 988     neg(nlen_neg, t0);
 989     slli(t0, result_tmp, haystack_chr_shift);
 990     add(haystack, haystack, t0);
 991     neg(hlen_neg, t0);
 992 
 993     bind(FIRST_LOOP);
 994     add(t0, haystack, hlen_neg);
 995     (this->*haystack_load_1chr)(ch2, Address(t0), noreg);
 996     beq(first, ch2, STR1_LOOP);
 997 
 998     bind(STR2_NEXT);
 999     addi(hlen_neg, hlen_neg, haystack_chr_size);
1000     blez(hlen_neg, FIRST_LOOP);
1001     j(NOMATCH);
1002 
1003     bind(STR1_LOOP);
1004     addi(nlen_tmp, nlen_neg, needle_chr_size);
1005     addi(hlen_tmp, hlen_neg, haystack_chr_size);
1006     bgez(nlen_tmp, MATCH);
1007 
1008     bind(STR1_NEXT);
1009     add(ch1, needle, nlen_tmp);
1010     (this->*needle_load_1chr)(ch1, Address(ch1), noreg);
1011     add(ch2, haystack, hlen_tmp);
1012     (this->*haystack_load_1chr)(ch2, Address(ch2), noreg);
1013     bne(ch1, ch2, STR2_NEXT);
1014     addi(nlen_tmp, nlen_tmp, needle_chr_size);
1015     addi(hlen_tmp, hlen_tmp, haystack_chr_size);
1016     bltz(nlen_tmp, STR1_NEXT);
1017     j(MATCH);
1018 
1019     bind(DOSHORT);
1020     if (needle_isL == haystack_isL) {
1021       subi(t0, needle_len, 2);
1022       bltz(t0, DO1);
1023       bgtz(t0, DO3);
1024     }
1025   }
1026 
1027   if (needle_con_cnt == 4) {
1028     Label CH1_LOOP;
1029     (this->*load_4chr)(ch1, Address(needle), noreg);
1030     subi(result_tmp, haystack_len, 4);
1031     slli(tmp3, result_tmp, haystack_chr_shift); // result as tmp
1032     add(haystack, haystack, tmp3);
1033     neg(hlen_neg, tmp3);
1034     if (AvoidUnalignedAccesses) {
1035       // preload first value, then we will read by 1 character per loop, instead of four
1036       // just shifting previous ch2 right by size of character in bits
1037       add(tmp3, haystack, hlen_neg);
1038       (this->*load_4chr)(ch2, Address(tmp3), noreg);
1039       if (isLL) {
1040         // need to erase 1 most significant byte in 32-bit value of ch2
1041         slli(ch2, ch2, 40);
1042         srli(ch2, ch2, 32);
1043       } else {
1044         slli(ch2, ch2, 16); // 2 most significant bytes will be erased by this operation
1045       }
1046     }
1047 
1048     bind(CH1_LOOP);
1049     add(tmp3, haystack, hlen_neg);
1050     if (AvoidUnalignedAccesses) {
1051       srli(ch2, ch2, isLL ? 8 : 16);
1052       (this->*haystack_load_1chr)(tmp3, Address(tmp3, isLL ? 3 : 6), noreg);
1053       slli(tmp3, tmp3, isLL ? 24 : 48);
1054       add(ch2, ch2, tmp3);
1055     } else {
1056       (this->*load_4chr)(ch2, Address(tmp3), noreg);
1057     }
1058     beq(ch1, ch2, MATCH);
1059     addi(hlen_neg, hlen_neg, haystack_chr_size);
1060     blez(hlen_neg, CH1_LOOP);
1061     j(NOMATCH);
1062   }
1063 
1064   if ((needle_con_cnt == -1 && needle_isL == haystack_isL) || needle_con_cnt == 2) {
1065     Label CH1_LOOP;
1066     BLOCK_COMMENT("string_indexof DO2 {");
1067     bind(DO2);
1068     (this->*load_2chr)(ch1, Address(needle), noreg);
1069     if (needle_con_cnt == 2) {
1070       subi(result_tmp, haystack_len, 2);
1071     }
1072     slli(tmp3, result_tmp, haystack_chr_shift);
1073     add(haystack, haystack, tmp3);
1074     neg(hlen_neg, tmp3);
1075     if (AvoidUnalignedAccesses) {
1076       // preload first value, then we will read by 1 character per loop, instead of two
1077       // just shifting previous ch2 right by size of character in bits
1078       add(tmp3, haystack, hlen_neg);
1079       (this->*haystack_load_1chr)(ch2, Address(tmp3), noreg);
1080       slli(ch2, ch2, isLL ? 8 : 16);
1081     }
1082     bind(CH1_LOOP);
1083     add(tmp3, haystack, hlen_neg);
1084     if (AvoidUnalignedAccesses) {
1085       srli(ch2, ch2, isLL ? 8 : 16);
1086       (this->*haystack_load_1chr)(tmp3, Address(tmp3, isLL ? 1 : 2), noreg);
1087       slli(tmp3, tmp3, isLL ? 8 : 16);
1088       add(ch2, ch2, tmp3);
1089     } else {
1090       (this->*load_2chr)(ch2, Address(tmp3), noreg);
1091     }
1092     beq(ch1, ch2, MATCH);
1093     addi(hlen_neg, hlen_neg, haystack_chr_size);
1094     blez(hlen_neg, CH1_LOOP);
1095     j(NOMATCH);
1096     BLOCK_COMMENT("} string_indexof DO2");
1097   }
1098 
1099   if ((needle_con_cnt == -1 && needle_isL == haystack_isL) || needle_con_cnt == 3) {
1100     Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
1101     BLOCK_COMMENT("string_indexof DO3 {");
1102 
1103     bind(DO3);
1104     (this->*load_2chr)(first, Address(needle), noreg);
1105     (this->*needle_load_1chr)(ch1, Address(needle, 2 * needle_chr_size), noreg);
1106     if (needle_con_cnt == 3) {
1107       subi(result_tmp, haystack_len, 3);
1108     }
1109     slli(hlen_tmp, result_tmp, haystack_chr_shift);
1110     add(haystack, haystack, hlen_tmp);
1111     neg(hlen_neg, hlen_tmp);
1112 
1113     bind(FIRST_LOOP);
1114     add(ch2, haystack, hlen_neg);
1115     if (AvoidUnalignedAccesses) {
1116       (this->*haystack_load_1chr)(tmp2, Address(ch2, isLL ? 1 : 2), noreg); // we need a temp register, we can safely use hlen_tmp here, which is a synonym for tmp2
1117       (this->*haystack_load_1chr)(ch2, Address(ch2), noreg);
1118       slli(tmp2, tmp2, isLL ? 8 : 16);
1119       add(ch2, ch2, tmp2);
1120     } else {
1121       (this->*load_2chr)(ch2, Address(ch2), noreg);
1122     }
1123     beq(first, ch2, STR1_LOOP);
1124 
1125     bind(STR2_NEXT);
1126     addi(hlen_neg, hlen_neg, haystack_chr_size);
1127     blez(hlen_neg, FIRST_LOOP);
1128     j(NOMATCH);
1129 
1130     bind(STR1_LOOP);
1131     addi(hlen_tmp, hlen_neg, 2 * haystack_chr_size);
1132     add(ch2, haystack, hlen_tmp);
1133     (this->*haystack_load_1chr)(ch2, Address(ch2), noreg);
1134     bne(ch1, ch2, STR2_NEXT);
1135     j(MATCH);
1136     BLOCK_COMMENT("} string_indexof DO3");
1137   }
1138 
1139   if (needle_con_cnt == -1 || needle_con_cnt == 1) {
1140     Label DO1_LOOP;
1141 
1142     BLOCK_COMMENT("string_indexof DO1 {");
1143     bind(DO1);
1144     (this->*needle_load_1chr)(ch1, Address(needle), noreg);
1145     subi(result_tmp, haystack_len, 1);
1146     slli(tmp3, result_tmp, haystack_chr_shift);
1147     add(haystack, haystack, tmp3);
1148     neg(hlen_neg, tmp3);
1149 
1150     bind(DO1_LOOP);
1151     add(tmp3, haystack, hlen_neg);
1152     (this->*haystack_load_1chr)(ch2, Address(tmp3), noreg);
1153     beq(ch1, ch2, MATCH);
1154     addi(hlen_neg, hlen_neg, haystack_chr_size);
1155     blez(hlen_neg, DO1_LOOP);
1156     BLOCK_COMMENT("} string_indexof DO1");
1157   }
1158 
1159   bind(NOMATCH);
1160   mv(result, -1);
1161   j(DONE);
1162 
1163   bind(MATCH);
1164   srai(t0, hlen_neg, haystack_chr_shift);
1165   add(result, result_tmp, t0);
1166 
1167   bind(DONE);
1168 }
1169 
1170 // Compare longwords
1171 void C2_MacroAssembler::string_compare_long_same_encoding(Register result, Register str1, Register str2,
1172                                                   const bool isLL, Register cnt1, Register cnt2,
1173                                                   Register tmp1, Register tmp2, Register tmp3,
1174                                                   const int STUB_THRESHOLD, Label *STUB, Label *SHORT_STRING, Label *DONE) {
1175   Label TAIL_CHECK, TAIL, NEXT_WORD, DIFFERENCE;
1176 
1177   const int base_offset = arrayOopDesc::base_offset_in_bytes(T_BYTE);
1178   assert((base_offset % (UseCompactObjectHeaders ? 4 : 8)) == 0, "Must be");
1179 
1180   const int minCharsInWord = isLL ? wordSize : wordSize / 2;
1181 
1182   // load first parts of strings and finish initialization while loading
1183   beq(str1, str2, *DONE);
1184   // Alignment
1185   if (AvoidUnalignedAccesses && (base_offset % 8) != 0) {
1186     lwu(tmp1, Address(str1));
1187     lwu(tmp2, Address(str2));
1188     bne(tmp1, tmp2, DIFFERENCE);
1189     addi(str1, str1, 4);
1190     addi(str2, str2, 4);
1191     subi(cnt2, cnt2, minCharsInWord / 2);
1192 
1193     // A very short string
1194     mv(t0, minCharsInWord);
1195     ble(cnt2, t0, *SHORT_STRING);
1196   }
1197 #ifdef ASSERT
1198   if (AvoidUnalignedAccesses) {
1199     Label align_ok;
1200     orr(t0, str1, str2);
1201     andi(t0, t0, 0x7);
1202     beqz(t0, align_ok);
1203     stop("bad alignment");
1204     bind(align_ok);
1205   }
1206 #endif
1207   // load 8 bytes once to compare
1208   ld(tmp1, Address(str1));
1209   ld(tmp2, Address(str2));
1210   mv(t0, STUB_THRESHOLD);
1211   bge(cnt2, t0, *STUB);
1212   subi(cnt2, cnt2, minCharsInWord);
1213   beqz(cnt2, TAIL_CHECK);
1214   // convert cnt2 from characters to bytes
1215   if (!isLL) {
1216     slli(cnt2, cnt2, 1);
1217   }
1218   add(str2, str2, cnt2);
1219   add(str1, str1, cnt2);
1220   sub(cnt2, zr, cnt2);
1221   addi(cnt2, cnt2, 8);
1222   bne(tmp1, tmp2, DIFFERENCE);
1223   bgez(cnt2, TAIL);
1224 
1225   // main loop
1226   bind(NEXT_WORD);
1227     // 8-byte aligned loads when AvoidUnalignedAccesses is enabled
1228     add(t0, str1, cnt2);
1229     ld(tmp1, Address(t0));
1230     add(t0, str2, cnt2);
1231     ld(tmp2, Address(t0));
1232     addi(cnt2, cnt2, 8);
1233     bne(tmp1, tmp2, DIFFERENCE);
1234     bltz(cnt2, NEXT_WORD);
1235 
1236   bind(TAIL);
1237   load_long_misaligned(tmp1, Address(str1), tmp3, isLL ? 1 : 2);
1238   load_long_misaligned(tmp2, Address(str2), tmp3, isLL ? 1 : 2);
1239 
1240   bind(TAIL_CHECK);
1241   beq(tmp1, tmp2, *DONE);
1242 
1243   // Find the first different characters in the longwords and
1244   // compute their difference.
1245   bind(DIFFERENCE);
1246   xorr(tmp3, tmp1, tmp2);
1247   // count bits of trailing zero chars
1248   ctzc_bits(result, tmp3, isLL);
1249   srl(tmp1, tmp1, result);
1250   srl(tmp2, tmp2, result);
1251   if (isLL) {
1252     zext(tmp1, tmp1, 8);
1253     zext(tmp2, tmp2, 8);
1254   } else {
1255     zext(tmp1, tmp1, 16);
1256     zext(tmp2, tmp2, 16);
1257   }
1258   sub(result, tmp1, tmp2);
1259 
1260   j(*DONE);
1261 }
1262 
1263 // Compare longwords
1264 void C2_MacroAssembler::string_compare_long_different_encoding(Register result, Register str1, Register str2,
1265                                                bool isLU, Register cnt1, Register cnt2,
1266                                                Register tmp1, Register tmp2, Register tmp3,
1267                                                const int STUB_THRESHOLD, Label *STUB, Label *DONE) {
1268   Label TAIL, NEXT_WORD, DIFFERENCE;
1269 
1270   const int base_offset = arrayOopDesc::base_offset_in_bytes(T_BYTE);
1271   assert((base_offset % (UseCompactObjectHeaders ? 4 : 8)) == 0, "Must be");
1272 
1273   Register strL = isLU ? str1 : str2;
1274   Register strU = isLU ? str2 : str1;
1275   Register tmpL = tmp1, tmpU = tmp2;
1276 
1277   // load first parts of strings and finish initialization while loading
1278   mv(t0, STUB_THRESHOLD);
1279   bge(cnt2, t0, *STUB);
1280   lwu(tmpL, Address(strL));
1281   load_long_misaligned(tmpU, Address(strU), tmp3, (base_offset % 8) != 0 ? 4 : 8);
1282   subi(cnt2, cnt2, 4);
1283   add(strL, strL, cnt2);
1284   sub(cnt1, zr, cnt2);
1285   slli(cnt2, cnt2, 1);
1286   add(strU, strU, cnt2);
1287   inflate_lo32(tmp3, tmpL);
1288   mv(tmpL, tmp3);
1289   sub(cnt2, zr, cnt2);
1290   addi(cnt1, cnt1, 4);
1291   addi(cnt2, cnt2, 8);
1292   bne(tmpL, tmpU, DIFFERENCE);
1293   bgez(cnt2, TAIL);
1294 
1295   // main loop
1296   bind(NEXT_WORD);
1297     add(t0, strL, cnt1);
1298     lwu(tmpL, Address(t0));
1299     add(t0, strU, cnt2);
1300     load_long_misaligned(tmpU, Address(t0), tmp3, (base_offset % 8) != 0 ? 4 : 8);
1301     addi(cnt1, cnt1, 4);
1302     inflate_lo32(tmp3, tmpL);
1303     mv(tmpL, tmp3);
1304     addi(cnt2, cnt2, 8);
1305     bne(tmpL, tmpU, DIFFERENCE);
1306     bltz(cnt2, NEXT_WORD);
1307 
1308   bind(TAIL);
1309   load_int_misaligned(tmpL, Address(strL), tmp3, false);
1310   load_long_misaligned(tmpU, Address(strU), tmp3, 2);
1311   inflate_lo32(tmp3, tmpL);
1312   mv(tmpL, tmp3);
1313 
1314   beq(tmpL, tmpU, *DONE);
1315 
1316   // Find the first different characters in the longwords and
1317   // compute their difference.
1318   bind(DIFFERENCE);
1319   xorr(tmp3, tmpL, tmpU);
1320   // count bits of trailing zero chars
1321   ctzc_bits(result, tmp3);
1322   srl(tmpL, tmpL, result);
1323   srl(tmpU, tmpU, result);
1324   zext(tmpL, tmpL, 16);
1325   zext(tmpU, tmpU, 16);
1326   if (isLU) {
1327     sub(result, tmpL, tmpU);
1328   } else {
1329     sub(result, tmpU, tmpL);
1330   }
1331 
1332   j(*DONE);
1333 }
1334 
1335 // Compare strings.
1336 void C2_MacroAssembler::string_compare(Register str1, Register str2,
1337                                        Register cnt1, Register cnt2, Register result,
1338                                        Register tmp1, Register tmp2, Register tmp3,
1339                                        int ae)
1340 {
1341   Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, STUB,
1342         SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
1343         SHORT_LOOP_START, L;
1344 
1345   const int STUB_THRESHOLD = 64 + 8;
1346   bool isLL = ae == StrIntrinsicNode::LL;
1347   bool isLU = ae == StrIntrinsicNode::LU;
1348   bool isUL = ae == StrIntrinsicNode::UL;
1349 
1350   bool str1_isL = isLL || isLU;
1351   bool str2_isL = isLL || isUL;
1352 
1353   // for L strings, 1 byte for 1 character
1354   // for U strings, 2 bytes for 1 character
1355   int str1_chr_size = str1_isL ? 1 : 2;
1356   int str2_chr_size = str2_isL ? 1 : 2;
1357   int minCharsInWord = isLL ? wordSize : wordSize / 2;
1358 
1359   load_chr_insn str1_load_chr = str1_isL ? (load_chr_insn)&MacroAssembler::lbu : (load_chr_insn)&MacroAssembler::lhu;
1360   load_chr_insn str2_load_chr = str2_isL ? (load_chr_insn)&MacroAssembler::lbu : (load_chr_insn)&MacroAssembler::lhu;
1361 
1362   BLOCK_COMMENT("string_compare {");
1363 
1364   // Bizarrely, the counts are passed in bytes, regardless of whether they
1365   // are L or U strings, however the result is always in characters.
1366   if (!str1_isL) {
1367     sraiw(cnt1, cnt1, 1);
1368   }
1369   if (!str2_isL) {
1370     sraiw(cnt2, cnt2, 1);
1371   }
1372 
1373   // Compute the minimum of the string lengths and save the difference in result.
1374   sub(result, cnt1, cnt2);
1375   bgt(cnt1, cnt2, L);
1376   mv(cnt2, cnt1);
1377   bind(L);
1378 
1379   // A very short string
1380   mv(t0, minCharsInWord);
1381   ble(cnt2, t0, SHORT_STRING);
1382 
1383   // Compare longwords
1384   {
1385     if (str1_isL == str2_isL) { // LL or UU
1386       string_compare_long_same_encoding(result,
1387                                 str1, str2, isLL,
1388                                 cnt1, cnt2, tmp1, tmp2, tmp3,
1389                                 STUB_THRESHOLD, &STUB, &SHORT_STRING, &DONE);
1390     } else { // LU or UL
1391       string_compare_long_different_encoding(result,
1392                                 str1, str2, isLU,
1393                                 cnt1, cnt2, tmp1, tmp2, tmp3,
1394                                 STUB_THRESHOLD, &STUB, &DONE);
1395     }
1396   }
1397 
1398   bind(STUB);
1399   RuntimeAddress stub = nullptr;
1400   switch (ae) {
1401     case StrIntrinsicNode::LL:
1402       stub = RuntimeAddress(StubRoutines::riscv::compare_long_string_LL());
1403       break;
1404     case StrIntrinsicNode::UU:
1405       stub = RuntimeAddress(StubRoutines::riscv::compare_long_string_UU());
1406       break;
1407     case StrIntrinsicNode::LU:
1408       stub = RuntimeAddress(StubRoutines::riscv::compare_long_string_LU());
1409       break;
1410     case StrIntrinsicNode::UL:
1411       stub = RuntimeAddress(StubRoutines::riscv::compare_long_string_UL());
1412       break;
1413     default:
1414       ShouldNotReachHere();
1415   }
1416   assert(stub.target() != nullptr, "compare_long_string stub has not been generated");
1417   address call = reloc_call(stub);
1418   if (call == nullptr) {
1419     DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START));
1420     ciEnv::current()->record_failure("CodeCache is full");
1421     return;
1422   }
1423   j(DONE);
1424 
1425   bind(SHORT_STRING);
1426   // Is the minimum length zero?
1427   beqz(cnt2, DONE);
1428   // arrange code to do most branches while loading and loading next characters
1429   // while comparing previous
1430   (this->*str1_load_chr)(tmp1, Address(str1), t0);
1431   addi(str1, str1, str1_chr_size);
1432   subi(cnt2, cnt2, 1);
1433   beqz(cnt2, SHORT_LAST_INIT);
1434   (this->*str2_load_chr)(cnt1, Address(str2), t0);
1435   addi(str2, str2, str2_chr_size);
1436   j(SHORT_LOOP_START);
1437   bind(SHORT_LOOP);
1438   subi(cnt2, cnt2, 1);
1439   beqz(cnt2, SHORT_LAST);
1440   bind(SHORT_LOOP_START);
1441   (this->*str1_load_chr)(tmp2, Address(str1), t0);
1442   addi(str1, str1, str1_chr_size);
1443   (this->*str2_load_chr)(t0, Address(str2), t0);
1444   addi(str2, str2, str2_chr_size);
1445   bne(tmp1, cnt1, SHORT_LOOP_TAIL);
1446   subi(cnt2, cnt2, 1);
1447   beqz(cnt2, SHORT_LAST2);
1448   (this->*str1_load_chr)(tmp1, Address(str1), t0);
1449   addi(str1, str1, str1_chr_size);
1450   (this->*str2_load_chr)(cnt1, Address(str2), t0);
1451   addi(str2, str2, str2_chr_size);
1452   beq(tmp2, t0, SHORT_LOOP);
1453   sub(result, tmp2, t0);
1454   j(DONE);
1455   bind(SHORT_LOOP_TAIL);
1456   sub(result, tmp1, cnt1);
1457   j(DONE);
1458   bind(SHORT_LAST2);
1459   beq(tmp2, t0, DONE);
1460   sub(result, tmp2, t0);
1461 
1462   j(DONE);
1463   bind(SHORT_LAST_INIT);
1464   (this->*str2_load_chr)(cnt1, Address(str2), t0);
1465   addi(str2, str2, str2_chr_size);
1466   bind(SHORT_LAST);
1467   beq(tmp1, cnt1, DONE);
1468   sub(result, tmp1, cnt1);
1469 
1470   bind(DONE);
1471 
1472   BLOCK_COMMENT("} string_compare");
1473 }
1474 
1475 void C2_MacroAssembler::arrays_equals(Register a1, Register a2,
1476                                       Register tmp1, Register tmp2, Register tmp3,
1477                                       Register result, int elem_size) {
1478   assert(elem_size == 1 || elem_size == 2, "must be char or byte");
1479   assert_different_registers(a1, a2, result, tmp1, tmp2, tmp3, t0);
1480 
1481   int elem_per_word = wordSize / elem_size;
1482   int log_elem_size = exact_log2(elem_size);
1483   int length_offset = arrayOopDesc::length_offset_in_bytes();
1484   int base_offset   = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE);
1485 
1486   assert((base_offset % (UseCompactObjectHeaders ? 4 : 8)) == 0, "Must be");
1487 
1488   Register cnt1 = tmp3;
1489   Register cnt2 = tmp1;  // cnt2 only used in array length compare
1490   Label DONE, SAME, NEXT_WORD, SHORT, TAIL03, TAIL01;
1491 
1492   BLOCK_COMMENT("arrays_equals {");
1493 
1494   // if (a1 == a2), return true
1495   beq(a1, a2, SAME);
1496 
1497   mv(result, false);
1498   // if (a1 == nullptr || a2 == nullptr)
1499   //     return false;
1500   beqz(a1, DONE);
1501   beqz(a2, DONE);
1502 
1503   // if (a1.length != a2.length)
1504   //      return false;
1505   lwu(cnt1, Address(a1, length_offset));
1506   lwu(cnt2, Address(a2, length_offset));
1507   bne(cnt1, cnt2, DONE);
1508 
1509   la(a1, Address(a1, base_offset));
1510   la(a2, Address(a2, base_offset));
1511 
1512   // Load 4 bytes once to compare for alignment before main loop.
1513   if (AvoidUnalignedAccesses && (base_offset % 8) != 0) {
1514     subi(cnt1, cnt1, elem_per_word / 2);
1515     bltz(cnt1, TAIL03);
1516     lwu(tmp1, Address(a1));
1517     lwu(tmp2, Address(a2));
1518     addi(a1, a1, 4);
1519     addi(a2, a2, 4);
1520     bne(tmp1, tmp2, DONE);
1521   }
1522 
1523   // Check for short strings, i.e. smaller than wordSize.
1524   subi(cnt1, cnt1, elem_per_word);
1525   bltz(cnt1, SHORT);
1526 
1527 #ifdef ASSERT
1528   if (AvoidUnalignedAccesses) {
1529     Label align_ok;
1530     orr(t0, a1, a2);
1531     andi(t0, t0, 0x7);
1532     beqz(t0, align_ok);
1533     stop("bad alignment");
1534     bind(align_ok);
1535   }
1536 #endif
1537 
1538   // Main 8 byte comparison loop.
1539   bind(NEXT_WORD); {
1540     ld(tmp1, Address(a1));
1541     ld(tmp2, Address(a2));
1542     subi(cnt1, cnt1, elem_per_word);
1543     addi(a1, a1, wordSize);
1544     addi(a2, a2, wordSize);
1545     bne(tmp1, tmp2, DONE);
1546   } bgez(cnt1, NEXT_WORD);
1547 
1548   addi(tmp1, cnt1, elem_per_word);
1549   beqz(tmp1, SAME);
1550 
1551   bind(SHORT);
1552   test_bit(tmp1, cnt1, 2 - log_elem_size);
1553   beqz(tmp1, TAIL03); // 0-7 bytes left.
1554   {
1555     lwu(tmp1, Address(a1));
1556     lwu(tmp2, Address(a2));
1557     addi(a1, a1, 4);
1558     addi(a2, a2, 4);
1559     bne(tmp1, tmp2, DONE);
1560   }
1561 
1562   bind(TAIL03);
1563   test_bit(tmp1, cnt1, 1 - log_elem_size);
1564   beqz(tmp1, TAIL01); // 0-3 bytes left.
1565   {
1566     lhu(tmp1, Address(a1));
1567     lhu(tmp2, Address(a2));
1568     addi(a1, a1, 2);
1569     addi(a2, a2, 2);
1570     bne(tmp1, tmp2, DONE);
1571   }
1572 
1573   bind(TAIL01);
1574   if (elem_size == 1) { // Only needed when comparing byte arrays.
1575     test_bit(tmp1, cnt1, 0);
1576     beqz(tmp1, SAME); // 0-1 bytes left.
1577     {
1578       lbu(tmp1, Address(a1));
1579       lbu(tmp2, Address(a2));
1580       bne(tmp1, tmp2, DONE);
1581     }
1582   }
1583 
1584   bind(SAME);
1585   mv(result, true);
1586   // That's it.
1587   bind(DONE);
1588 
1589   BLOCK_COMMENT("} arrays_equals");
1590 }
1591 
1592 // Compare Strings
1593 
1594 // For Strings we're passed the address of the first characters in a1 and a2
1595 // and the length in cnt1. There are two implementations.
1596 // For arrays >= 8 bytes, all comparisons (except for the tail) are performed
1597 // 8 bytes at a time. For the tail, we compare a halfword, then a short, and then a byte.
1598 // For strings < 8 bytes, we compare a halfword, then a short, and then a byte.
1599 
1600 void C2_MacroAssembler::string_equals(Register a1, Register a2,
1601                                       Register result, Register cnt1)
1602 {
1603   Label SAME, DONE, SHORT, NEXT_WORD, TAIL03, TAIL01;
1604   Register tmp1 = t0;
1605   Register tmp2 = t1;
1606 
1607   assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2);
1608 
1609   int base_offset = arrayOopDesc::base_offset_in_bytes(T_BYTE);
1610 
1611   assert((base_offset % (UseCompactObjectHeaders ? 4 : 8)) == 0, "Must be");
1612 
1613   BLOCK_COMMENT("string_equals {");
1614 
1615   mv(result, false);
1616 
1617   // Load 4 bytes once to compare for alignment before main loop.
1618   if (AvoidUnalignedAccesses && (base_offset % 8) != 0) {
1619     subi(cnt1, cnt1, 4);
1620     bltz(cnt1, TAIL03);
1621     lwu(tmp1, Address(a1));
1622     lwu(tmp2, Address(a2));
1623     addi(a1, a1, 4);
1624     addi(a2, a2, 4);
1625     bne(tmp1, tmp2, DONE);
1626   }
1627 
1628   // Check for short strings, i.e. smaller than wordSize.
1629   subi(cnt1, cnt1, wordSize);
1630   bltz(cnt1, SHORT);
1631 
1632 #ifdef ASSERT
1633   if (AvoidUnalignedAccesses) {
1634     Label align_ok;
1635     orr(t0, a1, a2);
1636     andi(t0, t0, 0x7);
1637     beqz(t0, align_ok);
1638     stop("bad alignment");
1639     bind(align_ok);
1640   }
1641 #endif
1642 
1643   // Main 8 byte comparison loop.
1644   bind(NEXT_WORD); {
1645     ld(tmp1, Address(a1));
1646     ld(tmp2, Address(a2));
1647     subi(cnt1, cnt1, wordSize);
1648     addi(a1, a1, wordSize);
1649     addi(a2, a2, wordSize);
1650     bne(tmp1, tmp2, DONE);
1651   } bgez(cnt1, NEXT_WORD);
1652 
1653   addi(tmp1, cnt1, wordSize);
1654   beqz(tmp1, SAME);
1655 
1656   bind(SHORT);
1657   // 0-7 bytes left.
1658   test_bit(tmp1, cnt1, 2);
1659   beqz(tmp1, TAIL03);
1660   {
1661     lwu(tmp1, Address(a1));
1662     lwu(tmp2, Address(a2));
1663     addi(a1, a1, 4);
1664     addi(a2, a2, 4);
1665     bne(tmp1, tmp2, DONE);
1666   }
1667 
1668   bind(TAIL03);
1669   // 0-3 bytes left.
1670   test_bit(tmp1, cnt1, 1);
1671   beqz(tmp1, TAIL01);
1672   {
1673     lhu(tmp1, Address(a1));
1674     lhu(tmp2, Address(a2));
1675     addi(a1, a1, 2);
1676     addi(a2, a2, 2);
1677     bne(tmp1, tmp2, DONE);
1678   }
1679 
1680   bind(TAIL01);
1681   // 0-1 bytes left.
1682   test_bit(tmp1, cnt1, 0);
1683   beqz(tmp1, SAME);
1684   {
1685     lbu(tmp1, Address(a1));
1686     lbu(tmp2, Address(a2));
1687     bne(tmp1, tmp2, DONE);
1688   }
1689 
1690   // Arrays are equal.
1691   bind(SAME);
1692   mv(result, true);
1693 
1694   // That's it.
1695   bind(DONE);
1696   BLOCK_COMMENT("} string_equals");
1697 }
1698 
1699 // jdk.internal.util.ArraysSupport.vectorizedHashCode
1700 void C2_MacroAssembler::arrays_hashcode(Register ary, Register cnt, Register result,
1701                                         Register tmp1, Register tmp2, Register tmp3,
1702                                         Register tmp4, Register tmp5, Register tmp6,
1703                                         BasicType eltype)
1704 {
1705   assert(!UseRVV, "sanity");
1706   assert_different_registers(ary, cnt, result, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, t0, t1);
1707 
1708   const int elsize = arrays_hashcode_elsize(eltype);
1709   const int chunks_end_shift = exact_log2(elsize);
1710 
1711   switch (eltype) {
1712   case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break;
1713   case T_CHAR:    BLOCK_COMMENT("arrays_hashcode(char) {");          break;
1714   case T_BYTE:    BLOCK_COMMENT("arrays_hashcode(byte) {");          break;
1715   case T_SHORT:   BLOCK_COMMENT("arrays_hashcode(short) {");         break;
1716   case T_INT:     BLOCK_COMMENT("arrays_hashcode(int) {");           break;
1717   default:
1718     ShouldNotReachHere();
1719   }
1720 
1721   const int stride = 4;
1722   const Register pow31_4 = tmp1;
1723   const Register pow31_3 = tmp2;
1724   const Register pow31_2 = tmp3;
1725   const Register chunks  = tmp4;
1726   const Register chunks_end = chunks;
1727 
1728   Label DONE, TAIL, TAIL_LOOP, WIDE_LOOP;
1729 
1730   // result has a value initially
1731 
1732   beqz(cnt, DONE);
1733 
1734   andi(chunks, cnt, ~(stride - 1));
1735   beqz(chunks, TAIL);
1736 
1737   mv(pow31_4, 923521);           // [31^^4]
1738   mv(pow31_3,  29791);           // [31^^3]
1739   mv(pow31_2,    961);           // [31^^2]
1740 
1741   shadd(chunks_end, chunks, ary, t0, chunks_end_shift);
1742   andi(cnt, cnt, stride - 1);    // don't forget about tail!
1743 
1744   bind(WIDE_LOOP);
1745   arrays_hashcode_elload(t0,   Address(ary, 0 * elsize), eltype);
1746   arrays_hashcode_elload(t1,   Address(ary, 1 * elsize), eltype);
1747   arrays_hashcode_elload(tmp5, Address(ary, 2 * elsize), eltype);
1748   arrays_hashcode_elload(tmp6, Address(ary, 3 * elsize), eltype);
1749   mulw(result, result, pow31_4); // 31^^4 * h
1750   mulw(t0, t0, pow31_3);         // 31^^3 * ary[i+0]
1751   addw(result, result, t0);
1752   mulw(t1, t1, pow31_2);         // 31^^2 * ary[i+1]
1753   addw(result, result, t1);
1754   slli(t0, tmp5, 5);             // optimize 31^^1 * ary[i+2]
1755   subw(tmp5, t0, tmp5);          // with ary[i+2]<<5 - ary[i+2]
1756   addw(result, result, tmp5);
1757   addw(result, result, tmp6);    // 31^^4 * h + 31^^3 * ary[i+0] + 31^^2 * ary[i+1]
1758                                  //           + 31^^1 * ary[i+2] + 31^^0 * ary[i+3]
1759   addi(ary, ary, elsize * stride);
1760   bne(ary, chunks_end, WIDE_LOOP);
1761   beqz(cnt, DONE);
1762 
1763   bind(TAIL);
1764   shadd(chunks_end, cnt, ary, t0, chunks_end_shift);
1765 
1766   bind(TAIL_LOOP);
1767   arrays_hashcode_elload(t0, Address(ary), eltype);
1768   slli(t1, result, 5);           // optimize 31 * result
1769   subw(result, t1, result);      // with result<<5 - result
1770   addw(result, result, t0);
1771   addi(ary, ary, elsize);
1772   bne(ary, chunks_end, TAIL_LOOP);
1773 
1774   bind(DONE);
1775   BLOCK_COMMENT("} // arrays_hashcode");
1776 }
1777 
1778 void C2_MacroAssembler::arrays_hashcode_v(Register ary, Register cnt, Register result,
1779                                           Register tmp1, Register tmp2, Register tmp3,
1780                                           BasicType eltype)
1781 {
1782   assert(UseRVV, "sanity");
1783   assert(StubRoutines::riscv::arrays_hashcode_powers_of_31() != nullptr, "sanity");
1784   assert_different_registers(ary, cnt, result, tmp1, tmp2, tmp3, t0, t1);
1785 
1786   // The MaxVectorSize should have been set by detecting RVV max vector register
1787   // size when check UseRVV (i.e. MaxVectorSize == VM_Version::_initial_vector_length).
1788   // Let's use T_INT as all hashCode calculations eventually deal with ints.
1789   const int lmul = 2;
1790   const int stride = MaxVectorSize / sizeof(jint) * lmul;
1791 
1792   const int elsize_bytes = arrays_hashcode_elsize(eltype);
1793   const int elsize_shift = exact_log2(elsize_bytes);
1794 
1795   switch (eltype) {
1796     case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode_v(unsigned byte) {"); break;
1797     case T_CHAR:    BLOCK_COMMENT("arrays_hashcode_v(char) {");          break;
1798     case T_BYTE:    BLOCK_COMMENT("arrays_hashcode_v(byte) {");          break;
1799     case T_SHORT:   BLOCK_COMMENT("arrays_hashcode_v(short) {");         break;
1800     case T_INT:     BLOCK_COMMENT("arrays_hashcode_v(int) {");           break;
1801     default:
1802       ShouldNotReachHere();
1803   }
1804 
1805   const Register pow31_highest = tmp1;
1806   const Register ary_end       = tmp2;
1807   const Register consumed      = tmp3;
1808 
1809   const VectorRegister v_sum    = v2;
1810   const VectorRegister v_src    = v4;
1811   const VectorRegister v_coeffs = v6;
1812   const VectorRegister v_tmp    = v8;
1813 
1814   const address adr_pows31 = StubRoutines::riscv::arrays_hashcode_powers_of_31()
1815                            + sizeof(jint);
1816   Label VEC_LOOP, DONE, SCALAR_TAIL, SCALAR_TAIL_LOOP;
1817 
1818   // NB: at this point (a) 'result' already has some value,
1819   // (b) 'cnt' is not 0 or 1, see java code for details.
1820 
1821   andi(t0, cnt, ~(stride - 1));
1822   beqz(t0, SCALAR_TAIL);
1823 
1824   la(t1, ExternalAddress(adr_pows31));
1825   lw(pow31_highest, Address(t1, -1 * sizeof(jint)));
1826 
1827   vsetvli(consumed, cnt, Assembler::e32, Assembler::m2);
1828   vle32_v(v_coeffs, t1); // 31^^(stride - 1) ... 31^^0
1829   vmv_v_x(v_sum, x0);
1830 
1831   bind(VEC_LOOP);
1832   arrays_hashcode_elload_v(v_src, v_tmp, ary, eltype);
1833   vmul_vv(v_src, v_src, v_coeffs);
1834   vmadd_vx(v_sum, pow31_highest, v_src);
1835   mulw(result, result, pow31_highest);
1836   shadd(ary, consumed, ary, t0, elsize_shift);
1837   subw(cnt, cnt, consumed);
1838   andi(t1, cnt, ~(stride - 1));
1839   bnez(t1, VEC_LOOP);
1840 
1841   vmv_s_x(v_tmp, x0);
1842   vredsum_vs(v_sum, v_sum, v_tmp);
1843   vmv_x_s(t0, v_sum);
1844   addw(result, result, t0);
1845   beqz(cnt, DONE);
1846 
1847   bind(SCALAR_TAIL);
1848   shadd(ary_end, cnt, ary, t0, elsize_shift);
1849 
1850   bind(SCALAR_TAIL_LOOP);
1851   arrays_hashcode_elload(t0, Address(ary), eltype);
1852   slli(t1, result, 5);      // optimize 31 * result
1853   subw(result, t1, result); // with result<<5 - result
1854   addw(result, result, t0);
1855   addi(ary, ary, elsize_bytes);
1856   bne(ary, ary_end, SCALAR_TAIL_LOOP);
1857 
1858   bind(DONE);
1859   BLOCK_COMMENT("} // arrays_hashcode_v");
1860 }
1861 
1862 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) {
1863   switch (eltype) {
1864     case T_BOOLEAN: return sizeof(jboolean);
1865     case T_BYTE:    return sizeof(jbyte);
1866     case T_SHORT:   return sizeof(jshort);
1867     case T_CHAR:    return sizeof(jchar);
1868     case T_INT:     return sizeof(jint);
1869     default:
1870       ShouldNotReachHere();
1871       return -1;
1872   }
1873 }
1874 
1875 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) {
1876   switch (eltype) {
1877     // T_BOOLEAN used as surrogate for unsigned byte
1878     case T_BOOLEAN: lbu(dst, src);   break;
1879     case T_BYTE:     lb(dst, src);   break;
1880     case T_SHORT:    lh(dst, src);   break;
1881     case T_CHAR:    lhu(dst, src);   break;
1882     case T_INT:      lw(dst, src);   break;
1883     default:
1884       ShouldNotReachHere();
1885   }
1886 }
1887 
1888 void C2_MacroAssembler::arrays_hashcode_elload_v(VectorRegister vdst,
1889                                                  VectorRegister vtmp,
1890                                                  Register src,
1891                                                  BasicType eltype) {
1892   assert_different_registers(vdst, vtmp);
1893   switch (eltype) {
1894     case T_BOOLEAN:
1895       vle8_v(vtmp, src);
1896       vzext_vf4(vdst, vtmp);
1897       break;
1898     case T_BYTE:
1899       vle8_v(vtmp, src);
1900       vsext_vf4(vdst, vtmp);
1901       break;
1902     case T_CHAR:
1903       vle16_v(vtmp, src);
1904       vzext_vf2(vdst, vtmp);
1905       break;
1906     case T_SHORT:
1907       vle16_v(vtmp, src);
1908       vsext_vf2(vdst, vtmp);
1909       break;
1910     case T_INT:
1911       vle32_v(vdst, src);
1912       break;
1913     default:
1914       ShouldNotReachHere();
1915   }
1916 }
1917 
1918 typedef void (Assembler::*conditional_branch_insn)(Register op1, Register op2, Label& label, bool is_far);
1919 typedef void (MacroAssembler::*float_conditional_branch_insn)(FloatRegister op1, FloatRegister op2, Label& label,
1920                                                               bool is_far, bool is_unordered);
1921 
1922 static conditional_branch_insn conditional_branches[] =
1923 {
1924   /* SHORT branches */
1925   (conditional_branch_insn)&MacroAssembler::beq,
1926   (conditional_branch_insn)&MacroAssembler::bgt,
1927   nullptr, // BoolTest::overflow
1928   (conditional_branch_insn)&MacroAssembler::blt,
1929   (conditional_branch_insn)&MacroAssembler::bne,
1930   (conditional_branch_insn)&MacroAssembler::ble,
1931   nullptr, // BoolTest::no_overflow
1932   (conditional_branch_insn)&MacroAssembler::bge,
1933 
1934   /* UNSIGNED branches */
1935   (conditional_branch_insn)&MacroAssembler::beq,
1936   (conditional_branch_insn)&MacroAssembler::bgtu,
1937   nullptr,
1938   (conditional_branch_insn)&MacroAssembler::bltu,
1939   (conditional_branch_insn)&MacroAssembler::bne,
1940   (conditional_branch_insn)&MacroAssembler::bleu,
1941   nullptr,
1942   (conditional_branch_insn)&MacroAssembler::bgeu
1943 };
1944 
1945 static float_conditional_branch_insn float_conditional_branches[] =
1946 {
1947   /* FLOAT SHORT branches */
1948   (float_conditional_branch_insn)&MacroAssembler::float_beq,
1949   (float_conditional_branch_insn)&MacroAssembler::float_bgt,
1950   nullptr,  // BoolTest::overflow
1951   (float_conditional_branch_insn)&MacroAssembler::float_blt,
1952   (float_conditional_branch_insn)&MacroAssembler::float_bne,
1953   (float_conditional_branch_insn)&MacroAssembler::float_ble,
1954   nullptr, // BoolTest::no_overflow
1955   (float_conditional_branch_insn)&MacroAssembler::float_bge,
1956 
1957   /* DOUBLE SHORT branches */
1958   (float_conditional_branch_insn)&MacroAssembler::double_beq,
1959   (float_conditional_branch_insn)&MacroAssembler::double_bgt,
1960   nullptr,
1961   (float_conditional_branch_insn)&MacroAssembler::double_blt,
1962   (float_conditional_branch_insn)&MacroAssembler::double_bne,
1963   (float_conditional_branch_insn)&MacroAssembler::double_ble,
1964   nullptr,
1965   (float_conditional_branch_insn)&MacroAssembler::double_bge
1966 };
1967 
1968 void C2_MacroAssembler::cmp_branch(int cmpFlag, Register op1, Register op2, Label& label, bool is_far) {
1969   assert(cmpFlag >= 0 && cmpFlag < (int)(sizeof(conditional_branches) / sizeof(conditional_branches[0])),
1970          "invalid conditional branch index");
1971   (this->*conditional_branches[cmpFlag])(op1, op2, label, is_far);
1972 }
1973 
1974 // This is a function should only be used by C2. Flip the unordered when unordered-greater, C2 would use
1975 // unordered-lesser instead of unordered-greater. Finally, commute the result bits at function do_one_bytecode().
1976 void C2_MacroAssembler::float_cmp_branch(int cmpFlag, FloatRegister op1, FloatRegister op2, Label& label, bool is_far) {
1977   assert(cmpFlag >= 0 && cmpFlag < (int)(sizeof(float_conditional_branches) / sizeof(float_conditional_branches[0])),
1978          "invalid float conditional branch index");
1979   int booltest_flag = cmpFlag & ~(C2_MacroAssembler::double_branch_mask);
1980   (this->*float_conditional_branches[cmpFlag])(op1, op2, label, is_far,
1981     (booltest_flag == (BoolTest::ge) || booltest_flag == (BoolTest::gt)) ? false : true);
1982 }
1983 
1984 void C2_MacroAssembler::enc_cmpUEqNeLeGt_imm0_branch(int cmpFlag, Register op1, Label& L, bool is_far) {
1985   switch (cmpFlag) {
1986     case BoolTest::eq:
1987     case BoolTest::le:
1988       beqz(op1, L, is_far);
1989       break;
1990     case BoolTest::ne:
1991     case BoolTest::gt:
1992       bnez(op1, L, is_far);
1993       break;
1994     default:
1995       ShouldNotReachHere();
1996   }
1997 }
1998 
1999 void C2_MacroAssembler::enc_cmpEqNe_imm0_branch(int cmpFlag, Register op1, Label& L, bool is_far) {
2000   switch (cmpFlag) {
2001     case BoolTest::eq:
2002       beqz(op1, L, is_far);
2003       break;
2004     case BoolTest::ne:
2005       bnez(op1, L, is_far);
2006       break;
2007     default:
2008       ShouldNotReachHere();
2009   }
2010 }
2011 
2012 void C2_MacroAssembler::enc_cmove(int cmpFlag, Register op1, Register op2, Register dst, Register src) {
2013   bool is_unsigned = (cmpFlag & unsigned_branch_mask) == unsigned_branch_mask;
2014   int op_select = cmpFlag & (~unsigned_branch_mask);
2015 
2016   switch (op_select) {
2017     case BoolTest::eq:
2018       cmov_eq(op1, op2, dst, src);
2019       break;
2020     case BoolTest::ne:
2021       cmov_ne(op1, op2, dst, src);
2022       break;
2023     case BoolTest::le:
2024       if (is_unsigned) {
2025         cmov_leu(op1, op2, dst, src);
2026       } else {
2027         cmov_le(op1, op2, dst, src);
2028       }
2029       break;
2030     case BoolTest::ge:
2031       if (is_unsigned) {
2032         cmov_geu(op1, op2, dst, src);
2033       } else {
2034         cmov_ge(op1, op2, dst, src);
2035       }
2036       break;
2037     case BoolTest::lt:
2038       if (is_unsigned) {
2039         cmov_ltu(op1, op2, dst, src);
2040       } else {
2041         cmov_lt(op1, op2, dst, src);
2042       }
2043       break;
2044     case BoolTest::gt:
2045       if (is_unsigned) {
2046         cmov_gtu(op1, op2, dst, src);
2047       } else {
2048         cmov_gt(op1, op2, dst, src);
2049       }
2050       break;
2051     default:
2052       assert(false, "unsupported compare condition");
2053       ShouldNotReachHere();
2054   }
2055 }
2056 
2057 void C2_MacroAssembler::enc_cmove_cmp_fp(int cmpFlag, FloatRegister op1, FloatRegister op2, Register dst, Register src, bool is_single) {
2058   int op_select = cmpFlag & (~unsigned_branch_mask);
2059 
2060   switch (op_select) {
2061     case BoolTest::eq:
2062       cmov_cmp_fp_eq(op1, op2, dst, src, is_single);
2063       break;
2064     case BoolTest::ne:
2065       cmov_cmp_fp_ne(op1, op2, dst, src, is_single);
2066       break;
2067     case BoolTest::le:
2068       cmov_cmp_fp_le(op1, op2, dst, src, is_single);
2069       break;
2070     case BoolTest::ge:
2071       cmov_cmp_fp_ge(op1, op2, dst, src, is_single);
2072       break;
2073     case BoolTest::lt:
2074       cmov_cmp_fp_lt(op1, op2, dst, src, is_single);
2075       break;
2076     case BoolTest::gt:
2077       cmov_cmp_fp_gt(op1, op2, dst, src, is_single);
2078       break;
2079     default:
2080       assert(false, "unsupported compare condition");
2081       ShouldNotReachHere();
2082   }
2083 }
2084 
2085 void C2_MacroAssembler::enc_cmove_fp_cmp(int cmpFlag, Register op1, Register op2,
2086                         FloatRegister dst, FloatRegister src, bool is_single) {
2087   bool is_unsigned = (cmpFlag & unsigned_branch_mask) == unsigned_branch_mask;
2088   int op_select = cmpFlag & (~unsigned_branch_mask);
2089 
2090   switch (op_select) {
2091     case BoolTest::eq:
2092       cmov_fp_eq(op1, op2, dst, src, is_single);
2093       break;
2094     case BoolTest::ne:
2095       cmov_fp_ne(op1, op2, dst, src, is_single);
2096       break;
2097     case BoolTest::le:
2098       if (is_unsigned) {
2099         cmov_fp_leu(op1, op2, dst, src, is_single);
2100       } else {
2101         cmov_fp_le(op1, op2, dst, src, is_single);
2102       }
2103       break;
2104     case BoolTest::ge:
2105       if (is_unsigned) {
2106         cmov_fp_geu(op1, op2, dst, src, is_single);
2107       } else {
2108         cmov_fp_ge(op1, op2, dst, src, is_single);
2109       }
2110       break;
2111     case BoolTest::lt:
2112       if (is_unsigned) {
2113         cmov_fp_ltu(op1, op2, dst, src, is_single);
2114       } else {
2115         cmov_fp_lt(op1, op2, dst, src, is_single);
2116       }
2117       break;
2118     case BoolTest::gt:
2119       if (is_unsigned) {
2120         cmov_fp_gtu(op1, op2, dst, src, is_single);
2121       } else {
2122         cmov_fp_gt(op1, op2, dst, src, is_single);
2123       }
2124       break;
2125     default:
2126       assert(false, "unsupported compare condition");
2127       ShouldNotReachHere();
2128   }
2129 }
2130 
2131 void C2_MacroAssembler::enc_cmove_fp_cmp_fp(int cmpFlag,
2132                            FloatRegister op1, FloatRegister op2,
2133                            FloatRegister dst, FloatRegister src,
2134                            bool cmp_single, bool cmov_single) {
2135   int op_select = cmpFlag & (~unsigned_branch_mask);
2136 
2137   switch (op_select) {
2138     case BoolTest::eq:
2139       cmov_fp_cmp_fp_eq(op1, op2, dst, src, cmp_single, cmov_single);
2140       break;
2141     case BoolTest::ne:
2142       cmov_fp_cmp_fp_ne(op1, op2, dst, src, cmp_single, cmov_single);
2143       break;
2144     case BoolTest::le:
2145       cmov_fp_cmp_fp_le(op1, op2, dst, src, cmp_single, cmov_single);
2146       break;
2147     case BoolTest::ge:
2148       cmov_fp_cmp_fp_ge(op1, op2, dst, src, cmp_single, cmov_single);
2149       break;
2150     case BoolTest::lt:
2151       cmov_fp_cmp_fp_lt(op1, op2, dst, src, cmp_single, cmov_single);
2152       break;
2153     case BoolTest::gt:
2154       cmov_fp_cmp_fp_gt(op1, op2, dst, src, cmp_single, cmov_single);
2155       break;
2156     default:
2157       assert(false, "unsupported compare condition");
2158       ShouldNotReachHere();
2159   }
2160 }
2161 
2162 // Set dst to NaN if any NaN input.
2163 void C2_MacroAssembler::minmax_fp(FloatRegister dst, FloatRegister src1, FloatRegister src2,
2164                                   FLOAT_TYPE ft, bool is_min) {
2165   assert_cond((ft != FLOAT_TYPE::half_precision) || UseZfh);
2166 
2167   Label Done, Compare;
2168 
2169   switch (ft) {
2170     case FLOAT_TYPE::half_precision:
2171       fclass_h(t0, src1);
2172       fclass_h(t1, src2);
2173 
2174       orr(t0, t0, t1);
2175       andi(t0, t0, FClassBits::nan); // if src1 or src2 is quiet or signaling NaN then return NaN
2176       beqz(t0, Compare);
2177 
2178       fadd_h(dst, src1, src2);
2179       j(Done);
2180 
2181       bind(Compare);
2182       if (is_min) {
2183         fmin_h(dst, src1, src2);
2184       } else {
2185         fmax_h(dst, src1, src2);
2186       }
2187       break;
2188     case FLOAT_TYPE::single_precision:
2189       fclass_s(t0, src1);
2190       fclass_s(t1, src2);
2191 
2192       orr(t0, t0, t1);
2193       andi(t0, t0, FClassBits::nan); // if src1 or src2 is quiet or signaling NaN then return NaN
2194       beqz(t0, Compare);
2195 
2196       fadd_s(dst, src1, src2);
2197       j(Done);
2198 
2199       bind(Compare);
2200       if (is_min) {
2201         fmin_s(dst, src1, src2);
2202       } else {
2203         fmax_s(dst, src1, src2);
2204       }
2205       break;
2206     case FLOAT_TYPE::double_precision:
2207       fclass_d(t0, src1);
2208       fclass_d(t1, src2);
2209 
2210       orr(t0, t0, t1);
2211       andi(t0, t0, FClassBits::nan); // if src1 or src2 is quiet or signaling NaN then return NaN
2212       beqz(t0, Compare);
2213 
2214       fadd_d(dst, src1, src2);
2215       j(Done);
2216 
2217       bind(Compare);
2218       if (is_min) {
2219         fmin_d(dst, src1, src2);
2220       } else {
2221         fmax_d(dst, src1, src2);
2222       }
2223       break;
2224     default:
2225       ShouldNotReachHere();
2226   }
2227 
2228   bind(Done);
2229 }
2230 
2231 // According to Java SE specification, for floating-point round operations, if
2232 // the input is NaN, +/-infinity, or +/-0, the same input is returned as the
2233 // rounded result; this differs from behavior of RISC-V fcvt instructions (which
2234 // round out-of-range values to the nearest max or min value), therefore special
2235 // handling is needed by NaN, +/-Infinity, +/-0.
2236 void C2_MacroAssembler::round_double_mode(FloatRegister dst, FloatRegister src, int round_mode,
2237                                           Register tmp1, Register tmp2, Register tmp3) {
2238 
2239   assert_different_registers(dst, src);
2240   assert_different_registers(tmp1, tmp2, tmp3);
2241 
2242   // Set rounding mode for conversions
2243   // Here we use similar modes to double->long and long->double conversions
2244   // Different mode for long->double conversion matter only if long value was not representable as double,
2245   // we got long value as a result of double->long conversion so, it is definitely representable
2246   RoundingMode rm;
2247   switch (round_mode) {
2248     case RoundDoubleModeNode::rmode_ceil:
2249       rm = RoundingMode::rup;
2250       break;
2251     case RoundDoubleModeNode::rmode_floor:
2252       rm = RoundingMode::rdn;
2253       break;
2254     case RoundDoubleModeNode::rmode_rint:
2255       rm = RoundingMode::rne;
2256       break;
2257     default:
2258       ShouldNotReachHere();
2259   }
2260 
2261   // tmp1 - is a register to store double converted to long int
2262   // tmp2 - is a register to create constant for comparison
2263   // tmp3 - is a register where we store modified result of double->long conversion
2264   Label done, bad_val;
2265 
2266   // Conversion from double to long
2267   fcvt_l_d(tmp1, src, rm);
2268 
2269   // Generate constant (tmp2)
2270   // tmp2 = 100...0000
2271   addi(tmp2, zr, 1);
2272   slli(tmp2, tmp2, 63);
2273 
2274   // Prepare converted long (tmp1)
2275   // as a result when conversion overflow we got:
2276   // tmp1 = 011...1111 or 100...0000
2277   // Convert it to: tmp3 = 100...0000
2278   addi(tmp3, tmp1, 1);
2279   andi(tmp3, tmp3, -2);
2280   beq(tmp3, tmp2, bad_val);
2281 
2282   // Conversion from long to double
2283   fcvt_d_l(dst, tmp1, rm);
2284   // Add sign of input value to result for +/- 0 cases
2285   fsgnj_d(dst, dst, src);
2286   j(done);
2287 
2288   // If got conversion overflow return src
2289   bind(bad_val);
2290   fmv_d(dst, src);
2291 
2292   bind(done);
2293 }
2294 
2295 // According to Java SE specification, for floating-point signum operations, if
2296 // on input we have NaN or +/-0.0 value we should return it,
2297 // otherwise return +/- 1.0 using sign of input.
2298 // one - gives us a floating-point 1.0 (got from matching rule)
2299 // bool is_double - specifies single or double precision operations will be used.
2300 void C2_MacroAssembler::signum_fp(FloatRegister dst, FloatRegister one, bool is_double) {
2301   Label done;
2302 
2303   is_double ? fclass_d(t0, dst)
2304             : fclass_s(t0, dst);
2305 
2306   // check if input is -0, +0, signaling NaN or quiet NaN
2307   andi(t0, t0, FClassBits::zero | FClassBits::nan);
2308 
2309   bnez(t0, done);
2310 
2311   // use floating-point 1.0 with a sign of input
2312   is_double ? fsgnj_d(dst, one, dst)
2313             : fsgnj_s(dst, one, dst);
2314 
2315   bind(done);
2316 }
2317 
2318 static void float16_to_float_slow_path(C2_MacroAssembler& masm, C2GeneralStub<FloatRegister, Register, Register>& stub) {
2319 #define __ masm.
2320   FloatRegister dst = stub.data<0>();
2321   Register src = stub.data<1>();
2322   Register tmp = stub.data<2>();
2323   __ bind(stub.entry());
2324 
2325   // following instructions mainly focus on NaN, as riscv does not handle
2326   // NaN well with fcvt, but the code also works for Inf at the same time.
2327 
2328   // construct a NaN in 32 bits from the NaN in 16 bits,
2329   // we need the payloads of non-canonical NaNs to be preserved.
2330   __ mv(tmp, 0x7f800000);
2331   // sign-bit was already set via sign-extension if necessary.
2332   __ slli(t0, src, 13);
2333   __ orr(tmp, t0, tmp);
2334   __ fmv_w_x(dst, tmp);
2335 
2336   __ j(stub.continuation());
2337 #undef __
2338 }
2339 
2340 // j.l.Float.float16ToFloat
2341 void C2_MacroAssembler::float16_to_float(FloatRegister dst, Register src, Register tmp) {
2342   auto stub = C2CodeStub::make<FloatRegister, Register, Register>(dst, src, tmp, 20, float16_to_float_slow_path);
2343 
2344   // On riscv, NaN needs a special process as fcvt does not work in that case.
2345   // On riscv, Inf does not need a special process as fcvt can handle it correctly.
2346   // but we consider to get the slow path to process NaN and Inf at the same time,
2347   // as both of them are rare cases, and if we try to get the slow path to handle
2348   // only NaN case it would sacrifise the performance for normal cases,
2349   // i.e. non-NaN and non-Inf cases.
2350 
2351   // check whether it's a NaN or +/- Inf.
2352   mv(t0, 0x7c00);
2353   andr(tmp, src, t0);
2354   // jump to stub processing NaN and Inf cases.
2355   beq(t0, tmp, stub->entry(), true);
2356 
2357   // non-NaN or non-Inf cases, just use built-in instructions.
2358   fmv_h_x(dst, src);
2359   fcvt_s_h(dst, dst);
2360 
2361   bind(stub->continuation());
2362 }
2363 
2364 static void float_to_float16_slow_path(C2_MacroAssembler& masm, C2GeneralStub<Register, FloatRegister, Register>& stub) {
2365 #define __ masm.
2366   Register dst = stub.data<0>();
2367   FloatRegister src = stub.data<1>();
2368   Register tmp = stub.data<2>();
2369   __ bind(stub.entry());
2370 
2371   __ float_to_float16_NaN(dst, src, t0, tmp);
2372 
2373   __ j(stub.continuation());
2374 #undef __
2375 }
2376 
2377 // j.l.Float.floatToFloat16
2378 void C2_MacroAssembler::float_to_float16(Register dst, FloatRegister src, FloatRegister ftmp, Register xtmp) {
2379   auto stub = C2CodeStub::make<Register, FloatRegister, Register>(dst, src, xtmp, 64, float_to_float16_slow_path);
2380 
2381   // On riscv, NaN needs a special process as fcvt does not work in that case.
2382 
2383   // check whether it's a NaN.
2384   // replace fclass with feq as performance optimization.
2385   feq_s(t0, src, src);
2386   // jump to stub processing NaN cases.
2387   beqz(t0, stub->entry(), true);
2388 
2389   // non-NaN cases, just use built-in instructions.
2390   fcvt_h_s(ftmp, src);
2391   fmv_x_h(dst, ftmp);
2392 
2393   bind(stub->continuation());
2394 }
2395 
2396 static void float16_to_float_v_slow_path(C2_MacroAssembler& masm, C2GeneralStub<VectorRegister, VectorRegister, uint>& stub) {
2397 #define __ masm.
2398   VectorRegister dst = stub.data<0>();
2399   VectorRegister src = stub.data<1>();
2400   uint vector_length = stub.data<2>();
2401   __ bind(stub.entry());
2402 
2403   // following instructions mainly focus on NaN, as riscv does not handle
2404   // NaN well with vfwcvt_f_f_v, but the code also works for Inf at the same time.
2405   //
2406   // construct NaN's in 32 bits from the NaN's in 16 bits,
2407   // we need the payloads of non-canonical NaNs to be preserved.
2408 
2409   // adjust vector type to 2 * SEW.
2410   __ vsetvli_helper(T_FLOAT, vector_length, Assembler::m1);
2411   // widen and sign-extend src data.
2412   __ vsext_vf2(dst, src, Assembler::v0_t);
2413   __ mv(t0, 0x7f800000);
2414   // sign-bit was already set via sign-extension if necessary.
2415   __ vsll_vi(dst, dst, 13, Assembler::v0_t);
2416   __ vor_vx(dst, dst, t0, Assembler::v0_t);
2417 
2418   __ j(stub.continuation());
2419 #undef __
2420 }
2421 
2422 // j.l.Float.float16ToFloat
2423 void C2_MacroAssembler::float16_to_float_v(VectorRegister dst, VectorRegister src, uint vector_length) {
2424   auto stub = C2CodeStub::make<VectorRegister, VectorRegister, uint>
2425               (dst, src, vector_length, 24, float16_to_float_v_slow_path);
2426   assert_different_registers(dst, src);
2427 
2428   // On riscv, NaN needs a special process as vfwcvt_f_f_v does not work in that case.
2429   // On riscv, Inf does not need a special process as vfwcvt_f_f_v can handle it correctly.
2430   // but we consider to get the slow path to process NaN and Inf at the same time,
2431   // as both of them are rare cases, and if we try to get the slow path to handle
2432   // only NaN case it would sacrifise the performance for normal cases,
2433   // i.e. non-NaN and non-Inf cases.
2434 
2435   vsetvli_helper(BasicType::T_SHORT, vector_length, Assembler::mf2);
2436 
2437   // check whether there is a NaN or +/- Inf.
2438   mv(t0, 0x7c00);
2439   vand_vx(v0, src, t0);
2440   // v0 will be used as mask in slow path.
2441   vmseq_vx(v0, v0, t0);
2442   vcpop_m(t0, v0);
2443 
2444   // For non-NaN or non-Inf cases, just use built-in instructions.
2445   vfwcvt_f_f_v(dst, src);
2446 
2447   // jump to stub processing NaN and Inf cases if there is any of them in the vector-wide.
2448   bnez(t0, stub->entry(), true);
2449 
2450   bind(stub->continuation());
2451 }
2452 
2453 static void float_to_float16_v_slow_path(C2_MacroAssembler& masm,
2454                                          C2GeneralStub<VectorRegister, VectorRegister, VectorRegister>& stub) {
2455 #define __ masm.
2456   VectorRegister dst = stub.data<0>();
2457   VectorRegister src = stub.data<1>();
2458   VectorRegister vtmp = stub.data<2>();
2459   assert_different_registers(dst, src, vtmp);
2460 
2461   __ bind(stub.entry());
2462 
2463   // Active elements (NaNs) are marked in v0 mask register.
2464   // mul is already set to mf2 in float_to_float16_v.
2465 
2466   //  Float (32 bits)
2467   //    Bit:     31        30 to 23          22 to 0
2468   //          +---+------------------+-----------------------------+
2469   //          | S |     Exponent     |      Mantissa (Fraction)    |
2470   //          +---+------------------+-----------------------------+
2471   //          1 bit       8 bits                  23 bits
2472   //
2473   //  Float (16 bits)
2474   //    Bit:    15        14 to 10         9 to 0
2475   //          +---+----------------+------------------+
2476   //          | S |    Exponent    |     Mantissa     |
2477   //          +---+----------------+------------------+
2478   //          1 bit      5 bits          10 bits
2479   const int fp_sign_bits = 1;
2480   const int fp32_bits = 32;
2481   const int fp32_mantissa_2nd_part_bits = 9;
2482   const int fp32_mantissa_3rd_part_bits = 4;
2483   const int fp16_exponent_bits = 5;
2484   const int fp16_mantissa_bits = 10;
2485 
2486   // preserve the sign bit and exponent, clear mantissa.
2487   __ vnsra_wi(dst, src, fp32_bits - fp_sign_bits - fp16_exponent_bits, Assembler::v0_t);
2488   __ vsll_vi(dst, dst, fp16_mantissa_bits, Assembler::v0_t);
2489 
2490   // Preserve high order bit of float NaN in the
2491   // binary16 result NaN (tenth bit); OR in remaining
2492   // bits into lower 9 bits of binary 16 significand.
2493   //   | (doppel & 0x007f_e000) >> 13 // 10 bits
2494   //   | (doppel & 0x0000_1ff0) >> 4  //  9 bits
2495   //   | (doppel & 0x0000_000f));     //  4 bits
2496   //
2497   // Check j.l.Float.floatToFloat16 for more information.
2498   // 10 bits
2499   __ vnsrl_wi(vtmp, src, fp32_mantissa_2nd_part_bits + fp32_mantissa_3rd_part_bits, Assembler::v0_t);
2500   __ mv(t0, 0x3ff); // retain first part of mantissa in a float 32
2501   __ vand_vx(vtmp, vtmp, t0, Assembler::v0_t);
2502   __ vor_vv(dst, dst, vtmp, Assembler::v0_t);
2503   // 9 bits
2504   __ vnsrl_wi(vtmp, src, fp32_mantissa_3rd_part_bits, Assembler::v0_t);
2505   __ mv(t0, 0x1ff); // retain second part of mantissa in a float 32
2506   __ vand_vx(vtmp, vtmp, t0, Assembler::v0_t);
2507   __ vor_vv(dst, dst, vtmp, Assembler::v0_t);
2508   // 4 bits
2509   // Narrow shift is necessary to move data from 32 bits element to 16 bits element in vector register.
2510   __ vnsrl_wi(vtmp, src, 0, Assembler::v0_t);
2511   __ vand_vi(vtmp, vtmp, 0xf, Assembler::v0_t);
2512   __ vor_vv(dst, dst, vtmp, Assembler::v0_t);
2513 
2514   __ j(stub.continuation());
2515 #undef __
2516 }
2517 
2518 // j.l.Float.float16ToFloat
2519 void C2_MacroAssembler::float_to_float16_v(VectorRegister dst, VectorRegister src,
2520                                            VectorRegister vtmp, Register tmp, uint vector_length) {
2521   assert_different_registers(dst, src, vtmp);
2522 
2523   auto stub = C2CodeStub::make<VectorRegister, VectorRegister, VectorRegister>
2524               (dst, src, vtmp, 56, float_to_float16_v_slow_path);
2525 
2526   // On riscv, NaN needs a special process as vfncvt_f_f_w does not work in that case.
2527 
2528   vsetvli_helper(BasicType::T_FLOAT, vector_length, Assembler::m1);
2529 
2530   // check whether there is a NaN.
2531   // replace v_fclass with vmfne_vv as performance optimization.
2532   vmfne_vv(v0, src, src);
2533   vcpop_m(t0, v0);
2534 
2535   vsetvli_helper(BasicType::T_SHORT, vector_length, Assembler::mf2, tmp);
2536 
2537   // For non-NaN cases, just use built-in instructions.
2538   vfncvt_f_f_w(dst, src);
2539 
2540   // jump to stub processing NaN cases.
2541   bnez(t0, stub->entry(), true);
2542 
2543   bind(stub->continuation());
2544 }
2545 
2546 void C2_MacroAssembler::signum_fp_v(VectorRegister dst, VectorRegister one, BasicType bt, int vlen) {
2547   vsetvli_helper(bt, vlen);
2548 
2549   // check if input is -0, +0, signaling NaN or quiet NaN
2550   vfclass_v(v0, dst);
2551   mv(t0, FClassBits::zero | FClassBits::nan);
2552   vand_vx(v0, v0, t0);
2553   vmseq_vi(v0, v0, 0);
2554 
2555   // use floating-point 1.0 with a sign of input
2556   vfsgnj_vv(dst, one, dst, v0_t);
2557 }
2558 
2559 // j.l.Math.round(float)
2560 //  Returns the closest int to the argument, with ties rounding to positive infinity.
2561 // We need to handle 3 special cases defined by java api spec:
2562 //    NaN,
2563 //    float >= Integer.MAX_VALUE,
2564 //    float <= Integer.MIN_VALUE.
2565 void C2_MacroAssembler::java_round_float_v(VectorRegister dst, VectorRegister src, FloatRegister ftmp,
2566                                            BasicType bt, uint vector_length) {
2567   // In riscv, there is no straight corresponding rounding mode to satisfy the behaviour defined,
2568   // in java api spec, i.e. any rounding mode can not handle some corner cases, e.g.
2569   //  RNE is the closest one, but it ties to "even", which means 1.5/2.5 both will be converted
2570   //    to 2, instead of 2 and 3 respectively.
2571   //  RUP does not work either, although java api requires "rounding to positive infinity",
2572   //    but both 1.3/1.8 will be converted to 2, instead of 1 and 2 respectively.
2573   //
2574   // The optimal solution for non-NaN cases is:
2575   //    src+0.5 => dst, with rdn rounding mode,
2576   //    convert dst from float to int, with rnd rounding mode.
2577   // and, this solution works as expected for float >= Integer.MAX_VALUE and float <= Integer.MIN_VALUE.
2578   //
2579   // But, we still need to handle NaN explicilty with vector mask instructions.
2580   //
2581   // Check MacroAssembler::java_round_float and C2_MacroAssembler::vector_round_sve in aarch64 for more details.
2582 
2583   csrwi(CSR_FRM, C2_MacroAssembler::rdn);
2584   vsetvli_helper(bt, vector_length);
2585 
2586   // don't rearrage the instructions sequence order without performance testing.
2587   // check MacroAssembler::java_round_float in riscv64 for more details.
2588   mv(t0, jint_cast(0.5f));
2589   fmv_w_x(ftmp, t0);
2590 
2591   // replacing vfclass with feq as performance optimization
2592   vmfeq_vv(v0, src, src);
2593   // set dst = 0 in cases of NaN
2594   vmv_v_x(dst, zr);
2595 
2596   // dst = (src + 0.5) rounded down towards negative infinity
2597   vfadd_vf(dst, src, ftmp, Assembler::v0_t);
2598   vfcvt_x_f_v(dst, dst, Assembler::v0_t); // in RoundingMode::rdn
2599 
2600   csrwi(CSR_FRM, C2_MacroAssembler::rne);
2601 }
2602 
2603 // java.lang.Math.round(double a)
2604 // Returns the closest long to the argument, with ties rounding to positive infinity.
2605 void C2_MacroAssembler::java_round_double_v(VectorRegister dst, VectorRegister src, FloatRegister ftmp,
2606                                             BasicType bt, uint vector_length) {
2607   // check C2_MacroAssembler::java_round_float_v above for more details.
2608 
2609   csrwi(CSR_FRM, C2_MacroAssembler::rdn);
2610   vsetvli_helper(bt, vector_length);
2611 
2612   mv(t0, julong_cast(0.5));
2613   fmv_d_x(ftmp, t0);
2614 
2615   // replacing vfclass with feq as performance optimization
2616   vmfeq_vv(v0, src, src);
2617   // set dst = 0 in cases of NaN
2618   vmv_v_x(dst, zr);
2619 
2620   // dst = (src + 0.5) rounded down towards negative infinity
2621   vfadd_vf(dst, src, ftmp, Assembler::v0_t);
2622   vfcvt_x_f_v(dst, dst, Assembler::v0_t); // in RoundingMode::rdn
2623 
2624   csrwi(CSR_FRM, C2_MacroAssembler::rne);
2625 }
2626 
2627 void C2_MacroAssembler::element_compare(Register a1, Register a2, Register result, Register cnt, Register tmp1, Register tmp2,
2628                                         VectorRegister vr1, VectorRegister vr2, VectorRegister vrs, bool islatin, Label &DONE,
2629                                         Assembler::LMUL lmul) {
2630   Label loop;
2631   Assembler::SEW sew = islatin ? Assembler::e8 : Assembler::e16;
2632 
2633   bind(loop);
2634   vsetvli(tmp1, cnt, sew, lmul);
2635   vlex_v(vr1, a1, sew);
2636   vlex_v(vr2, a2, sew);
2637   vmsne_vv(vrs, vr1, vr2);
2638   vfirst_m(tmp2, vrs);
2639   bgez(tmp2, DONE);
2640   sub(cnt, cnt, tmp1);
2641   if (!islatin) {
2642     slli(tmp1, tmp1, 1); // get byte counts
2643   }
2644   add(a1, a1, tmp1);
2645   add(a2, a2, tmp1);
2646   bnez(cnt, loop);
2647 
2648   mv(result, true);
2649 }
2650 
2651 void C2_MacroAssembler::string_equals_v(Register a1, Register a2, Register result, Register cnt) {
2652   Label DONE;
2653   Register tmp1 = t0;
2654   Register tmp2 = t1;
2655 
2656   BLOCK_COMMENT("string_equals_v {");
2657 
2658   mv(result, false);
2659 
2660   element_compare(a1, a2, result, cnt, tmp1, tmp2, v2, v4, v2, true, DONE, Assembler::m2);
2661 
2662   bind(DONE);
2663   BLOCK_COMMENT("} string_equals_v");
2664 }
2665 
2666 // used by C2 ClearArray patterns.
2667 // base: Address of a buffer to be zeroed
2668 // cnt: Count in HeapWords
2669 //
2670 // base, cnt, v4, v5, v6, v7 and t0 are clobbered.
2671 void C2_MacroAssembler::clear_array_v(Register base, Register cnt) {
2672   Label loop;
2673 
2674   // making zero words
2675   vsetvli(t0, cnt, Assembler::e64, Assembler::m4);
2676   vxor_vv(v4, v4, v4);
2677 
2678   bind(loop);
2679   vsetvli(t0, cnt, Assembler::e64, Assembler::m4);
2680   vse64_v(v4, base);
2681   sub(cnt, cnt, t0);
2682   shadd(base, t0, base, t0, 3);
2683   bnez(cnt, loop);
2684 }
2685 
2686 void C2_MacroAssembler::arrays_equals_v(Register a1, Register a2, Register result,
2687                                         Register cnt1, int elem_size) {
2688   assert(elem_size == 1 || elem_size == 2, "must be char or byte");
2689   assert_different_registers(a1, a2, result, cnt1, t0, t1);
2690 
2691   Label DONE;
2692   Register tmp1 = t0;
2693   Register tmp2 = t1;
2694   Register cnt2 = tmp2;
2695   int length_offset = arrayOopDesc::length_offset_in_bytes();
2696   int base_offset = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE);
2697 
2698   assert((base_offset % (UseCompactObjectHeaders ? 4 : 8)) == 0, "Must be");
2699 
2700   BLOCK_COMMENT("arrays_equals_v {");
2701 
2702   // if (a1 == a2), return true
2703   mv(result, true);
2704   beq(a1, a2, DONE);
2705 
2706   mv(result, false);
2707   // if a1 == null or a2 == null, return false
2708   beqz(a1, DONE);
2709   beqz(a2, DONE);
2710   // if (a1.length != a2.length), return false
2711   lwu(cnt1, Address(a1, length_offset));
2712   lwu(cnt2, Address(a2, length_offset));
2713   bne(cnt1, cnt2, DONE);
2714 
2715   la(a1, Address(a1, base_offset));
2716   la(a2, Address(a2, base_offset));
2717 
2718   element_compare(a1, a2, result, cnt1, tmp1, tmp2, v2, v4, v2, elem_size == 1, DONE, Assembler::m2);
2719 
2720   bind(DONE);
2721 
2722   BLOCK_COMMENT("} arrays_equals_v");
2723 }
2724 
2725 void C2_MacroAssembler::string_compare_v(Register str1, Register str2, Register cnt1, Register cnt2,
2726                                          Register result, Register tmp1, Register tmp2, int encForm) {
2727   Label DIFFERENCE, DONE, L, loop;
2728   bool encLL = encForm == StrIntrinsicNode::LL;
2729   bool encLU = encForm == StrIntrinsicNode::LU;
2730   bool encUL = encForm == StrIntrinsicNode::UL;
2731 
2732   bool str1_isL = encLL || encLU;
2733   bool str2_isL = encLL || encUL;
2734 
2735   int minCharsInWord = encLL ? wordSize : wordSize / 2;
2736 
2737   BLOCK_COMMENT("string_compare_v {");
2738 
2739   // for Latin strings, 1 byte for 1 character
2740   // for UTF16 strings, 2 bytes for 1 character
2741   if (!str1_isL)
2742     sraiw(cnt1, cnt1, 1);
2743   if (!str2_isL)
2744     sraiw(cnt2, cnt2, 1);
2745 
2746   // if str1 == str2, return the difference
2747   // save the minimum of the string lengths in cnt2.
2748   sub(result, cnt1, cnt2);
2749   bgt(cnt1, cnt2, L);
2750   mv(cnt2, cnt1);
2751   bind(L);
2752 
2753   // We focus on the optimization of small sized string.
2754   // Please check below document for string size distribution statistics.
2755   // https://cr.openjdk.org/~shade/density/string-density-report.pdf
2756   if (str1_isL == str2_isL) { // LL or UU
2757     // Below construction of v regs and lmul is based on test on 2 different boards,
2758     // vlen == 128 and vlen == 256 respectively.
2759     if (!encLL && MaxVectorSize == 16) { // UU
2760       element_compare(str1, str2, zr, cnt2, tmp1, tmp2, v4, v8, v4, encLL, DIFFERENCE, Assembler::m4);
2761     } else { // UU + MaxVectorSize or LL
2762       element_compare(str1, str2, zr, cnt2, tmp1, tmp2, v2, v4, v2, encLL, DIFFERENCE, Assembler::m2);
2763     }
2764 
2765     j(DONE);
2766   } else { // LU or UL
2767     Register strL = encLU ? str1 : str2;
2768     Register strU = encLU ? str2 : str1;
2769     VectorRegister vstr1 = encLU ? v8 : v4;
2770     VectorRegister vstr2 = encLU ? v4 : v8;
2771 
2772     bind(loop);
2773     vsetvli(tmp1, cnt2, Assembler::e8, Assembler::m2);
2774     vle8_v(vstr1, strL);
2775     vsetvli(tmp1, cnt2, Assembler::e16, Assembler::m4);
2776     vzext_vf2(vstr2, vstr1);
2777     vle16_v(vstr1, strU);
2778     vmsne_vv(v4, vstr2, vstr1);
2779     vfirst_m(tmp2, v4);
2780     bgez(tmp2, DIFFERENCE);
2781     sub(cnt2, cnt2, tmp1);
2782     add(strL, strL, tmp1);
2783     shadd(strU, tmp1, strU, tmp1, 1);
2784     bnez(cnt2, loop);
2785     j(DONE);
2786   }
2787 
2788   bind(DIFFERENCE);
2789   slli(tmp1, tmp2, 1);
2790   add(str1, str1, str1_isL ? tmp2 : tmp1);
2791   add(str2, str2, str2_isL ? tmp2 : tmp1);
2792   str1_isL ? lbu(tmp1, Address(str1, 0)) : lhu(tmp1, Address(str1, 0));
2793   str2_isL ? lbu(tmp2, Address(str2, 0)) : lhu(tmp2, Address(str2, 0));
2794   sub(result, tmp1, tmp2);
2795 
2796   bind(DONE);
2797 
2798   BLOCK_COMMENT("} string_compare_v");
2799 }
2800 
2801 void C2_MacroAssembler::byte_array_inflate_v(Register src, Register dst, Register len, Register tmp) {
2802   Label loop;
2803   assert_different_registers(src, dst, len, tmp, t0);
2804 
2805   BLOCK_COMMENT("byte_array_inflate_v {");
2806   bind(loop);
2807   vsetvli(tmp, len, Assembler::e8, Assembler::m2);
2808   vle8_v(v6, src);
2809   vsetvli(t0, len, Assembler::e16, Assembler::m4);
2810   vzext_vf2(v4, v6);
2811   vse16_v(v4, dst);
2812   sub(len, len, tmp);
2813   add(src, src, tmp);
2814   shadd(dst, tmp, dst, tmp, 1);
2815   bnez(len, loop);
2816   BLOCK_COMMENT("} byte_array_inflate_v");
2817 }
2818 
2819 // Compress char[] array to byte[].
2820 // Intrinsic for java.lang.StringUTF16.compress(char[] src, int srcOff, byte[] dst, int dstOff, int len)
2821 // result: the array length if every element in array can be encoded,
2822 // otherwise, the index of first non-latin1 (> 0xff) character.
2823 void C2_MacroAssembler::char_array_compress_v(Register src, Register dst, Register len,
2824                                               Register result, Register tmp) {
2825   encode_iso_array_v(src, dst, len, result, tmp, false);
2826 }
2827 
2828 // Intrinsic for
2829 //
2830 // - sun.nio.cs.ISO_8859_1.Encoder#encodeISOArray0(byte[] sa, int sp, byte[] da, int dp, int len)
2831 //   Encodes char[] to byte[] in ISO-8859-1
2832 //
2833 // - java.lang.StringCoding#encodeISOArray0(byte[] sa, int sp, byte[] da, int dp, int len)
2834 //   Encodes byte[] (containing UTF-16) to byte[] in ISO-8859-1
2835 //
2836 // - java.lang.StringCoding#encodeAsciiArray0(char[] sa, int sp, byte[] da, int dp, int len)
2837 //   Encodes char[] to byte[] in ASCII
2838 //
2839 // This version always returns the number of characters copied. A successful
2840 // copy will complete with the post-condition: 'res' == 'len', while an
2841 // unsuccessful copy will exit with the post-condition: 0 <= 'res' < 'len'.
2842 //
2843 // Clobbers: src, dst, len, result, t0
2844 void C2_MacroAssembler::encode_iso_array_v(Register src, Register dst, Register len,
2845                                            Register result, Register tmp, bool ascii) {
2846   Label loop, fail, done;
2847 
2848   BLOCK_COMMENT("encode_iso_array_v {");
2849   mv(result, 0);
2850 
2851   bind(loop);
2852   mv(tmp, ascii ? 0x7f : 0xff);
2853   vsetvli(t0, len, Assembler::e16, Assembler::m2);
2854   vle16_v(v2, src);
2855 
2856   vmsgtu_vx(v1, v2, tmp);
2857   vfirst_m(tmp, v1);
2858   vmsbf_m(v0, v1);
2859   // compress char to byte
2860   vsetvli(t0, len, Assembler::e8);
2861   vncvt_x_x_w(v1, v2, Assembler::v0_t);
2862   vse8_v(v1, dst, Assembler::v0_t);
2863 
2864   // fail if char > 0x7f/0xff
2865   bgez(tmp, fail);
2866   add(result, result, t0);
2867   add(dst, dst, t0);
2868   sub(len, len, t0);
2869   shadd(src, t0, src, t0, 1);
2870   bnez(len, loop);
2871   j(done);
2872 
2873   bind(fail);
2874   add(result, result, tmp);
2875 
2876   bind(done);
2877   BLOCK_COMMENT("} encode_iso_array_v");
2878 }
2879 
2880 void C2_MacroAssembler::count_positives_v(Register ary, Register len, Register result, Register tmp) {
2881   Label LOOP, SET_RESULT, DONE;
2882 
2883   BLOCK_COMMENT("count_positives_v {");
2884   assert_different_registers(ary, len, result, tmp);
2885 
2886   mv(result, zr);
2887 
2888   bind(LOOP);
2889   vsetvli(t0, len, Assembler::e8, Assembler::m4);
2890   vle8_v(v4, ary);
2891   vmslt_vx(v4, v4, zr);
2892   vfirst_m(tmp, v4);
2893   bgez(tmp, SET_RESULT);
2894   // if tmp == -1, all bytes are positive
2895   add(result, result, t0);
2896 
2897   sub(len, len, t0);
2898   add(ary, ary, t0);
2899   bnez(len, LOOP);
2900   j(DONE);
2901 
2902   // add remaining positive bytes count
2903   bind(SET_RESULT);
2904   add(result, result, tmp);
2905 
2906   bind(DONE);
2907   BLOCK_COMMENT("} count_positives_v");
2908 }
2909 
2910 void C2_MacroAssembler::string_indexof_char_v(Register str1, Register cnt1,
2911                                               Register ch, Register result,
2912                                               Register tmp1, Register tmp2,
2913                                               bool isL) {
2914   mv(result, zr);
2915 
2916   Label loop, MATCH, DONE;
2917   Assembler::SEW sew = isL ? Assembler::e8 : Assembler::e16;
2918   bind(loop);
2919   vsetvli(tmp1, cnt1, sew, Assembler::m4);
2920   vlex_v(v4, str1, sew);
2921   vmseq_vx(v4, v4, ch);
2922   vfirst_m(tmp2, v4);
2923   bgez(tmp2, MATCH); // if equal, return index
2924 
2925   add(result, result, tmp1);
2926   sub(cnt1, cnt1, tmp1);
2927   if (!isL) slli(tmp1, tmp1, 1);
2928   add(str1, str1, tmp1);
2929   bnez(cnt1, loop);
2930 
2931   mv(result, -1);
2932   j(DONE);
2933 
2934   bind(MATCH);
2935   add(result, result, tmp2);
2936 
2937   bind(DONE);
2938 }
2939 
2940 // Set dst to NaN if any NaN input.
2941 void C2_MacroAssembler::minmax_fp_v(VectorRegister dst, VectorRegister src1, VectorRegister src2,
2942                                     BasicType bt, bool is_min, uint vector_length) {
2943   assert_different_registers(dst, src1, src2);
2944 
2945   vsetvli_helper(bt, vector_length);
2946 
2947   is_min ? vfmin_vv(dst, src1, src2)
2948          : vfmax_vv(dst, src1, src2);
2949 
2950   vmfne_vv(v0,  src1, src1);
2951   vfadd_vv(dst, src1, src1, Assembler::v0_t);
2952   vmfne_vv(v0,  src2, src2);
2953   vfadd_vv(dst, src2, src2, Assembler::v0_t);
2954 }
2955 
2956 // Set dst to NaN if any NaN input.
2957 // The destination vector register elements corresponding to masked-off elements
2958 // are handled with a mask-undisturbed policy.
2959 void C2_MacroAssembler::minmax_fp_masked_v(VectorRegister dst, VectorRegister src1, VectorRegister src2,
2960                                            VectorRegister vmask, VectorRegister tmp1, VectorRegister tmp2,
2961                                            BasicType bt, bool is_min, uint vector_length) {
2962   assert_different_registers(src1, src2, tmp1, tmp2);
2963   vsetvli_helper(bt, vector_length);
2964 
2965   // Check vector elements of src1 and src2 for NaN.
2966   vmfeq_vv(tmp1, src1, src1);
2967   vmfeq_vv(tmp2, src2, src2);
2968 
2969   vmandn_mm(v0, vmask, tmp1);
2970   vfadd_vv(dst, src1, src1, Assembler::v0_t);
2971   vmandn_mm(v0, vmask, tmp2);
2972   vfadd_vv(dst, src2, src2, Assembler::v0_t);
2973 
2974   vmand_mm(tmp2, tmp1, tmp2);
2975   vmand_mm(v0, vmask, tmp2);
2976   is_min ? vfmin_vv(dst, src1, src2, Assembler::v0_t)
2977          : vfmax_vv(dst, src1, src2, Assembler::v0_t);
2978 }
2979 
2980 // Set dst to NaN if any NaN input.
2981 void C2_MacroAssembler::reduce_minmax_fp_v(FloatRegister dst,
2982                                            FloatRegister src1, VectorRegister src2,
2983                                            VectorRegister tmp1, VectorRegister tmp2,
2984                                            bool is_double, bool is_min, uint vector_length, VectorMask vm) {
2985   assert_different_registers(dst, src1);
2986   assert_different_registers(src2, tmp1, tmp2);
2987 
2988   Label L_done, L_NaN_1, L_NaN_2;
2989   // Set dst to src1 if src1 is NaN
2990   is_double ? feq_d(t0, src1, src1)
2991             : feq_s(t0, src1, src1);
2992   beqz(t0, L_NaN_2);
2993 
2994   vsetvli_helper(is_double ? T_DOUBLE : T_FLOAT, vector_length);
2995   vfmv_s_f(tmp2, src1);
2996 
2997   is_min ? vfredmin_vs(tmp1, src2, tmp2, vm)
2998          : vfredmax_vs(tmp1, src2, tmp2, vm);
2999   vfmv_f_s(dst, tmp1);
3000 
3001   // Checking NaNs in src2
3002   vmfne_vv(tmp1, src2, src2, vm);
3003   vcpop_m(t0, tmp1, vm);
3004   beqz(t0, L_done);
3005 
3006   bind(L_NaN_1);
3007   vfredusum_vs(tmp1, src2, tmp2, vm);
3008   vfmv_f_s(dst, tmp1);
3009   j(L_done);
3010 
3011   bind(L_NaN_2);
3012   is_double ? fmv_d(dst, src1)
3013             : fmv_s(dst, src1);
3014   bind(L_done);
3015 }
3016 
3017 bool C2_MacroAssembler::in_scratch_emit_size() {
3018   if (ciEnv::current()->task() != nullptr) {
3019     PhaseOutput* phase_output = Compile::current()->output();
3020     if (phase_output != nullptr && phase_output->in_scratch_emit_size()) {
3021       return true;
3022     }
3023   }
3024   return MacroAssembler::in_scratch_emit_size();
3025 }
3026 
3027 void C2_MacroAssembler::reduce_integral_v(Register dst, Register src1,
3028                                           VectorRegister src2, VectorRegister tmp,
3029                                           int opc, BasicType bt, uint vector_length, VectorMask vm) {
3030   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
3031   vsetvli_helper(bt, vector_length);
3032   vmv_s_x(tmp, src1);
3033   switch (opc) {
3034     case Op_AddReductionVI:
3035     case Op_AddReductionVL:
3036       vredsum_vs(tmp, src2, tmp, vm);
3037       break;
3038     case Op_AndReductionV:
3039       vredand_vs(tmp, src2, tmp, vm);
3040       break;
3041     case Op_OrReductionV:
3042       vredor_vs(tmp, src2, tmp, vm);
3043       break;
3044     case Op_XorReductionV:
3045       vredxor_vs(tmp, src2, tmp, vm);
3046       break;
3047     case Op_MaxReductionV:
3048       vredmax_vs(tmp, src2, tmp, vm);
3049       break;
3050     case Op_MinReductionV:
3051       vredmin_vs(tmp, src2, tmp, vm);
3052       break;
3053     default:
3054       ShouldNotReachHere();
3055   }
3056   vmv_x_s(dst, tmp);
3057 }
3058 
3059 void C2_MacroAssembler::reduce_mul_integral_v(Register dst, Register src1, VectorRegister src2,
3060                                               VectorRegister vtmp1, VectorRegister vtmp2,
3061                                               BasicType bt, uint vector_length, VectorMask vm) {
3062   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
3063   vsetvli_helper(bt, vector_length);
3064 
3065   vector_length /= 2;
3066   if (vm != Assembler::unmasked) {
3067     // This behaviour is consistent with spec requirements of vector API, for `reduceLanes`:
3068     //  If no elements are selected, an operation-specific identity value is returned.
3069     //    If the operation is MUL, then the identity value is one.
3070     vmv_v_i(vtmp1, 1);
3071     vmerge_vvm(vtmp2, vtmp1, src2); // vm == v0
3072     slidedown_v(vtmp1, vtmp2, vector_length);
3073 
3074     vsetvli_helper(bt, vector_length);
3075     vmul_vv(vtmp1, vtmp1, vtmp2);
3076   } else {
3077     slidedown_v(vtmp1, src2, vector_length);
3078 
3079     vsetvli_helper(bt, vector_length);
3080     vmul_vv(vtmp1, vtmp1, src2);
3081   }
3082 
3083   while (vector_length > 1) {
3084     vector_length /= 2;
3085     slidedown_v(vtmp2, vtmp1, vector_length);
3086     vsetvli_helper(bt, vector_length);
3087     vmul_vv(vtmp1, vtmp1, vtmp2);
3088   }
3089 
3090   vmv_x_s(dst, vtmp1);
3091   if (bt == T_INT) {
3092     mulw(dst, dst, src1);
3093   } else {
3094     mul(dst, dst, src1);
3095   }
3096 }
3097 
3098 // Set vl and vtype for full and partial vector operations.
3099 // (vma = mu, vta = tu, vill = false)
3100 void C2_MacroAssembler::vsetvli_helper(BasicType bt, uint vector_length, LMUL vlmul, Register tmp) {
3101   Assembler::SEW sew = Assembler::elemtype_to_sew(bt);
3102   if (vector_length <= 31) {
3103     vsetivli(tmp, vector_length, sew, vlmul);
3104   } else if (vector_length == (MaxVectorSize / type2aelembytes(bt))) {
3105     vsetvli(tmp, x0, sew, vlmul);
3106   } else {
3107     mv(tmp, vector_length);
3108     vsetvli(tmp, tmp, sew, vlmul);
3109   }
3110 }
3111 
3112 void C2_MacroAssembler::compare_integral_v(VectorRegister vd, VectorRegister src1, VectorRegister src2,
3113                                            int cond, BasicType bt, uint vector_length, VectorMask vm) {
3114   assert(is_integral_type(bt), "unsupported element type");
3115   assert(vm == Assembler::v0_t ? vd != v0 : true, "should be different registers");
3116   vsetvli_helper(bt, vector_length);
3117   if (vm == Assembler::v0_t) {
3118     vmclr_m(vd);
3119   }
3120   switch (cond) {
3121     case BoolTest::eq: vmseq_vv(vd, src1, src2, vm); break;
3122     case BoolTest::ne: vmsne_vv(vd, src1, src2, vm); break;
3123     case BoolTest::le: vmsle_vv(vd, src1, src2, vm); break;
3124     case BoolTest::ge: vmsge_vv(vd, src1, src2, vm); break;
3125     case BoolTest::lt: vmslt_vv(vd, src1, src2, vm); break;
3126     case BoolTest::gt: vmsgt_vv(vd, src1, src2, vm); break;
3127     case BoolTest::ule: vmsleu_vv(vd, src1, src2, vm); break;
3128     case BoolTest::uge: vmsgeu_vv(vd, src1, src2, vm); break;
3129     case BoolTest::ult: vmsltu_vv(vd, src1, src2, vm); break;
3130     case BoolTest::ugt: vmsgtu_vv(vd, src1, src2, vm); break;
3131     default:
3132       assert(false, "unsupported compare condition");
3133       ShouldNotReachHere();
3134   }
3135 }
3136 
3137 void C2_MacroAssembler::compare_fp_v(VectorRegister vd, VectorRegister src1, VectorRegister src2,
3138                                      int cond, BasicType bt, uint vector_length, VectorMask vm) {
3139   assert(is_floating_point_type(bt), "unsupported element type");
3140   assert(vm == Assembler::v0_t ? vd != v0 : true, "should be different registers");
3141   vsetvli_helper(bt, vector_length);
3142   if (vm == Assembler::v0_t) {
3143     vmclr_m(vd);
3144   }
3145   switch (cond) {
3146     case BoolTest::eq: vmfeq_vv(vd, src1, src2, vm); break;
3147     case BoolTest::ne: vmfne_vv(vd, src1, src2, vm); break;
3148     case BoolTest::le: vmfle_vv(vd, src1, src2, vm); break;
3149     case BoolTest::ge: vmfge_vv(vd, src1, src2, vm); break;
3150     case BoolTest::lt: vmflt_vv(vd, src1, src2, vm); break;
3151     case BoolTest::gt: vmfgt_vv(vd, src1, src2, vm); break;
3152     default:
3153       assert(false, "unsupported compare condition");
3154       ShouldNotReachHere();
3155   }
3156 }
3157 
3158 // In Matcher::scalable_predicate_reg_slots,
3159 // we assume each predicate register is one-eighth of the size of
3160 // scalable vector register, one mask bit per vector byte.
3161 void C2_MacroAssembler::spill_vmask(VectorRegister v, int offset) {
3162   vsetvli_helper(T_BYTE, MaxVectorSize >> 3);
3163   add(t0, sp, offset);
3164   vse8_v(v, t0);
3165 }
3166 
3167 void C2_MacroAssembler::unspill_vmask(VectorRegister v, int offset) {
3168   vsetvli_helper(T_BYTE, MaxVectorSize >> 3);
3169   add(t0, sp, offset);
3170   vle8_v(v, t0);
3171 }
3172 
3173 void C2_MacroAssembler::integer_extend_v(VectorRegister dst, BasicType dst_bt, uint vector_length,
3174                                          VectorRegister src, BasicType src_bt, bool is_signed) {
3175   assert(type2aelembytes(dst_bt) > type2aelembytes(src_bt) && type2aelembytes(dst_bt) <= 8 && type2aelembytes(src_bt) <= 4, "invalid element size");
3176   assert(dst_bt != T_FLOAT && dst_bt != T_DOUBLE && src_bt != T_FLOAT && src_bt != T_DOUBLE, "unsupported element type");
3177   // https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#52-vector-operands
3178   // The destination EEW is greater than the source EEW, the source EMUL is at least 1,
3179   // and the overlap is in the highest-numbered part of the destination register group.
3180   // Since LMUL=1, vd and vs cannot be the same.
3181   assert_different_registers(dst, src);
3182 
3183   vsetvli_helper(dst_bt, vector_length);
3184   if (is_signed) {
3185     if (src_bt == T_BYTE) {
3186       switch (dst_bt) {
3187       case T_SHORT:
3188         vsext_vf2(dst, src);
3189         break;
3190       case T_INT:
3191         vsext_vf4(dst, src);
3192         break;
3193       case T_LONG:
3194         vsext_vf8(dst, src);
3195         break;
3196       default:
3197         ShouldNotReachHere();
3198       }
3199     } else if (src_bt == T_SHORT) {
3200       if (dst_bt == T_INT) {
3201         vsext_vf2(dst, src);
3202       } else {
3203         vsext_vf4(dst, src);
3204       }
3205     } else if (src_bt == T_INT) {
3206       vsext_vf2(dst, src);
3207     }
3208   } else {
3209     if (src_bt == T_BYTE) {
3210       switch (dst_bt) {
3211       case T_SHORT:
3212         vzext_vf2(dst, src);
3213         break;
3214       case T_INT:
3215         vzext_vf4(dst, src);
3216         break;
3217       case T_LONG:
3218         vzext_vf8(dst, src);
3219         break;
3220       default:
3221         ShouldNotReachHere();
3222       }
3223     } else if (src_bt == T_SHORT) {
3224       if (dst_bt == T_INT) {
3225         vzext_vf2(dst, src);
3226       } else {
3227         vzext_vf4(dst, src);
3228       }
3229     } else if (src_bt == T_INT) {
3230       vzext_vf2(dst, src);
3231     }
3232   }
3233 }
3234 
3235 // Vector narrow from src to dst with specified element sizes.
3236 // High part of dst vector will be filled with zero.
3237 void C2_MacroAssembler::integer_narrow_v(VectorRegister dst, BasicType dst_bt, uint vector_length,
3238                                          VectorRegister src, BasicType src_bt) {
3239   assert(type2aelembytes(dst_bt) < type2aelembytes(src_bt) && type2aelembytes(dst_bt) <= 4 && type2aelembytes(src_bt) <= 8, "invalid element size");
3240   assert(dst_bt != T_FLOAT && dst_bt != T_DOUBLE && src_bt != T_FLOAT && src_bt != T_DOUBLE, "unsupported element type");
3241   mv(t0, vector_length);
3242   if (src_bt == T_LONG) {
3243     // https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#117-vector-narrowing-integer-right-shift-instructions
3244     // Future extensions might add support for versions that narrow to a destination that is 1/4 the width of the source.
3245     // So we can currently only scale down by 1/2 the width at a time.
3246     vsetvli(t0, t0, Assembler::e32, Assembler::mf2);
3247     vncvt_x_x_w(dst, src);
3248     if (dst_bt == T_SHORT || dst_bt == T_BYTE) {
3249       vsetvli(t0, t0, Assembler::e16, Assembler::mf2);
3250       vncvt_x_x_w(dst, dst);
3251       if (dst_bt == T_BYTE) {
3252         vsetvli(t0, t0, Assembler::e8, Assembler::mf2);
3253         vncvt_x_x_w(dst, dst);
3254       }
3255     }
3256   } else if (src_bt == T_INT) {
3257     // T_SHORT
3258     vsetvli(t0, t0, Assembler::e16, Assembler::mf2);
3259     vncvt_x_x_w(dst, src);
3260     if (dst_bt == T_BYTE) {
3261       vsetvli(t0, t0, Assembler::e8, Assembler::mf2);
3262       vncvt_x_x_w(dst, dst);
3263     }
3264   } else if (src_bt == T_SHORT) {
3265     vsetvli(t0, t0, Assembler::e8, Assembler::mf2);
3266     vncvt_x_x_w(dst, src);
3267   }
3268 }
3269 
3270 #define VFCVT_SAFE(VFLOATCVT)                                                      \
3271 void C2_MacroAssembler::VFLOATCVT##_safe(VectorRegister dst, VectorRegister src) { \
3272   assert_different_registers(dst, src);                                            \
3273   vxor_vv(dst, dst, dst);                                                          \
3274   vmfeq_vv(v0, src, src);                                                          \
3275   VFLOATCVT(dst, src, Assembler::v0_t);                                            \
3276 }
3277 
3278 VFCVT_SAFE(vfcvt_rtz_x_f_v);
3279 
3280 #undef VFCVT_SAFE
3281 
3282 // Extract a scalar element from an vector at position 'idx'.
3283 // The input elements in src are expected to be of integral type.
3284 void C2_MacroAssembler::extract_v(Register dst, VectorRegister src,
3285                                   BasicType bt, int idx, VectorRegister vtmp) {
3286   assert(is_integral_type(bt), "unsupported element type");
3287   assert(idx >= 0, "idx cannot be negative");
3288   // Only need the first element after vector slidedown
3289   vsetvli_helper(bt, 1);
3290   if (idx == 0) {
3291     vmv_x_s(dst, src);
3292   } else {
3293     slidedown_v(vtmp, src, idx);
3294     vmv_x_s(dst, vtmp);
3295   }
3296 }
3297 
3298 // Extract a scalar element from an vector at position 'idx'.
3299 // The input elements in src are expected to be of floating point type.
3300 void C2_MacroAssembler::extract_fp_v(FloatRegister dst, VectorRegister src,
3301                                      BasicType bt, int idx, VectorRegister vtmp) {
3302   assert(is_floating_point_type(bt), "unsupported element type");
3303   assert(idx >= 0, "idx cannot be negative");
3304   // Only need the first element after vector slidedown
3305   vsetvli_helper(bt, 1);
3306   if (idx == 0) {
3307     vfmv_f_s(dst, src);
3308   } else {
3309     slidedown_v(vtmp, src, idx);
3310     vfmv_f_s(dst, vtmp);
3311   }
3312 }
3313 
3314 // Move elements down a vector register group.
3315 // Offset is the start index (offset) for the source.
3316 void C2_MacroAssembler::slidedown_v(VectorRegister dst, VectorRegister src,
3317                                     uint32_t offset, Register tmp) {
3318   if (is_uimm5(offset)) {
3319     vslidedown_vi(dst, src, offset);
3320   } else {
3321     mv(tmp, offset);
3322     vslidedown_vx(dst, src, tmp);
3323   }
3324 }