1 /*
   2  * Copyright (c) 2020, 2026, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2020, 2022, Huawei Technologies Co., Ltd. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "opto/c2_MacroAssembler.hpp"
  29 #include "opto/compile.hpp"
  30 #include "opto/intrinsicnode.hpp"
  31 #include "opto/output.hpp"
  32 #include "opto/subnode.hpp"
  33 #include "runtime/objectMonitorTable.hpp"
  34 #include "runtime/stubRoutines.hpp"
  35 #include "runtime/synchronizer.hpp"
  36 #include "utilities/globalDefinitions.hpp"
  37 
  38 #ifdef PRODUCT
  39 #define BLOCK_COMMENT(str) /* nothing */
  40 #define STOP(error) stop(error)
  41 #else
  42 #define BLOCK_COMMENT(str) block_comment(str)
  43 #define STOP(error) block_comment(error); stop(error)
  44 #endif
  45 
  46 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  47 
  48 void C2_MacroAssembler::entry_barrier() {
  49   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
  50   // Dummy labels for just measuring the code size
  51   Label dummy_slow_path;
  52   Label dummy_continuation;
  53   Label dummy_guard;
  54   Label* slow_path = &dummy_slow_path;
  55   Label* continuation = &dummy_continuation;
  56   Label* guard = &dummy_guard;
  57 
  58   if (!Compile::current()->output()->in_scratch_emit_size()) {
  59     // Use real labels from actual stub when not emitting code for the purpose of measuring its size
  60     C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
  61     Compile::current()->output()->add_stub(stub);
  62     slow_path = &stub->entry();
  63     continuation = &stub->continuation();
  64     guard = &stub->guard();
  65   }
  66 
  67   // In the C2 code, we move the non-hot part of nmethod entry barriers out-of-line to a stub.
  68   bs->nmethod_entry_barrier(this, slow_path, continuation, guard);
  69 }
  70 
  71 void C2_MacroAssembler::fast_lock(Register obj, Register box,
  72                                   Register tmp1, Register tmp2, Register tmp3, Register tmp4) {
  73   // Flag register, zero for success; non-zero for failure.
  74   Register flag = t1;
  75 
  76   assert_different_registers(obj, box, tmp1, tmp2, tmp3, tmp4, flag, t0);
  77 
  78   mv(flag, 1);
  79 
  80   // Handle inflated monitor.
  81   Label inflated;
  82   // Finish fast lock successfully. MUST branch to with flag == 0
  83   Label locked;
  84   // Finish fast lock unsuccessfully. slow_path MUST branch to with flag != 0
  85   Label slow_path;
  86 
  87   if (UseObjectMonitorTable) {
  88     // Clear cache in case fast locking succeeds or we need to take the slow-path.
  89     sd(zr, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
  90   }
  91 
  92   if (DiagnoseSyncOnValueBasedClasses != 0) {
  93     load_klass(tmp1, obj);
  94     lbu(tmp1, Address(tmp1, Klass::misc_flags_offset()));
  95     test_bit(tmp1, tmp1, exact_log2(KlassFlags::_misc_is_value_based_class));
  96     bnez(tmp1, slow_path);
  97   }
  98 
  99   const Register tmp1_mark = tmp1;
 100   const Register tmp3_t = tmp3;
 101 
 102   { // Fast locking
 103 
 104     // Push lock to the lock stack and finish successfully. MUST branch to with flag == 0
 105     Label push;
 106 
 107     const Register tmp2_top = tmp2;
 108 
 109     // Check if lock-stack is full.
 110     lwu(tmp2_top, Address(xthread, JavaThread::lock_stack_top_offset()));
 111     mv(tmp3_t, (unsigned)LockStack::end_offset());
 112     bge(tmp2_top, tmp3_t, slow_path);
 113 
 114     // Check if recursive.
 115     add(tmp3_t, xthread, tmp2_top);
 116     ld(tmp3_t, Address(tmp3_t, -oopSize));
 117     beq(obj, tmp3_t, push);
 118 
 119     // Relaxed normal load to check for monitor. Optimization for monitor case.
 120     ld(tmp1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 121     test_bit(tmp3_t, tmp1_mark, exact_log2(markWord::monitor_value));
 122     bnez(tmp3_t, inflated);
 123 
 124     // Not inflated
 125     assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a la");
 126 
 127     // Try to lock. Transition lock-bits 0b01 => 0b00
 128     ori(tmp1_mark, tmp1_mark, markWord::unlocked_value);
 129     xori(tmp3_t, tmp1_mark, markWord::unlocked_value);
 130     cmpxchg(/*addr*/ obj, /*expected*/ tmp1_mark, /*new*/ tmp3_t, Assembler::int64,
 131             /*acquire*/ Assembler::aq, /*release*/ Assembler::relaxed, /*result*/ tmp3_t);
 132     bne(tmp1_mark, tmp3_t, slow_path);
 133 
 134     bind(push);
 135     // After successful lock, push object on lock-stack.
 136     add(tmp3_t, xthread, tmp2_top);
 137     sd(obj, Address(tmp3_t));
 138     addw(tmp2_top, tmp2_top, oopSize);
 139     sw(tmp2_top, Address(xthread, JavaThread::lock_stack_top_offset()));
 140     j(locked);
 141   }
 142 
 143   { // Handle inflated monitor.
 144     bind(inflated);
 145 
 146     const Register tmp1_monitor = tmp1;
 147 
 148     if (!UseObjectMonitorTable) {
 149       assert(tmp1_monitor == tmp1_mark, "should be the same here");
 150     } else {
 151       const Register tmp2_hash = tmp2;
 152       const Register tmp3_bucket = tmp3;
 153       Label monitor_found;
 154 
 155       // Save the mark, we might need it to extract the hash.
 156       mv(tmp2_hash, tmp1_mark);
 157 
 158       // Look for the monitor in the om_cache.
 159 
 160       ByteSize cache_offset   = JavaThread::om_cache_oops_offset();
 161       ByteSize monitor_offset = OMCache::oop_to_monitor_difference();
 162       const int num_unrolled  = OMCache::CAPACITY;
 163       for (int i = 0; i < num_unrolled; i++) {
 164         ld(tmp1_monitor, Address(xthread, cache_offset + monitor_offset));
 165         ld(tmp4, Address(xthread, cache_offset));
 166         beq(obj, tmp4, monitor_found);
 167         cache_offset = cache_offset + OMCache::oop_to_oop_difference();
 168       }
 169 
 170       // Look for the monitor in the table.
 171 
 172       // Get the hash code.
 173       srli(tmp2_hash, tmp2_hash, markWord::hash_shift);
 174 
 175       // Get the table and calculate the bucket's address.
 176       la(tmp3_t, ExternalAddress(ObjectMonitorTable::current_table_address()));
 177       ld(tmp3_t, Address(tmp3_t));
 178       ld(tmp1, Address(tmp3_t, ObjectMonitorTable::table_capacity_mask_offset()));
 179       andr(tmp2_hash, tmp2_hash, tmp1);
 180       ld(tmp3_t, Address(tmp3_t, ObjectMonitorTable::table_buckets_offset()));
 181 
 182       // Read the monitor from the bucket.
 183       shadd(tmp3_bucket, tmp2_hash, tmp3_t, tmp4, LogBytesPerWord);
 184       ld(tmp1_monitor, Address(tmp3_bucket));
 185 
 186       // Check if the monitor in the bucket is special (empty, tombstone or removed).
 187       mv(tmp2, ObjectMonitorTable::SpecialPointerValues::below_is_special);
 188       bltu(tmp1_monitor, tmp2, slow_path);
 189 
 190       // Check if object matches.
 191       ld(tmp3, Address(tmp1_monitor, ObjectMonitor::object_offset()));
 192       BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
 193       bs_asm->try_peek_weak_handle_in_nmethod(this, tmp3, tmp3, tmp2, slow_path);
 194       bne(tmp3, obj, slow_path);
 195 
 196       bind(monitor_found);
 197     }
 198 
 199     const Register tmp2_owner_addr = tmp2;
 200     const Register tmp3_owner = tmp3;
 201 
 202     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 203     const Address owner_address(tmp1_monitor, ObjectMonitor::owner_offset() - monitor_tag);
 204     const Address recursions_address(tmp1_monitor, ObjectMonitor::recursions_offset() - monitor_tag);
 205 
 206     Label monitor_locked;
 207 
 208     // Compute owner address.
 209     la(tmp2_owner_addr, owner_address);
 210 
 211     // Try to CAS owner (no owner => current thread's _monitor_owner_id).
 212     Register tid = tmp4;
 213     ld(tid, Address(xthread, JavaThread::monitor_owner_id_offset()));
 214     cmpxchg(/*addr*/ tmp2_owner_addr, /*expected*/ zr, /*new*/ tid, Assembler::int64,
 215             /*acquire*/ Assembler::aq, /*release*/ Assembler::relaxed, /*result*/ tmp3_owner);
 216     beqz(tmp3_owner, monitor_locked);
 217 
 218     // Check if recursive.
 219     bne(tmp3_owner, tid, slow_path);
 220 
 221     // Recursive.
 222     increment(recursions_address, 1, tmp2, tmp3);
 223 
 224     bind(monitor_locked);
 225     if (UseObjectMonitorTable) {
 226       sd(tmp1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 227     }
 228   }
 229 
 230   bind(locked);
 231   mv(flag, zr);
 232 
 233 #ifdef ASSERT
 234   // Check that locked label is reached with flag == 0.
 235   Label flag_correct;
 236   beqz(flag, flag_correct);
 237   stop("Fast Lock Flag != 0");
 238 #endif
 239 
 240   bind(slow_path);
 241 #ifdef ASSERT
 242   // Check that slow_path label is reached with flag != 0.
 243   bnez(flag, flag_correct);
 244   stop("Fast Lock Flag == 0");
 245   bind(flag_correct);
 246 #endif
 247   // C2 uses the value of flag (0 vs !0) to determine the continuation.
 248 }
 249 
 250 void C2_MacroAssembler::fast_unlock(Register obj, Register box,
 251                                     Register tmp1, Register tmp2, Register tmp3) {
 252   // Flag register, zero for success; non-zero for failure.
 253   Register flag = t1;
 254 
 255   assert_different_registers(obj, box, tmp1, tmp2, tmp3, flag, t0);
 256 
 257   mv(flag, 1);
 258 
 259   // Handle inflated monitor.
 260   Label inflated, inflated_load_mark;
 261   // Finish fast unlock successfully. unlocked MUST branch to with flag == 0
 262   Label unlocked;
 263   // Finish fast unlock unsuccessfully. MUST branch to with flag != 0
 264   Label slow_path;
 265 
 266   const Register tmp1_mark = tmp1;
 267   const Register tmp2_top = tmp2;
 268   const Register tmp3_t = tmp3;
 269 
 270   { // Fast unlock
 271     Label push_and_slow_path;
 272 
 273     // Check if obj is top of lock-stack.
 274     lwu(tmp2_top, Address(xthread, JavaThread::lock_stack_top_offset()));
 275     subw(tmp2_top, tmp2_top, oopSize);
 276     add(tmp3_t, xthread, tmp2_top);
 277     ld(tmp3_t, Address(tmp3_t));
 278     // Top of lock stack was not obj. Must be monitor.
 279     bne(obj, tmp3_t, inflated_load_mark);
 280 
 281     // Pop lock-stack.
 282     DEBUG_ONLY(add(tmp3_t, xthread, tmp2_top);)
 283     DEBUG_ONLY(sd(zr, Address(tmp3_t));)
 284     sw(tmp2_top, Address(xthread, JavaThread::lock_stack_top_offset()));
 285 
 286     // Check if recursive.
 287     add(tmp3_t, xthread, tmp2_top);
 288     ld(tmp3_t, Address(tmp3_t, -oopSize));
 289     beq(obj, tmp3_t, unlocked);
 290 
 291     // Not recursive.
 292     // Load Mark.
 293     ld(tmp1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 294 
 295     // Check header for monitor (0b10).
 296     // Because we got here by popping (meaning we pushed in locked)
 297     // there will be no monitor in the box. So we need to push back the obj
 298     // so that the runtime can fix any potential anonymous owner.
 299     test_bit(tmp3_t, tmp1_mark, exact_log2(markWord::monitor_value));
 300     bnez(tmp3_t, UseObjectMonitorTable ? push_and_slow_path : inflated);
 301 
 302     // Try to unlock. Transition lock bits 0b00 => 0b01
 303     assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea");
 304     ori(tmp3_t, tmp1_mark, markWord::unlocked_value);
 305     cmpxchg(/*addr*/ obj, /*expected*/ tmp1_mark, /*new*/ tmp3_t, Assembler::int64,
 306             /*acquire*/ Assembler::relaxed, /*release*/ Assembler::rl, /*result*/ tmp3_t);
 307     beq(tmp1_mark, tmp3_t, unlocked);
 308 
 309     bind(push_and_slow_path);
 310     // Compare and exchange failed.
 311     // Restore lock-stack and handle the unlock in runtime.
 312     DEBUG_ONLY(add(tmp3_t, xthread, tmp2_top);)
 313     DEBUG_ONLY(sd(obj, Address(tmp3_t));)
 314     addw(tmp2_top, tmp2_top, oopSize);
 315     sd(tmp2_top, Address(xthread, JavaThread::lock_stack_top_offset()));
 316     j(slow_path);
 317   }
 318 
 319   { // Handle inflated monitor.
 320     bind(inflated_load_mark);
 321     ld(tmp1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 322 #ifdef ASSERT
 323     test_bit(tmp3_t, tmp1_mark, exact_log2(markWord::monitor_value));
 324     bnez(tmp3_t, inflated);
 325     stop("Fast Unlock not monitor");
 326 #endif
 327 
 328     bind(inflated);
 329 
 330 #ifdef ASSERT
 331     Label check_done;
 332     subw(tmp2_top, tmp2_top, oopSize);
 333     mv(tmp3_t, in_bytes(JavaThread::lock_stack_base_offset()));
 334     blt(tmp2_top, tmp3_t, check_done);
 335     add(tmp3_t, xthread, tmp2_top);
 336     ld(tmp3_t, Address(tmp3_t));
 337     bne(obj, tmp3_t, inflated);
 338     stop("Fast Unlock lock on stack");
 339     bind(check_done);
 340 #endif
 341 
 342     const Register tmp1_monitor = tmp1;
 343 
 344     if (!UseObjectMonitorTable) {
 345       assert(tmp1_monitor == tmp1_mark, "should be the same here");
 346       // Untag the monitor.
 347       subi(tmp1_monitor, tmp1_mark, (int)markWord::monitor_value);
 348     } else {
 349       ld(tmp1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 350       // No valid pointer below alignof(ObjectMonitor*). Take the slow path.
 351       mv(tmp3_t, alignof(ObjectMonitor*));
 352       bltu(tmp1_monitor, tmp3_t, slow_path);
 353     }
 354 
 355     const Register tmp2_recursions = tmp2;
 356     Label not_recursive;
 357 
 358     // Check if recursive.
 359     ld(tmp2_recursions, Address(tmp1_monitor, ObjectMonitor::recursions_offset()));
 360     beqz(tmp2_recursions, not_recursive);
 361 
 362     // Recursive unlock.
 363     subi(tmp2_recursions, tmp2_recursions, 1);
 364     sd(tmp2_recursions, Address(tmp1_monitor, ObjectMonitor::recursions_offset()));
 365     j(unlocked);
 366 
 367     bind(not_recursive);
 368 
 369     const Register tmp2_owner_addr = tmp2;
 370 
 371     // Compute owner address.
 372     la(tmp2_owner_addr, Address(tmp1_monitor, ObjectMonitor::owner_offset()));
 373 
 374     // Set owner to null.
 375     // Release to satisfy the JMM
 376     membar(MacroAssembler::LoadStore | MacroAssembler::StoreStore);
 377     sd(zr, Address(tmp2_owner_addr));
 378     // We need a full fence after clearing owner to avoid stranding.
 379     // StoreLoad achieves this.
 380     membar(StoreLoad);
 381 
 382     // Check if the entry_list is empty.
 383     ld(t0, Address(tmp1_monitor, ObjectMonitor::entry_list_offset()));
 384     beqz(t0, unlocked); // If so we are done.
 385 
 386     // Check if there is a successor.
 387     ld(tmp3_t, Address(tmp1_monitor, ObjectMonitor::succ_offset()));
 388     bnez(tmp3_t, unlocked); // If so we are done.
 389 
 390     // Save the monitor pointer in the current thread, so we can try
 391     // to reacquire the lock in SharedRuntime::monitor_exit_helper().
 392     sd(tmp1_monitor, Address(xthread, JavaThread::unlocked_inflated_monitor_offset()));
 393 
 394     mv(flag, 1);
 395     j(slow_path);
 396   }
 397 
 398   bind(unlocked);
 399   mv(flag, zr);
 400 
 401 #ifdef ASSERT
 402   // Check that unlocked label is reached with flag == 0.
 403   Label flag_correct;
 404   beqz(flag, flag_correct);
 405   stop("Fast Lock Flag != 0");
 406 #endif
 407 
 408   bind(slow_path);
 409 #ifdef ASSERT
 410   // Check that slow_path label is reached with flag != 0.
 411   bnez(flag, flag_correct);
 412   stop("Fast Lock Flag == 0");
 413   bind(flag_correct);
 414 #endif
 415   // C2 uses the value of flag (0 vs !0) to determine the continuation.
 416 }
 417 
 418 // short string
 419 // StringUTF16.indexOfChar
 420 // StringLatin1.indexOfChar
 421 void C2_MacroAssembler::string_indexof_char_short(Register str1, Register cnt1,
 422                                                   Register ch, Register result,
 423                                                   bool isL)
 424 {
 425   Register ch1 = t0;
 426   Register index = t1;
 427 
 428   BLOCK_COMMENT("string_indexof_char_short {");
 429 
 430   Label LOOP, LOOP1, LOOP4, LOOP8;
 431   Label MATCH,  MATCH1, MATCH2, MATCH3,
 432         MATCH4, MATCH5, MATCH6, MATCH7, NOMATCH;
 433 
 434   mv(result, -1);
 435   mv(index, zr);
 436 
 437   bind(LOOP);
 438   addi(t0, index, 8);
 439   ble(t0, cnt1, LOOP8);
 440   addi(t0, index, 4);
 441   ble(t0, cnt1, LOOP4);
 442   j(LOOP1);
 443 
 444   bind(LOOP8);
 445   isL ? lbu(ch1, Address(str1, 0)) : lhu(ch1, Address(str1, 0));
 446   beq(ch, ch1, MATCH);
 447   isL ? lbu(ch1, Address(str1, 1)) : lhu(ch1, Address(str1, 2));
 448   beq(ch, ch1, MATCH1);
 449   isL ? lbu(ch1, Address(str1, 2)) : lhu(ch1, Address(str1, 4));
 450   beq(ch, ch1, MATCH2);
 451   isL ? lbu(ch1, Address(str1, 3)) : lhu(ch1, Address(str1, 6));
 452   beq(ch, ch1, MATCH3);
 453   isL ? lbu(ch1, Address(str1, 4)) : lhu(ch1, Address(str1, 8));
 454   beq(ch, ch1, MATCH4);
 455   isL ? lbu(ch1, Address(str1, 5)) : lhu(ch1, Address(str1, 10));
 456   beq(ch, ch1, MATCH5);
 457   isL ? lbu(ch1, Address(str1, 6)) : lhu(ch1, Address(str1, 12));
 458   beq(ch, ch1, MATCH6);
 459   isL ? lbu(ch1, Address(str1, 7)) : lhu(ch1, Address(str1, 14));
 460   beq(ch, ch1, MATCH7);
 461   addi(index, index, 8);
 462   addi(str1, str1, isL ? 8 : 16);
 463   blt(index, cnt1, LOOP);
 464   j(NOMATCH);
 465 
 466   bind(LOOP4);
 467   isL ? lbu(ch1, Address(str1, 0)) : lhu(ch1, Address(str1, 0));
 468   beq(ch, ch1, MATCH);
 469   isL ? lbu(ch1, Address(str1, 1)) : lhu(ch1, Address(str1, 2));
 470   beq(ch, ch1, MATCH1);
 471   isL ? lbu(ch1, Address(str1, 2)) : lhu(ch1, Address(str1, 4));
 472   beq(ch, ch1, MATCH2);
 473   isL ? lbu(ch1, Address(str1, 3)) : lhu(ch1, Address(str1, 6));
 474   beq(ch, ch1, MATCH3);
 475   addi(index, index, 4);
 476   addi(str1, str1, isL ? 4 : 8);
 477   bge(index, cnt1, NOMATCH);
 478 
 479   bind(LOOP1);
 480   isL ? lbu(ch1, Address(str1)) : lhu(ch1, Address(str1));
 481   beq(ch, ch1, MATCH);
 482   addi(index, index, 1);
 483   addi(str1, str1, isL ? 1 : 2);
 484   blt(index, cnt1, LOOP1);
 485   j(NOMATCH);
 486 
 487   bind(MATCH1);
 488   addi(index, index, 1);
 489   j(MATCH);
 490 
 491   bind(MATCH2);
 492   addi(index, index, 2);
 493   j(MATCH);
 494 
 495   bind(MATCH3);
 496   addi(index, index, 3);
 497   j(MATCH);
 498 
 499   bind(MATCH4);
 500   addi(index, index, 4);
 501   j(MATCH);
 502 
 503   bind(MATCH5);
 504   addi(index, index, 5);
 505   j(MATCH);
 506 
 507   bind(MATCH6);
 508   addi(index, index, 6);
 509   j(MATCH);
 510 
 511   bind(MATCH7);
 512   addi(index, index, 7);
 513 
 514   bind(MATCH);
 515   mv(result, index);
 516   bind(NOMATCH);
 517   BLOCK_COMMENT("} string_indexof_char_short");
 518 }
 519 
 520 // StringUTF16.indexOfChar
 521 // StringLatin1.indexOfChar
 522 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1,
 523                                             Register ch, Register result,
 524                                             Register tmp1, Register tmp2,
 525                                             Register tmp3, Register tmp4,
 526                                             bool isL)
 527 {
 528   Label CH1_LOOP, HIT, NOMATCH, DONE, DO_LONG;
 529   Register ch1 = t0;
 530   Register orig_cnt = t1;
 531   Register mask1 = tmp3;
 532   Register mask2 = tmp2;
 533   Register match_mask = tmp1;
 534   Register trailing_char = tmp4;
 535   Register unaligned_elems = tmp4;
 536 
 537   BLOCK_COMMENT("string_indexof_char {");
 538   beqz(cnt1, NOMATCH);
 539 
 540   subi(t0, cnt1, isL ? 32 : 16);
 541   bgtz(t0, DO_LONG);
 542   string_indexof_char_short(str1, cnt1, ch, result, isL);
 543   j(DONE);
 544 
 545   bind(DO_LONG);
 546   mv(orig_cnt, cnt1);
 547   if (AvoidUnalignedAccesses) {
 548     Label ALIGNED;
 549     andi(unaligned_elems, str1, 0x7);
 550     beqz(unaligned_elems, ALIGNED);
 551     sub(unaligned_elems, unaligned_elems, 8);
 552     neg(unaligned_elems, unaligned_elems);
 553     if (!isL) {
 554       srli(unaligned_elems, unaligned_elems, 1);
 555     }
 556     // do unaligned part per element
 557     string_indexof_char_short(str1, unaligned_elems, ch, result, isL);
 558     bgez(result, DONE);
 559     mv(orig_cnt, cnt1);
 560     sub(cnt1, cnt1, unaligned_elems);
 561     bind(ALIGNED);
 562   }
 563 
 564   // duplicate ch
 565   if (isL) {
 566     slli(ch1, ch, 8);
 567     orr(ch, ch1, ch);
 568   }
 569   slli(ch1, ch, 16);
 570   orr(ch, ch1, ch);
 571   slli(ch1, ch, 32);
 572   orr(ch, ch1, ch);
 573 
 574   if (!isL) {
 575     slli(cnt1, cnt1, 1);
 576   }
 577 
 578   uint64_t mask0101 = UCONST64(0x0101010101010101);
 579   uint64_t mask0001 = UCONST64(0x0001000100010001);
 580   mv(mask1, isL ? mask0101 : mask0001);
 581   uint64_t mask7f7f = UCONST64(0x7f7f7f7f7f7f7f7f);
 582   uint64_t mask7fff = UCONST64(0x7fff7fff7fff7fff);
 583   mv(mask2, isL ? mask7f7f : mask7fff);
 584 
 585   bind(CH1_LOOP);
 586   ld(ch1, Address(str1));
 587   addi(str1, str1, 8);
 588   subi(cnt1, cnt1, 8);
 589   compute_match_mask(ch1, ch, match_mask, mask1, mask2);
 590   bnez(match_mask, HIT);
 591   bgtz(cnt1, CH1_LOOP);
 592   j(NOMATCH);
 593 
 594   bind(HIT);
 595   // count bits of trailing zero chars
 596   ctzc_bits(trailing_char, match_mask, isL, ch1, result);
 597   srli(trailing_char, trailing_char, 3);
 598   addi(cnt1, cnt1, 8);
 599   ble(cnt1, trailing_char, NOMATCH);
 600   // match case
 601   if (!isL) {
 602     srli(cnt1, cnt1, 1);
 603     srli(trailing_char, trailing_char, 1);
 604   }
 605 
 606   sub(result, orig_cnt, cnt1);
 607   add(result, result, trailing_char);
 608   j(DONE);
 609 
 610   bind(NOMATCH);
 611   mv(result, -1);
 612 
 613   bind(DONE);
 614   BLOCK_COMMENT("} string_indexof_char");
 615 }
 616 
 617 typedef void (MacroAssembler::* load_chr_insn)(Register rd, const Address &adr, Register temp);
 618 
 619 // Search for needle in haystack and return index or -1
 620 // x10: result
 621 // x11: haystack
 622 // x12: haystack_len
 623 // x13: needle
 624 // x14: needle_len
 625 void C2_MacroAssembler::string_indexof(Register haystack, Register needle,
 626                                        Register haystack_len, Register needle_len,
 627                                        Register tmp1, Register tmp2,
 628                                        Register tmp3, Register tmp4,
 629                                        Register tmp5, Register tmp6,
 630                                        Register result, int ae)
 631 {
 632   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
 633 
 634   Label LINEARSEARCH, LINEARSTUB, DONE, NOMATCH;
 635 
 636   Register ch1 = t0;
 637   Register ch2 = t1;
 638   Register nlen_tmp = tmp1; // needle len tmp
 639   Register hlen_tmp = tmp2; // haystack len tmp
 640   Register result_tmp = tmp4;
 641 
 642   bool isLL = ae == StrIntrinsicNode::LL;
 643 
 644   bool needle_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
 645   bool haystack_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
 646   int needle_chr_shift = needle_isL ? 0 : 1;
 647   int haystack_chr_shift = haystack_isL ? 0 : 1;
 648   int needle_chr_size = needle_isL ? 1 : 2;
 649   int haystack_chr_size = haystack_isL ? 1 : 2;
 650   load_chr_insn needle_load_1chr = needle_isL ? (load_chr_insn)&MacroAssembler::lbu :
 651                               (load_chr_insn)&MacroAssembler::lhu;
 652   load_chr_insn haystack_load_1chr = haystack_isL ? (load_chr_insn)&MacroAssembler::lbu :
 653                                 (load_chr_insn)&MacroAssembler::lhu;
 654 
 655   BLOCK_COMMENT("string_indexof {");
 656 
 657   // Note, inline_string_indexOf() generates checks:
 658   // if (pattern.count > src.count) return -1;
 659   // if (pattern.count == 0) return 0;
 660 
 661   // We have two strings, a source string in haystack, haystack_len and a pattern string
 662   // in needle, needle_len. Find the first occurrence of pattern in source or return -1.
 663 
 664   // For larger pattern and source we use a simplified Boyer Moore algorithm.
 665   // With a small pattern and source we use linear scan.
 666 
 667   // needle_len >=8 && needle_len < 256 && needle_len < haystack_len/4, use bmh algorithm.
 668   sub(result_tmp, haystack_len, needle_len);
 669   // needle_len < 8, use linear scan
 670   sub(t0, needle_len, 8);
 671   bltz(t0, LINEARSEARCH);
 672   // needle_len >= 256, use linear scan
 673   sub(t0, needle_len, 256);
 674   bgez(t0, LINEARSTUB);
 675   // needle_len >= haystack_len/4, use linear scan
 676   srli(t0, haystack_len, 2);
 677   bge(needle_len, t0, LINEARSTUB);
 678 
 679   // Boyer-Moore-Horspool introduction:
 680   // The Boyer Moore alogorithm is based on the description here:-
 681   //
 682   // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
 683   //
 684   // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
 685   // and the 'Good Suffix' rule.
 686   //
 687   // These rules are essentially heuristics for how far we can shift the
 688   // pattern along the search string.
 689   //
 690   // The implementation here uses the 'Bad Character' rule only because of the
 691   // complexity of initialisation for the 'Good Suffix' rule.
 692   //
 693   // This is also known as the Boyer-Moore-Horspool algorithm:
 694   //
 695   // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
 696   //
 697   // #define ASIZE 256
 698   //
 699   //    int bm(unsigned char *pattern, int m, unsigned char *src, int n) {
 700   //      int i, j;
 701   //      unsigned c;
 702   //      unsigned char bc[ASIZE];
 703   //
 704   //      /* Preprocessing */
 705   //      for (i = 0; i < ASIZE; ++i)
 706   //        bc[i] = m;
 707   //      for (i = 0; i < m - 1; ) {
 708   //        c = pattern[i];
 709   //        ++i;
 710   //        // c < 256 for Latin1 string, so, no need for branch
 711   //        #ifdef PATTERN_STRING_IS_LATIN1
 712   //        bc[c] = m - i;
 713   //        #else
 714   //        if (c < ASIZE) bc[c] = m - i;
 715   //        #endif
 716   //      }
 717   //
 718   //      /* Searching */
 719   //      j = 0;
 720   //      while (j <= n - m) {
 721   //        c = src[i+j];
 722   //        if (pattern[m-1] == c)
 723   //          int k;
 724   //          for (k = m - 2; k >= 0 && pattern[k] == src[k + j]; --k);
 725   //          if (k < 0) return j;
 726   //          // c < 256 for Latin1 string, so, no need for branch
 727   //          #ifdef SOURCE_STRING_IS_LATIN1_AND_PATTERN_STRING_IS_LATIN1
 728   //          // LL case: (c< 256) always true. Remove branch
 729   //          j += bc[pattern[j+m-1]];
 730   //          #endif
 731   //          #ifdef SOURCE_STRING_IS_UTF_AND_PATTERN_STRING_IS_UTF
 732   //          // UU case: need if (c<ASIZE) check. Skip 1 character if not.
 733   //          if (c < ASIZE)
 734   //            j += bc[pattern[j+m-1]];
 735   //          else
 736   //            j += 1
 737   //          #endif
 738   //          #ifdef SOURCE_IS_UTF_AND_PATTERN_IS_LATIN1
 739   //          // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
 740   //          if (c < ASIZE)
 741   //            j += bc[pattern[j+m-1]];
 742   //          else
 743   //            j += m
 744   //          #endif
 745   //      }
 746   //      return -1;
 747   //    }
 748 
 749   // temp register:t0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, result
 750   Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
 751         BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
 752 
 753   Register haystack_end = haystack_len;
 754   Register skipch = tmp2;
 755 
 756   // pattern length is >=8, so, we can read at least 1 register for cases when
 757   // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
 758   // UL case. We'll re-read last character in inner pre-loop code to have
 759   // single outer pre-loop load
 760   const int firstStep = isLL ? 7 : 3;
 761 
 762   const int ASIZE = 256;
 763   const int STORE_BYTES = 8; // 8 bytes stored per instruction(sd)
 764 
 765   subi(sp, sp, ASIZE);
 766 
 767   // init BC offset table with default value: needle_len
 768   slli(t0, needle_len, 8);
 769   orr(t0, t0, needle_len); // [63...16][needle_len][needle_len]
 770   slli(tmp1, t0, 16);
 771   orr(t0, tmp1, t0); // [63...32][needle_len][needle_len][needle_len][needle_len]
 772   slli(tmp1, t0, 32);
 773   orr(tmp5, tmp1, t0); // tmp5: 8 elements [needle_len]
 774 
 775   mv(ch1, sp);  // ch1 is t0
 776   mv(tmp6, ASIZE / STORE_BYTES); // loop iterations
 777 
 778   bind(BM_INIT_LOOP);
 779   // for (i = 0; i < ASIZE; ++i)
 780   //   bc[i] = m;
 781   for (int i = 0; i < 4; i++) {
 782     sd(tmp5, Address(ch1, i * wordSize));
 783   }
 784   addi(ch1, ch1, 32);
 785   subi(tmp6, tmp6, 4);
 786   bgtz(tmp6, BM_INIT_LOOP);
 787 
 788   subi(nlen_tmp, needle_len, 1); // m - 1, index of the last element in pattern
 789   Register orig_haystack = tmp5;
 790   mv(orig_haystack, haystack);
 791   // result_tmp = tmp4
 792   shadd(haystack_end, result_tmp, haystack, haystack_end, haystack_chr_shift);
 793   subi(ch2, needle_len, 1); // bc offset init value, ch2 is t1
 794   mv(tmp3, needle);
 795 
 796   //  for (i = 0; i < m - 1; ) {
 797   //    c = pattern[i];
 798   //    ++i;
 799   //    // c < 256 for Latin1 string, so, no need for branch
 800   //    #ifdef PATTERN_STRING_IS_LATIN1
 801   //    bc[c] = m - i;
 802   //    #else
 803   //    if (c < ASIZE) bc[c] = m - i;
 804   //    #endif
 805   //  }
 806   bind(BCLOOP);
 807   (this->*needle_load_1chr)(ch1, Address(tmp3), noreg);
 808   addi(tmp3, tmp3, needle_chr_size);
 809   if (!needle_isL) {
 810     // ae == StrIntrinsicNode::UU
 811     mv(tmp6, ASIZE);
 812     bgeu(ch1, tmp6, BCSKIP);
 813   }
 814   add(tmp4, sp, ch1);
 815   sb(ch2, Address(tmp4)); // store skip offset to BC offset table
 816 
 817   bind(BCSKIP);
 818   subi(ch2, ch2, 1); // for next pattern element, skip distance -1
 819   bgtz(ch2, BCLOOP);
 820 
 821   // tmp6: pattern end, address after needle
 822   shadd(tmp6, needle_len, needle, tmp6, needle_chr_shift);
 823   if (needle_isL == haystack_isL) {
 824     // load last 8 bytes (8LL/4UU symbols)
 825     ld(tmp6, Address(tmp6, -wordSize));
 826   } else {
 827     // UL: from UTF-16(source) search Latin1(pattern)
 828     lwu(tmp6, Address(tmp6, -wordSize / 2)); // load last 4 bytes(4 symbols)
 829     // convert Latin1 to UTF. eg: 0x0000abcd -> 0x0a0b0c0d
 830     // We'll have to wait until load completed, but it's still faster than per-character loads+checks
 831     srli(tmp3, tmp6, BitsPerByte * (wordSize / 2 - needle_chr_size)); // pattern[m-1], eg:0x0000000a
 832     slli(ch2, tmp6, XLEN - 24);
 833     srli(ch2, ch2, XLEN - 8); // pattern[m-2], 0x0000000b
 834     slli(ch1, tmp6, XLEN - 16);
 835     srli(ch1, ch1, XLEN - 8); // pattern[m-3], 0x0000000c
 836     zext(tmp6, tmp6, 8); // pattern[m-4], 0x0000000d
 837     slli(ch2, ch2, 16);
 838     orr(ch2, ch2, ch1); // 0x00000b0c
 839     slli(result, tmp3, 48); // use result as temp register
 840     orr(tmp6, tmp6, result); // 0x0a00000d
 841     slli(result, ch2, 16);
 842     orr(tmp6, tmp6, result); // UTF-16:0x0a0b0c0d
 843   }
 844 
 845   // i = m - 1;
 846   // skipch = j + i;
 847   // if (skipch == pattern[m - 1]
 848   //   for (k = m - 2; k >= 0 && pattern[k] == src[k + j]; --k);
 849   // else
 850   //   move j with bad char offset table
 851   bind(BMLOOPSTR2);
 852   // compare pattern to source string backward
 853   shadd(result, nlen_tmp, haystack, result, haystack_chr_shift);
 854   (this->*haystack_load_1chr)(skipch, Address(result), noreg);
 855   subi(nlen_tmp, nlen_tmp, firstStep); // nlen_tmp is positive here, because needle_len >= 8
 856   if (needle_isL == haystack_isL) {
 857     // re-init tmp3. It's for free because it's executed in parallel with
 858     // load above. Alternative is to initialize it before loop, but it'll
 859     // affect performance on in-order systems with 2 or more ld/st pipelines
 860     srli(tmp3, tmp6, BitsPerByte * (wordSize - needle_chr_size)); // UU/LL: pattern[m-1]
 861   }
 862   if (!isLL) { // UU/UL case
 863     slli(ch2, nlen_tmp, 1); // offsets in bytes
 864   }
 865   bne(tmp3, skipch, BMSKIP); // if not equal, skipch is bad char
 866   add(result, haystack, isLL ? nlen_tmp : ch2);
 867   // load 8 bytes from source string
 868   // if isLL is false then read granularity can be 2
 869   load_long_misaligned(ch2, Address(result), ch1, isLL ? 1 : 2); // can use ch1 as temp register here as it will be trashed by next mv anyway
 870   mv(ch1, tmp6);
 871   if (isLL) {
 872     j(BMLOOPSTR1_AFTER_LOAD);
 873   } else {
 874     subi(nlen_tmp, nlen_tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
 875     j(BMLOOPSTR1_CMP);
 876   }
 877 
 878   bind(BMLOOPSTR1);
 879   shadd(ch1, nlen_tmp, needle, ch1, needle_chr_shift);
 880   (this->*needle_load_1chr)(ch1, Address(ch1), noreg);
 881   shadd(ch2, nlen_tmp, haystack, ch2, haystack_chr_shift);
 882   (this->*haystack_load_1chr)(ch2, Address(ch2), noreg);
 883 
 884   bind(BMLOOPSTR1_AFTER_LOAD);
 885   subi(nlen_tmp, nlen_tmp, 1);
 886   bltz(nlen_tmp, BMLOOPSTR1_LASTCMP);
 887 
 888   bind(BMLOOPSTR1_CMP);
 889   beq(ch1, ch2, BMLOOPSTR1);
 890 
 891   bind(BMSKIP);
 892   if (!isLL) {
 893     // if we've met UTF symbol while searching Latin1 pattern, then we can
 894     // skip needle_len symbols
 895     if (needle_isL != haystack_isL) {
 896       mv(result_tmp, needle_len);
 897     } else {
 898       mv(result_tmp, 1);
 899     }
 900     mv(t0, ASIZE);
 901     bgeu(skipch, t0, BMADV);
 902   }
 903   add(result_tmp, sp, skipch);
 904   lbu(result_tmp, Address(result_tmp)); // load skip offset
 905 
 906   bind(BMADV);
 907   subi(nlen_tmp, needle_len, 1);
 908   // move haystack after bad char skip offset
 909   shadd(haystack, result_tmp, haystack, result, haystack_chr_shift);
 910   ble(haystack, haystack_end, BMLOOPSTR2);
 911   addi(sp, sp, ASIZE);
 912   j(NOMATCH);
 913 
 914   bind(BMLOOPSTR1_LASTCMP);
 915   bne(ch1, ch2, BMSKIP);
 916 
 917   bind(BMMATCH);
 918   sub(result, haystack, orig_haystack);
 919   if (!haystack_isL) {
 920     srli(result, result, 1);
 921   }
 922   addi(sp, sp, ASIZE);
 923   j(DONE);
 924 
 925   bind(LINEARSTUB);
 926   subi(t0, needle_len, 16); // small patterns still should be handled by simple algorithm
 927   bltz(t0, LINEARSEARCH);
 928   mv(result, zr);
 929   RuntimeAddress stub = nullptr;
 930   if (isLL) {
 931     stub = RuntimeAddress(StubRoutines::riscv::string_indexof_linear_ll());
 932     assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated");
 933   } else if (needle_isL) {
 934     stub = RuntimeAddress(StubRoutines::riscv::string_indexof_linear_ul());
 935     assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated");
 936   } else {
 937     stub = RuntimeAddress(StubRoutines::riscv::string_indexof_linear_uu());
 938     assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated");
 939   }
 940   address call = reloc_call(stub);
 941   if (call == nullptr) {
 942     DEBUG_ONLY(reset_labels(LINEARSEARCH, DONE, NOMATCH));
 943     ciEnv::current()->record_failure("CodeCache is full");
 944     return;
 945   }
 946   j(DONE);
 947 
 948   bind(NOMATCH);
 949   mv(result, -1);
 950   j(DONE);
 951 
 952   bind(LINEARSEARCH);
 953   string_indexof_linearscan(haystack, needle, haystack_len, needle_len, tmp1, tmp2, tmp3, tmp4, -1, result, ae);
 954 
 955   bind(DONE);
 956   BLOCK_COMMENT("} string_indexof");
 957 }
 958 
 959 // string_indexof
 960 // result: x10
 961 // src: x11
 962 // src_count: x12
 963 // pattern: x13
 964 // pattern_count: x14 or 1/2/3/4
 965 void C2_MacroAssembler::string_indexof_linearscan(Register haystack, Register needle,
 966                                                Register haystack_len, Register needle_len,
 967                                                Register tmp1, Register tmp2,
 968                                                Register tmp3, Register tmp4,
 969                                                int needle_con_cnt, Register result, int ae)
 970 {
 971   // Note:
 972   // needle_con_cnt > 0 means needle_len register is invalid, needle length is constant
 973   // for UU/LL: needle_con_cnt[1, 4], UL: needle_con_cnt = 1
 974   assert(needle_con_cnt <= 4, "Invalid needle constant count");
 975   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
 976 
 977   Register ch1 = t0;
 978   Register ch2 = t1;
 979   Register hlen_neg = haystack_len, nlen_neg = needle_len;
 980   Register nlen_tmp = tmp1, hlen_tmp = tmp2, result_tmp = tmp4;
 981 
 982   bool isLL = ae == StrIntrinsicNode::LL;
 983 
 984   bool needle_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
 985   bool haystack_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
 986   int needle_chr_shift = needle_isL ? 0 : 1;
 987   int haystack_chr_shift = haystack_isL ? 0 : 1;
 988   int needle_chr_size = needle_isL ? 1 : 2;
 989   int haystack_chr_size = haystack_isL ? 1 : 2;
 990 
 991   load_chr_insn needle_load_1chr = needle_isL ? (load_chr_insn)&MacroAssembler::lbu :
 992                               (load_chr_insn)&MacroAssembler::lhu;
 993   load_chr_insn haystack_load_1chr = haystack_isL ? (load_chr_insn)&MacroAssembler::lbu :
 994                                 (load_chr_insn)&MacroAssembler::lhu;
 995   load_chr_insn load_2chr = isLL ? (load_chr_insn)&MacroAssembler::lhu : (load_chr_insn)&MacroAssembler::lwu;
 996   load_chr_insn load_4chr = isLL ? (load_chr_insn)&MacroAssembler::lwu : (load_chr_insn)&MacroAssembler::ld;
 997 
 998   Label DO1, DO2, DO3, MATCH, NOMATCH, DONE;
 999 
1000   Register first = tmp3;
1001 
1002   if (needle_con_cnt == -1) {
1003     Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
1004 
1005     subi(t0, needle_len, needle_isL == haystack_isL ? 4 : 2);
1006     bltz(t0, DOSHORT);
1007 
1008     (this->*needle_load_1chr)(first, Address(needle), noreg);
1009     slli(t0, needle_len, needle_chr_shift);
1010     add(needle, needle, t0);
1011     neg(nlen_neg, t0);
1012     slli(t0, result_tmp, haystack_chr_shift);
1013     add(haystack, haystack, t0);
1014     neg(hlen_neg, t0);
1015 
1016     bind(FIRST_LOOP);
1017     add(t0, haystack, hlen_neg);
1018     (this->*haystack_load_1chr)(ch2, Address(t0), noreg);
1019     beq(first, ch2, STR1_LOOP);
1020 
1021     bind(STR2_NEXT);
1022     addi(hlen_neg, hlen_neg, haystack_chr_size);
1023     blez(hlen_neg, FIRST_LOOP);
1024     j(NOMATCH);
1025 
1026     bind(STR1_LOOP);
1027     addi(nlen_tmp, nlen_neg, needle_chr_size);
1028     addi(hlen_tmp, hlen_neg, haystack_chr_size);
1029     bgez(nlen_tmp, MATCH);
1030 
1031     bind(STR1_NEXT);
1032     add(ch1, needle, nlen_tmp);
1033     (this->*needle_load_1chr)(ch1, Address(ch1), noreg);
1034     add(ch2, haystack, hlen_tmp);
1035     (this->*haystack_load_1chr)(ch2, Address(ch2), noreg);
1036     bne(ch1, ch2, STR2_NEXT);
1037     addi(nlen_tmp, nlen_tmp, needle_chr_size);
1038     addi(hlen_tmp, hlen_tmp, haystack_chr_size);
1039     bltz(nlen_tmp, STR1_NEXT);
1040     j(MATCH);
1041 
1042     bind(DOSHORT);
1043     if (needle_isL == haystack_isL) {
1044       subi(t0, needle_len, 2);
1045       bltz(t0, DO1);
1046       bgtz(t0, DO3);
1047     }
1048   }
1049 
1050   if (needle_con_cnt == 4) {
1051     Label CH1_LOOP;
1052     (this->*load_4chr)(ch1, Address(needle), noreg);
1053     subi(result_tmp, haystack_len, 4);
1054     slli(tmp3, result_tmp, haystack_chr_shift); // result as tmp
1055     add(haystack, haystack, tmp3);
1056     neg(hlen_neg, tmp3);
1057     if (AvoidUnalignedAccesses) {
1058       // preload first value, then we will read by 1 character per loop, instead of four
1059       // just shifting previous ch2 right by size of character in bits
1060       add(tmp3, haystack, hlen_neg);
1061       (this->*load_4chr)(ch2, Address(tmp3), noreg);
1062       if (isLL) {
1063         // need to erase 1 most significant byte in 32-bit value of ch2
1064         slli(ch2, ch2, 40);
1065         srli(ch2, ch2, 32);
1066       } else {
1067         slli(ch2, ch2, 16); // 2 most significant bytes will be erased by this operation
1068       }
1069     }
1070 
1071     bind(CH1_LOOP);
1072     add(tmp3, haystack, hlen_neg);
1073     if (AvoidUnalignedAccesses) {
1074       srli(ch2, ch2, isLL ? 8 : 16);
1075       (this->*haystack_load_1chr)(tmp3, Address(tmp3, isLL ? 3 : 6), noreg);
1076       slli(tmp3, tmp3, isLL ? 24 : 48);
1077       add(ch2, ch2, tmp3);
1078     } else {
1079       (this->*load_4chr)(ch2, Address(tmp3), noreg);
1080     }
1081     beq(ch1, ch2, MATCH);
1082     addi(hlen_neg, hlen_neg, haystack_chr_size);
1083     blez(hlen_neg, CH1_LOOP);
1084     j(NOMATCH);
1085   }
1086 
1087   if ((needle_con_cnt == -1 && needle_isL == haystack_isL) || needle_con_cnt == 2) {
1088     Label CH1_LOOP;
1089     BLOCK_COMMENT("string_indexof DO2 {");
1090     bind(DO2);
1091     (this->*load_2chr)(ch1, Address(needle), noreg);
1092     if (needle_con_cnt == 2) {
1093       subi(result_tmp, haystack_len, 2);
1094     }
1095     slli(tmp3, result_tmp, haystack_chr_shift);
1096     add(haystack, haystack, tmp3);
1097     neg(hlen_neg, tmp3);
1098     if (AvoidUnalignedAccesses) {
1099       // preload first value, then we will read by 1 character per loop, instead of two
1100       // just shifting previous ch2 right by size of character in bits
1101       add(tmp3, haystack, hlen_neg);
1102       (this->*haystack_load_1chr)(ch2, Address(tmp3), noreg);
1103       slli(ch2, ch2, isLL ? 8 : 16);
1104     }
1105     bind(CH1_LOOP);
1106     add(tmp3, haystack, hlen_neg);
1107     if (AvoidUnalignedAccesses) {
1108       srli(ch2, ch2, isLL ? 8 : 16);
1109       (this->*haystack_load_1chr)(tmp3, Address(tmp3, isLL ? 1 : 2), noreg);
1110       slli(tmp3, tmp3, isLL ? 8 : 16);
1111       add(ch2, ch2, tmp3);
1112     } else {
1113       (this->*load_2chr)(ch2, Address(tmp3), noreg);
1114     }
1115     beq(ch1, ch2, MATCH);
1116     addi(hlen_neg, hlen_neg, haystack_chr_size);
1117     blez(hlen_neg, CH1_LOOP);
1118     j(NOMATCH);
1119     BLOCK_COMMENT("} string_indexof DO2");
1120   }
1121 
1122   if ((needle_con_cnt == -1 && needle_isL == haystack_isL) || needle_con_cnt == 3) {
1123     Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
1124     BLOCK_COMMENT("string_indexof DO3 {");
1125 
1126     bind(DO3);
1127     (this->*load_2chr)(first, Address(needle), noreg);
1128     (this->*needle_load_1chr)(ch1, Address(needle, 2 * needle_chr_size), noreg);
1129     if (needle_con_cnt == 3) {
1130       subi(result_tmp, haystack_len, 3);
1131     }
1132     slli(hlen_tmp, result_tmp, haystack_chr_shift);
1133     add(haystack, haystack, hlen_tmp);
1134     neg(hlen_neg, hlen_tmp);
1135 
1136     bind(FIRST_LOOP);
1137     add(ch2, haystack, hlen_neg);
1138     if (AvoidUnalignedAccesses) {
1139       (this->*haystack_load_1chr)(tmp2, Address(ch2, isLL ? 1 : 2), noreg); // we need a temp register, we can safely use hlen_tmp here, which is a synonym for tmp2
1140       (this->*haystack_load_1chr)(ch2, Address(ch2), noreg);
1141       slli(tmp2, tmp2, isLL ? 8 : 16);
1142       add(ch2, ch2, tmp2);
1143     } else {
1144       (this->*load_2chr)(ch2, Address(ch2), noreg);
1145     }
1146     beq(first, ch2, STR1_LOOP);
1147 
1148     bind(STR2_NEXT);
1149     addi(hlen_neg, hlen_neg, haystack_chr_size);
1150     blez(hlen_neg, FIRST_LOOP);
1151     j(NOMATCH);
1152 
1153     bind(STR1_LOOP);
1154     addi(hlen_tmp, hlen_neg, 2 * haystack_chr_size);
1155     add(ch2, haystack, hlen_tmp);
1156     (this->*haystack_load_1chr)(ch2, Address(ch2), noreg);
1157     bne(ch1, ch2, STR2_NEXT);
1158     j(MATCH);
1159     BLOCK_COMMENT("} string_indexof DO3");
1160   }
1161 
1162   if (needle_con_cnt == -1 || needle_con_cnt == 1) {
1163     Label DO1_LOOP;
1164 
1165     BLOCK_COMMENT("string_indexof DO1 {");
1166     bind(DO1);
1167     (this->*needle_load_1chr)(ch1, Address(needle), noreg);
1168     subi(result_tmp, haystack_len, 1);
1169     slli(tmp3, result_tmp, haystack_chr_shift);
1170     add(haystack, haystack, tmp3);
1171     neg(hlen_neg, tmp3);
1172 
1173     bind(DO1_LOOP);
1174     add(tmp3, haystack, hlen_neg);
1175     (this->*haystack_load_1chr)(ch2, Address(tmp3), noreg);
1176     beq(ch1, ch2, MATCH);
1177     addi(hlen_neg, hlen_neg, haystack_chr_size);
1178     blez(hlen_neg, DO1_LOOP);
1179     BLOCK_COMMENT("} string_indexof DO1");
1180   }
1181 
1182   bind(NOMATCH);
1183   mv(result, -1);
1184   j(DONE);
1185 
1186   bind(MATCH);
1187   srai(t0, hlen_neg, haystack_chr_shift);
1188   add(result, result_tmp, t0);
1189 
1190   bind(DONE);
1191 }
1192 
1193 // Compare longwords
1194 void C2_MacroAssembler::string_compare_long_same_encoding(Register result, Register str1, Register str2,
1195                                                   const bool isLL, Register cnt1, Register cnt2,
1196                                                   Register tmp1, Register tmp2, Register tmp3,
1197                                                   const int STUB_THRESHOLD, Label *STUB, Label *SHORT_STRING, Label *DONE) {
1198   Label TAIL_CHECK, TAIL, NEXT_WORD, DIFFERENCE;
1199 
1200   const int base_offset = arrayOopDesc::base_offset_in_bytes(T_BYTE);
1201   assert((base_offset % (UseCompactObjectHeaders ? 4 : 8)) == 0, "Must be");
1202 
1203   const int minCharsInWord = isLL ? wordSize : wordSize / 2;
1204 
1205   // load first parts of strings and finish initialization while loading
1206   beq(str1, str2, *DONE);
1207   // Alignment
1208   if (AvoidUnalignedAccesses && (base_offset % 8) != 0) {
1209     lwu(tmp1, Address(str1));
1210     lwu(tmp2, Address(str2));
1211     bne(tmp1, tmp2, DIFFERENCE);
1212     addi(str1, str1, 4);
1213     addi(str2, str2, 4);
1214     subi(cnt2, cnt2, minCharsInWord / 2);
1215 
1216     // A very short string
1217     mv(t0, minCharsInWord);
1218     ble(cnt2, t0, *SHORT_STRING);
1219   }
1220 #ifdef ASSERT
1221   if (AvoidUnalignedAccesses) {
1222     Label align_ok;
1223     orr(t0, str1, str2);
1224     andi(t0, t0, 0x7);
1225     beqz(t0, align_ok);
1226     stop("bad alignment");
1227     bind(align_ok);
1228   }
1229 #endif
1230   // load 8 bytes once to compare
1231   ld(tmp1, Address(str1));
1232   ld(tmp2, Address(str2));
1233   mv(t0, STUB_THRESHOLD);
1234   bge(cnt2, t0, *STUB);
1235   subi(cnt2, cnt2, minCharsInWord);
1236   beqz(cnt2, TAIL_CHECK);
1237   // convert cnt2 from characters to bytes
1238   if (!isLL) {
1239     slli(cnt2, cnt2, 1);
1240   }
1241   add(str2, str2, cnt2);
1242   add(str1, str1, cnt2);
1243   sub(cnt2, zr, cnt2);
1244   addi(cnt2, cnt2, 8);
1245   bne(tmp1, tmp2, DIFFERENCE);
1246   bgez(cnt2, TAIL);
1247 
1248   // main loop
1249   bind(NEXT_WORD);
1250     // 8-byte aligned loads when AvoidUnalignedAccesses is enabled
1251     add(t0, str1, cnt2);
1252     ld(tmp1, Address(t0));
1253     add(t0, str2, cnt2);
1254     ld(tmp2, Address(t0));
1255     addi(cnt2, cnt2, 8);
1256     bne(tmp1, tmp2, DIFFERENCE);
1257     bltz(cnt2, NEXT_WORD);
1258 
1259   bind(TAIL);
1260   load_long_misaligned(tmp1, Address(str1), tmp3, isLL ? 1 : 2);
1261   load_long_misaligned(tmp2, Address(str2), tmp3, isLL ? 1 : 2);
1262 
1263   bind(TAIL_CHECK);
1264   beq(tmp1, tmp2, *DONE);
1265 
1266   // Find the first different characters in the longwords and
1267   // compute their difference.
1268   bind(DIFFERENCE);
1269   xorr(tmp3, tmp1, tmp2);
1270   // count bits of trailing zero chars
1271   ctzc_bits(result, tmp3, isLL);
1272   srl(tmp1, tmp1, result);
1273   srl(tmp2, tmp2, result);
1274   if (isLL) {
1275     zext(tmp1, tmp1, 8);
1276     zext(tmp2, tmp2, 8);
1277   } else {
1278     zext(tmp1, tmp1, 16);
1279     zext(tmp2, tmp2, 16);
1280   }
1281   sub(result, tmp1, tmp2);
1282 
1283   j(*DONE);
1284 }
1285 
1286 // Compare longwords
1287 void C2_MacroAssembler::string_compare_long_different_encoding(Register result, Register str1, Register str2,
1288                                                bool isLU, Register cnt1, Register cnt2,
1289                                                Register tmp1, Register tmp2, Register tmp3,
1290                                                const int STUB_THRESHOLD, Label *STUB, Label *DONE) {
1291   Label TAIL, NEXT_WORD, DIFFERENCE;
1292 
1293   const int base_offset = arrayOopDesc::base_offset_in_bytes(T_BYTE);
1294   assert((base_offset % (UseCompactObjectHeaders ? 4 : 8)) == 0, "Must be");
1295 
1296   Register strL = isLU ? str1 : str2;
1297   Register strU = isLU ? str2 : str1;
1298   Register tmpL = tmp1, tmpU = tmp2;
1299 
1300   // load first parts of strings and finish initialization while loading
1301   mv(t0, STUB_THRESHOLD);
1302   bge(cnt2, t0, *STUB);
1303   lwu(tmpL, Address(strL));
1304   load_long_misaligned(tmpU, Address(strU), tmp3, (base_offset % 8) != 0 ? 4 : 8);
1305   subi(cnt2, cnt2, 4);
1306   add(strL, strL, cnt2);
1307   sub(cnt1, zr, cnt2);
1308   slli(cnt2, cnt2, 1);
1309   add(strU, strU, cnt2);
1310   inflate_lo32(tmp3, tmpL);
1311   mv(tmpL, tmp3);
1312   sub(cnt2, zr, cnt2);
1313   addi(cnt1, cnt1, 4);
1314   addi(cnt2, cnt2, 8);
1315   bne(tmpL, tmpU, DIFFERENCE);
1316   bgez(cnt2, TAIL);
1317 
1318   // main loop
1319   bind(NEXT_WORD);
1320     add(t0, strL, cnt1);
1321     lwu(tmpL, Address(t0));
1322     add(t0, strU, cnt2);
1323     load_long_misaligned(tmpU, Address(t0), tmp3, (base_offset % 8) != 0 ? 4 : 8);
1324     addi(cnt1, cnt1, 4);
1325     inflate_lo32(tmp3, tmpL);
1326     mv(tmpL, tmp3);
1327     addi(cnt2, cnt2, 8);
1328     bne(tmpL, tmpU, DIFFERENCE);
1329     bltz(cnt2, NEXT_WORD);
1330 
1331   bind(TAIL);
1332   load_int_misaligned(tmpL, Address(strL), tmp3, false);
1333   load_long_misaligned(tmpU, Address(strU), tmp3, 2);
1334   inflate_lo32(tmp3, tmpL);
1335   mv(tmpL, tmp3);
1336 
1337   beq(tmpL, tmpU, *DONE);
1338 
1339   // Find the first different characters in the longwords and
1340   // compute their difference.
1341   bind(DIFFERENCE);
1342   xorr(tmp3, tmpL, tmpU);
1343   // count bits of trailing zero chars
1344   ctzc_bits(result, tmp3);
1345   srl(tmpL, tmpL, result);
1346   srl(tmpU, tmpU, result);
1347   zext(tmpL, tmpL, 16);
1348   zext(tmpU, tmpU, 16);
1349   if (isLU) {
1350     sub(result, tmpL, tmpU);
1351   } else {
1352     sub(result, tmpU, tmpL);
1353   }
1354 
1355   j(*DONE);
1356 }
1357 
1358 // Compare strings.
1359 void C2_MacroAssembler::string_compare(Register str1, Register str2,
1360                                        Register cnt1, Register cnt2, Register result,
1361                                        Register tmp1, Register tmp2, Register tmp3,
1362                                        int ae)
1363 {
1364   Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, STUB,
1365         SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
1366         SHORT_LOOP_START, L;
1367 
1368   const int STUB_THRESHOLD = 64 + 8;
1369   bool isLL = ae == StrIntrinsicNode::LL;
1370   bool isLU = ae == StrIntrinsicNode::LU;
1371   bool isUL = ae == StrIntrinsicNode::UL;
1372 
1373   bool str1_isL = isLL || isLU;
1374   bool str2_isL = isLL || isUL;
1375 
1376   // for L strings, 1 byte for 1 character
1377   // for U strings, 2 bytes for 1 character
1378   int str1_chr_size = str1_isL ? 1 : 2;
1379   int str2_chr_size = str2_isL ? 1 : 2;
1380   int minCharsInWord = isLL ? wordSize : wordSize / 2;
1381 
1382   load_chr_insn str1_load_chr = str1_isL ? (load_chr_insn)&MacroAssembler::lbu : (load_chr_insn)&MacroAssembler::lhu;
1383   load_chr_insn str2_load_chr = str2_isL ? (load_chr_insn)&MacroAssembler::lbu : (load_chr_insn)&MacroAssembler::lhu;
1384 
1385   BLOCK_COMMENT("string_compare {");
1386 
1387   // Bizarrely, the counts are passed in bytes, regardless of whether they
1388   // are L or U strings, however the result is always in characters.
1389   if (!str1_isL) {
1390     sraiw(cnt1, cnt1, 1);
1391   }
1392   if (!str2_isL) {
1393     sraiw(cnt2, cnt2, 1);
1394   }
1395 
1396   // Compute the minimum of the string lengths and save the difference in result.
1397   sub(result, cnt1, cnt2);
1398   bgt(cnt1, cnt2, L);
1399   mv(cnt2, cnt1);
1400   bind(L);
1401 
1402   // A very short string
1403   mv(t0, minCharsInWord);
1404   ble(cnt2, t0, SHORT_STRING);
1405 
1406   // Compare longwords
1407   {
1408     if (str1_isL == str2_isL) { // LL or UU
1409       string_compare_long_same_encoding(result,
1410                                 str1, str2, isLL,
1411                                 cnt1, cnt2, tmp1, tmp2, tmp3,
1412                                 STUB_THRESHOLD, &STUB, &SHORT_STRING, &DONE);
1413     } else { // LU or UL
1414       string_compare_long_different_encoding(result,
1415                                 str1, str2, isLU,
1416                                 cnt1, cnt2, tmp1, tmp2, tmp3,
1417                                 STUB_THRESHOLD, &STUB, &DONE);
1418     }
1419   }
1420 
1421   bind(STUB);
1422   RuntimeAddress stub = nullptr;
1423   switch (ae) {
1424     case StrIntrinsicNode::LL:
1425       stub = RuntimeAddress(StubRoutines::riscv::compare_long_string_LL());
1426       break;
1427     case StrIntrinsicNode::UU:
1428       stub = RuntimeAddress(StubRoutines::riscv::compare_long_string_UU());
1429       break;
1430     case StrIntrinsicNode::LU:
1431       stub = RuntimeAddress(StubRoutines::riscv::compare_long_string_LU());
1432       break;
1433     case StrIntrinsicNode::UL:
1434       stub = RuntimeAddress(StubRoutines::riscv::compare_long_string_UL());
1435       break;
1436     default:
1437       ShouldNotReachHere();
1438   }
1439   assert(stub.target() != nullptr, "compare_long_string stub has not been generated");
1440   address call = reloc_call(stub);
1441   if (call == nullptr) {
1442     DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START));
1443     ciEnv::current()->record_failure("CodeCache is full");
1444     return;
1445   }
1446   j(DONE);
1447 
1448   bind(SHORT_STRING);
1449   // Is the minimum length zero?
1450   beqz(cnt2, DONE);
1451   // arrange code to do most branches while loading and loading next characters
1452   // while comparing previous
1453   (this->*str1_load_chr)(tmp1, Address(str1), t0);
1454   addi(str1, str1, str1_chr_size);
1455   subi(cnt2, cnt2, 1);
1456   beqz(cnt2, SHORT_LAST_INIT);
1457   (this->*str2_load_chr)(cnt1, Address(str2), t0);
1458   addi(str2, str2, str2_chr_size);
1459   j(SHORT_LOOP_START);
1460   bind(SHORT_LOOP);
1461   subi(cnt2, cnt2, 1);
1462   beqz(cnt2, SHORT_LAST);
1463   bind(SHORT_LOOP_START);
1464   (this->*str1_load_chr)(tmp2, Address(str1), t0);
1465   addi(str1, str1, str1_chr_size);
1466   (this->*str2_load_chr)(t0, Address(str2), t0);
1467   addi(str2, str2, str2_chr_size);
1468   bne(tmp1, cnt1, SHORT_LOOP_TAIL);
1469   subi(cnt2, cnt2, 1);
1470   beqz(cnt2, SHORT_LAST2);
1471   (this->*str1_load_chr)(tmp1, Address(str1), t0);
1472   addi(str1, str1, str1_chr_size);
1473   (this->*str2_load_chr)(cnt1, Address(str2), t0);
1474   addi(str2, str2, str2_chr_size);
1475   beq(tmp2, t0, SHORT_LOOP);
1476   sub(result, tmp2, t0);
1477   j(DONE);
1478   bind(SHORT_LOOP_TAIL);
1479   sub(result, tmp1, cnt1);
1480   j(DONE);
1481   bind(SHORT_LAST2);
1482   beq(tmp2, t0, DONE);
1483   sub(result, tmp2, t0);
1484 
1485   j(DONE);
1486   bind(SHORT_LAST_INIT);
1487   (this->*str2_load_chr)(cnt1, Address(str2), t0);
1488   addi(str2, str2, str2_chr_size);
1489   bind(SHORT_LAST);
1490   beq(tmp1, cnt1, DONE);
1491   sub(result, tmp1, cnt1);
1492 
1493   bind(DONE);
1494 
1495   BLOCK_COMMENT("} string_compare");
1496 }
1497 
1498 void C2_MacroAssembler::arrays_equals(Register a1, Register a2,
1499                                       Register tmp1, Register tmp2, Register tmp3,
1500                                       Register result, int elem_size) {
1501   assert(elem_size == 1 || elem_size == 2, "must be char or byte");
1502   assert_different_registers(a1, a2, result, tmp1, tmp2, tmp3, t0);
1503 
1504   int elem_per_word = wordSize / elem_size;
1505   int log_elem_size = exact_log2(elem_size);
1506   int length_offset = arrayOopDesc::length_offset_in_bytes();
1507   int base_offset   = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE);
1508 
1509   assert((base_offset % (UseCompactObjectHeaders ? 4 : 8)) == 0, "Must be");
1510 
1511   Register cnt1 = tmp3;
1512   Register cnt2 = tmp1;  // cnt2 only used in array length compare
1513   Label DONE, SAME, NEXT_WORD, SHORT, TAIL03, TAIL01;
1514 
1515   BLOCK_COMMENT("arrays_equals {");
1516 
1517   // if (a1 == a2), return true
1518   beq(a1, a2, SAME);
1519 
1520   mv(result, false);
1521   // if (a1 == nullptr || a2 == nullptr)
1522   //     return false;
1523   beqz(a1, DONE);
1524   beqz(a2, DONE);
1525 
1526   // if (a1.length != a2.length)
1527   //      return false;
1528   lwu(cnt1, Address(a1, length_offset));
1529   lwu(cnt2, Address(a2, length_offset));
1530   bne(cnt1, cnt2, DONE);
1531 
1532   la(a1, Address(a1, base_offset));
1533   la(a2, Address(a2, base_offset));
1534 
1535   // Load 4 bytes once to compare for alignment before main loop.
1536   if (AvoidUnalignedAccesses && (base_offset % 8) != 0) {
1537     subi(cnt1, cnt1, elem_per_word / 2);
1538     bltz(cnt1, TAIL03);
1539     lwu(tmp1, Address(a1));
1540     lwu(tmp2, Address(a2));
1541     addi(a1, a1, 4);
1542     addi(a2, a2, 4);
1543     bne(tmp1, tmp2, DONE);
1544   }
1545 
1546   // Check for short strings, i.e. smaller than wordSize.
1547   subi(cnt1, cnt1, elem_per_word);
1548   bltz(cnt1, SHORT);
1549 
1550 #ifdef ASSERT
1551   if (AvoidUnalignedAccesses) {
1552     Label align_ok;
1553     orr(t0, a1, a2);
1554     andi(t0, t0, 0x7);
1555     beqz(t0, align_ok);
1556     stop("bad alignment");
1557     bind(align_ok);
1558   }
1559 #endif
1560 
1561   // Main 8 byte comparison loop.
1562   bind(NEXT_WORD); {
1563     ld(tmp1, Address(a1));
1564     ld(tmp2, Address(a2));
1565     subi(cnt1, cnt1, elem_per_word);
1566     addi(a1, a1, wordSize);
1567     addi(a2, a2, wordSize);
1568     bne(tmp1, tmp2, DONE);
1569   } bgez(cnt1, NEXT_WORD);
1570 
1571   addi(tmp1, cnt1, elem_per_word);
1572   beqz(tmp1, SAME);
1573 
1574   bind(SHORT);
1575   test_bit(tmp1, cnt1, 2 - log_elem_size);
1576   beqz(tmp1, TAIL03); // 0-7 bytes left.
1577   {
1578     lwu(tmp1, Address(a1));
1579     lwu(tmp2, Address(a2));
1580     addi(a1, a1, 4);
1581     addi(a2, a2, 4);
1582     bne(tmp1, tmp2, DONE);
1583   }
1584 
1585   bind(TAIL03);
1586   test_bit(tmp1, cnt1, 1 - log_elem_size);
1587   beqz(tmp1, TAIL01); // 0-3 bytes left.
1588   {
1589     lhu(tmp1, Address(a1));
1590     lhu(tmp2, Address(a2));
1591     addi(a1, a1, 2);
1592     addi(a2, a2, 2);
1593     bne(tmp1, tmp2, DONE);
1594   }
1595 
1596   bind(TAIL01);
1597   if (elem_size == 1) { // Only needed when comparing byte arrays.
1598     test_bit(tmp1, cnt1, 0);
1599     beqz(tmp1, SAME); // 0-1 bytes left.
1600     {
1601       lbu(tmp1, Address(a1));
1602       lbu(tmp2, Address(a2));
1603       bne(tmp1, tmp2, DONE);
1604     }
1605   }
1606 
1607   bind(SAME);
1608   mv(result, true);
1609   // That's it.
1610   bind(DONE);
1611 
1612   BLOCK_COMMENT("} arrays_equals");
1613 }
1614 
1615 // Compare Strings
1616 
1617 // For Strings we're passed the address of the first characters in a1 and a2
1618 // and the length in cnt1. There are two implementations.
1619 // For arrays >= 8 bytes, all comparisons (except for the tail) are performed
1620 // 8 bytes at a time. For the tail, we compare a halfword, then a short, and then a byte.
1621 // For strings < 8 bytes, we compare a halfword, then a short, and then a byte.
1622 
1623 void C2_MacroAssembler::string_equals(Register a1, Register a2,
1624                                       Register result, Register cnt1)
1625 {
1626   Label SAME, DONE, SHORT, NEXT_WORD, TAIL03, TAIL01;
1627   Register tmp1 = t0;
1628   Register tmp2 = t1;
1629 
1630   assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2);
1631 
1632   int base_offset = arrayOopDesc::base_offset_in_bytes(T_BYTE);
1633 
1634   assert((base_offset % (UseCompactObjectHeaders ? 4 : 8)) == 0, "Must be");
1635 
1636   BLOCK_COMMENT("string_equals {");
1637 
1638   mv(result, false);
1639 
1640   // Load 4 bytes once to compare for alignment before main loop.
1641   if (AvoidUnalignedAccesses && (base_offset % 8) != 0) {
1642     subi(cnt1, cnt1, 4);
1643     bltz(cnt1, TAIL03);
1644     lwu(tmp1, Address(a1));
1645     lwu(tmp2, Address(a2));
1646     addi(a1, a1, 4);
1647     addi(a2, a2, 4);
1648     bne(tmp1, tmp2, DONE);
1649   }
1650 
1651   // Check for short strings, i.e. smaller than wordSize.
1652   subi(cnt1, cnt1, wordSize);
1653   bltz(cnt1, SHORT);
1654 
1655 #ifdef ASSERT
1656   if (AvoidUnalignedAccesses) {
1657     Label align_ok;
1658     orr(t0, a1, a2);
1659     andi(t0, t0, 0x7);
1660     beqz(t0, align_ok);
1661     stop("bad alignment");
1662     bind(align_ok);
1663   }
1664 #endif
1665 
1666   // Main 8 byte comparison loop.
1667   bind(NEXT_WORD); {
1668     ld(tmp1, Address(a1));
1669     ld(tmp2, Address(a2));
1670     subi(cnt1, cnt1, wordSize);
1671     addi(a1, a1, wordSize);
1672     addi(a2, a2, wordSize);
1673     bne(tmp1, tmp2, DONE);
1674   } bgez(cnt1, NEXT_WORD);
1675 
1676   addi(tmp1, cnt1, wordSize);
1677   beqz(tmp1, SAME);
1678 
1679   bind(SHORT);
1680   // 0-7 bytes left.
1681   test_bit(tmp1, cnt1, 2);
1682   beqz(tmp1, TAIL03);
1683   {
1684     lwu(tmp1, Address(a1));
1685     lwu(tmp2, Address(a2));
1686     addi(a1, a1, 4);
1687     addi(a2, a2, 4);
1688     bne(tmp1, tmp2, DONE);
1689   }
1690 
1691   bind(TAIL03);
1692   // 0-3 bytes left.
1693   test_bit(tmp1, cnt1, 1);
1694   beqz(tmp1, TAIL01);
1695   {
1696     lhu(tmp1, Address(a1));
1697     lhu(tmp2, Address(a2));
1698     addi(a1, a1, 2);
1699     addi(a2, a2, 2);
1700     bne(tmp1, tmp2, DONE);
1701   }
1702 
1703   bind(TAIL01);
1704   // 0-1 bytes left.
1705   test_bit(tmp1, cnt1, 0);
1706   beqz(tmp1, SAME);
1707   {
1708     lbu(tmp1, Address(a1));
1709     lbu(tmp2, Address(a2));
1710     bne(tmp1, tmp2, DONE);
1711   }
1712 
1713   // Arrays are equal.
1714   bind(SAME);
1715   mv(result, true);
1716 
1717   // That's it.
1718   bind(DONE);
1719   BLOCK_COMMENT("} string_equals");
1720 }
1721 
1722 // jdk.internal.util.ArraysSupport.vectorizedHashCode
1723 void C2_MacroAssembler::arrays_hashcode(Register ary, Register cnt, Register result,
1724                                         Register tmp1, Register tmp2, Register tmp3,
1725                                         Register tmp4, Register tmp5, Register tmp6,
1726                                         BasicType eltype)
1727 {
1728   assert(!UseRVV, "sanity");
1729   assert_different_registers(ary, cnt, result, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, t0, t1);
1730 
1731   const int elsize = arrays_hashcode_elsize(eltype);
1732   const int chunks_end_shift = exact_log2(elsize);
1733 
1734   switch (eltype) {
1735   case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break;
1736   case T_CHAR:    BLOCK_COMMENT("arrays_hashcode(char) {");          break;
1737   case T_BYTE:    BLOCK_COMMENT("arrays_hashcode(byte) {");          break;
1738   case T_SHORT:   BLOCK_COMMENT("arrays_hashcode(short) {");         break;
1739   case T_INT:     BLOCK_COMMENT("arrays_hashcode(int) {");           break;
1740   default:
1741     ShouldNotReachHere();
1742   }
1743 
1744   const int stride = 4;
1745   const Register pow31_4 = tmp1;
1746   const Register pow31_3 = tmp2;
1747   const Register pow31_2 = tmp3;
1748   const Register chunks  = tmp4;
1749   const Register chunks_end = chunks;
1750 
1751   Label DONE, TAIL, TAIL_LOOP, WIDE_LOOP;
1752 
1753   // result has a value initially
1754 
1755   beqz(cnt, DONE);
1756 
1757   andi(chunks, cnt, ~(stride - 1));
1758   beqz(chunks, TAIL);
1759 
1760   mv(pow31_4, 923521);           // [31^^4]
1761   mv(pow31_3,  29791);           // [31^^3]
1762   mv(pow31_2,    961);           // [31^^2]
1763 
1764   shadd(chunks_end, chunks, ary, t0, chunks_end_shift);
1765   andi(cnt, cnt, stride - 1);    // don't forget about tail!
1766 
1767   bind(WIDE_LOOP);
1768   arrays_hashcode_elload(t0,   Address(ary, 0 * elsize), eltype);
1769   arrays_hashcode_elload(t1,   Address(ary, 1 * elsize), eltype);
1770   arrays_hashcode_elload(tmp5, Address(ary, 2 * elsize), eltype);
1771   arrays_hashcode_elload(tmp6, Address(ary, 3 * elsize), eltype);
1772   mulw(result, result, pow31_4); // 31^^4 * h
1773   mulw(t0, t0, pow31_3);         // 31^^3 * ary[i+0]
1774   addw(result, result, t0);
1775   mulw(t1, t1, pow31_2);         // 31^^2 * ary[i+1]
1776   addw(result, result, t1);
1777   slli(t0, tmp5, 5);             // optimize 31^^1 * ary[i+2]
1778   subw(tmp5, t0, tmp5);          // with ary[i+2]<<5 - ary[i+2]
1779   addw(result, result, tmp5);
1780   addw(result, result, tmp6);    // 31^^4 * h + 31^^3 * ary[i+0] + 31^^2 * ary[i+1]
1781                                  //           + 31^^1 * ary[i+2] + 31^^0 * ary[i+3]
1782   addi(ary, ary, elsize * stride);
1783   bne(ary, chunks_end, WIDE_LOOP);
1784   beqz(cnt, DONE);
1785 
1786   bind(TAIL);
1787   shadd(chunks_end, cnt, ary, t0, chunks_end_shift);
1788 
1789   bind(TAIL_LOOP);
1790   arrays_hashcode_elload(t0, Address(ary), eltype);
1791   slli(t1, result, 5);           // optimize 31 * result
1792   subw(result, t1, result);      // with result<<5 - result
1793   addw(result, result, t0);
1794   addi(ary, ary, elsize);
1795   bne(ary, chunks_end, TAIL_LOOP);
1796 
1797   bind(DONE);
1798   BLOCK_COMMENT("} // arrays_hashcode");
1799 }
1800 
1801 void C2_MacroAssembler::arrays_hashcode_v(Register ary, Register cnt, Register result,
1802                                           Register tmp1, Register tmp2, Register tmp3,
1803                                           BasicType eltype)
1804 {
1805   assert(UseRVV, "sanity");
1806   assert(StubRoutines::riscv::arrays_hashcode_powers_of_31() != nullptr, "sanity");
1807   assert_different_registers(ary, cnt, result, tmp1, tmp2, tmp3, t0, t1);
1808 
1809   // The MaxVectorSize should have been set by detecting RVV max vector register
1810   // size when check UseRVV (i.e. MaxVectorSize == VM_Version::_initial_vector_length).
1811   // Let's use T_INT as all hashCode calculations eventually deal with ints.
1812   const int lmul = 2;
1813   const int stride = MaxVectorSize / sizeof(jint) * lmul;
1814 
1815   const int elsize_bytes = arrays_hashcode_elsize(eltype);
1816   const int elsize_shift = exact_log2(elsize_bytes);
1817 
1818   switch (eltype) {
1819     case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode_v(unsigned byte) {"); break;
1820     case T_CHAR:    BLOCK_COMMENT("arrays_hashcode_v(char) {");          break;
1821     case T_BYTE:    BLOCK_COMMENT("arrays_hashcode_v(byte) {");          break;
1822     case T_SHORT:   BLOCK_COMMENT("arrays_hashcode_v(short) {");         break;
1823     case T_INT:     BLOCK_COMMENT("arrays_hashcode_v(int) {");           break;
1824     default:
1825       ShouldNotReachHere();
1826   }
1827 
1828   const Register pow31_highest = tmp1;
1829   const Register ary_end       = tmp2;
1830   const Register consumed      = tmp3;
1831 
1832   const VectorRegister v_sum    = v2;
1833   const VectorRegister v_src    = v4;
1834   const VectorRegister v_coeffs = v6;
1835   const VectorRegister v_tmp    = v8;
1836 
1837   const address adr_pows31 = StubRoutines::riscv::arrays_hashcode_powers_of_31()
1838                            + sizeof(jint);
1839   Label VEC_LOOP, DONE, SCALAR_TAIL, SCALAR_TAIL_LOOP;
1840 
1841   // NB: at this point (a) 'result' already has some value,
1842   // (b) 'cnt' is not 0 or 1, see java code for details.
1843 
1844   andi(t0, cnt, ~(stride - 1));
1845   beqz(t0, SCALAR_TAIL);
1846 
1847   la(t1, ExternalAddress(adr_pows31));
1848   lw(pow31_highest, Address(t1, -1 * sizeof(jint)));
1849 
1850   vsetvli(consumed, cnt, Assembler::e32, Assembler::m2);
1851   vle32_v(v_coeffs, t1); // 31^^(stride - 1) ... 31^^0
1852   vmv_v_x(v_sum, x0);
1853 
1854   bind(VEC_LOOP);
1855   arrays_hashcode_elload_v(v_src, v_tmp, ary, eltype);
1856   vmul_vv(v_src, v_src, v_coeffs);
1857   vmadd_vx(v_sum, pow31_highest, v_src);
1858   mulw(result, result, pow31_highest);
1859   shadd(ary, consumed, ary, t0, elsize_shift);
1860   subw(cnt, cnt, consumed);
1861   andi(t1, cnt, ~(stride - 1));
1862   bnez(t1, VEC_LOOP);
1863 
1864   vmv_s_x(v_tmp, x0);
1865   vredsum_vs(v_sum, v_sum, v_tmp);
1866   vmv_x_s(t0, v_sum);
1867   addw(result, result, t0);
1868   beqz(cnt, DONE);
1869 
1870   bind(SCALAR_TAIL);
1871   shadd(ary_end, cnt, ary, t0, elsize_shift);
1872 
1873   bind(SCALAR_TAIL_LOOP);
1874   arrays_hashcode_elload(t0, Address(ary), eltype);
1875   slli(t1, result, 5);      // optimize 31 * result
1876   subw(result, t1, result); // with result<<5 - result
1877   addw(result, result, t0);
1878   addi(ary, ary, elsize_bytes);
1879   bne(ary, ary_end, SCALAR_TAIL_LOOP);
1880 
1881   bind(DONE);
1882   BLOCK_COMMENT("} // arrays_hashcode_v");
1883 }
1884 
1885 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) {
1886   switch (eltype) {
1887     case T_BOOLEAN: return sizeof(jboolean);
1888     case T_BYTE:    return sizeof(jbyte);
1889     case T_SHORT:   return sizeof(jshort);
1890     case T_CHAR:    return sizeof(jchar);
1891     case T_INT:     return sizeof(jint);
1892     default:
1893       ShouldNotReachHere();
1894       return -1;
1895   }
1896 }
1897 
1898 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) {
1899   switch (eltype) {
1900     // T_BOOLEAN used as surrogate for unsigned byte
1901     case T_BOOLEAN: lbu(dst, src);   break;
1902     case T_BYTE:     lb(dst, src);   break;
1903     case T_SHORT:    lh(dst, src);   break;
1904     case T_CHAR:    lhu(dst, src);   break;
1905     case T_INT:      lw(dst, src);   break;
1906     default:
1907       ShouldNotReachHere();
1908   }
1909 }
1910 
1911 void C2_MacroAssembler::arrays_hashcode_elload_v(VectorRegister vdst,
1912                                                  VectorRegister vtmp,
1913                                                  Register src,
1914                                                  BasicType eltype) {
1915   assert_different_registers(vdst, vtmp);
1916   switch (eltype) {
1917     case T_BOOLEAN:
1918       vle8_v(vtmp, src);
1919       vzext_vf4(vdst, vtmp);
1920       break;
1921     case T_BYTE:
1922       vle8_v(vtmp, src);
1923       vsext_vf4(vdst, vtmp);
1924       break;
1925     case T_CHAR:
1926       vle16_v(vtmp, src);
1927       vzext_vf2(vdst, vtmp);
1928       break;
1929     case T_SHORT:
1930       vle16_v(vtmp, src);
1931       vsext_vf2(vdst, vtmp);
1932       break;
1933     case T_INT:
1934       vle32_v(vdst, src);
1935       break;
1936     default:
1937       ShouldNotReachHere();
1938   }
1939 }
1940 
1941 typedef void (Assembler::*conditional_branch_insn)(Register op1, Register op2, Label& label, bool is_far);
1942 typedef void (MacroAssembler::*float_conditional_branch_insn)(FloatRegister op1, FloatRegister op2, Label& label,
1943                                                               bool is_far, bool is_unordered);
1944 
1945 static conditional_branch_insn conditional_branches[] =
1946 {
1947   /* SHORT branches */
1948   (conditional_branch_insn)&MacroAssembler::beq,
1949   (conditional_branch_insn)&MacroAssembler::bgt,
1950   nullptr, // BoolTest::overflow
1951   (conditional_branch_insn)&MacroAssembler::blt,
1952   (conditional_branch_insn)&MacroAssembler::bne,
1953   (conditional_branch_insn)&MacroAssembler::ble,
1954   nullptr, // BoolTest::no_overflow
1955   (conditional_branch_insn)&MacroAssembler::bge,
1956 
1957   /* UNSIGNED branches */
1958   (conditional_branch_insn)&MacroAssembler::beq,
1959   (conditional_branch_insn)&MacroAssembler::bgtu,
1960   nullptr,
1961   (conditional_branch_insn)&MacroAssembler::bltu,
1962   (conditional_branch_insn)&MacroAssembler::bne,
1963   (conditional_branch_insn)&MacroAssembler::bleu,
1964   nullptr,
1965   (conditional_branch_insn)&MacroAssembler::bgeu
1966 };
1967 
1968 static float_conditional_branch_insn float_conditional_branches[] =
1969 {
1970   /* FLOAT SHORT branches */
1971   (float_conditional_branch_insn)&MacroAssembler::float_beq,
1972   (float_conditional_branch_insn)&MacroAssembler::float_bgt,
1973   nullptr,  // BoolTest::overflow
1974   (float_conditional_branch_insn)&MacroAssembler::float_blt,
1975   (float_conditional_branch_insn)&MacroAssembler::float_bne,
1976   (float_conditional_branch_insn)&MacroAssembler::float_ble,
1977   nullptr, // BoolTest::no_overflow
1978   (float_conditional_branch_insn)&MacroAssembler::float_bge,
1979 
1980   /* DOUBLE SHORT branches */
1981   (float_conditional_branch_insn)&MacroAssembler::double_beq,
1982   (float_conditional_branch_insn)&MacroAssembler::double_bgt,
1983   nullptr,
1984   (float_conditional_branch_insn)&MacroAssembler::double_blt,
1985   (float_conditional_branch_insn)&MacroAssembler::double_bne,
1986   (float_conditional_branch_insn)&MacroAssembler::double_ble,
1987   nullptr,
1988   (float_conditional_branch_insn)&MacroAssembler::double_bge
1989 };
1990 
1991 void C2_MacroAssembler::cmp_branch(int cmpFlag, Register op1, Register op2, Label& label, bool is_far) {
1992   assert(cmpFlag >= 0 && cmpFlag < (int)(sizeof(conditional_branches) / sizeof(conditional_branches[0])),
1993          "invalid conditional branch index");
1994   (this->*conditional_branches[cmpFlag])(op1, op2, label, is_far);
1995 }
1996 
1997 // This is a function should only be used by C2. Flip the unordered when unordered-greater, C2 would use
1998 // unordered-lesser instead of unordered-greater. Finally, commute the result bits at function do_one_bytecode().
1999 void C2_MacroAssembler::float_cmp_branch(int cmpFlag, FloatRegister op1, FloatRegister op2, Label& label, bool is_far) {
2000   assert(cmpFlag >= 0 && cmpFlag < (int)(sizeof(float_conditional_branches) / sizeof(float_conditional_branches[0])),
2001          "invalid float conditional branch index");
2002   int booltest_flag = cmpFlag & ~(C2_MacroAssembler::double_branch_mask);
2003   (this->*float_conditional_branches[cmpFlag])(op1, op2, label, is_far,
2004     (booltest_flag == (BoolTest::ge) || booltest_flag == (BoolTest::gt)) ? false : true);
2005 }
2006 
2007 void C2_MacroAssembler::enc_cmpUEqNeLeGt_imm0_branch(int cmpFlag, Register op1, Label& L, bool is_far) {
2008   switch (cmpFlag) {
2009     case BoolTest::eq:
2010     case BoolTest::le:
2011       beqz(op1, L, is_far);
2012       break;
2013     case BoolTest::ne:
2014     case BoolTest::gt:
2015       bnez(op1, L, is_far);
2016       break;
2017     default:
2018       ShouldNotReachHere();
2019   }
2020 }
2021 
2022 void C2_MacroAssembler::enc_cmpEqNe_imm0_branch(int cmpFlag, Register op1, Label& L, bool is_far) {
2023   switch (cmpFlag) {
2024     case BoolTest::eq:
2025       beqz(op1, L, is_far);
2026       break;
2027     case BoolTest::ne:
2028       bnez(op1, L, is_far);
2029       break;
2030     default:
2031       ShouldNotReachHere();
2032   }
2033 }
2034 
2035 void C2_MacroAssembler::enc_cmove(int cmpFlag, Register op1, Register op2, Register dst, Register src) {
2036   bool is_unsigned = (cmpFlag & unsigned_branch_mask) == unsigned_branch_mask;
2037   int op_select = cmpFlag & (~unsigned_branch_mask);
2038 
2039   switch (op_select) {
2040     case BoolTest::eq:
2041       cmov_eq(op1, op2, dst, src);
2042       break;
2043     case BoolTest::ne:
2044       cmov_ne(op1, op2, dst, src);
2045       break;
2046     case BoolTest::le:
2047       if (is_unsigned) {
2048         cmov_leu(op1, op2, dst, src);
2049       } else {
2050         cmov_le(op1, op2, dst, src);
2051       }
2052       break;
2053     case BoolTest::ge:
2054       if (is_unsigned) {
2055         cmov_geu(op1, op2, dst, src);
2056       } else {
2057         cmov_ge(op1, op2, dst, src);
2058       }
2059       break;
2060     case BoolTest::lt:
2061       if (is_unsigned) {
2062         cmov_ltu(op1, op2, dst, src);
2063       } else {
2064         cmov_lt(op1, op2, dst, src);
2065       }
2066       break;
2067     case BoolTest::gt:
2068       if (is_unsigned) {
2069         cmov_gtu(op1, op2, dst, src);
2070       } else {
2071         cmov_gt(op1, op2, dst, src);
2072       }
2073       break;
2074     default:
2075       assert(false, "unsupported compare condition");
2076       ShouldNotReachHere();
2077   }
2078 }
2079 
2080 void C2_MacroAssembler::enc_cmove_cmp_fp(int cmpFlag, FloatRegister op1, FloatRegister op2, Register dst, Register src, bool is_single) {
2081   int op_select = cmpFlag & (~unsigned_branch_mask);
2082 
2083   switch (op_select) {
2084     case BoolTest::eq:
2085       cmov_cmp_fp_eq(op1, op2, dst, src, is_single);
2086       break;
2087     case BoolTest::ne:
2088       cmov_cmp_fp_ne(op1, op2, dst, src, is_single);
2089       break;
2090     case BoolTest::le:
2091       cmov_cmp_fp_le(op1, op2, dst, src, is_single);
2092       break;
2093     case BoolTest::ge:
2094       cmov_cmp_fp_ge(op1, op2, dst, src, is_single);
2095       break;
2096     case BoolTest::lt:
2097       cmov_cmp_fp_lt(op1, op2, dst, src, is_single);
2098       break;
2099     case BoolTest::gt:
2100       cmov_cmp_fp_gt(op1, op2, dst, src, is_single);
2101       break;
2102     default:
2103       assert(false, "unsupported compare condition");
2104       ShouldNotReachHere();
2105   }
2106 }
2107 
2108 void C2_MacroAssembler::enc_cmove_fp_cmp(int cmpFlag, Register op1, Register op2,
2109                         FloatRegister dst, FloatRegister src, bool is_single) {
2110   bool is_unsigned = (cmpFlag & unsigned_branch_mask) == unsigned_branch_mask;
2111   int op_select = cmpFlag & (~unsigned_branch_mask);
2112 
2113   switch (op_select) {
2114     case BoolTest::eq:
2115       cmov_fp_eq(op1, op2, dst, src, is_single);
2116       break;
2117     case BoolTest::ne:
2118       cmov_fp_ne(op1, op2, dst, src, is_single);
2119       break;
2120     case BoolTest::le:
2121       if (is_unsigned) {
2122         cmov_fp_leu(op1, op2, dst, src, is_single);
2123       } else {
2124         cmov_fp_le(op1, op2, dst, src, is_single);
2125       }
2126       break;
2127     case BoolTest::ge:
2128       if (is_unsigned) {
2129         cmov_fp_geu(op1, op2, dst, src, is_single);
2130       } else {
2131         cmov_fp_ge(op1, op2, dst, src, is_single);
2132       }
2133       break;
2134     case BoolTest::lt:
2135       if (is_unsigned) {
2136         cmov_fp_ltu(op1, op2, dst, src, is_single);
2137       } else {
2138         cmov_fp_lt(op1, op2, dst, src, is_single);
2139       }
2140       break;
2141     case BoolTest::gt:
2142       if (is_unsigned) {
2143         cmov_fp_gtu(op1, op2, dst, src, is_single);
2144       } else {
2145         cmov_fp_gt(op1, op2, dst, src, is_single);
2146       }
2147       break;
2148     default:
2149       assert(false, "unsupported compare condition");
2150       ShouldNotReachHere();
2151   }
2152 }
2153 
2154 void C2_MacroAssembler::enc_cmove_fp_cmp_fp(int cmpFlag,
2155                            FloatRegister op1, FloatRegister op2,
2156                            FloatRegister dst, FloatRegister src,
2157                            bool cmp_single, bool cmov_single) {
2158   int op_select = cmpFlag & (~unsigned_branch_mask);
2159 
2160   switch (op_select) {
2161     case BoolTest::eq:
2162       cmov_fp_cmp_fp_eq(op1, op2, dst, src, cmp_single, cmov_single);
2163       break;
2164     case BoolTest::ne:
2165       cmov_fp_cmp_fp_ne(op1, op2, dst, src, cmp_single, cmov_single);
2166       break;
2167     case BoolTest::le:
2168       cmov_fp_cmp_fp_le(op1, op2, dst, src, cmp_single, cmov_single);
2169       break;
2170     case BoolTest::ge:
2171       cmov_fp_cmp_fp_ge(op1, op2, dst, src, cmp_single, cmov_single);
2172       break;
2173     case BoolTest::lt:
2174       cmov_fp_cmp_fp_lt(op1, op2, dst, src, cmp_single, cmov_single);
2175       break;
2176     case BoolTest::gt:
2177       cmov_fp_cmp_fp_gt(op1, op2, dst, src, cmp_single, cmov_single);
2178       break;
2179     default:
2180       assert(false, "unsupported compare condition");
2181       ShouldNotReachHere();
2182   }
2183 }
2184 
2185 // Set dst to NaN if any NaN input.
2186 void C2_MacroAssembler::minmax_fp(FloatRegister dst, FloatRegister src1, FloatRegister src2,
2187                                   FLOAT_TYPE ft, bool is_min) {
2188   assert_cond((ft != FLOAT_TYPE::half_precision) || UseZfh);
2189 
2190   Label Done, Compare;
2191 
2192   switch (ft) {
2193     case FLOAT_TYPE::half_precision:
2194       fclass_h(t0, src1);
2195       fclass_h(t1, src2);
2196 
2197       orr(t0, t0, t1);
2198       andi(t0, t0, FClassBits::nan); // if src1 or src2 is quiet or signaling NaN then return NaN
2199       beqz(t0, Compare);
2200 
2201       fadd_h(dst, src1, src2);
2202       j(Done);
2203 
2204       bind(Compare);
2205       if (is_min) {
2206         fmin_h(dst, src1, src2);
2207       } else {
2208         fmax_h(dst, src1, src2);
2209       }
2210       break;
2211     case FLOAT_TYPE::single_precision:
2212       fclass_s(t0, src1);
2213       fclass_s(t1, src2);
2214 
2215       orr(t0, t0, t1);
2216       andi(t0, t0, FClassBits::nan); // if src1 or src2 is quiet or signaling NaN then return NaN
2217       beqz(t0, Compare);
2218 
2219       fadd_s(dst, src1, src2);
2220       j(Done);
2221 
2222       bind(Compare);
2223       if (is_min) {
2224         fmin_s(dst, src1, src2);
2225       } else {
2226         fmax_s(dst, src1, src2);
2227       }
2228       break;
2229     case FLOAT_TYPE::double_precision:
2230       fclass_d(t0, src1);
2231       fclass_d(t1, src2);
2232 
2233       orr(t0, t0, t1);
2234       andi(t0, t0, FClassBits::nan); // if src1 or src2 is quiet or signaling NaN then return NaN
2235       beqz(t0, Compare);
2236 
2237       fadd_d(dst, src1, src2);
2238       j(Done);
2239 
2240       bind(Compare);
2241       if (is_min) {
2242         fmin_d(dst, src1, src2);
2243       } else {
2244         fmax_d(dst, src1, src2);
2245       }
2246       break;
2247     default:
2248       ShouldNotReachHere();
2249   }
2250 
2251   bind(Done);
2252 }
2253 
2254 // According to Java SE specification, for floating-point round operations, if
2255 // the input is NaN, +/-infinity, or +/-0, the same input is returned as the
2256 // rounded result; this differs from behavior of RISC-V fcvt instructions (which
2257 // round out-of-range values to the nearest max or min value), therefore special
2258 // handling is needed by NaN, +/-Infinity, +/-0.
2259 void C2_MacroAssembler::round_double_mode(FloatRegister dst, FloatRegister src, int round_mode,
2260                                           Register tmp1, Register tmp2, Register tmp3) {
2261 
2262   assert_different_registers(dst, src);
2263   assert_different_registers(tmp1, tmp2, tmp3);
2264 
2265   // Set rounding mode for conversions
2266   // Here we use similar modes to double->long and long->double conversions
2267   // Different mode for long->double conversion matter only if long value was not representable as double,
2268   // we got long value as a result of double->long conversion so, it is definitely representable
2269   RoundingMode rm;
2270   switch (round_mode) {
2271     case RoundDoubleModeNode::rmode_ceil:
2272       rm = RoundingMode::rup;
2273       break;
2274     case RoundDoubleModeNode::rmode_floor:
2275       rm = RoundingMode::rdn;
2276       break;
2277     case RoundDoubleModeNode::rmode_rint:
2278       rm = RoundingMode::rne;
2279       break;
2280     default:
2281       ShouldNotReachHere();
2282   }
2283 
2284   // tmp1 - is a register to store double converted to long int
2285   // tmp2 - is a register to create constant for comparison
2286   // tmp3 - is a register where we store modified result of double->long conversion
2287   Label done, bad_val;
2288 
2289   // Conversion from double to long
2290   fcvt_l_d(tmp1, src, rm);
2291 
2292   // Generate constant (tmp2)
2293   // tmp2 = 100...0000
2294   addi(tmp2, zr, 1);
2295   slli(tmp2, tmp2, 63);
2296 
2297   // Prepare converted long (tmp1)
2298   // as a result when conversion overflow we got:
2299   // tmp1 = 011...1111 or 100...0000
2300   // Convert it to: tmp3 = 100...0000
2301   addi(tmp3, tmp1, 1);
2302   andi(tmp3, tmp3, -2);
2303   beq(tmp3, tmp2, bad_val);
2304 
2305   // Conversion from long to double
2306   fcvt_d_l(dst, tmp1, rm);
2307   // Add sign of input value to result for +/- 0 cases
2308   fsgnj_d(dst, dst, src);
2309   j(done);
2310 
2311   // If got conversion overflow return src
2312   bind(bad_val);
2313   fmv_d(dst, src);
2314 
2315   bind(done);
2316 }
2317 
2318 // According to Java SE specification, for floating-point signum operations, if
2319 // on input we have NaN or +/-0.0 value we should return it,
2320 // otherwise return +/- 1.0 using sign of input.
2321 // one - gives us a floating-point 1.0 (got from matching rule)
2322 // bool is_double - specifies single or double precision operations will be used.
2323 void C2_MacroAssembler::signum_fp(FloatRegister dst, FloatRegister one, bool is_double) {
2324   Label done;
2325 
2326   is_double ? fclass_d(t0, dst)
2327             : fclass_s(t0, dst);
2328 
2329   // check if input is -0, +0, signaling NaN or quiet NaN
2330   andi(t0, t0, FClassBits::zero | FClassBits::nan);
2331 
2332   bnez(t0, done);
2333 
2334   // use floating-point 1.0 with a sign of input
2335   is_double ? fsgnj_d(dst, one, dst)
2336             : fsgnj_s(dst, one, dst);
2337 
2338   bind(done);
2339 }
2340 
2341 static void float16_to_float_slow_path(C2_MacroAssembler& masm, C2GeneralStub<FloatRegister, Register, Register>& stub) {
2342 #define __ masm.
2343   FloatRegister dst = stub.data<0>();
2344   Register src = stub.data<1>();
2345   Register tmp = stub.data<2>();
2346   __ bind(stub.entry());
2347 
2348   // following instructions mainly focus on NaN, as riscv does not handle
2349   // NaN well with fcvt, but the code also works for Inf at the same time.
2350 
2351   // construct a NaN in 32 bits from the NaN in 16 bits,
2352   // we need the payloads of non-canonical NaNs to be preserved.
2353   __ mv(tmp, 0x7f800000);
2354   // sign-bit was already set via sign-extension if necessary.
2355   __ slli(t0, src, 13);
2356   __ orr(tmp, t0, tmp);
2357   __ fmv_w_x(dst, tmp);
2358 
2359   __ j(stub.continuation());
2360 #undef __
2361 }
2362 
2363 // j.l.Float.float16ToFloat
2364 void C2_MacroAssembler::float16_to_float(FloatRegister dst, Register src, Register tmp) {
2365   auto stub = C2CodeStub::make<FloatRegister, Register, Register>(dst, src, tmp, 20, float16_to_float_slow_path);
2366 
2367   // On riscv, NaN needs a special process as fcvt does not work in that case.
2368   // On riscv, Inf does not need a special process as fcvt can handle it correctly.
2369   // but we consider to get the slow path to process NaN and Inf at the same time,
2370   // as both of them are rare cases, and if we try to get the slow path to handle
2371   // only NaN case it would sacrifise the performance for normal cases,
2372   // i.e. non-NaN and non-Inf cases.
2373 
2374   // check whether it's a NaN or +/- Inf.
2375   mv(t0, 0x7c00);
2376   andr(tmp, src, t0);
2377   // jump to stub processing NaN and Inf cases.
2378   beq(t0, tmp, stub->entry(), /* is_far */ true);
2379 
2380   // non-NaN or non-Inf cases, just use built-in instructions.
2381   fmv_h_x(dst, src);
2382   fcvt_s_h(dst, dst);
2383 
2384   bind(stub->continuation());
2385 }
2386 
2387 static void float_to_float16_slow_path(C2_MacroAssembler& masm, C2GeneralStub<Register, FloatRegister, Register>& stub) {
2388 #define __ masm.
2389   Register dst = stub.data<0>();
2390   FloatRegister src = stub.data<1>();
2391   Register tmp = stub.data<2>();
2392   __ bind(stub.entry());
2393 
2394   __ float_to_float16_NaN(dst, src, t0, tmp);
2395 
2396   __ j(stub.continuation());
2397 #undef __
2398 }
2399 
2400 // j.l.Float.floatToFloat16
2401 void C2_MacroAssembler::float_to_float16(Register dst, FloatRegister src, FloatRegister ftmp, Register xtmp) {
2402   auto stub = C2CodeStub::make<Register, FloatRegister, Register>(dst, src, xtmp, 64, float_to_float16_slow_path);
2403 
2404   // On riscv, NaN needs a special process as fcvt does not work in that case.
2405 
2406   // check whether it's a NaN.
2407   // replace fclass with feq as performance optimization.
2408   feq_s(t0, src, src);
2409   // jump to stub processing NaN cases.
2410   beqz(t0, stub->entry(), /* is_far */ true);
2411 
2412   // non-NaN cases, just use built-in instructions.
2413   fcvt_h_s(ftmp, src);
2414   fmv_x_h(dst, ftmp);
2415 
2416   bind(stub->continuation());
2417 }
2418 
2419 static void float16_to_float_v_slow_path(C2_MacroAssembler& masm, C2GeneralStub<VectorRegister, VectorRegister, uint>& stub) {
2420 #define __ masm.
2421   VectorRegister dst = stub.data<0>();
2422   VectorRegister src = stub.data<1>();
2423   uint vector_length = stub.data<2>();
2424   __ bind(stub.entry());
2425 
2426   // following instructions mainly focus on NaN, as riscv does not handle
2427   // NaN well with vfwcvt_f_f_v, but the code also works for Inf at the same time.
2428   //
2429   // construct NaN's in 32 bits from the NaN's in 16 bits,
2430   // we need the payloads of non-canonical NaNs to be preserved.
2431 
2432   // adjust vector type to 2 * SEW.
2433   __ vsetvli_helper(T_FLOAT, vector_length, Assembler::m1);
2434   // widen and sign-extend src data.
2435   __ vsext_vf2(dst, src, Assembler::v0_t);
2436   __ mv(t0, 0x7f800000);
2437   // sign-bit was already set via sign-extension if necessary.
2438   __ vsll_vi(dst, dst, 13, Assembler::v0_t);
2439   __ vor_vx(dst, dst, t0, Assembler::v0_t);
2440 
2441   __ j(stub.continuation());
2442 #undef __
2443 }
2444 
2445 // j.l.Float.float16ToFloat
2446 void C2_MacroAssembler::float16_to_float_v(VectorRegister dst, VectorRegister src, uint vector_length) {
2447   auto stub = C2CodeStub::make<VectorRegister, VectorRegister, uint>
2448               (dst, src, vector_length, 24, float16_to_float_v_slow_path);
2449   assert_different_registers(dst, src);
2450 
2451   // On riscv, NaN needs a special process as vfwcvt_f_f_v does not work in that case.
2452   // On riscv, Inf does not need a special process as vfwcvt_f_f_v can handle it correctly.
2453   // but we consider to get the slow path to process NaN and Inf at the same time,
2454   // as both of them are rare cases, and if we try to get the slow path to handle
2455   // only NaN case it would sacrifise the performance for normal cases,
2456   // i.e. non-NaN and non-Inf cases.
2457 
2458   vsetvli_helper(BasicType::T_SHORT, vector_length, Assembler::mf2);
2459 
2460   // check whether there is a NaN or +/- Inf.
2461   mv(t0, 0x7c00);
2462   vand_vx(v0, src, t0);
2463   // v0 will be used as mask in slow path.
2464   vmseq_vx(v0, v0, t0);
2465   vcpop_m(t0, v0);
2466 
2467   // For non-NaN or non-Inf cases, just use built-in instructions.
2468   vfwcvt_f_f_v(dst, src);
2469 
2470   // jump to stub processing NaN and Inf cases if there is any of them in the vector-wide.
2471   bnez(t0, stub->entry(), /* is_far */ true);
2472 
2473   bind(stub->continuation());
2474 }
2475 
2476 static void float_to_float16_v_slow_path(C2_MacroAssembler& masm,
2477                                          C2GeneralStub<VectorRegister, VectorRegister, VectorRegister>& stub) {
2478 #define __ masm.
2479   VectorRegister dst = stub.data<0>();
2480   VectorRegister src = stub.data<1>();
2481   VectorRegister vtmp = stub.data<2>();
2482   assert_different_registers(dst, src, vtmp);
2483 
2484   __ bind(stub.entry());
2485 
2486   // Active elements (NaNs) are marked in v0 mask register.
2487   // mul is already set to mf2 in float_to_float16_v.
2488 
2489   //  Float (32 bits)
2490   //    Bit:     31        30 to 23          22 to 0
2491   //          +---+------------------+-----------------------------+
2492   //          | S |     Exponent     |      Mantissa (Fraction)    |
2493   //          +---+------------------+-----------------------------+
2494   //          1 bit       8 bits                  23 bits
2495   //
2496   //  Float (16 bits)
2497   //    Bit:    15        14 to 10         9 to 0
2498   //          +---+----------------+------------------+
2499   //          | S |    Exponent    |     Mantissa     |
2500   //          +---+----------------+------------------+
2501   //          1 bit      5 bits          10 bits
2502   const int fp_sign_bits = 1;
2503   const int fp32_bits = 32;
2504   const int fp32_mantissa_2nd_part_bits = 9;
2505   const int fp32_mantissa_3rd_part_bits = 4;
2506   const int fp16_exponent_bits = 5;
2507   const int fp16_mantissa_bits = 10;
2508 
2509   // preserve the sign bit and exponent, clear mantissa.
2510   __ vnsra_wi(dst, src, fp32_bits - fp_sign_bits - fp16_exponent_bits, Assembler::v0_t);
2511   __ vsll_vi(dst, dst, fp16_mantissa_bits, Assembler::v0_t);
2512 
2513   // Preserve high order bit of float NaN in the
2514   // binary16 result NaN (tenth bit); OR in remaining
2515   // bits into lower 9 bits of binary 16 significand.
2516   //   | (doppel & 0x007f_e000) >> 13 // 10 bits
2517   //   | (doppel & 0x0000_1ff0) >> 4  //  9 bits
2518   //   | (doppel & 0x0000_000f));     //  4 bits
2519   //
2520   // Check j.l.Float.floatToFloat16 for more information.
2521   // 10 bits
2522   __ vnsrl_wi(vtmp, src, fp32_mantissa_2nd_part_bits + fp32_mantissa_3rd_part_bits, Assembler::v0_t);
2523   __ mv(t0, 0x3ff); // retain first part of mantissa in a float 32
2524   __ vand_vx(vtmp, vtmp, t0, Assembler::v0_t);
2525   __ vor_vv(dst, dst, vtmp, Assembler::v0_t);
2526   // 9 bits
2527   __ vnsrl_wi(vtmp, src, fp32_mantissa_3rd_part_bits, Assembler::v0_t);
2528   __ mv(t0, 0x1ff); // retain second part of mantissa in a float 32
2529   __ vand_vx(vtmp, vtmp, t0, Assembler::v0_t);
2530   __ vor_vv(dst, dst, vtmp, Assembler::v0_t);
2531   // 4 bits
2532   // Narrow shift is necessary to move data from 32 bits element to 16 bits element in vector register.
2533   __ vnsrl_wi(vtmp, src, 0, Assembler::v0_t);
2534   __ vand_vi(vtmp, vtmp, 0xf, Assembler::v0_t);
2535   __ vor_vv(dst, dst, vtmp, Assembler::v0_t);
2536 
2537   __ j(stub.continuation());
2538 #undef __
2539 }
2540 
2541 // j.l.Float.float16ToFloat
2542 void C2_MacroAssembler::float_to_float16_v(VectorRegister dst, VectorRegister src,
2543                                            VectorRegister vtmp, Register tmp, uint vector_length) {
2544   assert_different_registers(dst, src, vtmp);
2545 
2546   auto stub = C2CodeStub::make<VectorRegister, VectorRegister, VectorRegister>
2547               (dst, src, vtmp, 56, float_to_float16_v_slow_path);
2548 
2549   // On riscv, NaN needs a special process as vfncvt_f_f_w does not work in that case.
2550 
2551   vsetvli_helper(BasicType::T_FLOAT, vector_length, Assembler::m1);
2552 
2553   // check whether there is a NaN.
2554   // replace v_fclass with vmfne_vv as performance optimization.
2555   vmfne_vv(v0, src, src);
2556   vcpop_m(t0, v0);
2557 
2558   vsetvli_helper(BasicType::T_SHORT, vector_length, Assembler::mf2, tmp);
2559 
2560   // For non-NaN cases, just use built-in instructions.
2561   vfncvt_f_f_w(dst, src);
2562 
2563   // jump to stub processing NaN cases.
2564   bnez(t0, stub->entry(), /* is_far */ true);
2565 
2566   bind(stub->continuation());
2567 }
2568 
2569 void C2_MacroAssembler::signum_fp_v(VectorRegister dst, VectorRegister one, BasicType bt, int vlen) {
2570   vsetvli_helper(bt, vlen);
2571 
2572   // check if input is -0, +0, signaling NaN or quiet NaN
2573   vfclass_v(v0, dst);
2574   mv(t0, FClassBits::zero | FClassBits::nan);
2575   vand_vx(v0, v0, t0);
2576   vmseq_vi(v0, v0, 0);
2577 
2578   // use floating-point 1.0 with a sign of input
2579   vfsgnj_vv(dst, one, dst, v0_t);
2580 }
2581 
2582 // j.l.Math.round(float)
2583 //  Returns the closest int to the argument, with ties rounding to positive infinity.
2584 // We need to handle 3 special cases defined by java api spec:
2585 //    NaN,
2586 //    float >= Integer.MAX_VALUE,
2587 //    float <= Integer.MIN_VALUE.
2588 void C2_MacroAssembler::java_round_float_v(VectorRegister dst, VectorRegister src, FloatRegister ftmp,
2589                                            BasicType bt, uint vector_length) {
2590   // In riscv, there is no straight corresponding rounding mode to satisfy the behaviour defined,
2591   // in java api spec, i.e. any rounding mode can not handle some corner cases, e.g.
2592   //  RNE is the closest one, but it ties to "even", which means 1.5/2.5 both will be converted
2593   //    to 2, instead of 2 and 3 respectively.
2594   //  RUP does not work either, although java api requires "rounding to positive infinity",
2595   //    but both 1.3/1.8 will be converted to 2, instead of 1 and 2 respectively.
2596   //
2597   // The optimal solution for non-NaN cases is:
2598   //    src+0.5 => dst, with rdn rounding mode,
2599   //    convert dst from float to int, with rnd rounding mode.
2600   // and, this solution works as expected for float >= Integer.MAX_VALUE and float <= Integer.MIN_VALUE.
2601   //
2602   // But, we still need to handle NaN explicilty with vector mask instructions.
2603   //
2604   // Check MacroAssembler::java_round_float and C2_MacroAssembler::vector_round_sve in aarch64 for more details.
2605 
2606   csrwi(CSR_FRM, C2_MacroAssembler::rdn);
2607   vsetvli_helper(bt, vector_length);
2608 
2609   // don't rearrage the instructions sequence order without performance testing.
2610   // check MacroAssembler::java_round_float in riscv64 for more details.
2611   mv(t0, jint_cast(0.5f));
2612   fmv_w_x(ftmp, t0);
2613 
2614   // replacing vfclass with feq as performance optimization
2615   vmfeq_vv(v0, src, src);
2616   // set dst = 0 in cases of NaN
2617   vmv_v_x(dst, zr);
2618 
2619   // dst = (src + 0.5) rounded down towards negative infinity
2620   vfadd_vf(dst, src, ftmp, Assembler::v0_t);
2621   vfcvt_x_f_v(dst, dst, Assembler::v0_t); // in RoundingMode::rdn
2622 
2623   csrwi(CSR_FRM, C2_MacroAssembler::rne);
2624 }
2625 
2626 // java.lang.Math.round(double a)
2627 // Returns the closest long to the argument, with ties rounding to positive infinity.
2628 void C2_MacroAssembler::java_round_double_v(VectorRegister dst, VectorRegister src, FloatRegister ftmp,
2629                                             BasicType bt, uint vector_length) {
2630   // check C2_MacroAssembler::java_round_float_v above for more details.
2631 
2632   csrwi(CSR_FRM, C2_MacroAssembler::rdn);
2633   vsetvli_helper(bt, vector_length);
2634 
2635   mv(t0, julong_cast(0.5));
2636   fmv_d_x(ftmp, t0);
2637 
2638   // replacing vfclass with feq as performance optimization
2639   vmfeq_vv(v0, src, src);
2640   // set dst = 0 in cases of NaN
2641   vmv_v_x(dst, zr);
2642 
2643   // dst = (src + 0.5) rounded down towards negative infinity
2644   vfadd_vf(dst, src, ftmp, Assembler::v0_t);
2645   vfcvt_x_f_v(dst, dst, Assembler::v0_t); // in RoundingMode::rdn
2646 
2647   csrwi(CSR_FRM, C2_MacroAssembler::rne);
2648 }
2649 
2650 void C2_MacroAssembler::element_compare(Register a1, Register a2, Register result, Register cnt, Register tmp1, Register tmp2,
2651                                         VectorRegister vr1, VectorRegister vr2, VectorRegister vrs, bool islatin, Label &DONE,
2652                                         Assembler::LMUL lmul) {
2653   Label loop;
2654   Assembler::SEW sew = islatin ? Assembler::e8 : Assembler::e16;
2655 
2656   bind(loop);
2657   vsetvli(tmp1, cnt, sew, lmul);
2658   vlex_v(vr1, a1, sew);
2659   vlex_v(vr2, a2, sew);
2660   vmsne_vv(vrs, vr1, vr2);
2661   vfirst_m(tmp2, vrs);
2662   bgez(tmp2, DONE);
2663   sub(cnt, cnt, tmp1);
2664   if (!islatin) {
2665     slli(tmp1, tmp1, 1); // get byte counts
2666   }
2667   add(a1, a1, tmp1);
2668   add(a2, a2, tmp1);
2669   bnez(cnt, loop);
2670 
2671   mv(result, true);
2672 }
2673 
2674 void C2_MacroAssembler::string_equals_v(Register a1, Register a2, Register result, Register cnt) {
2675   Label DONE;
2676   Register tmp1 = t0;
2677   Register tmp2 = t1;
2678 
2679   BLOCK_COMMENT("string_equals_v {");
2680 
2681   mv(result, false);
2682 
2683   element_compare(a1, a2, result, cnt, tmp1, tmp2, v2, v4, v2, true, DONE, Assembler::m2);
2684 
2685   bind(DONE);
2686   BLOCK_COMMENT("} string_equals_v");
2687 }
2688 
2689 // used by C2 ClearArray patterns.
2690 // base: Address of a buffer to be zeroed
2691 // cnt: Count in HeapWords
2692 //
2693 // base, cnt, v4, v5, v6, v7 and t0 are clobbered.
2694 void C2_MacroAssembler::clear_array_v(Register base, Register cnt) {
2695   Label loop;
2696 
2697   // making zero words
2698   vsetvli(t0, cnt, Assembler::e64, Assembler::m4);
2699   vxor_vv(v4, v4, v4);
2700 
2701   bind(loop);
2702   vsetvli(t0, cnt, Assembler::e64, Assembler::m4);
2703   vse64_v(v4, base);
2704   sub(cnt, cnt, t0);
2705   shadd(base, t0, base, t0, 3);
2706   bnez(cnt, loop);
2707 }
2708 
2709 void C2_MacroAssembler::arrays_equals_v(Register a1, Register a2, Register result,
2710                                         Register cnt1, int elem_size) {
2711   assert(elem_size == 1 || elem_size == 2, "must be char or byte");
2712   assert_different_registers(a1, a2, result, cnt1, t0, t1);
2713 
2714   Label DONE;
2715   Register tmp1 = t0;
2716   Register tmp2 = t1;
2717   Register cnt2 = tmp2;
2718   int length_offset = arrayOopDesc::length_offset_in_bytes();
2719   int base_offset = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE);
2720 
2721   assert((base_offset % (UseCompactObjectHeaders ? 4 : 8)) == 0, "Must be");
2722 
2723   BLOCK_COMMENT("arrays_equals_v {");
2724 
2725   // if (a1 == a2), return true
2726   mv(result, true);
2727   beq(a1, a2, DONE);
2728 
2729   mv(result, false);
2730   // if a1 == null or a2 == null, return false
2731   beqz(a1, DONE);
2732   beqz(a2, DONE);
2733   // if (a1.length != a2.length), return false
2734   lwu(cnt1, Address(a1, length_offset));
2735   lwu(cnt2, Address(a2, length_offset));
2736   bne(cnt1, cnt2, DONE);
2737 
2738   la(a1, Address(a1, base_offset));
2739   la(a2, Address(a2, base_offset));
2740 
2741   element_compare(a1, a2, result, cnt1, tmp1, tmp2, v2, v4, v2, elem_size == 1, DONE, Assembler::m2);
2742 
2743   bind(DONE);
2744 
2745   BLOCK_COMMENT("} arrays_equals_v");
2746 }
2747 
2748 void C2_MacroAssembler::string_compare_v(Register str1, Register str2, Register cnt1, Register cnt2,
2749                                          Register result, Register tmp1, Register tmp2, int encForm) {
2750   Label DIFFERENCE, DONE, L, loop;
2751   bool encLL = encForm == StrIntrinsicNode::LL;
2752   bool encLU = encForm == StrIntrinsicNode::LU;
2753   bool encUL = encForm == StrIntrinsicNode::UL;
2754 
2755   bool str1_isL = encLL || encLU;
2756   bool str2_isL = encLL || encUL;
2757 
2758   int minCharsInWord = encLL ? wordSize : wordSize / 2;
2759 
2760   BLOCK_COMMENT("string_compare_v {");
2761 
2762   // for Latin strings, 1 byte for 1 character
2763   // for UTF16 strings, 2 bytes for 1 character
2764   if (!str1_isL)
2765     sraiw(cnt1, cnt1, 1);
2766   if (!str2_isL)
2767     sraiw(cnt2, cnt2, 1);
2768 
2769   // if str1 == str2, return the difference
2770   // save the minimum of the string lengths in cnt2.
2771   sub(result, cnt1, cnt2);
2772   bgt(cnt1, cnt2, L);
2773   mv(cnt2, cnt1);
2774   bind(L);
2775 
2776   // We focus on the optimization of small sized string.
2777   // Please check below document for string size distribution statistics.
2778   // https://cr.openjdk.org/~shade/density/string-density-report.pdf
2779   if (str1_isL == str2_isL) { // LL or UU
2780     // Below construction of v regs and lmul is based on test on 2 different boards,
2781     // vlen == 128 and vlen == 256 respectively.
2782     if (!encLL && MaxVectorSize == 16) { // UU
2783       element_compare(str1, str2, zr, cnt2, tmp1, tmp2, v4, v8, v4, encLL, DIFFERENCE, Assembler::m4);
2784     } else { // UU + MaxVectorSize or LL
2785       element_compare(str1, str2, zr, cnt2, tmp1, tmp2, v2, v4, v2, encLL, DIFFERENCE, Assembler::m2);
2786     }
2787 
2788     j(DONE);
2789   } else { // LU or UL
2790     Register strL = encLU ? str1 : str2;
2791     Register strU = encLU ? str2 : str1;
2792     VectorRegister vstr1 = encLU ? v8 : v4;
2793     VectorRegister vstr2 = encLU ? v4 : v8;
2794 
2795     bind(loop);
2796     vsetvli(tmp1, cnt2, Assembler::e8, Assembler::m2);
2797     vle8_v(vstr1, strL);
2798     vsetvli(tmp1, cnt2, Assembler::e16, Assembler::m4);
2799     vzext_vf2(vstr2, vstr1);
2800     vle16_v(vstr1, strU);
2801     vmsne_vv(v4, vstr2, vstr1);
2802     vfirst_m(tmp2, v4);
2803     bgez(tmp2, DIFFERENCE);
2804     sub(cnt2, cnt2, tmp1);
2805     add(strL, strL, tmp1);
2806     shadd(strU, tmp1, strU, tmp1, 1);
2807     bnez(cnt2, loop);
2808     j(DONE);
2809   }
2810 
2811   bind(DIFFERENCE);
2812   slli(tmp1, tmp2, 1);
2813   add(str1, str1, str1_isL ? tmp2 : tmp1);
2814   add(str2, str2, str2_isL ? tmp2 : tmp1);
2815   str1_isL ? lbu(tmp1, Address(str1, 0)) : lhu(tmp1, Address(str1, 0));
2816   str2_isL ? lbu(tmp2, Address(str2, 0)) : lhu(tmp2, Address(str2, 0));
2817   sub(result, tmp1, tmp2);
2818 
2819   bind(DONE);
2820 
2821   BLOCK_COMMENT("} string_compare_v");
2822 }
2823 
2824 void C2_MacroAssembler::byte_array_inflate_v(Register src, Register dst, Register len, Register tmp) {
2825   Label loop;
2826   assert_different_registers(src, dst, len, tmp, t0);
2827 
2828   BLOCK_COMMENT("byte_array_inflate_v {");
2829   bind(loop);
2830   vsetvli(tmp, len, Assembler::e8, Assembler::m2);
2831   vle8_v(v6, src);
2832   vsetvli(t0, len, Assembler::e16, Assembler::m4);
2833   vzext_vf2(v4, v6);
2834   vse16_v(v4, dst);
2835   sub(len, len, tmp);
2836   add(src, src, tmp);
2837   shadd(dst, tmp, dst, tmp, 1);
2838   bnez(len, loop);
2839   BLOCK_COMMENT("} byte_array_inflate_v");
2840 }
2841 
2842 // Compress char[] array to byte[].
2843 // Intrinsic for java.lang.StringUTF16.compress(char[] src, int srcOff, byte[] dst, int dstOff, int len)
2844 // result: the array length if every element in array can be encoded,
2845 // otherwise, the index of first non-latin1 (> 0xff) character.
2846 void C2_MacroAssembler::char_array_compress_v(Register src, Register dst, Register len,
2847                                               Register result, Register tmp) {
2848   encode_iso_array_v(src, dst, len, result, tmp, false);
2849 }
2850 
2851 // Intrinsic for
2852 //
2853 // - sun.nio.cs.ISO_8859_1.Encoder#encodeISOArray0(byte[] sa, int sp, byte[] da, int dp, int len)
2854 //   Encodes char[] to byte[] in ISO-8859-1
2855 //
2856 // - java.lang.StringCoding#encodeISOArray0(byte[] sa, int sp, byte[] da, int dp, int len)
2857 //   Encodes byte[] (containing UTF-16) to byte[] in ISO-8859-1
2858 //
2859 // - java.lang.StringCoding#encodeAsciiArray0(char[] sa, int sp, byte[] da, int dp, int len)
2860 //   Encodes char[] to byte[] in ASCII
2861 //
2862 // This version always returns the number of characters copied. A successful
2863 // copy will complete with the post-condition: 'res' == 'len', while an
2864 // unsuccessful copy will exit with the post-condition: 0 <= 'res' < 'len'.
2865 //
2866 // Clobbers: src, dst, len, result, t0
2867 void C2_MacroAssembler::encode_iso_array_v(Register src, Register dst, Register len,
2868                                            Register result, Register tmp, bool ascii) {
2869   Label loop, fail, done;
2870 
2871   BLOCK_COMMENT("encode_iso_array_v {");
2872   mv(result, 0);
2873 
2874   bind(loop);
2875   mv(tmp, ascii ? 0x7f : 0xff);
2876   vsetvli(t0, len, Assembler::e16, Assembler::m2);
2877   vle16_v(v2, src);
2878 
2879   vmsgtu_vx(v1, v2, tmp);
2880   vfirst_m(tmp, v1);
2881   vmsbf_m(v0, v1);
2882   // compress char to byte
2883   vsetvli(t0, len, Assembler::e8);
2884   vncvt_x_x_w(v1, v2, Assembler::v0_t);
2885   vse8_v(v1, dst, Assembler::v0_t);
2886 
2887   // fail if char > 0x7f/0xff
2888   bgez(tmp, fail);
2889   add(result, result, t0);
2890   add(dst, dst, t0);
2891   sub(len, len, t0);
2892   shadd(src, t0, src, t0, 1);
2893   bnez(len, loop);
2894   j(done);
2895 
2896   bind(fail);
2897   add(result, result, tmp);
2898 
2899   bind(done);
2900   BLOCK_COMMENT("} encode_iso_array_v");
2901 }
2902 
2903 void C2_MacroAssembler::count_positives_v(Register ary, Register len, Register result, Register tmp) {
2904   Label LOOP, SET_RESULT, DONE;
2905 
2906   BLOCK_COMMENT("count_positives_v {");
2907   assert_different_registers(ary, len, result, tmp);
2908 
2909   mv(result, zr);
2910 
2911   bind(LOOP);
2912   vsetvli(t0, len, Assembler::e8, Assembler::m4);
2913   vle8_v(v4, ary);
2914   vmslt_vx(v4, v4, zr);
2915   vfirst_m(tmp, v4);
2916   bgez(tmp, SET_RESULT);
2917   // if tmp == -1, all bytes are positive
2918   add(result, result, t0);
2919 
2920   sub(len, len, t0);
2921   add(ary, ary, t0);
2922   bnez(len, LOOP);
2923   j(DONE);
2924 
2925   // add remaining positive bytes count
2926   bind(SET_RESULT);
2927   add(result, result, tmp);
2928 
2929   bind(DONE);
2930   BLOCK_COMMENT("} count_positives_v");
2931 }
2932 
2933 void C2_MacroAssembler::string_indexof_char_v(Register str1, Register cnt1,
2934                                               Register ch, Register result,
2935                                               Register tmp1, Register tmp2,
2936                                               bool isL) {
2937   mv(result, zr);
2938 
2939   Label loop, MATCH, DONE;
2940   Assembler::SEW sew = isL ? Assembler::e8 : Assembler::e16;
2941   bind(loop);
2942   vsetvli(tmp1, cnt1, sew, Assembler::m4);
2943   vlex_v(v4, str1, sew);
2944   vmseq_vx(v4, v4, ch);
2945   vfirst_m(tmp2, v4);
2946   bgez(tmp2, MATCH); // if equal, return index
2947 
2948   add(result, result, tmp1);
2949   sub(cnt1, cnt1, tmp1);
2950   if (!isL) slli(tmp1, tmp1, 1);
2951   add(str1, str1, tmp1);
2952   bnez(cnt1, loop);
2953 
2954   mv(result, -1);
2955   j(DONE);
2956 
2957   bind(MATCH);
2958   add(result, result, tmp2);
2959 
2960   bind(DONE);
2961 }
2962 
2963 // Set dst to NaN if any NaN input.
2964 void C2_MacroAssembler::minmax_fp_v(VectorRegister dst, VectorRegister src1, VectorRegister src2,
2965                                     BasicType bt, bool is_min, uint vector_length) {
2966   assert_different_registers(dst, src1, src2);
2967 
2968   vsetvli_helper(bt, vector_length);
2969 
2970   is_min ? vfmin_vv(dst, src1, src2)
2971          : vfmax_vv(dst, src1, src2);
2972 
2973   vmfne_vv(v0,  src1, src1);
2974   vfadd_vv(dst, src1, src1, Assembler::v0_t);
2975   vmfne_vv(v0,  src2, src2);
2976   vfadd_vv(dst, src2, src2, Assembler::v0_t);
2977 }
2978 
2979 // Set dst to NaN if any NaN input.
2980 // The destination vector register elements corresponding to masked-off elements
2981 // are handled with a mask-undisturbed policy.
2982 void C2_MacroAssembler::minmax_fp_masked_v(VectorRegister dst, VectorRegister src1, VectorRegister src2,
2983                                            VectorRegister vmask, VectorRegister tmp1, VectorRegister tmp2,
2984                                            BasicType bt, bool is_min, uint vector_length) {
2985   assert_different_registers(src1, src2, tmp1, tmp2);
2986   vsetvli_helper(bt, vector_length);
2987 
2988   // Check vector elements of src1 and src2 for NaN.
2989   vmfeq_vv(tmp1, src1, src1);
2990   vmfeq_vv(tmp2, src2, src2);
2991 
2992   vmandn_mm(v0, vmask, tmp1);
2993   vfadd_vv(dst, src1, src1, Assembler::v0_t);
2994   vmandn_mm(v0, vmask, tmp2);
2995   vfadd_vv(dst, src2, src2, Assembler::v0_t);
2996 
2997   vmand_mm(tmp2, tmp1, tmp2);
2998   vmand_mm(v0, vmask, tmp2);
2999   is_min ? vfmin_vv(dst, src1, src2, Assembler::v0_t)
3000          : vfmax_vv(dst, src1, src2, Assembler::v0_t);
3001 }
3002 
3003 // Set dst to NaN if any NaN input.
3004 void C2_MacroAssembler::reduce_minmax_fp_v(FloatRegister dst,
3005                                            FloatRegister src1, VectorRegister src2,
3006                                            VectorRegister tmp1, VectorRegister tmp2,
3007                                            bool is_double, bool is_min, uint vector_length, VectorMask vm) {
3008   assert_different_registers(dst, src1);
3009   assert_different_registers(src2, tmp1, tmp2);
3010 
3011   Label L_done, L_NaN_1, L_NaN_2;
3012   // Set dst to src1 if src1 is NaN
3013   is_double ? feq_d(t0, src1, src1)
3014             : feq_s(t0, src1, src1);
3015   beqz(t0, L_NaN_2);
3016 
3017   vsetvli_helper(is_double ? T_DOUBLE : T_FLOAT, vector_length);
3018   vfmv_s_f(tmp2, src1);
3019 
3020   is_min ? vfredmin_vs(tmp1, src2, tmp2, vm)
3021          : vfredmax_vs(tmp1, src2, tmp2, vm);
3022   vfmv_f_s(dst, tmp1);
3023 
3024   // Checking NaNs in src2
3025   vmfne_vv(tmp1, src2, src2, vm);
3026   vcpop_m(t0, tmp1, vm);
3027   beqz(t0, L_done);
3028 
3029   bind(L_NaN_1);
3030   vfredusum_vs(tmp1, src2, tmp2, vm);
3031   vfmv_f_s(dst, tmp1);
3032   j(L_done);
3033 
3034   bind(L_NaN_2);
3035   is_double ? fmv_d(dst, src1)
3036             : fmv_s(dst, src1);
3037   bind(L_done);
3038 }
3039 
3040 bool C2_MacroAssembler::in_scratch_emit_size() {
3041   if (ciEnv::current()->task() != nullptr) {
3042     PhaseOutput* phase_output = Compile::current()->output();
3043     if (phase_output != nullptr && phase_output->in_scratch_emit_size()) {
3044       return true;
3045     }
3046   }
3047   return MacroAssembler::in_scratch_emit_size();
3048 }
3049 
3050 void C2_MacroAssembler::reduce_integral_v(Register dst, Register src1,
3051                                           VectorRegister src2, VectorRegister tmp,
3052                                           int opc, BasicType bt, uint vector_length, VectorMask vm) {
3053   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
3054   vsetvli_helper(bt, vector_length);
3055   vmv_s_x(tmp, src1);
3056   switch (opc) {
3057     case Op_AddReductionVI:
3058     case Op_AddReductionVL:
3059       vredsum_vs(tmp, src2, tmp, vm);
3060       break;
3061     case Op_AndReductionV:
3062       vredand_vs(tmp, src2, tmp, vm);
3063       break;
3064     case Op_OrReductionV:
3065       vredor_vs(tmp, src2, tmp, vm);
3066       break;
3067     case Op_XorReductionV:
3068       vredxor_vs(tmp, src2, tmp, vm);
3069       break;
3070     case Op_MaxReductionV:
3071       vredmax_vs(tmp, src2, tmp, vm);
3072       break;
3073     case Op_MinReductionV:
3074       vredmin_vs(tmp, src2, tmp, vm);
3075       break;
3076     default:
3077       ShouldNotReachHere();
3078   }
3079   vmv_x_s(dst, tmp);
3080 }
3081 
3082 void C2_MacroAssembler::reduce_mul_integral_v(Register dst, Register src1, VectorRegister src2,
3083                                               VectorRegister vtmp1, VectorRegister vtmp2,
3084                                               BasicType bt, uint vector_length, VectorMask vm) {
3085   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
3086   vsetvli_helper(bt, vector_length);
3087 
3088   vector_length /= 2;
3089   if (vm != Assembler::unmasked) {
3090     // This behaviour is consistent with spec requirements of vector API, for `reduceLanes`:
3091     //  If no elements are selected, an operation-specific identity value is returned.
3092     //    If the operation is MUL, then the identity value is one.
3093     vmv_v_i(vtmp1, 1);
3094     vmerge_vvm(vtmp2, vtmp1, src2); // vm == v0
3095     slidedown_v(vtmp1, vtmp2, vector_length);
3096 
3097     vsetvli_helper(bt, vector_length);
3098     vmul_vv(vtmp1, vtmp1, vtmp2);
3099   } else {
3100     slidedown_v(vtmp1, src2, vector_length);
3101 
3102     vsetvli_helper(bt, vector_length);
3103     vmul_vv(vtmp1, vtmp1, src2);
3104   }
3105 
3106   while (vector_length > 1) {
3107     vector_length /= 2;
3108     slidedown_v(vtmp2, vtmp1, vector_length);
3109     vsetvli_helper(bt, vector_length);
3110     vmul_vv(vtmp1, vtmp1, vtmp2);
3111   }
3112 
3113   vmv_x_s(dst, vtmp1);
3114   if (bt == T_INT) {
3115     mulw(dst, dst, src1);
3116   } else {
3117     mul(dst, dst, src1);
3118   }
3119 }
3120 
3121 // Set vl and vtype for full and partial vector operations.
3122 // (vma = mu, vta = tu, vill = false)
3123 void C2_MacroAssembler::vsetvli_helper(BasicType bt, uint vector_length, LMUL vlmul, Register tmp) {
3124   Assembler::SEW sew = Assembler::elemtype_to_sew(bt);
3125   if (vector_length <= 31) {
3126     vsetivli(tmp, vector_length, sew, vlmul);
3127   } else if (vector_length == (MaxVectorSize / type2aelembytes(bt))) {
3128     vsetvli(tmp, x0, sew, vlmul);
3129   } else {
3130     mv(tmp, vector_length);
3131     vsetvli(tmp, tmp, sew, vlmul);
3132   }
3133 }
3134 
3135 void C2_MacroAssembler::compare_integral_v(VectorRegister vd, VectorRegister src1, VectorRegister src2,
3136                                            int cond, BasicType bt, uint vector_length, VectorMask vm) {
3137   assert(is_integral_type(bt), "unsupported element type");
3138   assert(vm == Assembler::v0_t ? vd != v0 : true, "should be different registers");
3139   vsetvli_helper(bt, vector_length);
3140   if (vm == Assembler::v0_t) {
3141     vmclr_m(vd);
3142   }
3143   switch (cond) {
3144     case BoolTest::eq: vmseq_vv(vd, src1, src2, vm); break;
3145     case BoolTest::ne: vmsne_vv(vd, src1, src2, vm); break;
3146     case BoolTest::le: vmsle_vv(vd, src1, src2, vm); break;
3147     case BoolTest::ge: vmsge_vv(vd, src1, src2, vm); break;
3148     case BoolTest::lt: vmslt_vv(vd, src1, src2, vm); break;
3149     case BoolTest::gt: vmsgt_vv(vd, src1, src2, vm); break;
3150     case BoolTest::ule: vmsleu_vv(vd, src1, src2, vm); break;
3151     case BoolTest::uge: vmsgeu_vv(vd, src1, src2, vm); break;
3152     case BoolTest::ult: vmsltu_vv(vd, src1, src2, vm); break;
3153     case BoolTest::ugt: vmsgtu_vv(vd, src1, src2, vm); break;
3154     default:
3155       assert(false, "unsupported compare condition");
3156       ShouldNotReachHere();
3157   }
3158 }
3159 
3160 void C2_MacroAssembler::compare_fp_v(VectorRegister vd, VectorRegister src1, VectorRegister src2,
3161                                      int cond, BasicType bt, uint vector_length, VectorMask vm) {
3162   assert(is_floating_point_type(bt), "unsupported element type");
3163   assert(vm == Assembler::v0_t ? vd != v0 : true, "should be different registers");
3164   vsetvli_helper(bt, vector_length);
3165   if (vm == Assembler::v0_t) {
3166     vmclr_m(vd);
3167   }
3168   switch (cond) {
3169     case BoolTest::eq: vmfeq_vv(vd, src1, src2, vm); break;
3170     case BoolTest::ne: vmfne_vv(vd, src1, src2, vm); break;
3171     case BoolTest::le: vmfle_vv(vd, src1, src2, vm); break;
3172     case BoolTest::ge: vmfge_vv(vd, src1, src2, vm); break;
3173     case BoolTest::lt: vmflt_vv(vd, src1, src2, vm); break;
3174     case BoolTest::gt: vmfgt_vv(vd, src1, src2, vm); break;
3175     default:
3176       assert(false, "unsupported compare condition");
3177       ShouldNotReachHere();
3178   }
3179 }
3180 
3181 // In Matcher::scalable_predicate_reg_slots,
3182 // we assume each predicate register is one-eighth of the size of
3183 // scalable vector register, one mask bit per vector byte.
3184 void C2_MacroAssembler::spill_vmask(VectorRegister v, int offset) {
3185   vsetvli_helper(T_BYTE, MaxVectorSize >> 3);
3186   add(t0, sp, offset);
3187   vse8_v(v, t0);
3188 }
3189 
3190 void C2_MacroAssembler::unspill_vmask(VectorRegister v, int offset) {
3191   vsetvli_helper(T_BYTE, MaxVectorSize >> 3);
3192   add(t0, sp, offset);
3193   vle8_v(v, t0);
3194 }
3195 
3196 void C2_MacroAssembler::integer_extend_v(VectorRegister dst, BasicType dst_bt, uint vector_length,
3197                                          VectorRegister src, BasicType src_bt, bool is_signed) {
3198   assert(type2aelembytes(dst_bt) > type2aelembytes(src_bt) && type2aelembytes(dst_bt) <= 8 && type2aelembytes(src_bt) <= 4, "invalid element size");
3199   assert(dst_bt != T_FLOAT && dst_bt != T_DOUBLE && src_bt != T_FLOAT && src_bt != T_DOUBLE, "unsupported element type");
3200   // https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#52-vector-operands
3201   // The destination EEW is greater than the source EEW, the source EMUL is at least 1,
3202   // and the overlap is in the highest-numbered part of the destination register group.
3203   // Since LMUL=1, vd and vs cannot be the same.
3204   assert_different_registers(dst, src);
3205 
3206   vsetvli_helper(dst_bt, vector_length);
3207   if (is_signed) {
3208     if (src_bt == T_BYTE) {
3209       switch (dst_bt) {
3210       case T_SHORT:
3211         vsext_vf2(dst, src);
3212         break;
3213       case T_INT:
3214         vsext_vf4(dst, src);
3215         break;
3216       case T_LONG:
3217         vsext_vf8(dst, src);
3218         break;
3219       default:
3220         ShouldNotReachHere();
3221       }
3222     } else if (src_bt == T_SHORT) {
3223       if (dst_bt == T_INT) {
3224         vsext_vf2(dst, src);
3225       } else {
3226         vsext_vf4(dst, src);
3227       }
3228     } else if (src_bt == T_INT) {
3229       vsext_vf2(dst, src);
3230     }
3231   } else {
3232     if (src_bt == T_BYTE) {
3233       switch (dst_bt) {
3234       case T_SHORT:
3235         vzext_vf2(dst, src);
3236         break;
3237       case T_INT:
3238         vzext_vf4(dst, src);
3239         break;
3240       case T_LONG:
3241         vzext_vf8(dst, src);
3242         break;
3243       default:
3244         ShouldNotReachHere();
3245       }
3246     } else if (src_bt == T_SHORT) {
3247       if (dst_bt == T_INT) {
3248         vzext_vf2(dst, src);
3249       } else {
3250         vzext_vf4(dst, src);
3251       }
3252     } else if (src_bt == T_INT) {
3253       vzext_vf2(dst, src);
3254     }
3255   }
3256 }
3257 
3258 // Vector narrow from src to dst with specified element sizes.
3259 // High part of dst vector will be filled with zero.
3260 void C2_MacroAssembler::integer_narrow_v(VectorRegister dst, BasicType dst_bt, uint vector_length,
3261                                          VectorRegister src, BasicType src_bt) {
3262   assert(type2aelembytes(dst_bt) < type2aelembytes(src_bt) && type2aelembytes(dst_bt) <= 4 && type2aelembytes(src_bt) <= 8, "invalid element size");
3263   assert(dst_bt != T_FLOAT && dst_bt != T_DOUBLE && src_bt != T_FLOAT && src_bt != T_DOUBLE, "unsupported element type");
3264   mv(t0, vector_length);
3265   if (src_bt == T_LONG) {
3266     // https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#117-vector-narrowing-integer-right-shift-instructions
3267     // Future extensions might add support for versions that narrow to a destination that is 1/4 the width of the source.
3268     // So we can currently only scale down by 1/2 the width at a time.
3269     vsetvli(t0, t0, Assembler::e32, Assembler::mf2);
3270     vncvt_x_x_w(dst, src);
3271     if (dst_bt == T_SHORT || dst_bt == T_BYTE) {
3272       vsetvli(t0, t0, Assembler::e16, Assembler::mf2);
3273       vncvt_x_x_w(dst, dst);
3274       if (dst_bt == T_BYTE) {
3275         vsetvli(t0, t0, Assembler::e8, Assembler::mf2);
3276         vncvt_x_x_w(dst, dst);
3277       }
3278     }
3279   } else if (src_bt == T_INT) {
3280     // T_SHORT
3281     vsetvli(t0, t0, Assembler::e16, Assembler::mf2);
3282     vncvt_x_x_w(dst, src);
3283     if (dst_bt == T_BYTE) {
3284       vsetvli(t0, t0, Assembler::e8, Assembler::mf2);
3285       vncvt_x_x_w(dst, dst);
3286     }
3287   } else if (src_bt == T_SHORT) {
3288     vsetvli(t0, t0, Assembler::e8, Assembler::mf2);
3289     vncvt_x_x_w(dst, src);
3290   }
3291 }
3292 
3293 #define VFCVT_SAFE(VFLOATCVT)                                                      \
3294 void C2_MacroAssembler::VFLOATCVT##_safe(VectorRegister dst, VectorRegister src) { \
3295   assert_different_registers(dst, src);                                            \
3296   vxor_vv(dst, dst, dst);                                                          \
3297   vmfeq_vv(v0, src, src);                                                          \
3298   VFLOATCVT(dst, src, Assembler::v0_t);                                            \
3299 }
3300 
3301 VFCVT_SAFE(vfcvt_rtz_x_f_v);
3302 
3303 #undef VFCVT_SAFE
3304 
3305 // Extract a scalar element from an vector at position 'idx'.
3306 // The input elements in src are expected to be of integral type.
3307 void C2_MacroAssembler::extract_v(Register dst, VectorRegister src,
3308                                   BasicType bt, int idx, VectorRegister vtmp) {
3309   assert(is_integral_type(bt), "unsupported element type");
3310   assert(idx >= 0, "idx cannot be negative");
3311   // Only need the first element after vector slidedown
3312   vsetvli_helper(bt, 1);
3313   if (idx == 0) {
3314     vmv_x_s(dst, src);
3315   } else {
3316     slidedown_v(vtmp, src, idx);
3317     vmv_x_s(dst, vtmp);
3318   }
3319 }
3320 
3321 // Extract a scalar element from an vector at position 'idx'.
3322 // The input elements in src are expected to be of floating point type.
3323 void C2_MacroAssembler::extract_fp_v(FloatRegister dst, VectorRegister src,
3324                                      BasicType bt, int idx, VectorRegister vtmp) {
3325   assert(is_floating_point_type(bt), "unsupported element type");
3326   assert(idx >= 0, "idx cannot be negative");
3327   // Only need the first element after vector slidedown
3328   vsetvli_helper(bt, 1);
3329   if (idx == 0) {
3330     vfmv_f_s(dst, src);
3331   } else {
3332     slidedown_v(vtmp, src, idx);
3333     vfmv_f_s(dst, vtmp);
3334   }
3335 }
3336 
3337 // Move elements down a vector register group.
3338 // Offset is the start index (offset) for the source.
3339 void C2_MacroAssembler::slidedown_v(VectorRegister dst, VectorRegister src,
3340                                     uint32_t offset, Register tmp) {
3341   if (is_uimm5(offset)) {
3342     vslidedown_vi(dst, src, offset);
3343   } else {
3344     mv(tmp, offset);
3345     vslidedown_vx(dst, src, tmp);
3346   }
3347 }