New src/hotspot/cpu/riscv/c2_MacroAssembler

   1 /*
   2  * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2020, 2022, Huawei Technologies Co., Ltd. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/assembler.hpp"
  28 #include "asm/assembler.inline.hpp"
  29 #include "opto/c2_MacroAssembler.hpp"
  30 #include "opto/compile.hpp"
  31 #include "opto/intrinsicnode.hpp"
  32 #include "opto/output.hpp"
  33 #include "opto/subnode.hpp"
  34 #include "runtime/stubRoutines.hpp"
  35 #include "utilities/globalDefinitions.hpp"
  36 
  37 #ifdef PRODUCT
  38 #define BLOCK_COMMENT(str) /* nothing */
  39 #define STOP(error) stop(error)
  40 #else
  41 #define BLOCK_COMMENT(str) block_comment(str)
  42 #define STOP(error) block_comment(error); stop(error)
  43 #endif
  44 
  45 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  46 
  47 void C2_MacroAssembler::fast_lock(Register objectReg, Register boxReg,
  48                                   Register tmp1Reg, Register tmp2Reg, Register tmp3Reg) {
  49   // Use cr register to indicate the fast_lock result: zero for success; non-zero for failure.
  50   Register flag = t1;
  51   Register oop = objectReg;
  52   Register box = boxReg;
  53   Register disp_hdr = tmp1Reg;
  54   Register tmp = tmp2Reg;
  55   Label object_has_monitor;
  56   // Finish fast lock successfully. MUST branch to with flag == 0
  57   Label locked;
  58   // Finish fast lock unsuccessfully. slow_path MUST branch to with flag != 0
  59   Label slow_path;
  60 
  61   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight");
  62   assert_different_registers(oop, box, tmp, disp_hdr, flag, tmp3Reg, t0);
  63 
  64   mv(flag, 1);
  65 
  66   // Load markWord from object into displaced_header.
  67   ld(disp_hdr, Address(oop, oopDesc::mark_offset_in_bytes()));
  68 
  69   if (DiagnoseSyncOnValueBasedClasses != 0) {
  70     load_klass(tmp, oop);
  71     lwu(tmp, Address(tmp, Klass::access_flags_offset()));
  72     test_bit(tmp, tmp, exact_log2(JVM_ACC_IS_VALUE_BASED_CLASS));
  73     bnez(tmp, slow_path);
  74   }
  75 
  76   // Check for existing monitor
  77   test_bit(tmp, disp_hdr, exact_log2(markWord::monitor_value));
  78   bnez(tmp, object_has_monitor);
  79 
  80   if (LockingMode == LM_MONITOR) {
  81     j(slow_path);
  82   } else {
  83     assert(LockingMode == LM_LEGACY, "must be");
  84     // Set tmp to be (markWord of object | UNLOCK_VALUE).
  85     ori(tmp, disp_hdr, markWord::unlocked_value);
  86 
  87     // Initialize the box. (Must happen before we update the object mark!)
  88     sd(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes()));
  89 
  90     // Compare object markWord with an unlocked value (tmp) and if
  91     // equal exchange the stack address of our box with object markWord.
  92     // On failure disp_hdr contains the possibly locked markWord.
  93     cmpxchg(/*memory address*/oop, /*expected value*/tmp, /*new value*/box, Assembler::int64,
  94             Assembler::aq, Assembler::rl, /*result*/disp_hdr);
  95     beq(disp_hdr, tmp, locked);
  96 
  97     assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
  98 
  99     // If the compare-and-exchange succeeded, then we found an unlocked
 100     // object, will have now locked it will continue at label locked
 101     // We did not see an unlocked object so try the fast recursive case.
 102 
 103     // Check if the owner is self by comparing the value in the
 104     // markWord of object (disp_hdr) with the stack pointer.
 105     sub(disp_hdr, disp_hdr, sp);
 106     mv(tmp, (intptr_t) (~(os::vm_page_size()-1) | (uintptr_t)markWord::lock_mask_in_place));
 107     // If (mark & lock_mask) == 0 and mark - sp < page_size, we are stack-locking and goto label locked,
 108     // hence we can store 0 as the displaced header in the box, which indicates that it is a
 109     // recursive lock.
 110     andr(tmp/*==0?*/, disp_hdr, tmp);
 111     sd(tmp/*==0, perhaps*/, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 112     beqz(tmp, locked);
 113     j(slow_path);
 114   }
 115 
 116   // Handle existing monitor.
 117   bind(object_has_monitor);
 118   // The object's monitor m is unlocked iff m->owner == nullptr,
 119   // otherwise m->owner may contain a thread or a stack address.
 120   //
 121   // Try to CAS m->owner from null to current thread id.
 122   Register tid = flag;
 123   mv(tid, Address(xthread, JavaThread::lock_id_offset()));
 124   add(tmp, disp_hdr, (in_bytes(ObjectMonitor::owner_offset()) - markWord::monitor_value));
 125   cmpxchg(/*memory address*/tmp, /*expected value*/zr, /*new value*/tid, Assembler::int64,
 126           Assembler::aq, Assembler::rl, /*result*/tmp3Reg); // cas succeeds if tmp3Reg == zr(expected)
 127 
 128   // Store a non-null value into the box to avoid looking like a re-entrant
 129   // lock. The fast-path monitor unlock code checks for
 130   // markWord::monitor_value so use markWord::unused_mark which has the
 131   // relevant bit set, and also matches ObjectSynchronizer::slow_enter.
 132   mv(tmp, (address)markWord::unused_mark().value());
 133   sd(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 134 
 135   beqz(tmp3Reg, locked); // CAS success means locking succeeded
 136 
 137   bne(tmp3Reg, tid, slow_path); // Check for recursive locking
 138 
 139   // Recursive lock case
 140   increment(Address(disp_hdr, in_bytes(ObjectMonitor::recursions_offset()) - markWord::monitor_value), 1, tmp2Reg, tmp3Reg);
 141 
 142   bind(locked);
 143   mv(flag, zr);
 144   increment(Address(xthread, JavaThread::held_monitor_count_offset()), 1, tmp2Reg, tmp3Reg);
 145 
 146 #ifdef ASSERT
 147   // Check that locked label is reached with flag == 0.
 148   Label flag_correct;
 149   beqz(flag, flag_correct);
 150   stop("Fast Lock Flag != 0");
 151 #endif
 152 
 153   bind(slow_path);
 154 #ifdef ASSERT
 155   // Check that slow_path label is reached with flag != 0.
 156   bnez(flag, flag_correct);
 157   stop("Fast Lock Flag == 0");
 158   bind(flag_correct);
 159 #endif
 160   // C2 uses the value of flag (0 vs !0) to determine the continuation.
 161 }
 162 
 163 void C2_MacroAssembler::fast_unlock(Register objectReg, Register boxReg,
 164                                     Register tmp1Reg, Register tmp2Reg) {
 165   // Use cr register to indicate the fast_unlock result: zero for success; non-zero for failure.
 166   Register flag = t1;
 167   Register oop = objectReg;
 168   Register box = boxReg;
 169   Register disp_hdr = tmp1Reg;
 170   Register tmp = tmp2Reg;
 171   Label object_has_monitor;
 172   // Finish fast lock successfully. MUST branch to vwith flag == 0
 173   Label unlocked;
 174   // Finish fast lock unsuccessfully. slow_path MUST branch to with flag != 0
 175   Label slow_path;
 176 
 177   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight");
 178   assert_different_registers(oop, box, tmp, disp_hdr, flag, t0);
 179 
 180   mv(flag, 1);
 181 
 182   if (LockingMode == LM_LEGACY) {
 183     // Find the lock address and load the displaced header from the stack.
 184     ld(disp_hdr, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 185 
 186     // If the displaced header is 0, we have a recursive unlock.
 187     beqz(disp_hdr, unlocked);
 188   }
 189 
 190   // Handle existing monitor.
 191   ld(tmp, Address(oop, oopDesc::mark_offset_in_bytes()));
 192   test_bit(t0, tmp, exact_log2(markWord::monitor_value));
 193   bnez(t0, object_has_monitor);
 194 
 195   if (LockingMode == LM_MONITOR) {
 196     j(slow_path);
 197   } else {
 198     assert(LockingMode == LM_LEGACY, "must be");
 199     // Check if it is still a light weight lock, this is true if we
 200     // see the stack address of the basicLock in the markWord of the
 201     // object.
 202 
 203     cmpxchg(/*memory address*/oop, /*expected value*/box, /*new value*/disp_hdr, Assembler::int64,
 204             Assembler::relaxed, Assembler::rl, /*result*/tmp);
 205     beq(box, tmp, unlocked); // box == tmp if cas succeeds
 206     j(slow_path);
 207   }
 208 
 209   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
 210 
 211   // Handle existing monitor.
 212   bind(object_has_monitor);
 213   STATIC_ASSERT(markWord::monitor_value <= INT_MAX);
 214   add(tmp, tmp, -(int)markWord::monitor_value); // monitor
 215 
 216   ld(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset()));
 217 
 218   Label notRecursive;
 219   beqz(disp_hdr, notRecursive); // Will be 0 if not recursive.
 220 
 221   // Recursive lock
 222   addi(disp_hdr, disp_hdr, -1);
 223   sd(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset()));
 224   j(unlocked);
 225 
 226   bind(notRecursive);
 227   ld(t0, Address(tmp, ObjectMonitor::EntryList_offset()));
 228   ld(disp_hdr, Address(tmp, ObjectMonitor::cxq_offset()));
 229   orr(t0, t0, disp_hdr); // Will be 0 if both are 0.
 230   bnez(t0, slow_path);
 231 
 232   // need a release store here
 233   la(tmp, Address(tmp, ObjectMonitor::owner_offset()));
 234   membar(MacroAssembler::LoadStore | MacroAssembler::StoreStore);
 235   sd(zr, Address(tmp)); // set unowned
 236 
 237   bind(unlocked);
 238   mv(flag, zr);
 239   decrement(Address(xthread, JavaThread::held_monitor_count_offset()), 1, tmp1Reg, tmp2Reg);
 240 
 241 #ifdef ASSERT
 242   // Check that unlocked label is reached with flag == 0.
 243   Label flag_correct;
 244   beqz(flag, flag_correct);
 245   stop("Fast Lock Flag != 0");
 246 #endif
 247 
 248   bind(slow_path);
 249 #ifdef ASSERT
 250   // Check that slow_path label is reached with flag != 0.
 251   bnez(flag, flag_correct);
 252   stop("Fast Lock Flag == 0");
 253   bind(flag_correct);
 254 #endif
 255   // C2 uses the value of flag (0 vs !0) to determine the continuation.
 256 }
 257 
 258 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register tmp1, Register tmp2, Register tmp3) {
 259   // Flag register, zero for success; non-zero for failure.
 260   Register flag = t1;
 261 
 262   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 263   assert_different_registers(obj, tmp1, tmp2, tmp3, flag, t0);
 264 
 265   mv(flag, 1);
 266 
 267   // Handle inflated monitor.
 268   Label inflated;
 269   // Finish fast lock successfully. MUST branch to with flag == 0
 270   Label locked;
 271   // Finish fast lock unsuccessfully. slow_path MUST branch to with flag != 0
 272   Label slow_path;
 273 
 274   if (DiagnoseSyncOnValueBasedClasses != 0) {
 275     load_klass(tmp1, obj);
 276     lwu(tmp1, Address(tmp1, Klass::access_flags_offset()));
 277     test_bit(tmp1, tmp1, exact_log2(JVM_ACC_IS_VALUE_BASED_CLASS));
 278     bnez(tmp1, slow_path);
 279   }
 280 
 281   const Register tmp1_mark = tmp1;
 282 
 283   { // Lightweight locking
 284 
 285     // Push lock to the lock stack and finish successfully. MUST branch to with flag == 0
 286     Label push;
 287 
 288     const Register tmp2_top = tmp2;
 289     const Register tmp3_t = tmp3;
 290 
 291     // Check if lock-stack is full.
 292     lwu(tmp2_top, Address(xthread, JavaThread::lock_stack_top_offset()));
 293     mv(tmp3_t, (unsigned)LockStack::end_offset());
 294     bge(tmp2_top, tmp3_t, slow_path);
 295 
 296     // Check if recursive.
 297     add(tmp3_t, xthread, tmp2_top);
 298     ld(tmp3_t, Address(tmp3_t, -oopSize));
 299     beq(obj, tmp3_t, push);
 300 
 301     // Relaxed normal load to check for monitor. Optimization for monitor case.
 302     ld(tmp1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 303     test_bit(tmp3_t, tmp1_mark, exact_log2(markWord::monitor_value));
 304     bnez(tmp3_t, inflated);
 305 
 306     // Not inflated
 307     assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a la");
 308 
 309     // Try to lock. Transition lock-bits 0b01 => 0b00
 310     ori(tmp1_mark, tmp1_mark, markWord::unlocked_value);
 311     xori(tmp3_t, tmp1_mark, markWord::unlocked_value);
 312     cmpxchg(/*addr*/ obj, /*expected*/ tmp1_mark, /*new*/ tmp3_t, Assembler::int64,
 313             /*acquire*/ Assembler::aq, /*release*/ Assembler::relaxed, /*result*/ tmp3_t);
 314     bne(tmp1_mark, tmp3_t, slow_path);
 315 
 316     bind(push);
 317     // After successful lock, push object on lock-stack.
 318     add(tmp3_t, xthread, tmp2_top);
 319     sd(obj, Address(tmp3_t));
 320     addw(tmp2_top, tmp2_top, oopSize);
 321     sw(tmp2_top, Address(xthread, JavaThread::lock_stack_top_offset()));
 322     j(locked);
 323   }
 324 
 325   { // Handle inflated monitor.
 326     bind(inflated);
 327 
 328     // mark contains the tagged ObjectMonitor*.
 329     const Register tmp1_tagged_monitor = tmp1_mark;
 330     const uintptr_t monitor_tag = markWord::monitor_value;
 331     const Register tmp2_owner_addr = tmp2;
 332     const Register tmp3_owner = tmp3;
 333 
 334     // Compute owner address.
 335     la(tmp2_owner_addr, Address(tmp1_tagged_monitor, (in_bytes(ObjectMonitor::owner_offset()) - monitor_tag)));
 336 
 337     // CAS owner (null => current thread id).
 338     Register tid = flag;
 339     mv(tid, Address(xthread, JavaThread::lock_id_offset()));
 340     cmpxchg(/*addr*/ tmp2_owner_addr, /*expected*/ zr, /*new*/ tid, Assembler::int64,
 341             /*acquire*/ Assembler::aq, /*release*/ Assembler::relaxed, /*result*/ tmp3_owner);
 342     beqz(tmp3_owner, locked);
 343 
 344     // Check if recursive.
 345     bne(tmp3_owner, tid, slow_path);
 346 
 347     // Recursive.
 348     increment(Address(tmp1_tagged_monitor, in_bytes(ObjectMonitor::recursions_offset()) - monitor_tag), 1, tmp2, tmp3);
 349   }
 350 
 351   bind(locked);
 352   mv(flag, zr);
 353   increment(Address(xthread, JavaThread::held_monitor_count_offset()), 1, tmp2, tmp3);
 354 
 355 #ifdef ASSERT
 356   // Check that locked label is reached with flag == 0.
 357   Label flag_correct;
 358   beqz(flag, flag_correct);
 359   stop("Fast Lock Flag != 0");
 360 #endif
 361 
 362   bind(slow_path);
 363 #ifdef ASSERT
 364   // Check that slow_path label is reached with flag != 0.
 365   bnez(flag, flag_correct);
 366   stop("Fast Lock Flag == 0");
 367   bind(flag_correct);
 368 #endif
 369   // C2 uses the value of flag (0 vs !0) to determine the continuation.
 370 }
 371 
 372 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register tmp1, Register tmp2,
 373                                                 Register tmp3) {
 374   // Flag register, zero for success; non-zero for failure.
 375   Register flag = t1;
 376 
 377   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 378   assert_different_registers(obj, tmp1, tmp2, tmp3, flag, t0);
 379 
 380   mv(flag, 1);
 381 
 382   // Handle inflated monitor.
 383   Label inflated, inflated_load_monitor;
 384   // Finish fast unlock successfully. unlocked MUST branch to with flag == 0
 385   Label unlocked;
 386   // Finish fast unlock unsuccessfully. MUST branch to with flag != 0
 387   Label slow_path;
 388 
 389   const Register tmp1_mark = tmp1;
 390   const Register tmp2_top = tmp2;
 391   const Register tmp3_t = tmp3;
 392 
 393   { // Lightweight unlock
 394 
 395     // Check if obj is top of lock-stack.
 396     lwu(tmp2_top, Address(xthread, JavaThread::lock_stack_top_offset()));
 397     subw(tmp2_top, tmp2_top, oopSize);
 398     add(tmp3_t, xthread, tmp2_top);
 399     ld(tmp3_t, Address(tmp3_t));
 400     // Top of lock stack was not obj. Must be monitor.
 401     bne(obj, tmp3_t, inflated_load_monitor);
 402 
 403     // Pop lock-stack.
 404     DEBUG_ONLY(add(tmp3_t, xthread, tmp2_top);)
 405     DEBUG_ONLY(sd(zr, Address(tmp3_t));)
 406     sw(tmp2_top, Address(xthread, JavaThread::lock_stack_top_offset()));
 407 
 408     // Check if recursive.
 409     add(tmp3_t, xthread, tmp2_top);
 410     ld(tmp3_t, Address(tmp3_t, -oopSize));
 411     beq(obj, tmp3_t, unlocked);
 412 
 413     // Not recursive.
 414     // Load Mark.
 415     ld(tmp1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 416 
 417     // Check header for monitor (0b10).
 418     test_bit(tmp3_t, tmp1_mark, exact_log2(markWord::monitor_value));
 419     bnez(tmp3_t, inflated);
 420 
 421     // Try to unlock. Transition lock bits 0b00 => 0b01
 422     assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea");
 423     ori(tmp3_t, tmp1_mark, markWord::unlocked_value);
 424     cmpxchg(/*addr*/ obj, /*expected*/ tmp1_mark, /*new*/ tmp3_t, Assembler::int64,
 425             /*acquire*/ Assembler::relaxed, /*release*/ Assembler::rl, /*result*/ tmp3_t);
 426     beq(tmp1_mark, tmp3_t, unlocked);
 427 
 428     // Compare and exchange failed.
 429     // Restore lock-stack and handle the unlock in runtime.
 430     DEBUG_ONLY(add(tmp3_t, xthread, tmp2_top);)
 431     DEBUG_ONLY(sd(obj, Address(tmp3_t));)
 432     addw(tmp2_top, tmp2_top, oopSize);
 433     sd(tmp2_top, Address(xthread, JavaThread::lock_stack_top_offset()));
 434     j(slow_path);
 435   }
 436 
 437   { // Handle inflated monitor.
 438     bind(inflated_load_monitor);
 439     ld(tmp1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 440 #ifdef ASSERT
 441     test_bit(tmp3_t, tmp1_mark, exact_log2(markWord::monitor_value));
 442     bnez(tmp3_t, inflated);
 443     stop("Fast Unlock not monitor");
 444 #endif
 445 
 446     bind(inflated);
 447 
 448 #ifdef ASSERT
 449     Label check_done;
 450     subw(tmp2_top, tmp2_top, oopSize);
 451     mv(tmp3_t, in_bytes(JavaThread::lock_stack_base_offset()));
 452     blt(tmp2_top, tmp3_t, check_done);
 453     add(tmp3_t, xthread, tmp2_top);
 454     ld(tmp3_t, Address(tmp3_t));
 455     bne(obj, tmp3_t, inflated);
 456     stop("Fast Unlock lock on stack");
 457     bind(check_done);
 458 #endif
 459 
 460     // mark contains the tagged ObjectMonitor*.
 461     const Register tmp1_monitor = tmp1_mark;
 462     const uintptr_t monitor_tag = markWord::monitor_value;
 463 
 464     // Untag the monitor.
 465     sub(tmp1_monitor, tmp1_mark, monitor_tag);
 466 
 467     const Register tmp2_recursions = tmp2;
 468     Label not_recursive;
 469 
 470     // Check if recursive.
 471     ld(tmp2_recursions, Address(tmp1_monitor, ObjectMonitor::recursions_offset()));
 472     beqz(tmp2_recursions, not_recursive);
 473 
 474     // Recursive unlock.
 475     addi(tmp2_recursions, tmp2_recursions, -1);
 476     sd(tmp2_recursions, Address(tmp1_monitor, ObjectMonitor::recursions_offset()));
 477     j(unlocked);
 478 
 479     bind(not_recursive);
 480 
 481     Label release;
 482     const Register tmp2_owner_addr = tmp2;
 483 
 484     // Compute owner address.
 485     la(tmp2_owner_addr, Address(tmp1_monitor, ObjectMonitor::owner_offset()));
 486 
 487     // Check if the entry lists are empty.
 488     ld(t0, Address(tmp1_monitor, ObjectMonitor::EntryList_offset()));
 489     ld(tmp3_t, Address(tmp1_monitor, ObjectMonitor::cxq_offset()));
 490     orr(t0, t0, tmp3_t);
 491     beqz(t0, release);
 492 
 493     // The owner may be anonymous and we removed the last obj entry in
 494     // the lock-stack. This loses the information about the owner.
 495     // Write the thread to the owner field so the runtime knows the owner.
 496     Register tid = flag;
 497     mv(tid, Address(xthread, JavaThread::lock_id_offset()));
 498     sd(tid, Address(tmp2_owner_addr));
 499     j(slow_path);
 500 
 501     bind(release);
 502     // Set owner to null.
 503     membar(MacroAssembler::LoadStore | MacroAssembler::StoreStore);
 504     sd(zr, Address(tmp2_owner_addr));
 505   }
 506 
 507   bind(unlocked);
 508   mv(flag, zr);
 509   decrement(Address(xthread, JavaThread::held_monitor_count_offset()), 1, tmp2, tmp3);
 510 
 511 #ifdef ASSERT
 512   // Check that unlocked label is reached with flag == 0.
 513   Label flag_correct;
 514   beqz(flag, flag_correct);
 515   stop("Fast Lock Flag != 0");
 516 #endif
 517 
 518   bind(slow_path);
 519 #ifdef ASSERT
 520   // Check that slow_path label is reached with flag != 0.
 521   bnez(flag, flag_correct);
 522   stop("Fast Lock Flag == 0");
 523   bind(flag_correct);
 524 #endif
 525   // C2 uses the value of flag (0 vs !0) to determine the continuation.
 526 }
 527 
 528 // short string
 529 // StringUTF16.indexOfChar
 530 // StringLatin1.indexOfChar
 531 void C2_MacroAssembler::string_indexof_char_short(Register str1, Register cnt1,
 532                                                   Register ch, Register result,
 533                                                   bool isL)
 534 {
 535   Register ch1 = t0;
 536   Register index = t1;
 537 
 538   BLOCK_COMMENT("string_indexof_char_short {");
 539 
 540   Label LOOP, LOOP1, LOOP4, LOOP8;
 541   Label MATCH,  MATCH1, MATCH2, MATCH3,
 542         MATCH4, MATCH5, MATCH6, MATCH7, NOMATCH;
 543 
 544   mv(result, -1);
 545   mv(index, zr);
 546 
 547   bind(LOOP);
 548   addi(t0, index, 8);
 549   ble(t0, cnt1, LOOP8);
 550   addi(t0, index, 4);
 551   ble(t0, cnt1, LOOP4);
 552   j(LOOP1);
 553 
 554   bind(LOOP8);
 555   isL ? lbu(ch1, Address(str1, 0)) : lhu(ch1, Address(str1, 0));
 556   beq(ch, ch1, MATCH);
 557   isL ? lbu(ch1, Address(str1, 1)) : lhu(ch1, Address(str1, 2));
 558   beq(ch, ch1, MATCH1);
 559   isL ? lbu(ch1, Address(str1, 2)) : lhu(ch1, Address(str1, 4));
 560   beq(ch, ch1, MATCH2);
 561   isL ? lbu(ch1, Address(str1, 3)) : lhu(ch1, Address(str1, 6));
 562   beq(ch, ch1, MATCH3);
 563   isL ? lbu(ch1, Address(str1, 4)) : lhu(ch1, Address(str1, 8));
 564   beq(ch, ch1, MATCH4);
 565   isL ? lbu(ch1, Address(str1, 5)) : lhu(ch1, Address(str1, 10));
 566   beq(ch, ch1, MATCH5);
 567   isL ? lbu(ch1, Address(str1, 6)) : lhu(ch1, Address(str1, 12));
 568   beq(ch, ch1, MATCH6);
 569   isL ? lbu(ch1, Address(str1, 7)) : lhu(ch1, Address(str1, 14));
 570   beq(ch, ch1, MATCH7);
 571   addi(index, index, 8);
 572   addi(str1, str1, isL ? 8 : 16);
 573   blt(index, cnt1, LOOP);
 574   j(NOMATCH);
 575 
 576   bind(LOOP4);
 577   isL ? lbu(ch1, Address(str1, 0)) : lhu(ch1, Address(str1, 0));
 578   beq(ch, ch1, MATCH);
 579   isL ? lbu(ch1, Address(str1, 1)) : lhu(ch1, Address(str1, 2));
 580   beq(ch, ch1, MATCH1);
 581   isL ? lbu(ch1, Address(str1, 2)) : lhu(ch1, Address(str1, 4));
 582   beq(ch, ch1, MATCH2);
 583   isL ? lbu(ch1, Address(str1, 3)) : lhu(ch1, Address(str1, 6));
 584   beq(ch, ch1, MATCH3);
 585   addi(index, index, 4);
 586   addi(str1, str1, isL ? 4 : 8);
 587   bge(index, cnt1, NOMATCH);
 588 
 589   bind(LOOP1);
 590   isL ? lbu(ch1, Address(str1)) : lhu(ch1, Address(str1));
 591   beq(ch, ch1, MATCH);
 592   addi(index, index, 1);
 593   addi(str1, str1, isL ? 1 : 2);
 594   blt(index, cnt1, LOOP1);
 595   j(NOMATCH);
 596 
 597   bind(MATCH1);
 598   addi(index, index, 1);
 599   j(MATCH);
 600 
 601   bind(MATCH2);
 602   addi(index, index, 2);
 603   j(MATCH);
 604 
 605   bind(MATCH3);
 606   addi(index, index, 3);
 607   j(MATCH);
 608 
 609   bind(MATCH4);
 610   addi(index, index, 4);
 611   j(MATCH);
 612 
 613   bind(MATCH5);
 614   addi(index, index, 5);
 615   j(MATCH);
 616 
 617   bind(MATCH6);
 618   addi(index, index, 6);
 619   j(MATCH);
 620 
 621   bind(MATCH7);
 622   addi(index, index, 7);
 623 
 624   bind(MATCH);
 625   mv(result, index);
 626   bind(NOMATCH);
 627   BLOCK_COMMENT("} string_indexof_char_short");
 628 }
 629 
 630 // StringUTF16.indexOfChar
 631 // StringLatin1.indexOfChar
 632 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1,
 633                                             Register ch, Register result,
 634                                             Register tmp1, Register tmp2,
 635                                             Register tmp3, Register tmp4,
 636                                             bool isL)
 637 {
 638   Label CH1_LOOP, HIT, NOMATCH, DONE, DO_LONG;
 639   Register ch1 = t0;
 640   Register orig_cnt = t1;
 641   Register mask1 = tmp3;
 642   Register mask2 = tmp2;
 643   Register match_mask = tmp1;
 644   Register trailing_char = tmp4;
 645   Register unaligned_elems = tmp4;
 646 
 647   BLOCK_COMMENT("string_indexof_char {");
 648   beqz(cnt1, NOMATCH);
 649 
 650   addi(t0, cnt1, isL ? -32 : -16);
 651   bgtz(t0, DO_LONG);
 652   string_indexof_char_short(str1, cnt1, ch, result, isL);
 653   j(DONE);
 654 
 655   bind(DO_LONG);
 656   mv(orig_cnt, cnt1);
 657   if (AvoidUnalignedAccesses) {
 658     Label ALIGNED;
 659     andi(unaligned_elems, str1, 0x7);
 660     beqz(unaligned_elems, ALIGNED);
 661     sub(unaligned_elems, unaligned_elems, 8);
 662     neg(unaligned_elems, unaligned_elems);
 663     if (!isL) {
 664       srli(unaligned_elems, unaligned_elems, 1);
 665     }
 666     // do unaligned part per element
 667     string_indexof_char_short(str1, unaligned_elems, ch, result, isL);
 668     bgez(result, DONE);
 669     mv(orig_cnt, cnt1);
 670     sub(cnt1, cnt1, unaligned_elems);
 671     bind(ALIGNED);
 672   }
 673 
 674   // duplicate ch
 675   if (isL) {
 676     slli(ch1, ch, 8);
 677     orr(ch, ch1, ch);
 678   }
 679   slli(ch1, ch, 16);
 680   orr(ch, ch1, ch);
 681   slli(ch1, ch, 32);
 682   orr(ch, ch1, ch);
 683 
 684   if (!isL) {
 685     slli(cnt1, cnt1, 1);
 686   }
 687 
 688   uint64_t mask0101 = UCONST64(0x0101010101010101);
 689   uint64_t mask0001 = UCONST64(0x0001000100010001);
 690   mv(mask1, isL ? mask0101 : mask0001);
 691   uint64_t mask7f7f = UCONST64(0x7f7f7f7f7f7f7f7f);
 692   uint64_t mask7fff = UCONST64(0x7fff7fff7fff7fff);
 693   mv(mask2, isL ? mask7f7f : mask7fff);
 694 
 695   bind(CH1_LOOP);
 696   ld(ch1, Address(str1));
 697   addi(str1, str1, 8);
 698   addi(cnt1, cnt1, -8);
 699   compute_match_mask(ch1, ch, match_mask, mask1, mask2);
 700   bnez(match_mask, HIT);
 701   bgtz(cnt1, CH1_LOOP);
 702   j(NOMATCH);
 703 
 704   bind(HIT);
 705   ctzc_bit(trailing_char, match_mask, isL, ch1, result);
 706   srli(trailing_char, trailing_char, 3);
 707   addi(cnt1, cnt1, 8);
 708   ble(cnt1, trailing_char, NOMATCH);
 709   // match case
 710   if (!isL) {
 711     srli(cnt1, cnt1, 1);
 712     srli(trailing_char, trailing_char, 1);
 713   }
 714 
 715   sub(result, orig_cnt, cnt1);
 716   add(result, result, trailing_char);
 717   j(DONE);
 718 
 719   bind(NOMATCH);
 720   mv(result, -1);
 721 
 722   bind(DONE);
 723   BLOCK_COMMENT("} string_indexof_char");
 724 }
 725 
 726 typedef void (MacroAssembler::* load_chr_insn)(Register rd, const Address &adr, Register temp);
 727 
 728 // Search for needle in haystack and return index or -1
 729 // x10: result
 730 // x11: haystack
 731 // x12: haystack_len
 732 // x13: needle
 733 // x14: needle_len
 734 void C2_MacroAssembler::string_indexof(Register haystack, Register needle,
 735                                        Register haystack_len, Register needle_len,
 736                                        Register tmp1, Register tmp2,
 737                                        Register tmp3, Register tmp4,
 738                                        Register tmp5, Register tmp6,
 739                                        Register result, int ae)
 740 {
 741   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
 742 
 743   Label LINEARSEARCH, LINEARSTUB, DONE, NOMATCH;
 744 
 745   Register ch1 = t0;
 746   Register ch2 = t1;
 747   Register nlen_tmp = tmp1; // needle len tmp
 748   Register hlen_tmp = tmp2; // haystack len tmp
 749   Register result_tmp = tmp4;
 750 
 751   bool isLL = ae == StrIntrinsicNode::LL;
 752 
 753   bool needle_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
 754   bool haystack_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
 755   int needle_chr_shift = needle_isL ? 0 : 1;
 756   int haystack_chr_shift = haystack_isL ? 0 : 1;
 757   int needle_chr_size = needle_isL ? 1 : 2;
 758   int haystack_chr_size = haystack_isL ? 1 : 2;
 759   load_chr_insn needle_load_1chr = needle_isL ? (load_chr_insn)&MacroAssembler::lbu :
 760                               (load_chr_insn)&MacroAssembler::lhu;
 761   load_chr_insn haystack_load_1chr = haystack_isL ? (load_chr_insn)&MacroAssembler::lbu :
 762                                 (load_chr_insn)&MacroAssembler::lhu;
 763 
 764   BLOCK_COMMENT("string_indexof {");
 765 
 766   // Note, inline_string_indexOf() generates checks:
 767   // if (pattern.count > src.count) return -1;
 768   // if (pattern.count == 0) return 0;
 769 
 770   // We have two strings, a source string in haystack, haystack_len and a pattern string
 771   // in needle, needle_len. Find the first occurrence of pattern in source or return -1.
 772 
 773   // For larger pattern and source we use a simplified Boyer Moore algorithm.
 774   // With a small pattern and source we use linear scan.
 775 
 776   // needle_len >=8 && needle_len < 256 && needle_len < haystack_len/4, use bmh algorithm.
 777   sub(result_tmp, haystack_len, needle_len);
 778   // needle_len < 8, use linear scan
 779   sub(t0, needle_len, 8);
 780   bltz(t0, LINEARSEARCH);
 781   // needle_len >= 256, use linear scan
 782   sub(t0, needle_len, 256);
 783   bgez(t0, LINEARSTUB);
 784   // needle_len >= haystack_len/4, use linear scan
 785   srli(t0, haystack_len, 2);
 786   bge(needle_len, t0, LINEARSTUB);
 787 
 788   // Boyer-Moore-Horspool introduction:
 789   // The Boyer Moore alogorithm is based on the description here:-
 790   //
 791   // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
 792   //
 793   // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
 794   // and the 'Good Suffix' rule.
 795   //
 796   // These rules are essentially heuristics for how far we can shift the
 797   // pattern along the search string.
 798   //
 799   // The implementation here uses the 'Bad Character' rule only because of the
 800   // complexity of initialisation for the 'Good Suffix' rule.
 801   //
 802   // This is also known as the Boyer-Moore-Horspool algorithm:
 803   //
 804   // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
 805   //
 806   // #define ASIZE 256
 807   //
 808   //    int bm(unsigned char *pattern, int m, unsigned char *src, int n) {
 809   //      int i, j;
 810   //      unsigned c;
 811   //      unsigned char bc[ASIZE];
 812   //
 813   //      /* Preprocessing */
 814   //      for (i = 0; i < ASIZE; ++i)
 815   //        bc[i] = m;
 816   //      for (i = 0; i < m - 1; ) {
 817   //        c = pattern[i];
 818   //        ++i;
 819   //        // c < 256 for Latin1 string, so, no need for branch
 820   //        #ifdef PATTERN_STRING_IS_LATIN1
 821   //        bc[c] = m - i;
 822   //        #else
 823   //        if (c < ASIZE) bc[c] = m - i;
 824   //        #endif
 825   //      }
 826   //
 827   //      /* Searching */
 828   //      j = 0;
 829   //      while (j <= n - m) {
 830   //        c = src[i+j];
 831   //        if (pattern[m-1] == c)
 832   //          int k;
 833   //          for (k = m - 2; k >= 0 && pattern[k] == src[k + j]; --k);
 834   //          if (k < 0) return j;
 835   //          // c < 256 for Latin1 string, so, no need for branch
 836   //          #ifdef SOURCE_STRING_IS_LATIN1_AND_PATTERN_STRING_IS_LATIN1
 837   //          // LL case: (c< 256) always true. Remove branch
 838   //          j += bc[pattern[j+m-1]];
 839   //          #endif
 840   //          #ifdef SOURCE_STRING_IS_UTF_AND_PATTERN_STRING_IS_UTF
 841   //          // UU case: need if (c<ASIZE) check. Skip 1 character if not.
 842   //          if (c < ASIZE)
 843   //            j += bc[pattern[j+m-1]];
 844   //          else
 845   //            j += 1
 846   //          #endif
 847   //          #ifdef SOURCE_IS_UTF_AND_PATTERN_IS_LATIN1
 848   //          // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
 849   //          if (c < ASIZE)
 850   //            j += bc[pattern[j+m-1]];
 851   //          else
 852   //            j += m
 853   //          #endif
 854   //      }
 855   //      return -1;
 856   //    }
 857 
 858   // temp register:t0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, result
 859   Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
 860         BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
 861 
 862   Register haystack_end = haystack_len;
 863   Register skipch = tmp2;
 864 
 865   // pattern length is >=8, so, we can read at least 1 register for cases when
 866   // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
 867   // UL case. We'll re-read last character in inner pre-loop code to have
 868   // single outer pre-loop load
 869   const int firstStep = isLL ? 7 : 3;
 870 
 871   const int ASIZE = 256;
 872   const int STORE_BYTES = 8; // 8 bytes stored per instruction(sd)
 873 
 874   sub(sp, sp, ASIZE);
 875 
 876   // init BC offset table with default value: needle_len
 877   slli(t0, needle_len, 8);
 878   orr(t0, t0, needle_len); // [63...16][needle_len][needle_len]
 879   slli(tmp1, t0, 16);
 880   orr(t0, tmp1, t0); // [63...32][needle_len][needle_len][needle_len][needle_len]
 881   slli(tmp1, t0, 32);
 882   orr(tmp5, tmp1, t0); // tmp5: 8 elements [needle_len]
 883 
 884   mv(ch1, sp);  // ch1 is t0
 885   mv(tmp6, ASIZE / STORE_BYTES); // loop iterations
 886 
 887   bind(BM_INIT_LOOP);
 888   // for (i = 0; i < ASIZE; ++i)
 889   //   bc[i] = m;
 890   for (int i = 0; i < 4; i++) {
 891     sd(tmp5, Address(ch1, i * wordSize));
 892   }
 893   add(ch1, ch1, 32);
 894   sub(tmp6, tmp6, 4);
 895   bgtz(tmp6, BM_INIT_LOOP);
 896 
 897   sub(nlen_tmp, needle_len, 1); // m - 1, index of the last element in pattern
 898   Register orig_haystack = tmp5;
 899   mv(orig_haystack, haystack);
 900   // result_tmp = tmp4
 901   shadd(haystack_end, result_tmp, haystack, haystack_end, haystack_chr_shift);
 902   sub(ch2, needle_len, 1); // bc offset init value, ch2 is t1
 903   mv(tmp3, needle);
 904 
 905   //  for (i = 0; i < m - 1; ) {
 906   //    c = pattern[i];
 907   //    ++i;
 908   //    // c < 256 for Latin1 string, so, no need for branch
 909   //    #ifdef PATTERN_STRING_IS_LATIN1
 910   //    bc[c] = m - i;
 911   //    #else
 912   //    if (c < ASIZE) bc[c] = m - i;
 913   //    #endif
 914   //  }
 915   bind(BCLOOP);
 916   (this->*needle_load_1chr)(ch1, Address(tmp3), noreg);
 917   add(tmp3, tmp3, needle_chr_size);
 918   if (!needle_isL) {
 919     // ae == StrIntrinsicNode::UU
 920     mv(tmp6, ASIZE);
 921     bgeu(ch1, tmp6, BCSKIP);
 922   }
 923   add(tmp4, sp, ch1);
 924   sb(ch2, Address(tmp4)); // store skip offset to BC offset table
 925 
 926   bind(BCSKIP);
 927   sub(ch2, ch2, 1); // for next pattern element, skip distance -1
 928   bgtz(ch2, BCLOOP);
 929 
 930   // tmp6: pattern end, address after needle
 931   shadd(tmp6, needle_len, needle, tmp6, needle_chr_shift);
 932   if (needle_isL == haystack_isL) {
 933     // load last 8 bytes (8LL/4UU symbols)
 934     ld(tmp6, Address(tmp6, -wordSize));
 935   } else {
 936     // UL: from UTF-16(source) search Latin1(pattern)
 937     lwu(tmp6, Address(tmp6, -wordSize / 2)); // load last 4 bytes(4 symbols)
 938     // convert Latin1 to UTF. eg: 0x0000abcd -> 0x0a0b0c0d
 939     // We'll have to wait until load completed, but it's still faster than per-character loads+checks
 940     srli(tmp3, tmp6, BitsPerByte * (wordSize / 2 - needle_chr_size)); // pattern[m-1], eg:0x0000000a
 941     slli(ch2, tmp6, XLEN - 24);
 942     srli(ch2, ch2, XLEN - 8); // pattern[m-2], 0x0000000b
 943     slli(ch1, tmp6, XLEN - 16);
 944     srli(ch1, ch1, XLEN - 8); // pattern[m-3], 0x0000000c
 945     andi(tmp6, tmp6, 0xff); // pattern[m-4], 0x0000000d
 946     slli(ch2, ch2, 16);
 947     orr(ch2, ch2, ch1); // 0x00000b0c
 948     slli(result, tmp3, 48); // use result as temp register
 949     orr(tmp6, tmp6, result); // 0x0a00000d
 950     slli(result, ch2, 16);
 951     orr(tmp6, tmp6, result); // UTF-16:0x0a0b0c0d
 952   }
 953 
 954   // i = m - 1;
 955   // skipch = j + i;
 956   // if (skipch == pattern[m - 1]
 957   //   for (k = m - 2; k >= 0 && pattern[k] == src[k + j]; --k);
 958   // else
 959   //   move j with bad char offset table
 960   bind(BMLOOPSTR2);
 961   // compare pattern to source string backward
 962   shadd(result, nlen_tmp, haystack, result, haystack_chr_shift);
 963   (this->*haystack_load_1chr)(skipch, Address(result), noreg);
 964   sub(nlen_tmp, nlen_tmp, firstStep); // nlen_tmp is positive here, because needle_len >= 8
 965   if (needle_isL == haystack_isL) {
 966     // re-init tmp3. It's for free because it's executed in parallel with
 967     // load above. Alternative is to initialize it before loop, but it'll
 968     // affect performance on in-order systems with 2 or more ld/st pipelines
 969     srli(tmp3, tmp6, BitsPerByte * (wordSize - needle_chr_size)); // UU/LL: pattern[m-1]
 970   }
 971   if (!isLL) { // UU/UL case
 972     slli(ch2, nlen_tmp, 1); // offsets in bytes
 973   }
 974   bne(tmp3, skipch, BMSKIP); // if not equal, skipch is bad char
 975   add(result, haystack, isLL ? nlen_tmp : ch2);
 976   // load 8 bytes from source string
 977   // if isLL is false then read granularity can be 2
 978   load_long_misaligned(ch2, Address(result), ch1, isLL ? 1 : 2); // can use ch1 as temp register here as it will be trashed by next mv anyway
 979   mv(ch1, tmp6);
 980   if (isLL) {
 981     j(BMLOOPSTR1_AFTER_LOAD);
 982   } else {
 983     sub(nlen_tmp, nlen_tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
 984     j(BMLOOPSTR1_CMP);
 985   }
 986 
 987   bind(BMLOOPSTR1);
 988   shadd(ch1, nlen_tmp, needle, ch1, needle_chr_shift);
 989   (this->*needle_load_1chr)(ch1, Address(ch1), noreg);
 990   shadd(ch2, nlen_tmp, haystack, ch2, haystack_chr_shift);
 991   (this->*haystack_load_1chr)(ch2, Address(ch2), noreg);
 992 
 993   bind(BMLOOPSTR1_AFTER_LOAD);
 994   sub(nlen_tmp, nlen_tmp, 1);
 995   bltz(nlen_tmp, BMLOOPSTR1_LASTCMP);
 996 
 997   bind(BMLOOPSTR1_CMP);
 998   beq(ch1, ch2, BMLOOPSTR1);
 999 
1000   bind(BMSKIP);
1001   if (!isLL) {
1002     // if we've met UTF symbol while searching Latin1 pattern, then we can
1003     // skip needle_len symbols
1004     if (needle_isL != haystack_isL) {
1005       mv(result_tmp, needle_len);
1006     } else {
1007       mv(result_tmp, 1);
1008     }
1009     mv(t0, ASIZE);
1010     bgeu(skipch, t0, BMADV);
1011   }
1012   add(result_tmp, sp, skipch);
1013   lbu(result_tmp, Address(result_tmp)); // load skip offset
1014 
1015   bind(BMADV);
1016   sub(nlen_tmp, needle_len, 1);
1017   // move haystack after bad char skip offset
1018   shadd(haystack, result_tmp, haystack, result, haystack_chr_shift);
1019   ble(haystack, haystack_end, BMLOOPSTR2);
1020   add(sp, sp, ASIZE);
1021   j(NOMATCH);
1022 
1023   bind(BMLOOPSTR1_LASTCMP);
1024   bne(ch1, ch2, BMSKIP);
1025 
1026   bind(BMMATCH);
1027   sub(result, haystack, orig_haystack);
1028   if (!haystack_isL) {
1029     srli(result, result, 1);
1030   }
1031   add(sp, sp, ASIZE);
1032   j(DONE);
1033 
1034   bind(LINEARSTUB);
1035   sub(t0, needle_len, 16); // small patterns still should be handled by simple algorithm
1036   bltz(t0, LINEARSEARCH);
1037   mv(result, zr);
1038   RuntimeAddress stub = nullptr;
1039   if (isLL) {
1040     stub = RuntimeAddress(StubRoutines::riscv::string_indexof_linear_ll());
1041     assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated");
1042   } else if (needle_isL) {
1043     stub = RuntimeAddress(StubRoutines::riscv::string_indexof_linear_ul());
1044     assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated");
1045   } else {
1046     stub = RuntimeAddress(StubRoutines::riscv::string_indexof_linear_uu());
1047     assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated");
1048   }
1049   address call = reloc_call(stub);
1050   if (call == nullptr) {
1051     DEBUG_ONLY(reset_labels(LINEARSEARCH, DONE, NOMATCH));
1052     ciEnv::current()->record_failure("CodeCache is full");
1053     return;
1054   }
1055   j(DONE);
1056 
1057   bind(NOMATCH);
1058   mv(result, -1);
1059   j(DONE);
1060 
1061   bind(LINEARSEARCH);
1062   string_indexof_linearscan(haystack, needle, haystack_len, needle_len, tmp1, tmp2, tmp3, tmp4, -1, result, ae);
1063 
1064   bind(DONE);
1065   BLOCK_COMMENT("} string_indexof");
1066 }
1067 
1068 // string_indexof
1069 // result: x10
1070 // src: x11
1071 // src_count: x12
1072 // pattern: x13
1073 // pattern_count: x14 or 1/2/3/4
1074 void C2_MacroAssembler::string_indexof_linearscan(Register haystack, Register needle,
1075                                                Register haystack_len, Register needle_len,
1076                                                Register tmp1, Register tmp2,
1077                                                Register tmp3, Register tmp4,
1078                                                int needle_con_cnt, Register result, int ae)
1079 {
1080   // Note:
1081   // needle_con_cnt > 0 means needle_len register is invalid, needle length is constant
1082   // for UU/LL: needle_con_cnt[1, 4], UL: needle_con_cnt = 1
1083   assert(needle_con_cnt <= 4, "Invalid needle constant count");
1084   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
1085 
1086   Register ch1 = t0;
1087   Register ch2 = t1;
1088   Register hlen_neg = haystack_len, nlen_neg = needle_len;
1089   Register nlen_tmp = tmp1, hlen_tmp = tmp2, result_tmp = tmp4;
1090 
1091   bool isLL = ae == StrIntrinsicNode::LL;
1092 
1093   bool needle_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
1094   bool haystack_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
1095   int needle_chr_shift = needle_isL ? 0 : 1;
1096   int haystack_chr_shift = haystack_isL ? 0 : 1;
1097   int needle_chr_size = needle_isL ? 1 : 2;
1098   int haystack_chr_size = haystack_isL ? 1 : 2;
1099 
1100   load_chr_insn needle_load_1chr = needle_isL ? (load_chr_insn)&MacroAssembler::lbu :
1101                               (load_chr_insn)&MacroAssembler::lhu;
1102   load_chr_insn haystack_load_1chr = haystack_isL ? (load_chr_insn)&MacroAssembler::lbu :
1103                                 (load_chr_insn)&MacroAssembler::lhu;
1104   load_chr_insn load_2chr = isLL ? (load_chr_insn)&MacroAssembler::lhu : (load_chr_insn)&MacroAssembler::lwu;
1105   load_chr_insn load_4chr = isLL ? (load_chr_insn)&MacroAssembler::lwu : (load_chr_insn)&MacroAssembler::ld;
1106 
1107   Label DO1, DO2, DO3, MATCH, NOMATCH, DONE;
1108 
1109   Register first = tmp3;
1110 
1111   if (needle_con_cnt == -1) {
1112     Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
1113 
1114     sub(t0, needle_len, needle_isL == haystack_isL ? 4 : 2);
1115     bltz(t0, DOSHORT);
1116 
1117     (this->*needle_load_1chr)(first, Address(needle), noreg);
1118     slli(t0, needle_len, needle_chr_shift);
1119     add(needle, needle, t0);
1120     neg(nlen_neg, t0);
1121     slli(t0, result_tmp, haystack_chr_shift);
1122     add(haystack, haystack, t0);
1123     neg(hlen_neg, t0);
1124 
1125     bind(FIRST_LOOP);
1126     add(t0, haystack, hlen_neg);
1127     (this->*haystack_load_1chr)(ch2, Address(t0), noreg);
1128     beq(first, ch2, STR1_LOOP);
1129 
1130     bind(STR2_NEXT);
1131     add(hlen_neg, hlen_neg, haystack_chr_size);
1132     blez(hlen_neg, FIRST_LOOP);
1133     j(NOMATCH);
1134 
1135     bind(STR1_LOOP);
1136     add(nlen_tmp, nlen_neg, needle_chr_size);
1137     add(hlen_tmp, hlen_neg, haystack_chr_size);
1138     bgez(nlen_tmp, MATCH);
1139 
1140     bind(STR1_NEXT);
1141     add(ch1, needle, nlen_tmp);
1142     (this->*needle_load_1chr)(ch1, Address(ch1), noreg);
1143     add(ch2, haystack, hlen_tmp);
1144     (this->*haystack_load_1chr)(ch2, Address(ch2), noreg);
1145     bne(ch1, ch2, STR2_NEXT);
1146     add(nlen_tmp, nlen_tmp, needle_chr_size);
1147     add(hlen_tmp, hlen_tmp, haystack_chr_size);
1148     bltz(nlen_tmp, STR1_NEXT);
1149     j(MATCH);
1150 
1151     bind(DOSHORT);
1152     if (needle_isL == haystack_isL) {
1153       sub(t0, needle_len, 2);
1154       bltz(t0, DO1);
1155       bgtz(t0, DO3);
1156     }
1157   }
1158 
1159   if (needle_con_cnt == 4) {
1160     Label CH1_LOOP;
1161     (this->*load_4chr)(ch1, Address(needle), noreg);
1162     sub(result_tmp, haystack_len, 4);
1163     slli(tmp3, result_tmp, haystack_chr_shift); // result as tmp
1164     add(haystack, haystack, tmp3);
1165     neg(hlen_neg, tmp3);
1166     if (AvoidUnalignedAccesses) {
1167       // preload first value, then we will read by 1 character per loop, instead of four
1168       // just shifting previous ch2 right by size of character in bits
1169       add(tmp3, haystack, hlen_neg);
1170       (this->*load_4chr)(ch2, Address(tmp3), noreg);
1171       if (isLL) {
1172         // need to erase 1 most significant byte in 32-bit value of ch2
1173         slli(ch2, ch2, 40);
1174         srli(ch2, ch2, 32);
1175       } else {
1176         slli(ch2, ch2, 16); // 2 most significant bytes will be erased by this operation
1177       }
1178     }
1179 
1180     bind(CH1_LOOP);
1181     add(tmp3, haystack, hlen_neg);
1182     if (AvoidUnalignedAccesses) {
1183       srli(ch2, ch2, isLL ? 8 : 16);
1184       (this->*haystack_load_1chr)(tmp3, Address(tmp3, isLL ? 3 : 6), noreg);
1185       slli(tmp3, tmp3, isLL ? 24 : 48);
1186       add(ch2, ch2, tmp3);
1187     } else {
1188       (this->*load_4chr)(ch2, Address(tmp3), noreg);
1189     }
1190     beq(ch1, ch2, MATCH);
1191     add(hlen_neg, hlen_neg, haystack_chr_size);
1192     blez(hlen_neg, CH1_LOOP);
1193     j(NOMATCH);
1194   }
1195 
1196   if ((needle_con_cnt == -1 && needle_isL == haystack_isL) || needle_con_cnt == 2) {
1197     Label CH1_LOOP;
1198     BLOCK_COMMENT("string_indexof DO2 {");
1199     bind(DO2);
1200     (this->*load_2chr)(ch1, Address(needle), noreg);
1201     if (needle_con_cnt == 2) {
1202       sub(result_tmp, haystack_len, 2);
1203     }
1204     slli(tmp3, result_tmp, haystack_chr_shift);
1205     add(haystack, haystack, tmp3);
1206     neg(hlen_neg, tmp3);
1207     if (AvoidUnalignedAccesses) {
1208       // preload first value, then we will read by 1 character per loop, instead of two
1209       // just shifting previous ch2 right by size of character in bits
1210       add(tmp3, haystack, hlen_neg);
1211       (this->*haystack_load_1chr)(ch2, Address(tmp3), noreg);
1212       slli(ch2, ch2, isLL ? 8 : 16);
1213     }
1214     bind(CH1_LOOP);
1215     add(tmp3, haystack, hlen_neg);
1216     if (AvoidUnalignedAccesses) {
1217       srli(ch2, ch2, isLL ? 8 : 16);
1218       (this->*haystack_load_1chr)(tmp3, Address(tmp3, isLL ? 1 : 2), noreg);
1219       slli(tmp3, tmp3, isLL ? 8 : 16);
1220       add(ch2, ch2, tmp3);
1221     } else {
1222       (this->*load_2chr)(ch2, Address(tmp3), noreg);
1223     }
1224     beq(ch1, ch2, MATCH);
1225     add(hlen_neg, hlen_neg, haystack_chr_size);
1226     blez(hlen_neg, CH1_LOOP);
1227     j(NOMATCH);
1228     BLOCK_COMMENT("} string_indexof DO2");
1229   }
1230 
1231   if ((needle_con_cnt == -1 && needle_isL == haystack_isL) || needle_con_cnt == 3) {
1232     Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
1233     BLOCK_COMMENT("string_indexof DO3 {");
1234 
1235     bind(DO3);
1236     (this->*load_2chr)(first, Address(needle), noreg);
1237     (this->*needle_load_1chr)(ch1, Address(needle, 2 * needle_chr_size), noreg);
1238     if (needle_con_cnt == 3) {
1239       sub(result_tmp, haystack_len, 3);
1240     }
1241     slli(hlen_tmp, result_tmp, haystack_chr_shift);
1242     add(haystack, haystack, hlen_tmp);
1243     neg(hlen_neg, hlen_tmp);
1244 
1245     bind(FIRST_LOOP);
1246     add(ch2, haystack, hlen_neg);
1247     if (AvoidUnalignedAccesses) {
1248       (this->*haystack_load_1chr)(tmp2, Address(ch2, isLL ? 1 : 2), noreg); // we need a temp register, we can safely use hlen_tmp here, which is a synonym for tmp2
1249       (this->*haystack_load_1chr)(ch2, Address(ch2), noreg);
1250       slli(tmp2, tmp2, isLL ? 8 : 16);
1251       add(ch2, ch2, tmp2);
1252     } else {
1253       (this->*load_2chr)(ch2, Address(ch2), noreg);
1254     }
1255     beq(first, ch2, STR1_LOOP);
1256 
1257     bind(STR2_NEXT);
1258     add(hlen_neg, hlen_neg, haystack_chr_size);
1259     blez(hlen_neg, FIRST_LOOP);
1260     j(NOMATCH);
1261 
1262     bind(STR1_LOOP);
1263     add(hlen_tmp, hlen_neg, 2 * haystack_chr_size);
1264     add(ch2, haystack, hlen_tmp);
1265     (this->*haystack_load_1chr)(ch2, Address(ch2), noreg);
1266     bne(ch1, ch2, STR2_NEXT);
1267     j(MATCH);
1268     BLOCK_COMMENT("} string_indexof DO3");
1269   }
1270 
1271   if (needle_con_cnt == -1 || needle_con_cnt == 1) {
1272     Label DO1_LOOP;
1273 
1274     BLOCK_COMMENT("string_indexof DO1 {");
1275     bind(DO1);
1276     (this->*needle_load_1chr)(ch1, Address(needle), noreg);
1277     sub(result_tmp, haystack_len, 1);
1278     slli(tmp3, result_tmp, haystack_chr_shift);
1279     add(haystack, haystack, tmp3);
1280     neg(hlen_neg, tmp3);
1281 
1282     bind(DO1_LOOP);
1283     add(tmp3, haystack, hlen_neg);
1284     (this->*haystack_load_1chr)(ch2, Address(tmp3), noreg);
1285     beq(ch1, ch2, MATCH);
1286     add(hlen_neg, hlen_neg, haystack_chr_size);
1287     blez(hlen_neg, DO1_LOOP);
1288     BLOCK_COMMENT("} string_indexof DO1");
1289   }
1290 
1291   bind(NOMATCH);
1292   mv(result, -1);
1293   j(DONE);
1294 
1295   bind(MATCH);
1296   srai(t0, hlen_neg, haystack_chr_shift);
1297   add(result, result_tmp, t0);
1298 
1299   bind(DONE);
1300 }
1301 
1302 // Compare strings.
1303 void C2_MacroAssembler::string_compare(Register str1, Register str2,
1304                                        Register cnt1, Register cnt2, Register result,
1305                                        Register tmp1, Register tmp2, Register tmp3,
1306                                        int ae)
1307 {
1308   Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
1309         DIFFERENCE, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
1310         SHORT_LOOP_START, TAIL_CHECK, L;
1311 
1312   const int STUB_THRESHOLD = 64 + 8;
1313   bool isLL = ae == StrIntrinsicNode::LL;
1314   bool isLU = ae == StrIntrinsicNode::LU;
1315   bool isUL = ae == StrIntrinsicNode::UL;
1316 
1317   bool str1_isL = isLL || isLU;
1318   bool str2_isL = isLL || isUL;
1319 
1320   // for L strings, 1 byte for 1 character
1321   // for U strings, 2 bytes for 1 character
1322   int str1_chr_size = str1_isL ? 1 : 2;
1323   int str2_chr_size = str2_isL ? 1 : 2;
1324   int minCharsInWord = isLL ? wordSize : wordSize / 2;
1325 
1326   load_chr_insn str1_load_chr = str1_isL ? (load_chr_insn)&MacroAssembler::lbu : (load_chr_insn)&MacroAssembler::lhu;
1327   load_chr_insn str2_load_chr = str2_isL ? (load_chr_insn)&MacroAssembler::lbu : (load_chr_insn)&MacroAssembler::lhu;
1328 
1329   BLOCK_COMMENT("string_compare {");
1330 
1331   // Bizarrely, the counts are passed in bytes, regardless of whether they
1332   // are L or U strings, however the result is always in characters.
1333   if (!str1_isL) {
1334     sraiw(cnt1, cnt1, 1);
1335   }
1336   if (!str2_isL) {
1337     sraiw(cnt2, cnt2, 1);
1338   }
1339 
1340   // Compute the minimum of the string lengths and save the difference in result.
1341   sub(result, cnt1, cnt2);
1342   bgt(cnt1, cnt2, L);
1343   mv(cnt2, cnt1);
1344   bind(L);
1345 
1346   // A very short string
1347   mv(t0, minCharsInWord);
1348   ble(cnt2, t0, SHORT_STRING);
1349 
1350   // Compare longwords
1351   // load first parts of strings and finish initialization while loading
1352   {
1353     if (str1_isL == str2_isL) { // LL or UU
1354       // check if str1 and str2 is same pointer
1355       beq(str1, str2, DONE);
1356       // load 8 bytes once to compare
1357       ld(tmp1, Address(str1));
1358       ld(tmp2, Address(str2));
1359       mv(t0, STUB_THRESHOLD);
1360       bge(cnt2, t0, STUB);
1361       sub(cnt2, cnt2, minCharsInWord);
1362       beqz(cnt2, TAIL_CHECK);
1363       // convert cnt2 from characters to bytes
1364       if (!str1_isL) {
1365         slli(cnt2, cnt2, 1);
1366       }
1367       add(str2, str2, cnt2);
1368       add(str1, str1, cnt2);
1369       sub(cnt2, zr, cnt2);
1370     } else if (isLU) { // LU case
1371       lwu(tmp1, Address(str1));
1372       ld(tmp2, Address(str2));
1373       mv(t0, STUB_THRESHOLD);
1374       bge(cnt2, t0, STUB);
1375       addi(cnt2, cnt2, -4);
1376       add(str1, str1, cnt2);
1377       sub(cnt1, zr, cnt2);
1378       slli(cnt2, cnt2, 1);
1379       add(str2, str2, cnt2);
1380       inflate_lo32(tmp3, tmp1);
1381       mv(tmp1, tmp3);
1382       sub(cnt2, zr, cnt2);
1383       addi(cnt1, cnt1, 4);
1384     } else { // UL case
1385       ld(tmp1, Address(str1));
1386       lwu(tmp2, Address(str2));
1387       mv(t0, STUB_THRESHOLD);
1388       bge(cnt2, t0, STUB);
1389       addi(cnt2, cnt2, -4);
1390       slli(t0, cnt2, 1);
1391       sub(cnt1, zr, t0);
1392       add(str1, str1, t0);
1393       add(str2, str2, cnt2);
1394       inflate_lo32(tmp3, tmp2);
1395       mv(tmp2, tmp3);
1396       sub(cnt2, zr, cnt2);
1397       addi(cnt1, cnt1, 8);
1398     }
1399     addi(cnt2, cnt2, isUL ? 4 : 8);
1400     bne(tmp1, tmp2, DIFFERENCE);
1401     bgez(cnt2, TAIL);
1402 
1403     // main loop
1404     bind(NEXT_WORD);
1405     if (str1_isL == str2_isL) { // LL or UU
1406       add(t0, str1, cnt2);
1407       ld(tmp1, Address(t0));
1408       add(t0, str2, cnt2);
1409       ld(tmp2, Address(t0));
1410       addi(cnt2, cnt2, 8);
1411     } else if (isLU) { // LU case
1412       add(t0, str1, cnt1);
1413       lwu(tmp1, Address(t0));
1414       add(t0, str2, cnt2);
1415       ld(tmp2, Address(t0));
1416       addi(cnt1, cnt1, 4);
1417       inflate_lo32(tmp3, tmp1);
1418       mv(tmp1, tmp3);
1419       addi(cnt2, cnt2, 8);
1420     } else { // UL case
1421       add(t0, str2, cnt2);
1422       lwu(tmp2, Address(t0));
1423       add(t0, str1, cnt1);
1424       ld(tmp1, Address(t0));
1425       inflate_lo32(tmp3, tmp2);
1426       mv(tmp2, tmp3);
1427       addi(cnt1, cnt1, 8);
1428       addi(cnt2, cnt2, 4);
1429     }
1430     bne(tmp1, tmp2, DIFFERENCE);
1431     bltz(cnt2, NEXT_WORD);
1432     bind(TAIL);
1433     if (str1_isL == str2_isL) { // LL or UU
1434       load_long_misaligned(tmp1, Address(str1), tmp3, isLL ? 1 : 2);
1435       load_long_misaligned(tmp2, Address(str2), tmp3, isLL ? 1 : 2);
1436     } else if (isLU) { // LU case
1437       load_int_misaligned(tmp1, Address(str1), tmp3, false);
1438       load_long_misaligned(tmp2, Address(str2), tmp3, 2);
1439       inflate_lo32(tmp3, tmp1);
1440       mv(tmp1, tmp3);
1441     } else { // UL case
1442       load_int_misaligned(tmp2, Address(str2), tmp3, false);
1443       load_long_misaligned(tmp1, Address(str1), tmp3, 2);
1444       inflate_lo32(tmp3, tmp2);
1445       mv(tmp2, tmp3);
1446     }
1447     bind(TAIL_CHECK);
1448     beq(tmp1, tmp2, DONE);
1449 
1450     // Find the first different characters in the longwords and
1451     // compute their difference.
1452     bind(DIFFERENCE);
1453     xorr(tmp3, tmp1, tmp2);
1454     ctzc_bit(result, tmp3, isLL); // count zero from lsb to msb
1455     srl(tmp1, tmp1, result);
1456     srl(tmp2, tmp2, result);
1457     if (isLL) {
1458       andi(tmp1, tmp1, 0xFF);
1459       andi(tmp2, tmp2, 0xFF);
1460     } else {
1461       andi(tmp1, tmp1, 0xFFFF);
1462       andi(tmp2, tmp2, 0xFFFF);
1463     }
1464     sub(result, tmp1, tmp2);
1465     j(DONE);
1466   }
1467 
1468   bind(STUB);
1469   RuntimeAddress stub = nullptr;
1470   switch (ae) {
1471     case StrIntrinsicNode::LL:
1472       stub = RuntimeAddress(StubRoutines::riscv::compare_long_string_LL());
1473       break;
1474     case StrIntrinsicNode::UU:
1475       stub = RuntimeAddress(StubRoutines::riscv::compare_long_string_UU());
1476       break;
1477     case StrIntrinsicNode::LU:
1478       stub = RuntimeAddress(StubRoutines::riscv::compare_long_string_LU());
1479       break;
1480     case StrIntrinsicNode::UL:
1481       stub = RuntimeAddress(StubRoutines::riscv::compare_long_string_UL());
1482       break;
1483     default:
1484       ShouldNotReachHere();
1485   }
1486   assert(stub.target() != nullptr, "compare_long_string stub has not been generated");
1487   address call = reloc_call(stub);
1488   if (call == nullptr) {
1489     DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START));
1490     ciEnv::current()->record_failure("CodeCache is full");
1491     return;
1492   }
1493   j(DONE);
1494 
1495   bind(SHORT_STRING);
1496   // Is the minimum length zero?
1497   beqz(cnt2, DONE);
1498   // arrange code to do most branches while loading and loading next characters
1499   // while comparing previous
1500   (this->*str1_load_chr)(tmp1, Address(str1), t0);
1501   addi(str1, str1, str1_chr_size);
1502   addi(cnt2, cnt2, -1);
1503   beqz(cnt2, SHORT_LAST_INIT);
1504   (this->*str2_load_chr)(cnt1, Address(str2), t0);
1505   addi(str2, str2, str2_chr_size);
1506   j(SHORT_LOOP_START);
1507   bind(SHORT_LOOP);
1508   addi(cnt2, cnt2, -1);
1509   beqz(cnt2, SHORT_LAST);
1510   bind(SHORT_LOOP_START);
1511   (this->*str1_load_chr)(tmp2, Address(str1), t0);
1512   addi(str1, str1, str1_chr_size);
1513   (this->*str2_load_chr)(t0, Address(str2), t0);
1514   addi(str2, str2, str2_chr_size);
1515   bne(tmp1, cnt1, SHORT_LOOP_TAIL);
1516   addi(cnt2, cnt2, -1);
1517   beqz(cnt2, SHORT_LAST2);
1518   (this->*str1_load_chr)(tmp1, Address(str1), t0);
1519   addi(str1, str1, str1_chr_size);
1520   (this->*str2_load_chr)(cnt1, Address(str2), t0);
1521   addi(str2, str2, str2_chr_size);
1522   beq(tmp2, t0, SHORT_LOOP);
1523   sub(result, tmp2, t0);
1524   j(DONE);
1525   bind(SHORT_LOOP_TAIL);
1526   sub(result, tmp1, cnt1);
1527   j(DONE);
1528   bind(SHORT_LAST2);
1529   beq(tmp2, t0, DONE);
1530   sub(result, tmp2, t0);
1531 
1532   j(DONE);
1533   bind(SHORT_LAST_INIT);
1534   (this->*str2_load_chr)(cnt1, Address(str2), t0);
1535   addi(str2, str2, str2_chr_size);
1536   bind(SHORT_LAST);
1537   beq(tmp1, cnt1, DONE);
1538   sub(result, tmp1, cnt1);
1539 
1540   bind(DONE);
1541 
1542   BLOCK_COMMENT("} string_compare");
1543 }
1544 
1545 void C2_MacroAssembler::arrays_equals(Register a1, Register a2,
1546                                       Register tmp1, Register tmp2, Register tmp3,
1547                                       Register result, int elem_size) {
1548   assert(elem_size == 1 || elem_size == 2, "must be char or byte");
1549   assert_different_registers(a1, a2, result, tmp1, tmp2, tmp3, t0);
1550 
1551   int elem_per_word = wordSize/elem_size;
1552   int log_elem_size = exact_log2(elem_size);
1553   int length_offset = arrayOopDesc::length_offset_in_bytes();
1554   int base_offset   = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE);
1555 
1556   Register cnt1 = tmp3;
1557   Register cnt2 = tmp1;  // cnt2 only used in array length compare
1558   Label DONE, SAME, NEXT_WORD, SHORT, TAIL03, TAIL01;
1559 
1560   BLOCK_COMMENT("arrays_equals {");
1561 
1562   // if (a1 == a2), return true
1563   beq(a1, a2, SAME);
1564 
1565   mv(result, false);
1566   // if (a1 == nullptr || a2 == nullptr)
1567   //     return false;
1568   beqz(a1, DONE);
1569   beqz(a2, DONE);
1570 
1571   // if (a1.length != a2.length)
1572   //      return false;
1573   lwu(cnt1, Address(a1, length_offset));
1574   lwu(cnt2, Address(a2, length_offset));
1575   bne(cnt1, cnt2, DONE);
1576 
1577   la(a1, Address(a1, base_offset));
1578   la(a2, Address(a2, base_offset));
1579   // Check for short strings, i.e. smaller than wordSize.
1580   addi(cnt1, cnt1, -elem_per_word);
1581   bltz(cnt1, SHORT);
1582 
1583   // Main 8 byte comparison loop.
1584   bind(NEXT_WORD); {
1585     ld(tmp1, Address(a1));
1586     ld(tmp2, Address(a2));
1587     addi(cnt1, cnt1, -elem_per_word);
1588     addi(a1, a1, wordSize);
1589     addi(a2, a2, wordSize);
1590     bne(tmp1, tmp2, DONE);
1591   } bgez(cnt1, NEXT_WORD);
1592 
1593   addi(tmp1, cnt1, elem_per_word);
1594   beqz(tmp1, SAME);
1595 
1596   bind(SHORT);
1597   test_bit(tmp1, cnt1, 2 - log_elem_size);
1598   beqz(tmp1, TAIL03); // 0-7 bytes left.
1599   {
1600     lwu(tmp1, Address(a1));
1601     lwu(tmp2, Address(a2));
1602     addi(a1, a1, 4);
1603     addi(a2, a2, 4);
1604     bne(tmp1, tmp2, DONE);
1605   }
1606 
1607   bind(TAIL03);
1608   test_bit(tmp1, cnt1, 1 - log_elem_size);
1609   beqz(tmp1, TAIL01); // 0-3 bytes left.
1610   {
1611     lhu(tmp1, Address(a1));
1612     lhu(tmp2, Address(a2));
1613     addi(a1, a1, 2);
1614     addi(a2, a2, 2);
1615     bne(tmp1, tmp2, DONE);
1616   }
1617 
1618   bind(TAIL01);
1619   if (elem_size == 1) { // Only needed when comparing byte arrays.
1620     test_bit(tmp1, cnt1, 0);
1621     beqz(tmp1, SAME); // 0-1 bytes left.
1622     {
1623       lbu(tmp1, Address(a1));
1624       lbu(tmp2, Address(a2));
1625       bne(tmp1, tmp2, DONE);
1626     }
1627   }
1628 
1629   bind(SAME);
1630   mv(result, true);
1631   // That's it.
1632   bind(DONE);
1633 
1634   BLOCK_COMMENT("} arrays_equals");
1635 }
1636 
1637 // Compare Strings
1638 
1639 // For Strings we're passed the address of the first characters in a1 and a2
1640 // and the length in cnt1. There are two implementations.
1641 // For arrays >= 8 bytes, all comparisons (except for the tail) are performed
1642 // 8 bytes at a time. For the tail, we compare a halfword, then a short, and then a byte.
1643 // For strings < 8 bytes, we compare a halfword, then a short, and then a byte.
1644 
1645 void C2_MacroAssembler::string_equals(Register a1, Register a2,
1646                                       Register result, Register cnt1)
1647 {
1648   Label SAME, DONE, SHORT, NEXT_WORD;
1649   Register tmp1 = t0;
1650   Register tmp2 = t1;
1651 
1652   assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2);
1653 
1654   BLOCK_COMMENT("string_equals {");
1655 
1656   mv(result, false);
1657 
1658   // Check for short strings, i.e. smaller than wordSize.
1659   addi(cnt1, cnt1, -wordSize);
1660   bltz(cnt1, SHORT);
1661 
1662   // Main 8 byte comparison loop.
1663   bind(NEXT_WORD); {
1664     ld(tmp1, Address(a1));
1665     ld(tmp2, Address(a2));
1666     addi(cnt1, cnt1, -wordSize);
1667     addi(a1, a1, wordSize);
1668     addi(a2, a2, wordSize);
1669     bne(tmp1, tmp2, DONE);
1670   } bgez(cnt1, NEXT_WORD);
1671 
1672   addi(tmp1, cnt1, wordSize);
1673   beqz(tmp1, SAME);
1674 
1675   bind(SHORT);
1676   Label TAIL03, TAIL01;
1677 
1678   // 0-7 bytes left.
1679   test_bit(tmp1, cnt1, 2);
1680   beqz(tmp1, TAIL03);
1681   {
1682     lwu(tmp1, Address(a1));
1683     lwu(tmp2, Address(a2));
1684     addi(a1, a1, 4);
1685     addi(a2, a2, 4);
1686     bne(tmp1, tmp2, DONE);
1687   }
1688 
1689   bind(TAIL03);
1690   // 0-3 bytes left.
1691   test_bit(tmp1, cnt1, 1);
1692   beqz(tmp1, TAIL01);
1693   {
1694     lhu(tmp1, Address(a1));
1695     lhu(tmp2, Address(a2));
1696     addi(a1, a1, 2);
1697     addi(a2, a2, 2);
1698     bne(tmp1, tmp2, DONE);
1699   }
1700 
1701   bind(TAIL01);
1702   // 0-1 bytes left.
1703   test_bit(tmp1, cnt1, 0);
1704   beqz(tmp1, SAME);
1705   {
1706     lbu(tmp1, Address(a1));
1707     lbu(tmp2, Address(a2));
1708     bne(tmp1, tmp2, DONE);
1709   }
1710 
1711   // Arrays are equal.
1712   bind(SAME);
1713   mv(result, true);
1714 
1715   // That's it.
1716   bind(DONE);
1717   BLOCK_COMMENT("} string_equals");
1718 }
1719 
1720 // jdk.internal.util.ArraysSupport.vectorizedHashCode
1721 void C2_MacroAssembler::arrays_hashcode(Register ary, Register cnt, Register result,
1722                                         Register tmp1, Register tmp2, Register tmp3,
1723                                         Register tmp4, Register tmp5, Register tmp6,
1724                                         BasicType eltype)
1725 {
1726   assert_different_registers(ary, cnt, result, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, t0, t1);
1727 
1728   const int elsize = arrays_hashcode_elsize(eltype);
1729   const int chunks_end_shift = exact_log2(elsize);
1730 
1731   switch (eltype) {
1732   case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break;
1733   case T_CHAR:    BLOCK_COMMENT("arrays_hashcode(char) {");          break;
1734   case T_BYTE:    BLOCK_COMMENT("arrays_hashcode(byte) {");          break;
1735   case T_SHORT:   BLOCK_COMMENT("arrays_hashcode(short) {");         break;
1736   case T_INT:     BLOCK_COMMENT("arrays_hashcode(int) {");           break;
1737   default:
1738     ShouldNotReachHere();
1739   }
1740 
1741   const int stride = 4;
1742   const Register pow31_4 = tmp1;
1743   const Register pow31_3 = tmp2;
1744   const Register pow31_2 = tmp3;
1745   const Register chunks  = tmp4;
1746   const Register chunks_end = chunks;
1747 
1748   Label DONE, TAIL, TAIL_LOOP, WIDE_LOOP;
1749 
1750   // result has a value initially
1751 
1752   beqz(cnt, DONE);
1753 
1754   andi(chunks, cnt, ~(stride-1));
1755   beqz(chunks, TAIL);
1756 
1757   mv(pow31_4, 923521);           // [31^^4]
1758   mv(pow31_3,  29791);           // [31^^3]
1759   mv(pow31_2,    961);           // [31^^2]
1760 
1761   slli(chunks_end, chunks, chunks_end_shift);
1762   add(chunks_end, ary, chunks_end);
1763   andi(cnt, cnt, stride-1);      // don't forget about tail!
1764 
1765   bind(WIDE_LOOP);
1766   mulw(result, result, pow31_4); // 31^^4 * h
1767   arrays_hashcode_elload(t0,   Address(ary, 0 * elsize), eltype);
1768   arrays_hashcode_elload(t1,   Address(ary, 1 * elsize), eltype);
1769   arrays_hashcode_elload(tmp5, Address(ary, 2 * elsize), eltype);
1770   arrays_hashcode_elload(tmp6, Address(ary, 3 * elsize), eltype);
1771   mulw(t0, t0, pow31_3);         // 31^^3 * ary[i+0]
1772   addw(result, result, t0);
1773   mulw(t1, t1, pow31_2);         // 31^^2 * ary[i+1]
1774   addw(result, result, t1);
1775   slli(t0, tmp5, 5);             // optimize 31^^1 * ary[i+2]
1776   subw(tmp5, t0, tmp5);          // with ary[i+2]<<5 - ary[i+2]
1777   addw(result, result, tmp5);
1778   addw(result, result, tmp6);    // 31^^4 * h + 31^^3 * ary[i+0] + 31^^2 * ary[i+1]
1779                                  //           + 31^^1 * ary[i+2] + 31^^0 * ary[i+3]
1780   addi(ary, ary, elsize * stride);
1781   bne(ary, chunks_end, WIDE_LOOP);
1782   beqz(cnt, DONE);
1783 
1784   bind(TAIL);
1785   slli(chunks_end, cnt, chunks_end_shift);
1786   add(chunks_end, ary, chunks_end);
1787 
1788   bind(TAIL_LOOP);
1789   arrays_hashcode_elload(t0, Address(ary), eltype);
1790   slli(t1, result, 5);           // optimize 31 * result
1791   subw(result, t1, result);      // with result<<5 - result
1792   addw(result, result, t0);
1793   addi(ary, ary, elsize);
1794   bne(ary, chunks_end, TAIL_LOOP);
1795 
1796   bind(DONE);
1797   BLOCK_COMMENT("} // arrays_hashcode");
1798 }
1799 
1800 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) {
1801   switch (eltype) {
1802   case T_BOOLEAN: return sizeof(jboolean);
1803   case T_BYTE:    return sizeof(jbyte);
1804   case T_SHORT:   return sizeof(jshort);
1805   case T_CHAR:    return sizeof(jchar);
1806   case T_INT:     return sizeof(jint);
1807   default:
1808     ShouldNotReachHere();
1809     return -1;
1810   }
1811 }
1812 
1813 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) {
1814   switch (eltype) {
1815   // T_BOOLEAN used as surrogate for unsigned byte
1816   case T_BOOLEAN: lbu(dst, src);   break;
1817   case T_BYTE:     lb(dst, src);   break;
1818   case T_SHORT:    lh(dst, src);   break;
1819   case T_CHAR:    lhu(dst, src);   break;
1820   case T_INT:      lw(dst, src);   break;
1821   default:
1822     ShouldNotReachHere();
1823   }
1824 }
1825 
1826 typedef void (Assembler::*conditional_branch_insn)(Register op1, Register op2, Label& label, bool is_far);
1827 typedef void (MacroAssembler::*float_conditional_branch_insn)(FloatRegister op1, FloatRegister op2, Label& label,
1828                                                               bool is_far, bool is_unordered);
1829 
1830 static conditional_branch_insn conditional_branches[] =
1831 {
1832   /* SHORT branches */
1833   (conditional_branch_insn)&MacroAssembler::beq,
1834   (conditional_branch_insn)&MacroAssembler::bgt,
1835   nullptr, // BoolTest::overflow
1836   (conditional_branch_insn)&MacroAssembler::blt,
1837   (conditional_branch_insn)&MacroAssembler::bne,
1838   (conditional_branch_insn)&MacroAssembler::ble,
1839   nullptr, // BoolTest::no_overflow
1840   (conditional_branch_insn)&MacroAssembler::bge,
1841 
1842   /* UNSIGNED branches */
1843   (conditional_branch_insn)&MacroAssembler::beq,
1844   (conditional_branch_insn)&MacroAssembler::bgtu,
1845   nullptr,
1846   (conditional_branch_insn)&MacroAssembler::bltu,
1847   (conditional_branch_insn)&MacroAssembler::bne,
1848   (conditional_branch_insn)&MacroAssembler::bleu,
1849   nullptr,
1850   (conditional_branch_insn)&MacroAssembler::bgeu
1851 };
1852 
1853 static float_conditional_branch_insn float_conditional_branches[] =
1854 {
1855   /* FLOAT SHORT branches */
1856   (float_conditional_branch_insn)&MacroAssembler::float_beq,
1857   (float_conditional_branch_insn)&MacroAssembler::float_bgt,
1858   nullptr,  // BoolTest::overflow
1859   (float_conditional_branch_insn)&MacroAssembler::float_blt,
1860   (float_conditional_branch_insn)&MacroAssembler::float_bne,
1861   (float_conditional_branch_insn)&MacroAssembler::float_ble,
1862   nullptr, // BoolTest::no_overflow
1863   (float_conditional_branch_insn)&MacroAssembler::float_bge,
1864 
1865   /* DOUBLE SHORT branches */
1866   (float_conditional_branch_insn)&MacroAssembler::double_beq,
1867   (float_conditional_branch_insn)&MacroAssembler::double_bgt,
1868   nullptr,
1869   (float_conditional_branch_insn)&MacroAssembler::double_blt,
1870   (float_conditional_branch_insn)&MacroAssembler::double_bne,
1871   (float_conditional_branch_insn)&MacroAssembler::double_ble,
1872   nullptr,
1873   (float_conditional_branch_insn)&MacroAssembler::double_bge
1874 };
1875 
1876 void C2_MacroAssembler::cmp_branch(int cmpFlag, Register op1, Register op2, Label& label, bool is_far) {
1877   assert(cmpFlag >= 0 && cmpFlag < (int)(sizeof(conditional_branches) / sizeof(conditional_branches[0])),
1878          "invalid conditional branch index");
1879   (this->*conditional_branches[cmpFlag])(op1, op2, label, is_far);
1880 }
1881 
1882 // This is a function should only be used by C2. Flip the unordered when unordered-greater, C2 would use
1883 // unordered-lesser instead of unordered-greater. Finally, commute the result bits at function do_one_bytecode().
1884 void C2_MacroAssembler::float_cmp_branch(int cmpFlag, FloatRegister op1, FloatRegister op2, Label& label, bool is_far) {
1885   assert(cmpFlag >= 0 && cmpFlag < (int)(sizeof(float_conditional_branches) / sizeof(float_conditional_branches[0])),
1886          "invalid float conditional branch index");
1887   int booltest_flag = cmpFlag & ~(C2_MacroAssembler::double_branch_mask);
1888   (this->*float_conditional_branches[cmpFlag])(op1, op2, label, is_far,
1889     (booltest_flag == (BoolTest::ge) || booltest_flag == (BoolTest::gt)) ? false : true);
1890 }
1891 
1892 void C2_MacroAssembler::enc_cmpUEqNeLeGt_imm0_branch(int cmpFlag, Register op1, Label& L, bool is_far) {
1893   switch (cmpFlag) {
1894     case BoolTest::eq:
1895     case BoolTest::le:
1896       beqz(op1, L, is_far);
1897       break;
1898     case BoolTest::ne:
1899     case BoolTest::gt:
1900       bnez(op1, L, is_far);
1901       break;
1902     default:
1903       ShouldNotReachHere();
1904   }
1905 }
1906 
1907 void C2_MacroAssembler::enc_cmpEqNe_imm0_branch(int cmpFlag, Register op1, Label& L, bool is_far) {
1908   switch (cmpFlag) {
1909     case BoolTest::eq:
1910       beqz(op1, L, is_far);
1911       break;
1912     case BoolTest::ne:
1913       bnez(op1, L, is_far);
1914       break;
1915     default:
1916       ShouldNotReachHere();
1917   }
1918 }
1919 
1920 void C2_MacroAssembler::enc_cmove(int cmpFlag, Register op1, Register op2, Register dst, Register src) {
1921   Label L;
1922   cmp_branch(cmpFlag ^ (1 << neg_cond_bits), op1, op2, L);
1923   mv(dst, src);
1924   bind(L);
1925 }
1926 
1927 // Set dst to NaN if any NaN input.
1928 void C2_MacroAssembler::minmax_fp(FloatRegister dst, FloatRegister src1, FloatRegister src2,
1929                                   bool is_double, bool is_min) {
1930   assert_different_registers(dst, src1, src2);
1931 
1932   Label Done, Compare;
1933 
1934   is_double ? fclass_d(t0, src1)
1935             : fclass_s(t0, src1);
1936   is_double ? fclass_d(t1, src2)
1937             : fclass_s(t1, src2);
1938   orr(t0, t0, t1);
1939   andi(t0, t0, fclass_mask::nan); // if src1 or src2 is quiet or signaling NaN then return NaN
1940   beqz(t0, Compare);
1941   is_double ? fadd_d(dst, src1, src2)
1942             : fadd_s(dst, src1, src2);
1943   j(Done);
1944 
1945   bind(Compare);
1946   if (is_double) {
1947     is_min ? fmin_d(dst, src1, src2)
1948            : fmax_d(dst, src1, src2);
1949   } else {
1950     is_min ? fmin_s(dst, src1, src2)
1951            : fmax_s(dst, src1, src2);
1952   }
1953 
1954   bind(Done);
1955 }
1956 
1957 // According to Java SE specification, for floating-point round operations, if
1958 // the input is NaN, +/-infinity, or +/-0, the same input is returned as the
1959 // rounded result; this differs from behavior of RISC-V fcvt instructions (which
1960 // round out-of-range values to the nearest max or min value), therefore special
1961 // handling is needed by NaN, +/-Infinity, +/-0.
1962 void C2_MacroAssembler::round_double_mode(FloatRegister dst, FloatRegister src, int round_mode,
1963                                           Register tmp1, Register tmp2, Register tmp3) {
1964 
1965   assert_different_registers(dst, src);
1966   assert_different_registers(tmp1, tmp2, tmp3);
1967 
1968   // Set rounding mode for conversions
1969   // Here we use similar modes to double->long and long->double conversions
1970   // Different mode for long->double conversion matter only if long value was not representable as double,
1971   // we got long value as a result of double->long conversion so, it is definitely representable
1972   RoundingMode rm;
1973   switch (round_mode) {
1974     case RoundDoubleModeNode::rmode_ceil:
1975       rm = RoundingMode::rup;
1976       break;
1977     case RoundDoubleModeNode::rmode_floor:
1978       rm = RoundingMode::rdn;
1979       break;
1980     case RoundDoubleModeNode::rmode_rint:
1981       rm = RoundingMode::rne;
1982       break;
1983     default:
1984       ShouldNotReachHere();
1985   }
1986 
1987   // tmp1 - is a register to store double converted to long int
1988   // tmp2 - is a register to create constant for comparison
1989   // tmp3 - is a register where we store modified result of double->long conversion
1990   Label done, bad_val;
1991 
1992   // Conversion from double to long
1993   fcvt_l_d(tmp1, src, rm);
1994 
1995   // Generate constant (tmp2)
1996   // tmp2 = 100...0000
1997   addi(tmp2, zr, 1);
1998   slli(tmp2, tmp2, 63);
1999 
2000   // Prepare converted long (tmp1)
2001   // as a result when conversion overflow we got:
2002   // tmp1 = 011...1111 or 100...0000
2003   // Convert it to: tmp3 = 100...0000
2004   addi(tmp3, tmp1, 1);
2005   andi(tmp3, tmp3, -2);
2006   beq(tmp3, tmp2, bad_val);
2007 
2008   // Conversion from long to double
2009   fcvt_d_l(dst, tmp1, rm);
2010   // Add sign of input value to result for +/- 0 cases
2011   fsgnj_d(dst, dst, src);
2012   j(done);
2013 
2014   // If got conversion overflow return src
2015   bind(bad_val);
2016   fmv_d(dst, src);
2017 
2018   bind(done);
2019 }
2020 
2021 // According to Java SE specification, for floating-point signum operations, if
2022 // on input we have NaN or +/-0.0 value we should return it,
2023 // otherwise return +/- 1.0 using sign of input.
2024 // one - gives us a floating-point 1.0 (got from matching rule)
2025 // bool is_double - specifies single or double precision operations will be used.
2026 void C2_MacroAssembler::signum_fp(FloatRegister dst, FloatRegister one, bool is_double) {
2027   Label done;
2028 
2029   is_double ? fclass_d(t0, dst)
2030             : fclass_s(t0, dst);
2031 
2032   // check if input is -0, +0, signaling NaN or quiet NaN
2033   andi(t0, t0, fclass_mask::zero | fclass_mask::nan);
2034 
2035   bnez(t0, done);
2036 
2037   // use floating-point 1.0 with a sign of input
2038   is_double ? fsgnj_d(dst, one, dst)
2039             : fsgnj_s(dst, one, dst);
2040 
2041   bind(done);
2042 }
2043 
2044 static void float16_to_float_slow_path(C2_MacroAssembler& masm, C2GeneralStub<FloatRegister, Register, Register>& stub) {
2045 #define __ masm.
2046   FloatRegister dst = stub.data<0>();
2047   Register src = stub.data<1>();
2048   Register tmp = stub.data<2>();
2049   __ bind(stub.entry());
2050 
2051   // following instructions mainly focus on NaN, as riscv does not handle
2052   // NaN well with fcvt, but the code also works for Inf at the same time.
2053 
2054   // construct a NaN in 32 bits from the NaN in 16 bits,
2055   // we need the payloads of non-canonical NaNs to be preserved.
2056   __ mv(tmp, 0x7f800000);
2057   // sign-bit was already set via sign-extension if necessary.
2058   __ slli(t0, src, 13);
2059   __ orr(tmp, t0, tmp);
2060   __ fmv_w_x(dst, tmp);
2061 
2062   __ j(stub.continuation());
2063 #undef __
2064 }
2065 
2066 // j.l.Float.float16ToFloat
2067 void C2_MacroAssembler::float16_to_float(FloatRegister dst, Register src, Register tmp) {
2068   auto stub = C2CodeStub::make<FloatRegister, Register, Register>(dst, src, tmp, 20, float16_to_float_slow_path);
2069 
2070   // On riscv, NaN needs a special process as fcvt does not work in that case.
2071   // On riscv, Inf does not need a special process as fcvt can handle it correctly.
2072   // but we consider to get the slow path to process NaN and Inf at the same time,
2073   // as both of them are rare cases, and if we try to get the slow path to handle
2074   // only NaN case it would sacrifise the performance for normal cases,
2075   // i.e. non-NaN and non-Inf cases.
2076 
2077   // check whether it's a NaN or +/- Inf.
2078   mv(t0, 0x7c00);
2079   andr(tmp, src, t0);
2080   // jump to stub processing NaN and Inf cases.
2081   beq(t0, tmp, stub->entry());
2082 
2083   // non-NaN or non-Inf cases, just use built-in instructions.
2084   fmv_h_x(dst, src);
2085   fcvt_s_h(dst, dst);
2086 
2087   bind(stub->continuation());
2088 }
2089 
2090 static void float_to_float16_slow_path(C2_MacroAssembler& masm, C2GeneralStub<Register, FloatRegister, Register>& stub) {
2091 #define __ masm.
2092   Register dst = stub.data<0>();
2093   FloatRegister src = stub.data<1>();
2094   Register tmp = stub.data<2>();
2095   __ bind(stub.entry());
2096 
2097   __ fmv_x_w(dst, src);
2098 
2099   // preserve the payloads of non-canonical NaNs.
2100   __ srai(dst, dst, 13);
2101   // preserve the sign bit.
2102   __ srai(tmp, dst, 13);
2103   __ slli(tmp, tmp, 10);
2104   __ mv(t0, 0x3ff);
2105   __ orr(tmp, tmp, t0);
2106 
2107   // get the result by merging sign bit and payloads of preserved non-canonical NaNs.
2108   __ andr(dst, dst, tmp);
2109 
2110   __ j(stub.continuation());
2111 #undef __
2112 }
2113 
2114 // j.l.Float.floatToFloat16
2115 void C2_MacroAssembler::float_to_float16(Register dst, FloatRegister src, FloatRegister ftmp, Register xtmp) {
2116   auto stub = C2CodeStub::make<Register, FloatRegister, Register>(dst, src, xtmp, 130, float_to_float16_slow_path);
2117 
2118   // On riscv, NaN needs a special process as fcvt does not work in that case.
2119 
2120   // check whether it's a NaN.
2121   // replace fclass with feq as performance optimization.
2122   feq_s(t0, src, src);
2123   // jump to stub processing NaN cases.
2124   beqz(t0, stub->entry());
2125 
2126   // non-NaN cases, just use built-in instructions.
2127   fcvt_h_s(ftmp, src);
2128   fmv_x_h(dst, ftmp);
2129 
2130   bind(stub->continuation());
2131 }
2132 
2133 static void float16_to_float_v_slow_path(C2_MacroAssembler& masm, C2GeneralStub<VectorRegister, VectorRegister, uint>& stub) {
2134 #define __ masm.
2135   VectorRegister dst = stub.data<0>();
2136   VectorRegister src = stub.data<1>();
2137   uint vector_length = stub.data<2>();
2138   __ bind(stub.entry());
2139 
2140   // following instructions mainly focus on NaN, as riscv does not handle
2141   // NaN well with vfwcvt_f_f_v, but the code also works for Inf at the same time.
2142   //
2143   // construct NaN's in 32 bits from the NaN's in 16 bits,
2144   // we need the payloads of non-canonical NaNs to be preserved.
2145 
2146   // adjust vector type to 2 * SEW.
2147   __ vsetvli_helper(T_FLOAT, vector_length, Assembler::m1);
2148   // widen and sign-extend src data.
2149   __ vsext_vf2(dst, src, Assembler::v0_t);
2150   __ mv(t0, 0x7f800000);
2151   // sign-bit was already set via sign-extension if necessary.
2152   __ vsll_vi(dst, dst, 13, Assembler::v0_t);
2153   __ vor_vx(dst, dst, t0, Assembler::v0_t);
2154 
2155   __ j(stub.continuation());
2156 #undef __
2157 }
2158 
2159 // j.l.Float.float16ToFloat
2160 void C2_MacroAssembler::float16_to_float_v(VectorRegister dst, VectorRegister src, uint vector_length) {
2161   auto stub = C2CodeStub::make<VectorRegister, VectorRegister, uint>
2162               (dst, src, vector_length, 24, float16_to_float_v_slow_path);
2163   assert_different_registers(dst, src);
2164 
2165   // On riscv, NaN needs a special process as vfwcvt_f_f_v does not work in that case.
2166   // On riscv, Inf does not need a special process as vfwcvt_f_f_v can handle it correctly.
2167   // but we consider to get the slow path to process NaN and Inf at the same time,
2168   // as both of them are rare cases, and if we try to get the slow path to handle
2169   // only NaN case it would sacrifise the performance for normal cases,
2170   // i.e. non-NaN and non-Inf cases.
2171 
2172   vsetvli_helper(BasicType::T_SHORT, vector_length, Assembler::mf2);
2173 
2174   // check whether there is a NaN or +/- Inf.
2175   mv(t0, 0x7c00);
2176   vand_vx(v0, src, t0);
2177   // v0 will be used as mask in slow path.
2178   vmseq_vx(v0, v0, t0);
2179   vcpop_m(t0, v0);
2180 
2181   // For non-NaN or non-Inf cases, just use built-in instructions.
2182   vfwcvt_f_f_v(dst, src);
2183 
2184   // jump to stub processing NaN and Inf cases if there is any of them in the vector-wide.
2185   bnez(t0, stub->entry());
2186 
2187   bind(stub->continuation());
2188 }
2189 
2190 static void float_to_float16_v_slow_path(C2_MacroAssembler& masm,
2191                                          C2GeneralStub<VectorRegister, VectorRegister, VectorRegister>& stub) {
2192 #define __ masm.
2193   VectorRegister dst = stub.data<0>();
2194   VectorRegister src = stub.data<1>();
2195   VectorRegister tmp = stub.data<2>();
2196   __ bind(stub.entry());
2197 
2198   // mul is already set to mf2 in float_to_float16_v.
2199 
2200   // preserve the payloads of non-canonical NaNs.
2201   __ vnsra_wi(dst, src, 13, Assembler::v0_t);
2202 
2203   // preserve the sign bit.
2204   __ vnsra_wi(tmp, src, 26, Assembler::v0_t);
2205   __ vsll_vi(tmp, tmp, 10, Assembler::v0_t);
2206   __ mv(t0, 0x3ff);
2207   __ vor_vx(tmp, tmp, t0, Assembler::v0_t);
2208 
2209   // get the result by merging sign bit and payloads of preserved non-canonical NaNs.
2210   __ vand_vv(dst, dst, tmp, Assembler::v0_t);
2211 
2212   __ j(stub.continuation());
2213 #undef __
2214 }
2215 
2216 // j.l.Float.float16ToFloat
2217 void C2_MacroAssembler::float_to_float16_v(VectorRegister dst, VectorRegister src, VectorRegister vtmp,
2218                                            Register tmp, uint vector_length) {
2219   assert_different_registers(dst, src, vtmp);
2220 
2221   auto stub = C2CodeStub::make<VectorRegister, VectorRegister, VectorRegister>
2222               (dst, src, vtmp, 28, float_to_float16_v_slow_path);
2223 
2224   // On riscv, NaN needs a special process as vfncvt_f_f_w does not work in that case.
2225 
2226   vsetvli_helper(BasicType::T_FLOAT, vector_length, Assembler::m1);
2227 
2228   // check whether there is a NaN.
2229   // replace v_fclass with vmseq_vv as performance optimization.
2230   vmfne_vv(v0, src, src);
2231   vcpop_m(t0, v0);
2232 
2233   vsetvli_helper(BasicType::T_SHORT, vector_length, Assembler::mf2, tmp);
2234 
2235   // For non-NaN cases, just use built-in instructions.
2236   vfncvt_f_f_w(dst, src);
2237 
2238   // jump to stub processing NaN cases.
2239   bnez(t0, stub->entry());
2240 
2241   bind(stub->continuation());
2242 }
2243 
2244 void C2_MacroAssembler::signum_fp_v(VectorRegister dst, VectorRegister one, BasicType bt, int vlen) {
2245   vsetvli_helper(bt, vlen);
2246 
2247   // check if input is -0, +0, signaling NaN or quiet NaN
2248   vfclass_v(v0, dst);
2249   mv(t0, fclass_mask::zero | fclass_mask::nan);
2250   vand_vx(v0, v0, t0);
2251   vmseq_vi(v0, v0, 0);
2252 
2253   // use floating-point 1.0 with a sign of input
2254   vfsgnj_vv(dst, one, dst, v0_t);
2255 }
2256 
2257 void C2_MacroAssembler::compress_bits_v(Register dst, Register src, Register mask, bool is_long) {
2258   Assembler::SEW sew = is_long ? Assembler::e64 : Assembler::e32;
2259   // intrinsic is enabled when MaxVectorSize >= 16
2260   Assembler::LMUL lmul = is_long ? Assembler::m4 : Assembler::m2;
2261   long len = is_long ? 64 : 32;
2262 
2263   // load the src data(in bits) to be compressed.
2264   vsetivli(x0, 1, sew, Assembler::m1);
2265   vmv_s_x(v0, src);
2266   // reset the src data(in bytes) to zero.
2267   mv(t0, len);
2268   vsetvli(x0, t0, Assembler::e8, lmul);
2269   vmv_v_i(v4, 0);
2270   // convert the src data from bits to bytes.
2271   vmerge_vim(v4, v4, 1); // v0 as the implicit mask register
2272   // reset the dst data(in bytes) to zero.
2273   vmv_v_i(v8, 0);
2274   // load the mask data(in bits).
2275   vsetivli(x0, 1, sew, Assembler::m1);
2276   vmv_s_x(v0, mask);
2277   // compress the src data(in bytes) to dst(in bytes).
2278   vsetvli(x0, t0, Assembler::e8, lmul);
2279   vcompress_vm(v8, v4, v0);
2280   // convert the dst data from bytes to bits.
2281   vmseq_vi(v0, v8, 1);
2282   // store result back.
2283   vsetivli(x0, 1, sew, Assembler::m1);
2284   vmv_x_s(dst, v0);
2285 }
2286 
2287 void C2_MacroAssembler::compress_bits_i_v(Register dst, Register src, Register mask) {
2288   compress_bits_v(dst, src, mask, /* is_long */ false);
2289 }
2290 
2291 void C2_MacroAssembler::compress_bits_l_v(Register dst, Register src, Register mask) {
2292   compress_bits_v(dst, src, mask, /* is_long */ true);
2293 }
2294 
2295 void C2_MacroAssembler::expand_bits_v(Register dst, Register src, Register mask, bool is_long) {
2296   Assembler::SEW sew = is_long ? Assembler::e64 : Assembler::e32;
2297   // intrinsic is enabled when MaxVectorSize >= 16
2298   Assembler::LMUL lmul = is_long ? Assembler::m4 : Assembler::m2;
2299   long len = is_long ? 64 : 32;
2300 
2301   // load the src data(in bits) to be expanded.
2302   vsetivli(x0, 1, sew, Assembler::m1);
2303   vmv_s_x(v0, src);
2304   // reset the src data(in bytes) to zero.
2305   mv(t0, len);
2306   vsetvli(x0, t0, Assembler::e8, lmul);
2307   vmv_v_i(v4, 0);
2308   // convert the src data from bits to bytes.
2309   vmerge_vim(v4, v4, 1); // v0 as implicit mask register
2310   // reset the dst data(in bytes) to zero.
2311   vmv_v_i(v12, 0);
2312   // load the mask data(in bits).
2313   vsetivli(x0, 1, sew, Assembler::m1);
2314   vmv_s_x(v0, mask);
2315   // expand the src data(in bytes) to dst(in bytes).
2316   vsetvli(x0, t0, Assembler::e8, lmul);
2317   viota_m(v8, v0);
2318   vrgather_vv(v12, v4, v8, VectorMask::v0_t); // v0 as implicit mask register
2319   // convert the dst data from bytes to bits.
2320   vmseq_vi(v0, v12, 1);
2321   // store result back.
2322   vsetivli(x0, 1, sew, Assembler::m1);
2323   vmv_x_s(dst, v0);
2324 }
2325 
2326 void C2_MacroAssembler::expand_bits_i_v(Register dst, Register src, Register mask) {
2327   expand_bits_v(dst, src, mask, /* is_long */ false);
2328 }
2329 
2330 void C2_MacroAssembler::expand_bits_l_v(Register dst, Register src, Register mask) {
2331   expand_bits_v(dst, src, mask, /* is_long */ true);
2332 }
2333 
2334 void C2_MacroAssembler::element_compare(Register a1, Register a2, Register result, Register cnt, Register tmp1, Register tmp2,
2335                                         VectorRegister vr1, VectorRegister vr2, VectorRegister vrs, bool islatin, Label &DONE,
2336                                         Assembler::LMUL lmul) {
2337   Label loop;
2338   Assembler::SEW sew = islatin ? Assembler::e8 : Assembler::e16;
2339 
2340   bind(loop);
2341   vsetvli(tmp1, cnt, sew, lmul);
2342   vlex_v(vr1, a1, sew);
2343   vlex_v(vr2, a2, sew);
2344   vmsne_vv(vrs, vr1, vr2);
2345   vfirst_m(tmp2, vrs);
2346   bgez(tmp2, DONE);
2347   sub(cnt, cnt, tmp1);
2348   if (!islatin) {
2349     slli(tmp1, tmp1, 1); // get byte counts
2350   }
2351   add(a1, a1, tmp1);
2352   add(a2, a2, tmp1);
2353   bnez(cnt, loop);
2354 
2355   mv(result, true);
2356 }
2357 
2358 void C2_MacroAssembler::string_equals_v(Register a1, Register a2, Register result, Register cnt) {
2359   Label DONE;
2360   Register tmp1 = t0;
2361   Register tmp2 = t1;
2362 
2363   BLOCK_COMMENT("string_equals_v {");
2364 
2365   mv(result, false);
2366 
2367   element_compare(a1, a2, result, cnt, tmp1, tmp2, v2, v4, v2, true, DONE, Assembler::m2);
2368 
2369   bind(DONE);
2370   BLOCK_COMMENT("} string_equals_v");
2371 }
2372 
2373 // used by C2 ClearArray patterns.
2374 // base: Address of a buffer to be zeroed
2375 // cnt: Count in HeapWords
2376 //
2377 // base, cnt, v4, v5, v6, v7 and t0 are clobbered.
2378 void C2_MacroAssembler::clear_array_v(Register base, Register cnt) {
2379   Label loop;
2380 
2381   // making zero words
2382   vsetvli(t0, cnt, Assembler::e64, Assembler::m4);
2383   vxor_vv(v4, v4, v4);
2384 
2385   bind(loop);
2386   vsetvli(t0, cnt, Assembler::e64, Assembler::m4);
2387   vse64_v(v4, base);
2388   sub(cnt, cnt, t0);
2389   shadd(base, t0, base, t0, 3);
2390   bnez(cnt, loop);
2391 }
2392 
2393 void C2_MacroAssembler::arrays_equals_v(Register a1, Register a2, Register result,
2394                                         Register cnt1, int elem_size) {
2395   Label DONE;
2396   Register tmp1 = t0;
2397   Register tmp2 = t1;
2398   Register cnt2 = tmp2;
2399   int length_offset = arrayOopDesc::length_offset_in_bytes();
2400   int base_offset = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE);
2401 
2402   BLOCK_COMMENT("arrays_equals_v {");
2403 
2404   // if (a1 == a2), return true
2405   mv(result, true);
2406   beq(a1, a2, DONE);
2407 
2408   mv(result, false);
2409   // if a1 == null or a2 == null, return false
2410   beqz(a1, DONE);
2411   beqz(a2, DONE);
2412   // if (a1.length != a2.length), return false
2413   lwu(cnt1, Address(a1, length_offset));
2414   lwu(cnt2, Address(a2, length_offset));
2415   bne(cnt1, cnt2, DONE);
2416 
2417   la(a1, Address(a1, base_offset));
2418   la(a2, Address(a2, base_offset));
2419 
2420   element_compare(a1, a2, result, cnt1, tmp1, tmp2, v2, v4, v2, elem_size == 1, DONE, Assembler::m2);
2421 
2422   bind(DONE);
2423 
2424   BLOCK_COMMENT("} arrays_equals_v");
2425 }
2426 
2427 void C2_MacroAssembler::string_compare_v(Register str1, Register str2, Register cnt1, Register cnt2,
2428                                          Register result, Register tmp1, Register tmp2, int encForm) {
2429   Label DIFFERENCE, DONE, L, loop;
2430   bool encLL = encForm == StrIntrinsicNode::LL;
2431   bool encLU = encForm == StrIntrinsicNode::LU;
2432   bool encUL = encForm == StrIntrinsicNode::UL;
2433 
2434   bool str1_isL = encLL || encLU;
2435   bool str2_isL = encLL || encUL;
2436 
2437   int minCharsInWord = encLL ? wordSize : wordSize / 2;
2438 
2439   BLOCK_COMMENT("string_compare {");
2440 
2441   // for Latin strings, 1 byte for 1 character
2442   // for UTF16 strings, 2 bytes for 1 character
2443   if (!str1_isL)
2444     sraiw(cnt1, cnt1, 1);
2445   if (!str2_isL)
2446     sraiw(cnt2, cnt2, 1);
2447 
2448   // if str1 == str2, return the difference
2449   // save the minimum of the string lengths in cnt2.
2450   sub(result, cnt1, cnt2);
2451   bgt(cnt1, cnt2, L);
2452   mv(cnt2, cnt1);
2453   bind(L);
2454 
2455   // We focus on the optimization of small sized string.
2456   // Please check below document for string size distribution statistics.
2457   // https://cr.openjdk.org/~shade/density/string-density-report.pdf
2458   if (str1_isL == str2_isL) { // LL or UU
2459     // Below construction of v regs and lmul is based on test on 2 different boards,
2460     // vlen == 128 and vlen == 256 respectively.
2461     if (!encLL && MaxVectorSize == 16) { // UU
2462       element_compare(str1, str2, zr, cnt2, tmp1, tmp2, v4, v8, v4, encLL, DIFFERENCE, Assembler::m4);
2463     } else { // UU + MaxVectorSize or LL
2464       element_compare(str1, str2, zr, cnt2, tmp1, tmp2, v2, v4, v2, encLL, DIFFERENCE, Assembler::m2);
2465     }
2466 
2467     j(DONE);
2468   } else { // LU or UL
2469     Register strL = encLU ? str1 : str2;
2470     Register strU = encLU ? str2 : str1;
2471     VectorRegister vstr1 = encLU ? v8 : v4;
2472     VectorRegister vstr2 = encLU ? v4 : v8;
2473 
2474     bind(loop);
2475     vsetvli(tmp1, cnt2, Assembler::e8, Assembler::m2);
2476     vle8_v(vstr1, strL);
2477     vsetvli(tmp1, cnt2, Assembler::e16, Assembler::m4);
2478     vzext_vf2(vstr2, vstr1);
2479     vle16_v(vstr1, strU);
2480     vmsne_vv(v4, vstr2, vstr1);
2481     vfirst_m(tmp2, v4);
2482     bgez(tmp2, DIFFERENCE);
2483     sub(cnt2, cnt2, tmp1);
2484     add(strL, strL, tmp1);
2485     shadd(strU, tmp1, strU, tmp1, 1);
2486     bnez(cnt2, loop);
2487     j(DONE);
2488   }
2489 
2490   bind(DIFFERENCE);
2491   slli(tmp1, tmp2, 1);
2492   add(str1, str1, str1_isL ? tmp2 : tmp1);
2493   add(str2, str2, str2_isL ? tmp2 : tmp1);
2494   str1_isL ? lbu(tmp1, Address(str1, 0)) : lhu(tmp1, Address(str1, 0));
2495   str2_isL ? lbu(tmp2, Address(str2, 0)) : lhu(tmp2, Address(str2, 0));
2496   sub(result, tmp1, tmp2);
2497 
2498   bind(DONE);
2499 }
2500 
2501 void C2_MacroAssembler::byte_array_inflate_v(Register src, Register dst, Register len, Register tmp) {
2502   Label loop;
2503   assert_different_registers(src, dst, len, tmp, t0);
2504 
2505   BLOCK_COMMENT("byte_array_inflate_v {");
2506   bind(loop);
2507   vsetvli(tmp, len, Assembler::e8, Assembler::m2);
2508   vle8_v(v6, src);
2509   vsetvli(t0, len, Assembler::e16, Assembler::m4);
2510   vzext_vf2(v4, v6);
2511   vse16_v(v4, dst);
2512   sub(len, len, tmp);
2513   add(src, src, tmp);
2514   shadd(dst, tmp, dst, tmp, 1);
2515   bnez(len, loop);
2516   BLOCK_COMMENT("} byte_array_inflate_v");
2517 }
2518 
2519 // Compress char[] array to byte[].
2520 // Intrinsic for java.lang.StringUTF16.compress(char[] src, int srcOff, byte[] dst, int dstOff, int len)
2521 // result: the array length if every element in array can be encoded,
2522 // otherwise, the index of first non-latin1 (> 0xff) character.
2523 void C2_MacroAssembler::char_array_compress_v(Register src, Register dst, Register len,
2524                                               Register result, Register tmp) {
2525   encode_iso_array_v(src, dst, len, result, tmp, false);
2526 }
2527 
2528 // Intrinsic for
2529 //
2530 // - sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray
2531 //     return the number of characters copied.
2532 // - java/lang/StringUTF16.compress
2533 //     return index of non-latin1 character if copy fails, otherwise 'len'.
2534 //
2535 // This version always returns the number of characters copied. A successful
2536 // copy will complete with the post-condition: 'res' == 'len', while an
2537 // unsuccessful copy will exit with the post-condition: 0 <= 'res' < 'len'.
2538 //
2539 // Clobbers: src, dst, len, result, t0
2540 void C2_MacroAssembler::encode_iso_array_v(Register src, Register dst, Register len,
2541                                            Register result, Register tmp, bool ascii) {
2542   Label loop, fail, done;
2543 
2544   BLOCK_COMMENT("encode_iso_array_v {");
2545   mv(result, 0);
2546 
2547   bind(loop);
2548   mv(tmp, ascii ? 0x7f : 0xff);
2549   vsetvli(t0, len, Assembler::e16, Assembler::m2);
2550   vle16_v(v2, src);
2551 
2552   vmsgtu_vx(v1, v2, tmp);
2553   vfirst_m(tmp, v1);
2554   vmsbf_m(v0, v1);
2555   // compress char to byte
2556   vsetvli(t0, len, Assembler::e8);
2557   vncvt_x_x_w(v1, v2, Assembler::v0_t);
2558   vse8_v(v1, dst, Assembler::v0_t);
2559 
2560   // fail if char > 0x7f/0xff
2561   bgez(tmp, fail);
2562   add(result, result, t0);
2563   add(dst, dst, t0);
2564   sub(len, len, t0);
2565   shadd(src, t0, src, t0, 1);
2566   bnez(len, loop);
2567   j(done);
2568 
2569   bind(fail);
2570   add(result, result, tmp);
2571 
2572   bind(done);
2573   BLOCK_COMMENT("} encode_iso_array_v");
2574 }
2575 
2576 void C2_MacroAssembler::count_positives_v(Register ary, Register len, Register result, Register tmp) {
2577   Label LOOP, SET_RESULT, DONE;
2578 
2579   BLOCK_COMMENT("count_positives_v {");
2580   assert_different_registers(ary, len, result, tmp);
2581 
2582   mv(result, zr);
2583 
2584   bind(LOOP);
2585   vsetvli(t0, len, Assembler::e8, Assembler::m4);
2586   vle8_v(v4, ary);
2587   vmslt_vx(v4, v4, zr);
2588   vfirst_m(tmp, v4);
2589   bgez(tmp, SET_RESULT);
2590   // if tmp == -1, all bytes are positive
2591   add(result, result, t0);
2592 
2593   sub(len, len, t0);
2594   add(ary, ary, t0);
2595   bnez(len, LOOP);
2596   j(DONE);
2597 
2598   // add remaining positive bytes count
2599   bind(SET_RESULT);
2600   add(result, result, tmp);
2601 
2602   bind(DONE);
2603   BLOCK_COMMENT("} count_positives_v");
2604 }
2605 
2606 void C2_MacroAssembler::string_indexof_char_v(Register str1, Register cnt1,
2607                                               Register ch, Register result,
2608                                               Register tmp1, Register tmp2,
2609                                               bool isL) {
2610   mv(result, zr);
2611 
2612   Label loop, MATCH, DONE;
2613   Assembler::SEW sew = isL ? Assembler::e8 : Assembler::e16;
2614   bind(loop);
2615   vsetvli(tmp1, cnt1, sew, Assembler::m4);
2616   vlex_v(v4, str1, sew);
2617   vmseq_vx(v4, v4, ch);
2618   vfirst_m(tmp2, v4);
2619   bgez(tmp2, MATCH); // if equal, return index
2620 
2621   add(result, result, tmp1);
2622   sub(cnt1, cnt1, tmp1);
2623   if (!isL) slli(tmp1, tmp1, 1);
2624   add(str1, str1, tmp1);
2625   bnez(cnt1, loop);
2626 
2627   mv(result, -1);
2628   j(DONE);
2629 
2630   bind(MATCH);
2631   add(result, result, tmp2);
2632 
2633   bind(DONE);
2634 }
2635 
2636 // Set dst to NaN if any NaN input.
2637 void C2_MacroAssembler::minmax_fp_v(VectorRegister dst, VectorRegister src1, VectorRegister src2,
2638                                     BasicType bt, bool is_min, uint vector_length) {
2639   assert_different_registers(dst, src1, src2);
2640 
2641   vsetvli_helper(bt, vector_length);
2642 
2643   is_min ? vfmin_vv(dst, src1, src2)
2644          : vfmax_vv(dst, src1, src2);
2645 
2646   vmfne_vv(v0,  src1, src1);
2647   vfadd_vv(dst, src1, src1, Assembler::v0_t);
2648   vmfne_vv(v0,  src2, src2);
2649   vfadd_vv(dst, src2, src2, Assembler::v0_t);
2650 }
2651 
2652 // Set dst to NaN if any NaN input.
2653 // The destination vector register elements corresponding to masked-off elements
2654 // are handled with a mask-undisturbed policy.
2655 void C2_MacroAssembler::minmax_fp_masked_v(VectorRegister dst, VectorRegister src1, VectorRegister src2,
2656                                            VectorRegister vmask, VectorRegister tmp1, VectorRegister tmp2,
2657                                            BasicType bt, bool is_min, uint vector_length) {
2658   assert_different_registers(src1, src2, tmp1, tmp2);
2659   vsetvli_helper(bt, vector_length);
2660 
2661   // Check vector elements of src1 and src2 for NaN.
2662   vmfeq_vv(tmp1, src1, src1);
2663   vmfeq_vv(tmp2, src2, src2);
2664 
2665   vmandn_mm(v0, vmask, tmp1);
2666   vfadd_vv(dst, src1, src1, Assembler::v0_t);
2667   vmandn_mm(v0, vmask, tmp2);
2668   vfadd_vv(dst, src2, src2, Assembler::v0_t);
2669 
2670   vmand_mm(tmp2, tmp1, tmp2);
2671   vmand_mm(v0, vmask, tmp2);
2672   is_min ? vfmin_vv(dst, src1, src2, Assembler::v0_t)
2673          : vfmax_vv(dst, src1, src2, Assembler::v0_t);
2674 }
2675 
2676 // Set dst to NaN if any NaN input.
2677 void C2_MacroAssembler::reduce_minmax_fp_v(FloatRegister dst,
2678                                            FloatRegister src1, VectorRegister src2,
2679                                            VectorRegister tmp1, VectorRegister tmp2,
2680                                            bool is_double, bool is_min, uint vector_length, VectorMask vm) {
2681   assert_different_registers(dst, src1);
2682   assert_different_registers(src2, tmp1, tmp2);
2683 
2684   Label L_done, L_NaN_1, L_NaN_2;
2685   // Set dst to src1 if src1 is NaN
2686   is_double ? feq_d(t0, src1, src1)
2687             : feq_s(t0, src1, src1);
2688   beqz(t0, L_NaN_2);
2689 
2690   vsetvli_helper(is_double ? T_DOUBLE : T_FLOAT, vector_length);
2691   vfmv_s_f(tmp2, src1);
2692 
2693   is_min ? vfredmin_vs(tmp1, src2, tmp2, vm)
2694          : vfredmax_vs(tmp1, src2, tmp2, vm);
2695   vfmv_f_s(dst, tmp1);
2696 
2697   // Checking NaNs in src2
2698   vmfne_vv(tmp1, src2, src2, vm);
2699   vcpop_m(t0, tmp1, vm);
2700   beqz(t0, L_done);
2701 
2702   bind(L_NaN_1);
2703   vfredusum_vs(tmp1, src2, tmp2, vm);
2704   vfmv_f_s(dst, tmp1);
2705   j(L_done);
2706 
2707   bind(L_NaN_2);
2708   is_double ? fmv_d(dst, src1)
2709             : fmv_s(dst, src1);
2710   bind(L_done);
2711 }
2712 
2713 bool C2_MacroAssembler::in_scratch_emit_size() {
2714   if (ciEnv::current()->task() != nullptr) {
2715     PhaseOutput* phase_output = Compile::current()->output();
2716     if (phase_output != nullptr && phase_output->in_scratch_emit_size()) {
2717       return true;
2718     }
2719   }
2720   return MacroAssembler::in_scratch_emit_size();
2721 }
2722 
2723 void C2_MacroAssembler::reduce_integral_v(Register dst, Register src1,
2724                                           VectorRegister src2, VectorRegister tmp,
2725                                           int opc, BasicType bt, uint vector_length, VectorMask vm) {
2726   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2727   vsetvli_helper(bt, vector_length);
2728   vmv_s_x(tmp, src1);
2729   switch (opc) {
2730     case Op_AddReductionVI:
2731     case Op_AddReductionVL:
2732       vredsum_vs(tmp, src2, tmp, vm);
2733       break;
2734     case Op_AndReductionV:
2735       vredand_vs(tmp, src2, tmp, vm);
2736       break;
2737     case Op_OrReductionV:
2738       vredor_vs(tmp, src2, tmp, vm);
2739       break;
2740     case Op_XorReductionV:
2741       vredxor_vs(tmp, src2, tmp, vm);
2742       break;
2743     case Op_MaxReductionV:
2744       vredmax_vs(tmp, src2, tmp, vm);
2745       break;
2746     case Op_MinReductionV:
2747       vredmin_vs(tmp, src2, tmp, vm);
2748       break;
2749     default:
2750       ShouldNotReachHere();
2751   }
2752   vmv_x_s(dst, tmp);
2753 }
2754 
2755 // Set vl and vtype for full and partial vector operations.
2756 // (vma = mu, vta = tu, vill = false)
2757 void C2_MacroAssembler::vsetvli_helper(BasicType bt, uint vector_length, LMUL vlmul, Register tmp) {
2758   Assembler::SEW sew = Assembler::elemtype_to_sew(bt);
2759   if (vector_length <= 31) {
2760     vsetivli(tmp, vector_length, sew, vlmul);
2761   } else if (vector_length == (MaxVectorSize / type2aelembytes(bt))) {
2762     vsetvli(tmp, x0, sew, vlmul);
2763   } else {
2764     mv(tmp, vector_length);
2765     vsetvli(tmp, tmp, sew, vlmul);
2766   }
2767 }
2768 
2769 void C2_MacroAssembler::compare_integral_v(VectorRegister vd, VectorRegister src1, VectorRegister src2,
2770                                            int cond, BasicType bt, uint vector_length, VectorMask vm) {
2771   assert(is_integral_type(bt), "unsupported element type");
2772   assert(vm == Assembler::v0_t ? vd != v0 : true, "should be different registers");
2773   vsetvli_helper(bt, vector_length);
2774   vmclr_m(vd);
2775   switch (cond) {
2776     case BoolTest::eq: vmseq_vv(vd, src1, src2, vm); break;
2777     case BoolTest::ne: vmsne_vv(vd, src1, src2, vm); break;
2778     case BoolTest::le: vmsle_vv(vd, src1, src2, vm); break;
2779     case BoolTest::ge: vmsge_vv(vd, src1, src2, vm); break;
2780     case BoolTest::lt: vmslt_vv(vd, src1, src2, vm); break;
2781     case BoolTest::gt: vmsgt_vv(vd, src1, src2, vm); break;
2782     case BoolTest::ule: vmsleu_vv(vd, src1, src2, vm); break;
2783     case BoolTest::uge: vmsgeu_vv(vd, src1, src2, vm); break;
2784     case BoolTest::ult: vmsltu_vv(vd, src1, src2, vm); break;
2785     case BoolTest::ugt: vmsgtu_vv(vd, src1, src2, vm); break;
2786     default:
2787       assert(false, "unsupported compare condition");
2788       ShouldNotReachHere();
2789   }
2790 }
2791 
2792 void C2_MacroAssembler::compare_fp_v(VectorRegister vd, VectorRegister src1, VectorRegister src2,
2793                                      int cond, BasicType bt, uint vector_length, VectorMask vm) {
2794   assert(is_floating_point_type(bt), "unsupported element type");
2795   assert(vm == Assembler::v0_t ? vd != v0 : true, "should be different registers");
2796   vsetvli_helper(bt, vector_length);
2797   vmclr_m(vd);
2798   switch (cond) {
2799     case BoolTest::eq: vmfeq_vv(vd, src1, src2, vm); break;
2800     case BoolTest::ne: vmfne_vv(vd, src1, src2, vm); break;
2801     case BoolTest::le: vmfle_vv(vd, src1, src2, vm); break;
2802     case BoolTest::ge: vmfge_vv(vd, src1, src2, vm); break;
2803     case BoolTest::lt: vmflt_vv(vd, src1, src2, vm); break;
2804     case BoolTest::gt: vmfgt_vv(vd, src1, src2, vm); break;
2805     default:
2806       assert(false, "unsupported compare condition");
2807       ShouldNotReachHere();
2808   }
2809 }
2810 
2811 // In Matcher::scalable_predicate_reg_slots,
2812 // we assume each predicate register is one-eighth of the size of
2813 // scalable vector register, one mask bit per vector byte.
2814 void C2_MacroAssembler::spill_vmask(VectorRegister v, int offset) {
2815   vsetvli_helper(T_BYTE, MaxVectorSize >> 3);
2816   add(t0, sp, offset);
2817   vse8_v(v, t0);
2818 }
2819 
2820 void C2_MacroAssembler::unspill_vmask(VectorRegister v, int offset) {
2821   vsetvli_helper(T_BYTE, MaxVectorSize >> 3);
2822   add(t0, sp, offset);
2823   vle8_v(v, t0);
2824 }
2825 
2826 void C2_MacroAssembler::integer_extend_v(VectorRegister dst, BasicType dst_bt, uint vector_length,
2827                                          VectorRegister src, BasicType src_bt, bool is_signed) {
2828   assert(type2aelembytes(dst_bt) > type2aelembytes(src_bt) && type2aelembytes(dst_bt) <= 8 && type2aelembytes(src_bt) <= 4, "invalid element size");
2829   assert(dst_bt != T_FLOAT && dst_bt != T_DOUBLE && src_bt != T_FLOAT && src_bt != T_DOUBLE, "unsupported element type");
2830   // https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#52-vector-operands
2831   // The destination EEW is greater than the source EEW, the source EMUL is at least 1,
2832   // and the overlap is in the highest-numbered part of the destination register group.
2833   // Since LMUL=1, vd and vs cannot be the same.
2834   assert_different_registers(dst, src);
2835 
2836   vsetvli_helper(dst_bt, vector_length);
2837   if (is_signed) {
2838     if (src_bt == T_BYTE) {
2839       switch (dst_bt) {
2840       case T_SHORT:
2841         vsext_vf2(dst, src);
2842         break;
2843       case T_INT:
2844         vsext_vf4(dst, src);
2845         break;
2846       case T_LONG:
2847         vsext_vf8(dst, src);
2848         break;
2849       default:
2850         ShouldNotReachHere();
2851       }
2852     } else if (src_bt == T_SHORT) {
2853       if (dst_bt == T_INT) {
2854         vsext_vf2(dst, src);
2855       } else {
2856         vsext_vf4(dst, src);
2857       }
2858     } else if (src_bt == T_INT) {
2859       vsext_vf2(dst, src);
2860     }
2861   } else {
2862     if (src_bt == T_BYTE) {
2863       switch (dst_bt) {
2864       case T_SHORT:
2865         vzext_vf2(dst, src);
2866         break;
2867       case T_INT:
2868         vzext_vf4(dst, src);
2869         break;
2870       case T_LONG:
2871         vzext_vf8(dst, src);
2872         break;
2873       default:
2874         ShouldNotReachHere();
2875       }
2876     } else if (src_bt == T_SHORT) {
2877       if (dst_bt == T_INT) {
2878         vzext_vf2(dst, src);
2879       } else {
2880         vzext_vf4(dst, src);
2881       }
2882     } else if (src_bt == T_INT) {
2883       vzext_vf2(dst, src);
2884     }
2885   }
2886 }
2887 
2888 // Vector narrow from src to dst with specified element sizes.
2889 // High part of dst vector will be filled with zero.
2890 void C2_MacroAssembler::integer_narrow_v(VectorRegister dst, BasicType dst_bt, uint vector_length,
2891                                          VectorRegister src, BasicType src_bt) {
2892   assert(type2aelembytes(dst_bt) < type2aelembytes(src_bt) && type2aelembytes(dst_bt) <= 4 && type2aelembytes(src_bt) <= 8, "invalid element size");
2893   assert(dst_bt != T_FLOAT && dst_bt != T_DOUBLE && src_bt != T_FLOAT && src_bt != T_DOUBLE, "unsupported element type");
2894   mv(t0, vector_length);
2895   if (src_bt == T_LONG) {
2896     // https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#117-vector-narrowing-integer-right-shift-instructions
2897     // Future extensions might add support for versions that narrow to a destination that is 1/4 the width of the source.
2898     // So we can currently only scale down by 1/2 the width at a time.
2899     vsetvli(t0, t0, Assembler::e32, Assembler::mf2);
2900     vncvt_x_x_w(dst, src);
2901     if (dst_bt == T_SHORT || dst_bt == T_BYTE) {
2902       vsetvli(t0, t0, Assembler::e16, Assembler::mf2);
2903       vncvt_x_x_w(dst, dst);
2904       if (dst_bt == T_BYTE) {
2905         vsetvli(t0, t0, Assembler::e8, Assembler::mf2);
2906         vncvt_x_x_w(dst, dst);
2907       }
2908     }
2909   } else if (src_bt == T_INT) {
2910     // T_SHORT
2911     vsetvli(t0, t0, Assembler::e16, Assembler::mf2);
2912     vncvt_x_x_w(dst, src);
2913     if (dst_bt == T_BYTE) {
2914       vsetvli(t0, t0, Assembler::e8, Assembler::mf2);
2915       vncvt_x_x_w(dst, dst);
2916     }
2917   } else if (src_bt == T_SHORT) {
2918     vsetvli(t0, t0, Assembler::e8, Assembler::mf2);
2919     vncvt_x_x_w(dst, src);
2920   }
2921 }
2922 
2923 #define VFCVT_SAFE(VFLOATCVT)                                                      \
2924 void C2_MacroAssembler::VFLOATCVT##_safe(VectorRegister dst, VectorRegister src) { \
2925   assert_different_registers(dst, src);                                            \
2926   vxor_vv(dst, dst, dst);                                                          \
2927   vmfeq_vv(v0, src, src);                                                          \
2928   VFLOATCVT(dst, src, Assembler::v0_t);                                            \
2929 }
2930 
2931 VFCVT_SAFE(vfcvt_rtz_x_f_v);
2932 
2933 #undef VFCVT_SAFE
2934 
2935 // Extract a scalar element from an vector at position 'idx'.
2936 // The input elements in src are expected to be of integral type.
2937 void C2_MacroAssembler::extract_v(Register dst, VectorRegister src, BasicType bt,
2938                                   int idx, VectorRegister tmp) {
2939   assert(is_integral_type(bt), "unsupported element type");
2940   assert(idx >= 0, "idx cannot be negative");
2941   // Only need the first element after vector slidedown
2942   vsetvli_helper(bt, 1);
2943   if (idx == 0) {
2944     vmv_x_s(dst, src);
2945   } else if (idx <= 31) {
2946     vslidedown_vi(tmp, src, idx);
2947     vmv_x_s(dst, tmp);
2948   } else {
2949     mv(t0, idx);
2950     vslidedown_vx(tmp, src, t0);
2951     vmv_x_s(dst, tmp);
2952   }
2953 }
2954 
2955 // Extract a scalar element from an vector at position 'idx'.
2956 // The input elements in src are expected to be of floating point type.
2957 void C2_MacroAssembler::extract_fp_v(FloatRegister dst, VectorRegister src, BasicType bt,
2958                                      int idx, VectorRegister tmp) {
2959   assert(is_floating_point_type(bt), "unsupported element type");
2960   assert(idx >= 0, "idx cannot be negative");
2961   // Only need the first element after vector slidedown
2962   vsetvli_helper(bt, 1);
2963   if (idx == 0) {
2964     vfmv_f_s(dst, src);
2965   } else if (idx <= 31) {
2966     vslidedown_vi(tmp, src, idx);
2967     vfmv_f_s(dst, tmp);
2968   } else {
2969     mv(t0, idx);
2970     vslidedown_vx(tmp, src, t0);
2971     vfmv_f_s(dst, tmp);
2972   }
2973 }