New src/hotspot/cpu/riscv/c2_MacroAssembler

   1 /*
   2  * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2020, 2022, Huawei Technologies Co., Ltd. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/assembler.hpp"
  28 #include "asm/assembler.inline.hpp"
  29 #include "opto/c2_MacroAssembler.hpp"
  30 #include "opto/compile.hpp"
  31 #include "opto/intrinsicnode.hpp"
  32 #include "opto/output.hpp"
  33 #include "opto/subnode.hpp"
  34 #include "runtime/stubRoutines.hpp"
  35 #include "utilities/globalDefinitions.hpp"
  36 
  37 #ifdef PRODUCT
  38 #define BLOCK_COMMENT(str) /* nothing */
  39 #define STOP(error) stop(error)
  40 #else
  41 #define BLOCK_COMMENT(str) block_comment(str)
  42 #define STOP(error) block_comment(error); stop(error)
  43 #endif
  44 
  45 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  46 
  47 void C2_MacroAssembler::fast_lock(Register objectReg, Register boxReg,
  48                                   Register tmp1Reg, Register tmp2Reg, Register tmp3Reg) {
  49   // Use cr register to indicate the fast_lock result: zero for success; non-zero for failure.
  50   Register flag = t1;
  51   Register oop = objectReg;
  52   Register box = boxReg;
  53   Register disp_hdr = tmp1Reg;
  54   Register tmp = tmp2Reg;
  55   Label object_has_monitor;
  56   // Finish fast lock successfully. MUST branch to with flag == 0
  57   Label locked;
  58   // Finish fast lock unsuccessfully. slow_path MUST branch to with flag != 0
  59   Label slow_path;
  60 
  61   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight");
  62   assert_different_registers(oop, box, tmp, disp_hdr, flag, tmp3Reg, t0);
  63 
  64   mv(flag, 1);
  65 
  66   // Load markWord from object into displaced_header.
  67   ld(disp_hdr, Address(oop, oopDesc::mark_offset_in_bytes()));
  68 
  69   if (DiagnoseSyncOnValueBasedClasses != 0) {
  70     load_klass(tmp, oop);
  71     lwu(tmp, Address(tmp, Klass::access_flags_offset()));
  72     test_bit(tmp, tmp, exact_log2(JVM_ACC_IS_VALUE_BASED_CLASS));
  73     bnez(tmp, slow_path);
  74   }
  75 
  76   // Check for existing monitor
  77   test_bit(tmp, disp_hdr, exact_log2(markWord::monitor_value));
  78   bnez(tmp, object_has_monitor);
  79 
  80   if (LockingMode == LM_MONITOR) {
  81     j(slow_path);
  82   } else {
  83     assert(LockingMode == LM_LEGACY, "must be");
  84     // Set tmp to be (markWord of object | UNLOCK_VALUE).
  85     ori(tmp, disp_hdr, markWord::unlocked_value);
  86 
  87     // Initialize the box. (Must happen before we update the object mark!)
  88     sd(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes()));
  89 
  90     // Compare object markWord with an unlocked value (tmp) and if
  91     // equal exchange the stack address of our box with object markWord.
  92     // On failure disp_hdr contains the possibly locked markWord.
  93     cmpxchg(/*memory address*/oop, /*expected value*/tmp, /*new value*/box, Assembler::int64,
  94             Assembler::aq, Assembler::rl, /*result*/disp_hdr);
  95     beq(disp_hdr, tmp, locked);
  96 
  97     assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
  98 
  99     // If the compare-and-exchange succeeded, then we found an unlocked
 100     // object, will have now locked it will continue at label locked
 101     // We did not see an unlocked object so try the fast recursive case.
 102 
 103     // Check if the owner is self by comparing the value in the
 104     // markWord of object (disp_hdr) with the stack pointer.
 105     sub(disp_hdr, disp_hdr, sp);
 106     mv(tmp, (intptr_t) (~(os::vm_page_size()-1) | (uintptr_t)markWord::lock_mask_in_place));
 107     // If (mark & lock_mask) == 0 and mark - sp < page_size, we are stack-locking and goto label locked,
 108     // hence we can store 0 as the displaced header in the box, which indicates that it is a
 109     // recursive lock.
 110     andr(tmp/*==0?*/, disp_hdr, tmp);
 111     sd(tmp/*==0, perhaps*/, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 112     beqz(tmp, locked);
 113     j(slow_path);
 114   }
 115 
 116   // Handle existing monitor.
 117   bind(object_has_monitor);
 118   // The object's monitor m is unlocked iff m->owner == nullptr,
 119   // otherwise m->owner may contain a thread or a stack address.
 120   //
 121   // Try to CAS m->owner from null to current thread.
 122   add(tmp, disp_hdr, (in_bytes(ObjectMonitor::owner_offset()) - markWord::monitor_value));
 123   cmpxchg(/*memory address*/tmp, /*expected value*/zr, /*new value*/xthread, Assembler::int64,
 124           Assembler::aq, Assembler::rl, /*result*/tmp3Reg); // cas succeeds if tmp3Reg == zr(expected)
 125 
 126   // Store a non-null value into the box to avoid looking like a re-entrant
 127   // lock. The fast-path monitor unlock code checks for
 128   // markWord::monitor_value so use markWord::unused_mark which has the
 129   // relevant bit set, and also matches ObjectSynchronizer::slow_enter.
 130   mv(tmp, (address)markWord::unused_mark().value());
 131   sd(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 132 
 133   beqz(tmp3Reg, locked); // CAS success means locking succeeded
 134 
 135   bne(tmp3Reg, xthread, slow_path); // Check for recursive locking
 136 
 137   // Recursive lock case
 138   increment(Address(disp_hdr, in_bytes(ObjectMonitor::recursions_offset()) - markWord::monitor_value), 1, tmp2Reg, tmp3Reg);
 139 
 140   bind(locked);
 141   mv(flag, zr);
 142   increment(Address(xthread, JavaThread::held_monitor_count_offset()), 1, tmp2Reg, tmp3Reg);
 143 
 144 #ifdef ASSERT
 145   // Check that locked label is reached with flag == 0.
 146   Label flag_correct;
 147   beqz(flag, flag_correct);
 148   stop("Fast Lock Flag != 0");
 149 #endif
 150 
 151   bind(slow_path);
 152 #ifdef ASSERT
 153   // Check that slow_path label is reached with flag != 0.
 154   bnez(flag, flag_correct);
 155   stop("Fast Lock Flag == 0");
 156   bind(flag_correct);
 157 #endif
 158   // C2 uses the value of flag (0 vs !0) to determine the continuation.
 159 }
 160 
 161 void C2_MacroAssembler::fast_unlock(Register objectReg, Register boxReg,
 162                                     Register tmp1Reg, Register tmp2Reg) {
 163   // Use cr register to indicate the fast_unlock result: zero for success; non-zero for failure.
 164   Register flag = t1;
 165   Register oop = objectReg;
 166   Register box = boxReg;
 167   Register disp_hdr = tmp1Reg;
 168   Register tmp = tmp2Reg;
 169   Label object_has_monitor;
 170   // Finish fast lock successfully. MUST branch to with flag == 0
 171   Label unlocked;
 172   // Finish fast lock unsuccessfully. slow_path MUST branch to with flag != 0
 173   Label slow_path;
 174 
 175   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight");
 176   assert_different_registers(oop, box, tmp, disp_hdr, flag, t0);
 177 
 178   mv(flag, 1);
 179 
 180   if (LockingMode == LM_LEGACY) {
 181     // Find the lock address and load the displaced header from the stack.
 182     ld(disp_hdr, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 183 
 184     // If the displaced header is 0, we have a recursive unlock.
 185     beqz(disp_hdr, unlocked);
 186   }
 187 
 188   // Handle existing monitor.
 189   ld(tmp, Address(oop, oopDesc::mark_offset_in_bytes()));
 190   test_bit(t0, tmp, exact_log2(markWord::monitor_value));
 191   bnez(t0, object_has_monitor);
 192 
 193   if (LockingMode == LM_MONITOR) {
 194     j(slow_path);
 195   } else {
 196     assert(LockingMode == LM_LEGACY, "must be");
 197     // Check if it is still a light weight lock, this is true if we
 198     // see the stack address of the basicLock in the markWord of the
 199     // object.
 200 
 201     cmpxchg(/*memory address*/oop, /*expected value*/box, /*new value*/disp_hdr, Assembler::int64,
 202             Assembler::relaxed, Assembler::rl, /*result*/tmp);
 203     beq(box, tmp, unlocked); // box == tmp if cas succeeds
 204     j(slow_path);
 205   }
 206 
 207   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
 208 
 209   // Handle existing monitor.
 210   bind(object_has_monitor);
 211   STATIC_ASSERT(markWord::monitor_value <= INT_MAX);
 212   add(tmp, tmp, -(int)markWord::monitor_value); // monitor
 213 
 214   ld(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset()));
 215 
 216   Label notRecursive;
 217   beqz(disp_hdr, notRecursive); // Will be 0 if not recursive.
 218 
 219   // Recursive lock
 220   addi(disp_hdr, disp_hdr, -1);
 221   sd(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset()));
 222   j(unlocked);
 223 
 224   bind(notRecursive);
 225   ld(t0, Address(tmp, ObjectMonitor::EntryList_offset()));
 226   ld(disp_hdr, Address(tmp, ObjectMonitor::cxq_offset()));
 227   orr(t0, t0, disp_hdr); // Will be 0 if both are 0.
 228   bnez(t0, slow_path);
 229 
 230   // need a release store here
 231   la(tmp, Address(tmp, ObjectMonitor::owner_offset()));
 232   membar(MacroAssembler::LoadStore | MacroAssembler::StoreStore);
 233   sd(zr, Address(tmp)); // set unowned
 234 
 235   bind(unlocked);
 236   mv(flag, zr);
 237   decrement(Address(xthread, JavaThread::held_monitor_count_offset()), 1, tmp1Reg, tmp2Reg);
 238 
 239 #ifdef ASSERT
 240   // Check that unlocked label is reached with flag == 0.
 241   Label flag_correct;
 242   beqz(flag, flag_correct);
 243   stop("Fast Lock Flag != 0");
 244 #endif
 245 
 246   bind(slow_path);
 247 #ifdef ASSERT
 248   // Check that slow_path label is reached with flag != 0.
 249   bnez(flag, flag_correct);
 250   stop("Fast Lock Flag == 0");
 251   bind(flag_correct);
 252 #endif
 253   // C2 uses the value of flag (0 vs !0) to determine the continuation.
 254 }
 255 
 256 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register tmp1, Register tmp2, Register tmp3) {
 257   // Flag register, zero for success; non-zero for failure.
 258   Register flag = t1;
 259 
 260   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 261   assert_different_registers(obj, tmp1, tmp2, tmp3, flag, t0);
 262 
 263   mv(flag, 1);
 264 
 265   // Handle inflated monitor.
 266   Label inflated;
 267   // Finish fast lock successfully. MUST branch to with flag == 0
 268   Label locked;
 269   // Finish fast lock unsuccessfully. slow_path MUST branch to with flag != 0
 270   Label slow_path;
 271 
 272   if (DiagnoseSyncOnValueBasedClasses != 0) {
 273     load_klass(tmp1, obj);
 274     lwu(tmp1, Address(tmp1, Klass::access_flags_offset()));
 275     test_bit(tmp1, tmp1, exact_log2(JVM_ACC_IS_VALUE_BASED_CLASS));
 276     bnez(tmp1, slow_path);
 277   }
 278 
 279   const Register tmp1_mark = tmp1;
 280 
 281   { // Lightweight locking
 282 
 283     // Push lock to the lock stack and finish successfully. MUST branch to with flag == 0
 284     Label push;
 285 
 286     const Register tmp2_top = tmp2;
 287     const Register tmp3_t = tmp3;
 288 
 289     // Check if lock-stack is full.
 290     lwu(tmp2_top, Address(xthread, JavaThread::lock_stack_top_offset()));
 291     mv(tmp3_t, (unsigned)LockStack::end_offset());
 292     bge(tmp2_top, tmp3_t, slow_path);
 293 
 294     // Check if recursive.
 295     add(tmp3_t, xthread, tmp2_top);
 296     ld(tmp3_t, Address(tmp3_t, -oopSize));
 297     beq(obj, tmp3_t, push);
 298 
 299     // Relaxed normal load to check for monitor. Optimization for monitor case.
 300     ld(tmp1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 301     test_bit(tmp3_t, tmp1_mark, exact_log2(markWord::monitor_value));
 302     bnez(tmp3_t, inflated);
 303 
 304     // Not inflated
 305     assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a la");
 306 
 307     // Try to lock. Transition lock-bits 0b01 => 0b00
 308     ori(tmp1_mark, tmp1_mark, markWord::unlocked_value);
 309     xori(tmp3_t, tmp1_mark, markWord::unlocked_value);
 310     cmpxchg(/*addr*/ obj, /*expected*/ tmp1_mark, /*new*/ tmp3_t, Assembler::int64,
 311             /*acquire*/ Assembler::aq, /*release*/ Assembler::relaxed, /*result*/ tmp3_t);
 312     bne(tmp1_mark, tmp3_t, slow_path);
 313 
 314     bind(push);
 315     // After successful lock, push object on lock-stack.
 316     add(tmp3_t, xthread, tmp2_top);
 317     sd(obj, Address(tmp3_t));
 318     addw(tmp2_top, tmp2_top, oopSize);
 319     sw(tmp2_top, Address(xthread, JavaThread::lock_stack_top_offset()));
 320     j(locked);
 321   }
 322 
 323   { // Handle inflated monitor.
 324     bind(inflated);
 325 
 326     if (!UseObjectMonitorTable) {
 327       // mark contains the tagged ObjectMonitor*.
 328       const Register tmp1_tagged_monitor = tmp1_mark;
 329       const uintptr_t monitor_tag = markWord::monitor_value;
 330       const Register tmp2_owner_addr = tmp2;
 331       const Register tmp3_owner = tmp3;
 332 
 333       // Compute owner address.
 334       la(tmp2_owner_addr, Address(tmp1_tagged_monitor, (in_bytes(ObjectMonitor::owner_offset()) - monitor_tag)));
 335 
 336       // CAS owner (null => current thread).
 337       cmpxchg(/*addr*/ tmp2_owner_addr, /*expected*/ zr, /*new*/ xthread, Assembler::int64,
 338               /*acquire*/ Assembler::aq, /*release*/ Assembler::relaxed, /*result*/ tmp3_owner);
 339       beqz(tmp3_owner, locked);
 340 
 341       // Check if recursive.
 342       bne(tmp3_owner, xthread, slow_path);
 343 
 344       // Recursive.
 345       increment(Address(tmp1_tagged_monitor, in_bytes(ObjectMonitor::recursions_offset()) - monitor_tag), 1, tmp2, tmp3);
 346     } else {
 347       // OMCache lookup not supported yet. Take the slowpath.
 348       j(slow_path);
 349     }
 350   }
 351 
 352   bind(locked);
 353   mv(flag, zr);
 354   increment(Address(xthread, JavaThread::held_monitor_count_offset()), 1, tmp2, tmp3);
 355 
 356 #ifdef ASSERT
 357   // Check that locked label is reached with flag == 0.
 358   Label flag_correct;
 359   beqz(flag, flag_correct);
 360   stop("Fast Lock Flag != 0");
 361 #endif
 362 
 363   bind(slow_path);
 364 #ifdef ASSERT
 365   // Check that slow_path label is reached with flag != 0.
 366   bnez(flag, flag_correct);
 367   stop("Fast Lock Flag == 0");
 368   bind(flag_correct);
 369 #endif
 370   // C2 uses the value of flag (0 vs !0) to determine the continuation.
 371 }
 372 
 373 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register tmp1, Register tmp2,
 374                                                 Register tmp3) {
 375   // Flag register, zero for success; non-zero for failure.
 376   Register flag = t1;
 377 
 378   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 379   assert_different_registers(obj, tmp1, tmp2, tmp3, flag, t0);
 380 
 381   mv(flag, 1);
 382 
 383   // Handle inflated monitor.
 384   Label inflated, inflated_load_monitor;
 385   // Finish fast unlock successfully. unlocked MUST branch to with flag == 0
 386   Label unlocked;
 387   // Finish fast unlock unsuccessfully. MUST branch to with flag != 0
 388   Label slow_path;
 389 
 390   const Register tmp1_mark = tmp1;
 391   const Register tmp2_top = tmp2;
 392   const Register tmp3_t = tmp3;
 393 
 394   { // Lightweight unlock
 395 
 396     // Check if obj is top of lock-stack.
 397     lwu(tmp2_top, Address(xthread, JavaThread::lock_stack_top_offset()));
 398     subw(tmp2_top, tmp2_top, oopSize);
 399     add(tmp3_t, xthread, tmp2_top);
 400     ld(tmp3_t, Address(tmp3_t));
 401     // Top of lock stack was not obj. Must be monitor.
 402     bne(obj, tmp3_t, inflated_load_monitor);
 403 
 404     // Pop lock-stack.
 405     DEBUG_ONLY(add(tmp3_t, xthread, tmp2_top);)
 406     DEBUG_ONLY(sd(zr, Address(tmp3_t));)
 407     sw(tmp2_top, Address(xthread, JavaThread::lock_stack_top_offset()));
 408 
 409     // Check if recursive.
 410     add(tmp3_t, xthread, tmp2_top);
 411     ld(tmp3_t, Address(tmp3_t, -oopSize));
 412     beq(obj, tmp3_t, unlocked);
 413 
 414     // Not recursive.
 415     // Load Mark.
 416     ld(tmp1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 417 
 418     // Check header for monitor (0b10).
 419     test_bit(tmp3_t, tmp1_mark, exact_log2(markWord::monitor_value));
 420     bnez(tmp3_t, inflated);
 421 
 422     // Try to unlock. Transition lock bits 0b00 => 0b01
 423     assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea");
 424     ori(tmp3_t, tmp1_mark, markWord::unlocked_value);
 425     cmpxchg(/*addr*/ obj, /*expected*/ tmp1_mark, /*new*/ tmp3_t, Assembler::int64,
 426             /*acquire*/ Assembler::relaxed, /*release*/ Assembler::rl, /*result*/ tmp3_t);
 427     beq(tmp1_mark, tmp3_t, unlocked);
 428 
 429     // Compare and exchange failed.
 430     // Restore lock-stack and handle the unlock in runtime.
 431     DEBUG_ONLY(add(tmp3_t, xthread, tmp2_top);)
 432     DEBUG_ONLY(sd(obj, Address(tmp3_t));)
 433     addw(tmp2_top, tmp2_top, oopSize);
 434     sd(tmp2_top, Address(xthread, JavaThread::lock_stack_top_offset()));
 435     j(slow_path);
 436   }
 437 
 438   { // Handle inflated monitor.
 439     bind(inflated_load_monitor);
 440     ld(tmp1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 441 #ifdef ASSERT
 442     test_bit(tmp3_t, tmp1_mark, exact_log2(markWord::monitor_value));
 443     bnez(tmp3_t, inflated);
 444     stop("Fast Unlock not monitor");
 445 #endif
 446 
 447     bind(inflated);
 448 
 449 #ifdef ASSERT
 450     Label check_done;
 451     subw(tmp2_top, tmp2_top, oopSize);
 452     mv(tmp3_t, in_bytes(JavaThread::lock_stack_base_offset()));
 453     blt(tmp2_top, tmp3_t, check_done);
 454     add(tmp3_t, xthread, tmp2_top);
 455     ld(tmp3_t, Address(tmp3_t));
 456     bne(obj, tmp3_t, inflated);
 457     stop("Fast Unlock lock on stack");
 458     bind(check_done);
 459 #endif
 460 
 461     if (!UseObjectMonitorTable) {
 462       // mark contains the tagged ObjectMonitor*.
 463       const Register tmp1_monitor = tmp1_mark;
 464       const uintptr_t monitor_tag = markWord::monitor_value;
 465 
 466       // Untag the monitor.
 467       sub(tmp1_monitor, tmp1_mark, monitor_tag);
 468 
 469       const Register tmp2_recursions = tmp2;
 470       Label not_recursive;
 471 
 472       // Check if recursive.
 473       ld(tmp2_recursions, Address(tmp1_monitor, ObjectMonitor::recursions_offset()));
 474       beqz(tmp2_recursions, not_recursive);
 475 
 476       // Recursive unlock.
 477       addi(tmp2_recursions, tmp2_recursions, -1);
 478       sd(tmp2_recursions, Address(tmp1_monitor, ObjectMonitor::recursions_offset()));
 479       j(unlocked);
 480 
 481       bind(not_recursive);
 482 
 483       Label release;
 484       const Register tmp2_owner_addr = tmp2;
 485 
 486       // Compute owner address.
 487       la(tmp2_owner_addr, Address(tmp1_monitor, ObjectMonitor::owner_offset()));
 488 
 489       // Check if the entry lists are empty.
 490       ld(t0, Address(tmp1_monitor, ObjectMonitor::EntryList_offset()));
 491       ld(tmp3_t, Address(tmp1_monitor, ObjectMonitor::cxq_offset()));
 492       orr(t0, t0, tmp3_t);
 493       beqz(t0, release);
 494 
 495       // The owner may be anonymous and we removed the last obj entry in
 496       // the lock-stack. This loses the information about the owner.
 497       // Write the thread to the owner field so the runtime knows the owner.
 498       sd(xthread, Address(tmp2_owner_addr));
 499       j(slow_path);
 500 
 501       bind(release);
 502       // Set owner to null.
 503       membar(MacroAssembler::LoadStore | MacroAssembler::StoreStore);
 504       sd(zr, Address(tmp2_owner_addr));
 505     } else {
 506       // OMCache lookup not supported yet. Take the slowpath.
 507       j(slow_path);
 508     }
 509   }
 510 
 511   bind(unlocked);
 512   mv(flag, zr);
 513   decrement(Address(xthread, JavaThread::held_monitor_count_offset()), 1, tmp2, tmp3);
 514 
 515 #ifdef ASSERT
 516   // Check that unlocked label is reached with flag == 0.
 517   Label flag_correct;
 518   beqz(flag, flag_correct);
 519   stop("Fast Lock Flag != 0");
 520 #endif
 521 
 522   bind(slow_path);
 523 #ifdef ASSERT
 524   // Check that slow_path label is reached with flag != 0.
 525   bnez(flag, flag_correct);
 526   stop("Fast Lock Flag == 0");
 527   bind(flag_correct);
 528 #endif
 529   // C2 uses the value of flag (0 vs !0) to determine the continuation.
 530 }
 531 
 532 // short string
 533 // StringUTF16.indexOfChar
 534 // StringLatin1.indexOfChar
 535 void C2_MacroAssembler::string_indexof_char_short(Register str1, Register cnt1,
 536                                                   Register ch, Register result,
 537                                                   bool isL)
 538 {
 539   Register ch1 = t0;
 540   Register index = t1;
 541 
 542   BLOCK_COMMENT("string_indexof_char_short {");
 543 
 544   Label LOOP, LOOP1, LOOP4, LOOP8;
 545   Label MATCH,  MATCH1, MATCH2, MATCH3,
 546         MATCH4, MATCH5, MATCH6, MATCH7, NOMATCH;
 547 
 548   mv(result, -1);
 549   mv(index, zr);
 550 
 551   bind(LOOP);
 552   addi(t0, index, 8);
 553   ble(t0, cnt1, LOOP8);
 554   addi(t0, index, 4);
 555   ble(t0, cnt1, LOOP4);
 556   j(LOOP1);
 557 
 558   bind(LOOP8);
 559   isL ? lbu(ch1, Address(str1, 0)) : lhu(ch1, Address(str1, 0));
 560   beq(ch, ch1, MATCH);
 561   isL ? lbu(ch1, Address(str1, 1)) : lhu(ch1, Address(str1, 2));
 562   beq(ch, ch1, MATCH1);
 563   isL ? lbu(ch1, Address(str1, 2)) : lhu(ch1, Address(str1, 4));
 564   beq(ch, ch1, MATCH2);
 565   isL ? lbu(ch1, Address(str1, 3)) : lhu(ch1, Address(str1, 6));
 566   beq(ch, ch1, MATCH3);
 567   isL ? lbu(ch1, Address(str1, 4)) : lhu(ch1, Address(str1, 8));
 568   beq(ch, ch1, MATCH4);
 569   isL ? lbu(ch1, Address(str1, 5)) : lhu(ch1, Address(str1, 10));
 570   beq(ch, ch1, MATCH5);
 571   isL ? lbu(ch1, Address(str1, 6)) : lhu(ch1, Address(str1, 12));
 572   beq(ch, ch1, MATCH6);
 573   isL ? lbu(ch1, Address(str1, 7)) : lhu(ch1, Address(str1, 14));
 574   beq(ch, ch1, MATCH7);
 575   addi(index, index, 8);
 576   addi(str1, str1, isL ? 8 : 16);
 577   blt(index, cnt1, LOOP);
 578   j(NOMATCH);
 579 
 580   bind(LOOP4);
 581   isL ? lbu(ch1, Address(str1, 0)) : lhu(ch1, Address(str1, 0));
 582   beq(ch, ch1, MATCH);
 583   isL ? lbu(ch1, Address(str1, 1)) : lhu(ch1, Address(str1, 2));
 584   beq(ch, ch1, MATCH1);
 585   isL ? lbu(ch1, Address(str1, 2)) : lhu(ch1, Address(str1, 4));
 586   beq(ch, ch1, MATCH2);
 587   isL ? lbu(ch1, Address(str1, 3)) : lhu(ch1, Address(str1, 6));
 588   beq(ch, ch1, MATCH3);
 589   addi(index, index, 4);
 590   addi(str1, str1, isL ? 4 : 8);
 591   bge(index, cnt1, NOMATCH);
 592 
 593   bind(LOOP1);
 594   isL ? lbu(ch1, Address(str1)) : lhu(ch1, Address(str1));
 595   beq(ch, ch1, MATCH);
 596   addi(index, index, 1);
 597   addi(str1, str1, isL ? 1 : 2);
 598   blt(index, cnt1, LOOP1);
 599   j(NOMATCH);
 600 
 601   bind(MATCH1);
 602   addi(index, index, 1);
 603   j(MATCH);
 604 
 605   bind(MATCH2);
 606   addi(index, index, 2);
 607   j(MATCH);
 608 
 609   bind(MATCH3);
 610   addi(index, index, 3);
 611   j(MATCH);
 612 
 613   bind(MATCH4);
 614   addi(index, index, 4);
 615   j(MATCH);
 616 
 617   bind(MATCH5);
 618   addi(index, index, 5);
 619   j(MATCH);
 620 
 621   bind(MATCH6);
 622   addi(index, index, 6);
 623   j(MATCH);
 624 
 625   bind(MATCH7);
 626   addi(index, index, 7);
 627 
 628   bind(MATCH);
 629   mv(result, index);
 630   bind(NOMATCH);
 631   BLOCK_COMMENT("} string_indexof_char_short");
 632 }
 633 
 634 // StringUTF16.indexOfChar
 635 // StringLatin1.indexOfChar
 636 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1,
 637                                             Register ch, Register result,
 638                                             Register tmp1, Register tmp2,
 639                                             Register tmp3, Register tmp4,
 640                                             bool isL)
 641 {
 642   Label CH1_LOOP, HIT, NOMATCH, DONE, DO_LONG;
 643   Register ch1 = t0;
 644   Register orig_cnt = t1;
 645   Register mask1 = tmp3;
 646   Register mask2 = tmp2;
 647   Register match_mask = tmp1;
 648   Register trailing_char = tmp4;
 649   Register unaligned_elems = tmp4;
 650 
 651   BLOCK_COMMENT("string_indexof_char {");
 652   beqz(cnt1, NOMATCH);
 653 
 654   addi(t0, cnt1, isL ? -32 : -16);
 655   bgtz(t0, DO_LONG);
 656   string_indexof_char_short(str1, cnt1, ch, result, isL);
 657   j(DONE);
 658 
 659   bind(DO_LONG);
 660   mv(orig_cnt, cnt1);
 661   if (AvoidUnalignedAccesses) {
 662     Label ALIGNED;
 663     andi(unaligned_elems, str1, 0x7);
 664     beqz(unaligned_elems, ALIGNED);
 665     sub(unaligned_elems, unaligned_elems, 8);
 666     neg(unaligned_elems, unaligned_elems);
 667     if (!isL) {
 668       srli(unaligned_elems, unaligned_elems, 1);
 669     }
 670     // do unaligned part per element
 671     string_indexof_char_short(str1, unaligned_elems, ch, result, isL);
 672     bgez(result, DONE);
 673     mv(orig_cnt, cnt1);
 674     sub(cnt1, cnt1, unaligned_elems);
 675     bind(ALIGNED);
 676   }
 677 
 678   // duplicate ch
 679   if (isL) {
 680     slli(ch1, ch, 8);
 681     orr(ch, ch1, ch);
 682   }
 683   slli(ch1, ch, 16);
 684   orr(ch, ch1, ch);
 685   slli(ch1, ch, 32);
 686   orr(ch, ch1, ch);
 687 
 688   if (!isL) {
 689     slli(cnt1, cnt1, 1);
 690   }
 691 
 692   uint64_t mask0101 = UCONST64(0x0101010101010101);
 693   uint64_t mask0001 = UCONST64(0x0001000100010001);
 694   mv(mask1, isL ? mask0101 : mask0001);
 695   uint64_t mask7f7f = UCONST64(0x7f7f7f7f7f7f7f7f);
 696   uint64_t mask7fff = UCONST64(0x7fff7fff7fff7fff);
 697   mv(mask2, isL ? mask7f7f : mask7fff);
 698 
 699   bind(CH1_LOOP);
 700   ld(ch1, Address(str1));
 701   addi(str1, str1, 8);
 702   addi(cnt1, cnt1, -8);
 703   compute_match_mask(ch1, ch, match_mask, mask1, mask2);
 704   bnez(match_mask, HIT);
 705   bgtz(cnt1, CH1_LOOP);
 706   j(NOMATCH);
 707 
 708   bind(HIT);
 709   ctzc_bit(trailing_char, match_mask, isL, ch1, result);
 710   srli(trailing_char, trailing_char, 3);
 711   addi(cnt1, cnt1, 8);
 712   ble(cnt1, trailing_char, NOMATCH);
 713   // match case
 714   if (!isL) {
 715     srli(cnt1, cnt1, 1);
 716     srli(trailing_char, trailing_char, 1);
 717   }
 718 
 719   sub(result, orig_cnt, cnt1);
 720   add(result, result, trailing_char);
 721   j(DONE);
 722 
 723   bind(NOMATCH);
 724   mv(result, -1);
 725 
 726   bind(DONE);
 727   BLOCK_COMMENT("} string_indexof_char");
 728 }
 729 
 730 typedef void (MacroAssembler::* load_chr_insn)(Register rd, const Address &adr, Register temp);
 731 
 732 // Search for needle in haystack and return index or -1
 733 // x10: result
 734 // x11: haystack
 735 // x12: haystack_len
 736 // x13: needle
 737 // x14: needle_len
 738 void C2_MacroAssembler::string_indexof(Register haystack, Register needle,
 739                                        Register haystack_len, Register needle_len,
 740                                        Register tmp1, Register tmp2,
 741                                        Register tmp3, Register tmp4,
 742                                        Register tmp5, Register tmp6,
 743                                        Register result, int ae)
 744 {
 745   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
 746 
 747   Label LINEARSEARCH, LINEARSTUB, DONE, NOMATCH;
 748 
 749   Register ch1 = t0;
 750   Register ch2 = t1;
 751   Register nlen_tmp = tmp1; // needle len tmp
 752   Register hlen_tmp = tmp2; // haystack len tmp
 753   Register result_tmp = tmp4;
 754 
 755   bool isLL = ae == StrIntrinsicNode::LL;
 756 
 757   bool needle_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
 758   bool haystack_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
 759   int needle_chr_shift = needle_isL ? 0 : 1;
 760   int haystack_chr_shift = haystack_isL ? 0 : 1;
 761   int needle_chr_size = needle_isL ? 1 : 2;
 762   int haystack_chr_size = haystack_isL ? 1 : 2;
 763   load_chr_insn needle_load_1chr = needle_isL ? (load_chr_insn)&MacroAssembler::lbu :
 764                               (load_chr_insn)&MacroAssembler::lhu;
 765   load_chr_insn haystack_load_1chr = haystack_isL ? (load_chr_insn)&MacroAssembler::lbu :
 766                                 (load_chr_insn)&MacroAssembler::lhu;
 767 
 768   BLOCK_COMMENT("string_indexof {");
 769 
 770   // Note, inline_string_indexOf() generates checks:
 771   // if (pattern.count > src.count) return -1;
 772   // if (pattern.count == 0) return 0;
 773 
 774   // We have two strings, a source string in haystack, haystack_len and a pattern string
 775   // in needle, needle_len. Find the first occurrence of pattern in source or return -1.
 776 
 777   // For larger pattern and source we use a simplified Boyer Moore algorithm.
 778   // With a small pattern and source we use linear scan.
 779 
 780   // needle_len >=8 && needle_len < 256 && needle_len < haystack_len/4, use bmh algorithm.
 781   sub(result_tmp, haystack_len, needle_len);
 782   // needle_len < 8, use linear scan
 783   sub(t0, needle_len, 8);
 784   bltz(t0, LINEARSEARCH);
 785   // needle_len >= 256, use linear scan
 786   sub(t0, needle_len, 256);
 787   bgez(t0, LINEARSTUB);
 788   // needle_len >= haystack_len/4, use linear scan
 789   srli(t0, haystack_len, 2);
 790   bge(needle_len, t0, LINEARSTUB);
 791 
 792   // Boyer-Moore-Horspool introduction:
 793   // The Boyer Moore alogorithm is based on the description here:-
 794   //
 795   // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
 796   //
 797   // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
 798   // and the 'Good Suffix' rule.
 799   //
 800   // These rules are essentially heuristics for how far we can shift the
 801   // pattern along the search string.
 802   //
 803   // The implementation here uses the 'Bad Character' rule only because of the
 804   // complexity of initialisation for the 'Good Suffix' rule.
 805   //
 806   // This is also known as the Boyer-Moore-Horspool algorithm:
 807   //
 808   // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
 809   //
 810   // #define ASIZE 256
 811   //
 812   //    int bm(unsigned char *pattern, int m, unsigned char *src, int n) {
 813   //      int i, j;
 814   //      unsigned c;
 815   //      unsigned char bc[ASIZE];
 816   //
 817   //      /* Preprocessing */
 818   //      for (i = 0; i < ASIZE; ++i)
 819   //        bc[i] = m;
 820   //      for (i = 0; i < m - 1; ) {
 821   //        c = pattern[i];
 822   //        ++i;
 823   //        // c < 256 for Latin1 string, so, no need for branch
 824   //        #ifdef PATTERN_STRING_IS_LATIN1
 825   //        bc[c] = m - i;
 826   //        #else
 827   //        if (c < ASIZE) bc[c] = m - i;
 828   //        #endif
 829   //      }
 830   //
 831   //      /* Searching */
 832   //      j = 0;
 833   //      while (j <= n - m) {
 834   //        c = src[i+j];
 835   //        if (pattern[m-1] == c)
 836   //          int k;
 837   //          for (k = m - 2; k >= 0 && pattern[k] == src[k + j]; --k);
 838   //          if (k < 0) return j;
 839   //          // c < 256 for Latin1 string, so, no need for branch
 840   //          #ifdef SOURCE_STRING_IS_LATIN1_AND_PATTERN_STRING_IS_LATIN1
 841   //          // LL case: (c< 256) always true. Remove branch
 842   //          j += bc[pattern[j+m-1]];
 843   //          #endif
 844   //          #ifdef SOURCE_STRING_IS_UTF_AND_PATTERN_STRING_IS_UTF
 845   //          // UU case: need if (c<ASIZE) check. Skip 1 character if not.
 846   //          if (c < ASIZE)
 847   //            j += bc[pattern[j+m-1]];
 848   //          else
 849   //            j += 1
 850   //          #endif
 851   //          #ifdef SOURCE_IS_UTF_AND_PATTERN_IS_LATIN1
 852   //          // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
 853   //          if (c < ASIZE)
 854   //            j += bc[pattern[j+m-1]];
 855   //          else
 856   //            j += m
 857   //          #endif
 858   //      }
 859   //      return -1;
 860   //    }
 861 
 862   // temp register:t0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, result
 863   Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
 864         BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
 865 
 866   Register haystack_end = haystack_len;
 867   Register skipch = tmp2;
 868 
 869   // pattern length is >=8, so, we can read at least 1 register for cases when
 870   // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
 871   // UL case. We'll re-read last character in inner pre-loop code to have
 872   // single outer pre-loop load
 873   const int firstStep = isLL ? 7 : 3;
 874 
 875   const int ASIZE = 256;
 876   const int STORE_BYTES = 8; // 8 bytes stored per instruction(sd)
 877 
 878   sub(sp, sp, ASIZE);
 879 
 880   // init BC offset table with default value: needle_len
 881   slli(t0, needle_len, 8);
 882   orr(t0, t0, needle_len); // [63...16][needle_len][needle_len]
 883   slli(tmp1, t0, 16);
 884   orr(t0, tmp1, t0); // [63...32][needle_len][needle_len][needle_len][needle_len]
 885   slli(tmp1, t0, 32);
 886   orr(tmp5, tmp1, t0); // tmp5: 8 elements [needle_len]
 887 
 888   mv(ch1, sp);  // ch1 is t0
 889   mv(tmp6, ASIZE / STORE_BYTES); // loop iterations
 890 
 891   bind(BM_INIT_LOOP);
 892   // for (i = 0; i < ASIZE; ++i)
 893   //   bc[i] = m;
 894   for (int i = 0; i < 4; i++) {
 895     sd(tmp5, Address(ch1, i * wordSize));
 896   }
 897   add(ch1, ch1, 32);
 898   sub(tmp6, tmp6, 4);
 899   bgtz(tmp6, BM_INIT_LOOP);
 900 
 901   sub(nlen_tmp, needle_len, 1); // m - 1, index of the last element in pattern
 902   Register orig_haystack = tmp5;
 903   mv(orig_haystack, haystack);
 904   // result_tmp = tmp4
 905   shadd(haystack_end, result_tmp, haystack, haystack_end, haystack_chr_shift);
 906   sub(ch2, needle_len, 1); // bc offset init value, ch2 is t1
 907   mv(tmp3, needle);
 908 
 909   //  for (i = 0; i < m - 1; ) {
 910   //    c = pattern[i];
 911   //    ++i;
 912   //    // c < 256 for Latin1 string, so, no need for branch
 913   //    #ifdef PATTERN_STRING_IS_LATIN1
 914   //    bc[c] = m - i;
 915   //    #else
 916   //    if (c < ASIZE) bc[c] = m - i;
 917   //    #endif
 918   //  }
 919   bind(BCLOOP);
 920   (this->*needle_load_1chr)(ch1, Address(tmp3), noreg);
 921   add(tmp3, tmp3, needle_chr_size);
 922   if (!needle_isL) {
 923     // ae == StrIntrinsicNode::UU
 924     mv(tmp6, ASIZE);
 925     bgeu(ch1, tmp6, BCSKIP);
 926   }
 927   add(tmp4, sp, ch1);
 928   sb(ch2, Address(tmp4)); // store skip offset to BC offset table
 929 
 930   bind(BCSKIP);
 931   sub(ch2, ch2, 1); // for next pattern element, skip distance -1
 932   bgtz(ch2, BCLOOP);
 933 
 934   // tmp6: pattern end, address after needle
 935   shadd(tmp6, needle_len, needle, tmp6, needle_chr_shift);
 936   if (needle_isL == haystack_isL) {
 937     // load last 8 bytes (8LL/4UU symbols)
 938     ld(tmp6, Address(tmp6, -wordSize));
 939   } else {
 940     // UL: from UTF-16(source) search Latin1(pattern)
 941     lwu(tmp6, Address(tmp6, -wordSize / 2)); // load last 4 bytes(4 symbols)
 942     // convert Latin1 to UTF. eg: 0x0000abcd -> 0x0a0b0c0d
 943     // We'll have to wait until load completed, but it's still faster than per-character loads+checks
 944     srli(tmp3, tmp6, BitsPerByte * (wordSize / 2 - needle_chr_size)); // pattern[m-1], eg:0x0000000a
 945     slli(ch2, tmp6, XLEN - 24);
 946     srli(ch2, ch2, XLEN - 8); // pattern[m-2], 0x0000000b
 947     slli(ch1, tmp6, XLEN - 16);
 948     srli(ch1, ch1, XLEN - 8); // pattern[m-3], 0x0000000c
 949     andi(tmp6, tmp6, 0xff); // pattern[m-4], 0x0000000d
 950     slli(ch2, ch2, 16);
 951     orr(ch2, ch2, ch1); // 0x00000b0c
 952     slli(result, tmp3, 48); // use result as temp register
 953     orr(tmp6, tmp6, result); // 0x0a00000d
 954     slli(result, ch2, 16);
 955     orr(tmp6, tmp6, result); // UTF-16:0x0a0b0c0d
 956   }
 957 
 958   // i = m - 1;
 959   // skipch = j + i;
 960   // if (skipch == pattern[m - 1]
 961   //   for (k = m - 2; k >= 0 && pattern[k] == src[k + j]; --k);
 962   // else
 963   //   move j with bad char offset table
 964   bind(BMLOOPSTR2);
 965   // compare pattern to source string backward
 966   shadd(result, nlen_tmp, haystack, result, haystack_chr_shift);
 967   (this->*haystack_load_1chr)(skipch, Address(result), noreg);
 968   sub(nlen_tmp, nlen_tmp, firstStep); // nlen_tmp is positive here, because needle_len >= 8
 969   if (needle_isL == haystack_isL) {
 970     // re-init tmp3. It's for free because it's executed in parallel with
 971     // load above. Alternative is to initialize it before loop, but it'll
 972     // affect performance on in-order systems with 2 or more ld/st pipelines
 973     srli(tmp3, tmp6, BitsPerByte * (wordSize - needle_chr_size)); // UU/LL: pattern[m-1]
 974   }
 975   if (!isLL) { // UU/UL case
 976     slli(ch2, nlen_tmp, 1); // offsets in bytes
 977   }
 978   bne(tmp3, skipch, BMSKIP); // if not equal, skipch is bad char
 979   add(result, haystack, isLL ? nlen_tmp : ch2);
 980   // load 8 bytes from source string
 981   // if isLL is false then read granularity can be 2
 982   load_long_misaligned(ch2, Address(result), ch1, isLL ? 1 : 2); // can use ch1 as temp register here as it will be trashed by next mv anyway
 983   mv(ch1, tmp6);
 984   if (isLL) {
 985     j(BMLOOPSTR1_AFTER_LOAD);
 986   } else {
 987     sub(nlen_tmp, nlen_tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
 988     j(BMLOOPSTR1_CMP);
 989   }
 990 
 991   bind(BMLOOPSTR1);
 992   shadd(ch1, nlen_tmp, needle, ch1, needle_chr_shift);
 993   (this->*needle_load_1chr)(ch1, Address(ch1), noreg);
 994   shadd(ch2, nlen_tmp, haystack, ch2, haystack_chr_shift);
 995   (this->*haystack_load_1chr)(ch2, Address(ch2), noreg);
 996 
 997   bind(BMLOOPSTR1_AFTER_LOAD);
 998   sub(nlen_tmp, nlen_tmp, 1);
 999   bltz(nlen_tmp, BMLOOPSTR1_LASTCMP);
1000 
1001   bind(BMLOOPSTR1_CMP);
1002   beq(ch1, ch2, BMLOOPSTR1);
1003 
1004   bind(BMSKIP);
1005   if (!isLL) {
1006     // if we've met UTF symbol while searching Latin1 pattern, then we can
1007     // skip needle_len symbols
1008     if (needle_isL != haystack_isL) {
1009       mv(result_tmp, needle_len);
1010     } else {
1011       mv(result_tmp, 1);
1012     }
1013     mv(t0, ASIZE);
1014     bgeu(skipch, t0, BMADV);
1015   }
1016   add(result_tmp, sp, skipch);
1017   lbu(result_tmp, Address(result_tmp)); // load skip offset
1018 
1019   bind(BMADV);
1020   sub(nlen_tmp, needle_len, 1);
1021   // move haystack after bad char skip offset
1022   shadd(haystack, result_tmp, haystack, result, haystack_chr_shift);
1023   ble(haystack, haystack_end, BMLOOPSTR2);
1024   add(sp, sp, ASIZE);
1025   j(NOMATCH);
1026 
1027   bind(BMLOOPSTR1_LASTCMP);
1028   bne(ch1, ch2, BMSKIP);
1029 
1030   bind(BMMATCH);
1031   sub(result, haystack, orig_haystack);
1032   if (!haystack_isL) {
1033     srli(result, result, 1);
1034   }
1035   add(sp, sp, ASIZE);
1036   j(DONE);
1037 
1038   bind(LINEARSTUB);
1039   sub(t0, needle_len, 16); // small patterns still should be handled by simple algorithm
1040   bltz(t0, LINEARSEARCH);
1041   mv(result, zr);
1042   RuntimeAddress stub = nullptr;
1043   if (isLL) {
1044     stub = RuntimeAddress(StubRoutines::riscv::string_indexof_linear_ll());
1045     assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated");
1046   } else if (needle_isL) {
1047     stub = RuntimeAddress(StubRoutines::riscv::string_indexof_linear_ul());
1048     assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated");
1049   } else {
1050     stub = RuntimeAddress(StubRoutines::riscv::string_indexof_linear_uu());
1051     assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated");
1052   }
1053   address call = trampoline_call(stub);
1054   if (call == nullptr) {
1055     DEBUG_ONLY(reset_labels(LINEARSEARCH, DONE, NOMATCH));
1056     ciEnv::current()->record_failure("CodeCache is full");
1057     return;
1058   }
1059   j(DONE);
1060 
1061   bind(NOMATCH);
1062   mv(result, -1);
1063   j(DONE);
1064 
1065   bind(LINEARSEARCH);
1066   string_indexof_linearscan(haystack, needle, haystack_len, needle_len, tmp1, tmp2, tmp3, tmp4, -1, result, ae);
1067 
1068   bind(DONE);
1069   BLOCK_COMMENT("} string_indexof");
1070 }
1071 
1072 // string_indexof
1073 // result: x10
1074 // src: x11
1075 // src_count: x12
1076 // pattern: x13
1077 // pattern_count: x14 or 1/2/3/4
1078 void C2_MacroAssembler::string_indexof_linearscan(Register haystack, Register needle,
1079                                                Register haystack_len, Register needle_len,
1080                                                Register tmp1, Register tmp2,
1081                                                Register tmp3, Register tmp4,
1082                                                int needle_con_cnt, Register result, int ae)
1083 {
1084   // Note:
1085   // needle_con_cnt > 0 means needle_len register is invalid, needle length is constant
1086   // for UU/LL: needle_con_cnt[1, 4], UL: needle_con_cnt = 1
1087   assert(needle_con_cnt <= 4, "Invalid needle constant count");
1088   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
1089 
1090   Register ch1 = t0;
1091   Register ch2 = t1;
1092   Register hlen_neg = haystack_len, nlen_neg = needle_len;
1093   Register nlen_tmp = tmp1, hlen_tmp = tmp2, result_tmp = tmp4;
1094 
1095   bool isLL = ae == StrIntrinsicNode::LL;
1096 
1097   bool needle_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
1098   bool haystack_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
1099   int needle_chr_shift = needle_isL ? 0 : 1;
1100   int haystack_chr_shift = haystack_isL ? 0 : 1;
1101   int needle_chr_size = needle_isL ? 1 : 2;
1102   int haystack_chr_size = haystack_isL ? 1 : 2;
1103 
1104   load_chr_insn needle_load_1chr = needle_isL ? (load_chr_insn)&MacroAssembler::lbu :
1105                               (load_chr_insn)&MacroAssembler::lhu;
1106   load_chr_insn haystack_load_1chr = haystack_isL ? (load_chr_insn)&MacroAssembler::lbu :
1107                                 (load_chr_insn)&MacroAssembler::lhu;
1108   load_chr_insn load_2chr = isLL ? (load_chr_insn)&MacroAssembler::lhu : (load_chr_insn)&MacroAssembler::lwu;
1109   load_chr_insn load_4chr = isLL ? (load_chr_insn)&MacroAssembler::lwu : (load_chr_insn)&MacroAssembler::ld;
1110 
1111   Label DO1, DO2, DO3, MATCH, NOMATCH, DONE;
1112 
1113   Register first = tmp3;
1114 
1115   if (needle_con_cnt == -1) {
1116     Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
1117 
1118     sub(t0, needle_len, needle_isL == haystack_isL ? 4 : 2);
1119     bltz(t0, DOSHORT);
1120 
1121     (this->*needle_load_1chr)(first, Address(needle), noreg);
1122     slli(t0, needle_len, needle_chr_shift);
1123     add(needle, needle, t0);
1124     neg(nlen_neg, t0);
1125     slli(t0, result_tmp, haystack_chr_shift);
1126     add(haystack, haystack, t0);
1127     neg(hlen_neg, t0);
1128 
1129     bind(FIRST_LOOP);
1130     add(t0, haystack, hlen_neg);
1131     (this->*haystack_load_1chr)(ch2, Address(t0), noreg);
1132     beq(first, ch2, STR1_LOOP);
1133 
1134     bind(STR2_NEXT);
1135     add(hlen_neg, hlen_neg, haystack_chr_size);
1136     blez(hlen_neg, FIRST_LOOP);
1137     j(NOMATCH);
1138 
1139     bind(STR1_LOOP);
1140     add(nlen_tmp, nlen_neg, needle_chr_size);
1141     add(hlen_tmp, hlen_neg, haystack_chr_size);
1142     bgez(nlen_tmp, MATCH);
1143 
1144     bind(STR1_NEXT);
1145     add(ch1, needle, nlen_tmp);
1146     (this->*needle_load_1chr)(ch1, Address(ch1), noreg);
1147     add(ch2, haystack, hlen_tmp);
1148     (this->*haystack_load_1chr)(ch2, Address(ch2), noreg);
1149     bne(ch1, ch2, STR2_NEXT);
1150     add(nlen_tmp, nlen_tmp, needle_chr_size);
1151     add(hlen_tmp, hlen_tmp, haystack_chr_size);
1152     bltz(nlen_tmp, STR1_NEXT);
1153     j(MATCH);
1154 
1155     bind(DOSHORT);
1156     if (needle_isL == haystack_isL) {
1157       sub(t0, needle_len, 2);
1158       bltz(t0, DO1);
1159       bgtz(t0, DO3);
1160     }
1161   }
1162 
1163   if (needle_con_cnt == 4) {
1164     Label CH1_LOOP;
1165     (this->*load_4chr)(ch1, Address(needle), noreg);
1166     sub(result_tmp, haystack_len, 4);
1167     slli(tmp3, result_tmp, haystack_chr_shift); // result as tmp
1168     add(haystack, haystack, tmp3);
1169     neg(hlen_neg, tmp3);
1170     if (AvoidUnalignedAccesses) {
1171       // preload first value, then we will read by 1 character per loop, instead of four
1172       // just shifting previous ch2 right by size of character in bits
1173       add(tmp3, haystack, hlen_neg);
1174       (this->*load_4chr)(ch2, Address(tmp3), noreg);
1175       if (isLL) {
1176         // need to erase 1 most significant byte in 32-bit value of ch2
1177         slli(ch2, ch2, 40);
1178         srli(ch2, ch2, 32);
1179       } else {
1180         slli(ch2, ch2, 16); // 2 most significant bytes will be erased by this operation
1181       }
1182     }
1183 
1184     bind(CH1_LOOP);
1185     add(tmp3, haystack, hlen_neg);
1186     if (AvoidUnalignedAccesses) {
1187       srli(ch2, ch2, isLL ? 8 : 16);
1188       (this->*haystack_load_1chr)(tmp3, Address(tmp3, isLL ? 3 : 6), noreg);
1189       slli(tmp3, tmp3, isLL ? 24 : 48);
1190       add(ch2, ch2, tmp3);
1191     } else {
1192       (this->*load_4chr)(ch2, Address(tmp3), noreg);
1193     }
1194     beq(ch1, ch2, MATCH);
1195     add(hlen_neg, hlen_neg, haystack_chr_size);
1196     blez(hlen_neg, CH1_LOOP);
1197     j(NOMATCH);
1198   }
1199 
1200   if ((needle_con_cnt == -1 && needle_isL == haystack_isL) || needle_con_cnt == 2) {
1201     Label CH1_LOOP;
1202     BLOCK_COMMENT("string_indexof DO2 {");
1203     bind(DO2);
1204     (this->*load_2chr)(ch1, Address(needle), noreg);
1205     if (needle_con_cnt == 2) {
1206       sub(result_tmp, haystack_len, 2);
1207     }
1208     slli(tmp3, result_tmp, haystack_chr_shift);
1209     add(haystack, haystack, tmp3);
1210     neg(hlen_neg, tmp3);
1211     if (AvoidUnalignedAccesses) {
1212       // preload first value, then we will read by 1 character per loop, instead of two
1213       // just shifting previous ch2 right by size of character in bits
1214       add(tmp3, haystack, hlen_neg);
1215       (this->*haystack_load_1chr)(ch2, Address(tmp3), noreg);
1216       slli(ch2, ch2, isLL ? 8 : 16);
1217     }
1218     bind(CH1_LOOP);
1219     add(tmp3, haystack, hlen_neg);
1220     if (AvoidUnalignedAccesses) {
1221       srli(ch2, ch2, isLL ? 8 : 16);
1222       (this->*haystack_load_1chr)(tmp3, Address(tmp3, isLL ? 1 : 2), noreg);
1223       slli(tmp3, tmp3, isLL ? 8 : 16);
1224       add(ch2, ch2, tmp3);
1225     } else {
1226       (this->*load_2chr)(ch2, Address(tmp3), noreg);
1227     }
1228     beq(ch1, ch2, MATCH);
1229     add(hlen_neg, hlen_neg, haystack_chr_size);
1230     blez(hlen_neg, CH1_LOOP);
1231     j(NOMATCH);
1232     BLOCK_COMMENT("} string_indexof DO2");
1233   }
1234 
1235   if ((needle_con_cnt == -1 && needle_isL == haystack_isL) || needle_con_cnt == 3) {
1236     Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
1237     BLOCK_COMMENT("string_indexof DO3 {");
1238 
1239     bind(DO3);
1240     (this->*load_2chr)(first, Address(needle), noreg);
1241     (this->*needle_load_1chr)(ch1, Address(needle, 2 * needle_chr_size), noreg);
1242     if (needle_con_cnt == 3) {
1243       sub(result_tmp, haystack_len, 3);
1244     }
1245     slli(hlen_tmp, result_tmp, haystack_chr_shift);
1246     add(haystack, haystack, hlen_tmp);
1247     neg(hlen_neg, hlen_tmp);
1248 
1249     bind(FIRST_LOOP);
1250     add(ch2, haystack, hlen_neg);
1251     if (AvoidUnalignedAccesses) {
1252       (this->*haystack_load_1chr)(tmp2, Address(ch2, isLL ? 1 : 2), noreg); // we need a temp register, we can safely use hlen_tmp here, which is a synonym for tmp2
1253       (this->*haystack_load_1chr)(ch2, Address(ch2), noreg);
1254       slli(tmp2, tmp2, isLL ? 8 : 16);
1255       add(ch2, ch2, tmp2);
1256     } else {
1257       (this->*load_2chr)(ch2, Address(ch2), noreg);
1258     }
1259     beq(first, ch2, STR1_LOOP);
1260 
1261     bind(STR2_NEXT);
1262     add(hlen_neg, hlen_neg, haystack_chr_size);
1263     blez(hlen_neg, FIRST_LOOP);
1264     j(NOMATCH);
1265 
1266     bind(STR1_LOOP);
1267     add(hlen_tmp, hlen_neg, 2 * haystack_chr_size);
1268     add(ch2, haystack, hlen_tmp);
1269     (this->*haystack_load_1chr)(ch2, Address(ch2), noreg);
1270     bne(ch1, ch2, STR2_NEXT);
1271     j(MATCH);
1272     BLOCK_COMMENT("} string_indexof DO3");
1273   }
1274 
1275   if (needle_con_cnt == -1 || needle_con_cnt == 1) {
1276     Label DO1_LOOP;
1277 
1278     BLOCK_COMMENT("string_indexof DO1 {");
1279     bind(DO1);
1280     (this->*needle_load_1chr)(ch1, Address(needle), noreg);
1281     sub(result_tmp, haystack_len, 1);
1282     slli(tmp3, result_tmp, haystack_chr_shift);
1283     add(haystack, haystack, tmp3);
1284     neg(hlen_neg, tmp3);
1285 
1286     bind(DO1_LOOP);
1287     add(tmp3, haystack, hlen_neg);
1288     (this->*haystack_load_1chr)(ch2, Address(tmp3), noreg);
1289     beq(ch1, ch2, MATCH);
1290     add(hlen_neg, hlen_neg, haystack_chr_size);
1291     blez(hlen_neg, DO1_LOOP);
1292     BLOCK_COMMENT("} string_indexof DO1");
1293   }
1294 
1295   bind(NOMATCH);
1296   mv(result, -1);
1297   j(DONE);
1298 
1299   bind(MATCH);
1300   srai(t0, hlen_neg, haystack_chr_shift);
1301   add(result, result_tmp, t0);
1302 
1303   bind(DONE);
1304 }
1305 
1306 // Compare strings.
1307 void C2_MacroAssembler::string_compare(Register str1, Register str2,
1308                                        Register cnt1, Register cnt2, Register result,
1309                                        Register tmp1, Register tmp2, Register tmp3,
1310                                        int ae)
1311 {
1312   Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
1313         DIFFERENCE, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
1314         SHORT_LOOP_START, TAIL_CHECK, L;
1315 
1316   const int STUB_THRESHOLD = 64 + 8;
1317   bool isLL = ae == StrIntrinsicNode::LL;
1318   bool isLU = ae == StrIntrinsicNode::LU;
1319   bool isUL = ae == StrIntrinsicNode::UL;
1320 
1321   bool str1_isL = isLL || isLU;
1322   bool str2_isL = isLL || isUL;
1323 
1324   // for L strings, 1 byte for 1 character
1325   // for U strings, 2 bytes for 1 character
1326   int str1_chr_size = str1_isL ? 1 : 2;
1327   int str2_chr_size = str2_isL ? 1 : 2;
1328   int minCharsInWord = isLL ? wordSize : wordSize / 2;
1329 
1330   load_chr_insn str1_load_chr = str1_isL ? (load_chr_insn)&MacroAssembler::lbu : (load_chr_insn)&MacroAssembler::lhu;
1331   load_chr_insn str2_load_chr = str2_isL ? (load_chr_insn)&MacroAssembler::lbu : (load_chr_insn)&MacroAssembler::lhu;
1332 
1333   BLOCK_COMMENT("string_compare {");
1334 
1335   // Bizzarely, the counts are passed in bytes, regardless of whether they
1336   // are L or U strings, however the result is always in characters.
1337   if (!str1_isL) {
1338     sraiw(cnt1, cnt1, 1);
1339   }
1340   if (!str2_isL) {
1341     sraiw(cnt2, cnt2, 1);
1342   }
1343 
1344   // Compute the minimum of the string lengths and save the difference in result.
1345   sub(result, cnt1, cnt2);
1346   bgt(cnt1, cnt2, L);
1347   mv(cnt2, cnt1);
1348   bind(L);
1349 
1350   // A very short string
1351   mv(t0, minCharsInWord);
1352   ble(cnt2, t0, SHORT_STRING);
1353 
1354   // Compare longwords
1355   // load first parts of strings and finish initialization while loading
1356   {
1357     if (str1_isL == str2_isL) { // LL or UU
1358       // check if str1 and str2 is same pointer
1359       beq(str1, str2, DONE);
1360       // load 8 bytes once to compare
1361       ld(tmp1, Address(str1));
1362       ld(tmp2, Address(str2));
1363       mv(t0, STUB_THRESHOLD);
1364       bge(cnt2, t0, STUB);
1365       sub(cnt2, cnt2, minCharsInWord);
1366       beqz(cnt2, TAIL_CHECK);
1367       // convert cnt2 from characters to bytes
1368       if (!str1_isL) {
1369         slli(cnt2, cnt2, 1);
1370       }
1371       add(str2, str2, cnt2);
1372       add(str1, str1, cnt2);
1373       sub(cnt2, zr, cnt2);
1374     } else if (isLU) { // LU case
1375       lwu(tmp1, Address(str1));
1376       ld(tmp2, Address(str2));
1377       mv(t0, STUB_THRESHOLD);
1378       bge(cnt2, t0, STUB);
1379       addi(cnt2, cnt2, -4);
1380       add(str1, str1, cnt2);
1381       sub(cnt1, zr, cnt2);
1382       slli(cnt2, cnt2, 1);
1383       add(str2, str2, cnt2);
1384       inflate_lo32(tmp3, tmp1);
1385       mv(tmp1, tmp3);
1386       sub(cnt2, zr, cnt2);
1387       addi(cnt1, cnt1, 4);
1388     } else { // UL case
1389       ld(tmp1, Address(str1));
1390       lwu(tmp2, Address(str2));
1391       mv(t0, STUB_THRESHOLD);
1392       bge(cnt2, t0, STUB);
1393       addi(cnt2, cnt2, -4);
1394       slli(t0, cnt2, 1);
1395       sub(cnt1, zr, t0);
1396       add(str1, str1, t0);
1397       add(str2, str2, cnt2);
1398       inflate_lo32(tmp3, tmp2);
1399       mv(tmp2, tmp3);
1400       sub(cnt2, zr, cnt2);
1401       addi(cnt1, cnt1, 8);
1402     }
1403     addi(cnt2, cnt2, isUL ? 4 : 8);
1404     bne(tmp1, tmp2, DIFFERENCE);
1405     bgez(cnt2, TAIL);
1406 
1407     // main loop
1408     bind(NEXT_WORD);
1409     if (str1_isL == str2_isL) { // LL or UU
1410       add(t0, str1, cnt2);
1411       ld(tmp1, Address(t0));
1412       add(t0, str2, cnt2);
1413       ld(tmp2, Address(t0));
1414       addi(cnt2, cnt2, 8);
1415     } else if (isLU) { // LU case
1416       add(t0, str1, cnt1);
1417       lwu(tmp1, Address(t0));
1418       add(t0, str2, cnt2);
1419       ld(tmp2, Address(t0));
1420       addi(cnt1, cnt1, 4);
1421       inflate_lo32(tmp3, tmp1);
1422       mv(tmp1, tmp3);
1423       addi(cnt2, cnt2, 8);
1424     } else { // UL case
1425       add(t0, str2, cnt2);
1426       lwu(tmp2, Address(t0));
1427       add(t0, str1, cnt1);
1428       ld(tmp1, Address(t0));
1429       inflate_lo32(tmp3, tmp2);
1430       mv(tmp2, tmp3);
1431       addi(cnt1, cnt1, 8);
1432       addi(cnt2, cnt2, 4);
1433     }
1434     bne(tmp1, tmp2, DIFFERENCE);
1435     bltz(cnt2, NEXT_WORD);
1436     bind(TAIL);
1437     if (str1_isL == str2_isL) { // LL or UU
1438       load_long_misaligned(tmp1, Address(str1), tmp3, isLL ? 1 : 2);
1439       load_long_misaligned(tmp2, Address(str2), tmp3, isLL ? 1 : 2);
1440     } else if (isLU) { // LU case
1441       load_int_misaligned(tmp1, Address(str1), tmp3, false);
1442       load_long_misaligned(tmp2, Address(str2), tmp3, 2);
1443       inflate_lo32(tmp3, tmp1);
1444       mv(tmp1, tmp3);
1445     } else { // UL case
1446       load_int_misaligned(tmp2, Address(str2), tmp3, false);
1447       load_long_misaligned(tmp1, Address(str1), tmp3, 2);
1448       inflate_lo32(tmp3, tmp2);
1449       mv(tmp2, tmp3);
1450     }
1451     bind(TAIL_CHECK);
1452     beq(tmp1, tmp2, DONE);
1453 
1454     // Find the first different characters in the longwords and
1455     // compute their difference.
1456     bind(DIFFERENCE);
1457     xorr(tmp3, tmp1, tmp2);
1458     ctzc_bit(result, tmp3, isLL); // count zero from lsb to msb
1459     srl(tmp1, tmp1, result);
1460     srl(tmp2, tmp2, result);
1461     if (isLL) {
1462       andi(tmp1, tmp1, 0xFF);
1463       andi(tmp2, tmp2, 0xFF);
1464     } else {
1465       andi(tmp1, tmp1, 0xFFFF);
1466       andi(tmp2, tmp2, 0xFFFF);
1467     }
1468     sub(result, tmp1, tmp2);
1469     j(DONE);
1470   }
1471 
1472   bind(STUB);
1473   RuntimeAddress stub = nullptr;
1474   switch (ae) {
1475     case StrIntrinsicNode::LL:
1476       stub = RuntimeAddress(StubRoutines::riscv::compare_long_string_LL());
1477       break;
1478     case StrIntrinsicNode::UU:
1479       stub = RuntimeAddress(StubRoutines::riscv::compare_long_string_UU());
1480       break;
1481     case StrIntrinsicNode::LU:
1482       stub = RuntimeAddress(StubRoutines::riscv::compare_long_string_LU());
1483       break;
1484     case StrIntrinsicNode::UL:
1485       stub = RuntimeAddress(StubRoutines::riscv::compare_long_string_UL());
1486       break;
1487     default:
1488       ShouldNotReachHere();
1489   }
1490   assert(stub.target() != nullptr, "compare_long_string stub has not been generated");
1491   address call = trampoline_call(stub);
1492   if (call == nullptr) {
1493     DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START));
1494     ciEnv::current()->record_failure("CodeCache is full");
1495     return;
1496   }
1497   j(DONE);
1498 
1499   bind(SHORT_STRING);
1500   // Is the minimum length zero?
1501   beqz(cnt2, DONE);
1502   // arrange code to do most branches while loading and loading next characters
1503   // while comparing previous
1504   (this->*str1_load_chr)(tmp1, Address(str1), t0);
1505   addi(str1, str1, str1_chr_size);
1506   addi(cnt2, cnt2, -1);
1507   beqz(cnt2, SHORT_LAST_INIT);
1508   (this->*str2_load_chr)(cnt1, Address(str2), t0);
1509   addi(str2, str2, str2_chr_size);
1510   j(SHORT_LOOP_START);
1511   bind(SHORT_LOOP);
1512   addi(cnt2, cnt2, -1);
1513   beqz(cnt2, SHORT_LAST);
1514   bind(SHORT_LOOP_START);
1515   (this->*str1_load_chr)(tmp2, Address(str1), t0);
1516   addi(str1, str1, str1_chr_size);
1517   (this->*str2_load_chr)(t0, Address(str2), t0);
1518   addi(str2, str2, str2_chr_size);
1519   bne(tmp1, cnt1, SHORT_LOOP_TAIL);
1520   addi(cnt2, cnt2, -1);
1521   beqz(cnt2, SHORT_LAST2);
1522   (this->*str1_load_chr)(tmp1, Address(str1), t0);
1523   addi(str1, str1, str1_chr_size);
1524   (this->*str2_load_chr)(cnt1, Address(str2), t0);
1525   addi(str2, str2, str2_chr_size);
1526   beq(tmp2, t0, SHORT_LOOP);
1527   sub(result, tmp2, t0);
1528   j(DONE);
1529   bind(SHORT_LOOP_TAIL);
1530   sub(result, tmp1, cnt1);
1531   j(DONE);
1532   bind(SHORT_LAST2);
1533   beq(tmp2, t0, DONE);
1534   sub(result, tmp2, t0);
1535 
1536   j(DONE);
1537   bind(SHORT_LAST_INIT);
1538   (this->*str2_load_chr)(cnt1, Address(str2), t0);
1539   addi(str2, str2, str2_chr_size);
1540   bind(SHORT_LAST);
1541   beq(tmp1, cnt1, DONE);
1542   sub(result, tmp1, cnt1);
1543 
1544   bind(DONE);
1545 
1546   BLOCK_COMMENT("} string_compare");
1547 }
1548 
1549 void C2_MacroAssembler::arrays_equals(Register a1, Register a2,
1550                                       Register tmp1, Register tmp2, Register tmp3,
1551                                       Register result, int elem_size) {
1552   assert(elem_size == 1 || elem_size == 2, "must be char or byte");
1553   assert_different_registers(a1, a2, result, tmp1, tmp2, tmp3, t0);
1554 
1555   int elem_per_word = wordSize/elem_size;
1556   int log_elem_size = exact_log2(elem_size);
1557   int length_offset = arrayOopDesc::length_offset_in_bytes();
1558   int base_offset   = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE);
1559 
1560   Register cnt1 = tmp3;
1561   Register cnt2 = tmp1;  // cnt2 only used in array length compare
1562   Label DONE, SAME, NEXT_WORD, SHORT, TAIL03, TAIL01;
1563 
1564   BLOCK_COMMENT("arrays_equals {");
1565 
1566   // if (a1 == a2), return true
1567   beq(a1, a2, SAME);
1568 
1569   mv(result, false);
1570   // if (a1 == nullptr || a2 == nullptr)
1571   //     return false;
1572   beqz(a1, DONE);
1573   beqz(a2, DONE);
1574 
1575   // if (a1.length != a2.length)
1576   //      return false;
1577   lwu(cnt1, Address(a1, length_offset));
1578   lwu(cnt2, Address(a2, length_offset));
1579   bne(cnt1, cnt2, DONE);
1580 
1581   la(a1, Address(a1, base_offset));
1582   la(a2, Address(a2, base_offset));
1583   // Check for short strings, i.e. smaller than wordSize.
1584   addi(cnt1, cnt1, -elem_per_word);
1585   bltz(cnt1, SHORT);
1586 
1587   // Main 8 byte comparison loop.
1588   bind(NEXT_WORD); {
1589     ld(tmp1, Address(a1));
1590     ld(tmp2, Address(a2));
1591     addi(cnt1, cnt1, -elem_per_word);
1592     addi(a1, a1, wordSize);
1593     addi(a2, a2, wordSize);
1594     bne(tmp1, tmp2, DONE);
1595   } bgez(cnt1, NEXT_WORD);
1596 
1597   addi(tmp1, cnt1, elem_per_word);
1598   beqz(tmp1, SAME);
1599 
1600   bind(SHORT);
1601   test_bit(tmp1, cnt1, 2 - log_elem_size);
1602   beqz(tmp1, TAIL03); // 0-7 bytes left.
1603   {
1604     lwu(tmp1, Address(a1));
1605     lwu(tmp2, Address(a2));
1606     addi(a1, a1, 4);
1607     addi(a2, a2, 4);
1608     bne(tmp1, tmp2, DONE);
1609   }
1610 
1611   bind(TAIL03);
1612   test_bit(tmp1, cnt1, 1 - log_elem_size);
1613   beqz(tmp1, TAIL01); // 0-3 bytes left.
1614   {
1615     lhu(tmp1, Address(a1));
1616     lhu(tmp2, Address(a2));
1617     addi(a1, a1, 2);
1618     addi(a2, a2, 2);
1619     bne(tmp1, tmp2, DONE);
1620   }
1621 
1622   bind(TAIL01);
1623   if (elem_size == 1) { // Only needed when comparing byte arrays.
1624     test_bit(tmp1, cnt1, 0);
1625     beqz(tmp1, SAME); // 0-1 bytes left.
1626     {
1627       lbu(tmp1, Address(a1));
1628       lbu(tmp2, Address(a2));
1629       bne(tmp1, tmp2, DONE);
1630     }
1631   }
1632 
1633   bind(SAME);
1634   mv(result, true);
1635   // That's it.
1636   bind(DONE);
1637 
1638   BLOCK_COMMENT("} arrays_equals");
1639 }
1640 
1641 // Compare Strings
1642 
1643 // For Strings we're passed the address of the first characters in a1 and a2
1644 // and the length in cnt1. There are two implementations.
1645 // For arrays >= 8 bytes, all comparisons (except for the tail) are performed
1646 // 8 bytes at a time. For the tail, we compare a halfword, then a short, and then a byte.
1647 // For strings < 8 bytes, we compare a halfword, then a short, and then a byte.
1648 
1649 void C2_MacroAssembler::string_equals(Register a1, Register a2,
1650                                       Register result, Register cnt1)
1651 {
1652   Label SAME, DONE, SHORT, NEXT_WORD;
1653   Register tmp1 = t0;
1654   Register tmp2 = t1;
1655 
1656   assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2);
1657 
1658   BLOCK_COMMENT("string_equals {");
1659 
1660   mv(result, false);
1661 
1662   // Check for short strings, i.e. smaller than wordSize.
1663   addi(cnt1, cnt1, -wordSize);
1664   bltz(cnt1, SHORT);
1665 
1666   // Main 8 byte comparison loop.
1667   bind(NEXT_WORD); {
1668     ld(tmp1, Address(a1));
1669     ld(tmp2, Address(a2));
1670     addi(cnt1, cnt1, -wordSize);
1671     addi(a1, a1, wordSize);
1672     addi(a2, a2, wordSize);
1673     bne(tmp1, tmp2, DONE);
1674   } bgez(cnt1, NEXT_WORD);
1675 
1676   addi(tmp1, cnt1, wordSize);
1677   beqz(tmp1, SAME);
1678 
1679   bind(SHORT);
1680   Label TAIL03, TAIL01;
1681 
1682   // 0-7 bytes left.
1683   test_bit(tmp1, cnt1, 2);
1684   beqz(tmp1, TAIL03);
1685   {
1686     lwu(tmp1, Address(a1));
1687     lwu(tmp2, Address(a2));
1688     addi(a1, a1, 4);
1689     addi(a2, a2, 4);
1690     bne(tmp1, tmp2, DONE);
1691   }
1692 
1693   bind(TAIL03);
1694   // 0-3 bytes left.
1695   test_bit(tmp1, cnt1, 1);
1696   beqz(tmp1, TAIL01);
1697   {
1698     lhu(tmp1, Address(a1));
1699     lhu(tmp2, Address(a2));
1700     addi(a1, a1, 2);
1701     addi(a2, a2, 2);
1702     bne(tmp1, tmp2, DONE);
1703   }
1704 
1705   bind(TAIL01);
1706   // 0-1 bytes left.
1707   test_bit(tmp1, cnt1, 0);
1708   beqz(tmp1, SAME);
1709   {
1710     lbu(tmp1, Address(a1));
1711     lbu(tmp2, Address(a2));
1712     bne(tmp1, tmp2, DONE);
1713   }
1714 
1715   // Arrays are equal.
1716   bind(SAME);
1717   mv(result, true);
1718 
1719   // That's it.
1720   bind(DONE);
1721   BLOCK_COMMENT("} string_equals");
1722 }
1723 
1724 // jdk.internal.util.ArraysSupport.vectorizedHashCode
1725 void C2_MacroAssembler::arrays_hashcode(Register ary, Register cnt, Register result,
1726                                         Register tmp1, Register tmp2, Register tmp3,
1727                                         Register tmp4, Register tmp5, Register tmp6,
1728                                         BasicType eltype)
1729 {
1730   assert_different_registers(ary, cnt, result, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, t0, t1);
1731 
1732   const int elsize = arrays_hashcode_elsize(eltype);
1733   const int chunks_end_shift = exact_log2(elsize);
1734 
1735   switch (eltype) {
1736   case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break;
1737   case T_CHAR:    BLOCK_COMMENT("arrays_hashcode(char) {");          break;
1738   case T_BYTE:    BLOCK_COMMENT("arrays_hashcode(byte) {");          break;
1739   case T_SHORT:   BLOCK_COMMENT("arrays_hashcode(short) {");         break;
1740   case T_INT:     BLOCK_COMMENT("arrays_hashcode(int) {");           break;
1741   default:
1742     ShouldNotReachHere();
1743   }
1744 
1745   const int stride = 4;
1746   const Register pow31_4 = tmp1;
1747   const Register pow31_3 = tmp2;
1748   const Register pow31_2 = tmp3;
1749   const Register chunks  = tmp4;
1750   const Register chunks_end = chunks;
1751 
1752   Label DONE, TAIL, TAIL_LOOP, WIDE_LOOP;
1753 
1754   // result has a value initially
1755 
1756   beqz(cnt, DONE);
1757 
1758   andi(chunks, cnt, ~(stride-1));
1759   beqz(chunks, TAIL);
1760 
1761   mv(pow31_4, 923521);           // [31^^4]
1762   mv(pow31_3,  29791);           // [31^^3]
1763   mv(pow31_2,    961);           // [31^^2]
1764 
1765   slli(chunks_end, chunks, chunks_end_shift);
1766   add(chunks_end, ary, chunks_end);
1767   andi(cnt, cnt, stride-1);      // don't forget about tail!
1768 
1769   bind(WIDE_LOOP);
1770   mulw(result, result, pow31_4); // 31^^4 * h
1771   arrays_hashcode_elload(t0,   Address(ary, 0 * elsize), eltype);
1772   arrays_hashcode_elload(t1,   Address(ary, 1 * elsize), eltype);
1773   arrays_hashcode_elload(tmp5, Address(ary, 2 * elsize), eltype);
1774   arrays_hashcode_elload(tmp6, Address(ary, 3 * elsize), eltype);
1775   mulw(t0, t0, pow31_3);         // 31^^3 * ary[i+0]
1776   addw(result, result, t0);
1777   mulw(t1, t1, pow31_2);         // 31^^2 * ary[i+1]
1778   addw(result, result, t1);
1779   slli(t0, tmp5, 5);             // optimize 31^^1 * ary[i+2]
1780   subw(tmp5, t0, tmp5);          // with ary[i+2]<<5 - ary[i+2]
1781   addw(result, result, tmp5);
1782   addw(result, result, tmp6);    // 31^^4 * h + 31^^3 * ary[i+0] + 31^^2 * ary[i+1]
1783                                  //           + 31^^1 * ary[i+2] + 31^^0 * ary[i+3]
1784   addi(ary, ary, elsize * stride);
1785   bne(ary, chunks_end, WIDE_LOOP);
1786   beqz(cnt, DONE);
1787 
1788   bind(TAIL);
1789   slli(chunks_end, cnt, chunks_end_shift);
1790   add(chunks_end, ary, chunks_end);
1791 
1792   bind(TAIL_LOOP);
1793   arrays_hashcode_elload(t0, Address(ary), eltype);
1794   slli(t1, result, 5);           // optimize 31 * result
1795   subw(result, t1, result);      // with result<<5 - result
1796   addw(result, result, t0);
1797   addi(ary, ary, elsize);
1798   bne(ary, chunks_end, TAIL_LOOP);
1799 
1800   bind(DONE);
1801   BLOCK_COMMENT("} // arrays_hashcode");
1802 }
1803 
1804 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) {
1805   switch (eltype) {
1806   case T_BOOLEAN: return sizeof(jboolean);
1807   case T_BYTE:    return sizeof(jbyte);
1808   case T_SHORT:   return sizeof(jshort);
1809   case T_CHAR:    return sizeof(jchar);
1810   case T_INT:     return sizeof(jint);
1811   default:
1812     ShouldNotReachHere();
1813     return -1;
1814   }
1815 }
1816 
1817 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) {
1818   switch (eltype) {
1819   // T_BOOLEAN used as surrogate for unsigned byte
1820   case T_BOOLEAN: lbu(dst, src);   break;
1821   case T_BYTE:     lb(dst, src);   break;
1822   case T_SHORT:    lh(dst, src);   break;
1823   case T_CHAR:    lhu(dst, src);   break;
1824   case T_INT:      lw(dst, src);   break;
1825   default:
1826     ShouldNotReachHere();
1827   }
1828 }
1829 
1830 typedef void (Assembler::*conditional_branch_insn)(Register op1, Register op2, Label& label, bool is_far);
1831 typedef void (MacroAssembler::*float_conditional_branch_insn)(FloatRegister op1, FloatRegister op2, Label& label,
1832                                                               bool is_far, bool is_unordered);
1833 
1834 static conditional_branch_insn conditional_branches[] =
1835 {
1836   /* SHORT branches */
1837   (conditional_branch_insn)&MacroAssembler::beq,
1838   (conditional_branch_insn)&MacroAssembler::bgt,
1839   nullptr, // BoolTest::overflow
1840   (conditional_branch_insn)&MacroAssembler::blt,
1841   (conditional_branch_insn)&MacroAssembler::bne,
1842   (conditional_branch_insn)&MacroAssembler::ble,
1843   nullptr, // BoolTest::no_overflow
1844   (conditional_branch_insn)&MacroAssembler::bge,
1845 
1846   /* UNSIGNED branches */
1847   (conditional_branch_insn)&MacroAssembler::beq,
1848   (conditional_branch_insn)&MacroAssembler::bgtu,
1849   nullptr,
1850   (conditional_branch_insn)&MacroAssembler::bltu,
1851   (conditional_branch_insn)&MacroAssembler::bne,
1852   (conditional_branch_insn)&MacroAssembler::bleu,
1853   nullptr,
1854   (conditional_branch_insn)&MacroAssembler::bgeu
1855 };
1856 
1857 static float_conditional_branch_insn float_conditional_branches[] =
1858 {
1859   /* FLOAT SHORT branches */
1860   (float_conditional_branch_insn)&MacroAssembler::float_beq,
1861   (float_conditional_branch_insn)&MacroAssembler::float_bgt,
1862   nullptr,  // BoolTest::overflow
1863   (float_conditional_branch_insn)&MacroAssembler::float_blt,
1864   (float_conditional_branch_insn)&MacroAssembler::float_bne,
1865   (float_conditional_branch_insn)&MacroAssembler::float_ble,
1866   nullptr, // BoolTest::no_overflow
1867   (float_conditional_branch_insn)&MacroAssembler::float_bge,
1868 
1869   /* DOUBLE SHORT branches */
1870   (float_conditional_branch_insn)&MacroAssembler::double_beq,
1871   (float_conditional_branch_insn)&MacroAssembler::double_bgt,
1872   nullptr,
1873   (float_conditional_branch_insn)&MacroAssembler::double_blt,
1874   (float_conditional_branch_insn)&MacroAssembler::double_bne,
1875   (float_conditional_branch_insn)&MacroAssembler::double_ble,
1876   nullptr,
1877   (float_conditional_branch_insn)&MacroAssembler::double_bge
1878 };
1879 
1880 void C2_MacroAssembler::cmp_branch(int cmpFlag, Register op1, Register op2, Label& label, bool is_far) {
1881   assert(cmpFlag >= 0 && cmpFlag < (int)(sizeof(conditional_branches) / sizeof(conditional_branches[0])),
1882          "invalid conditional branch index");
1883   (this->*conditional_branches[cmpFlag])(op1, op2, label, is_far);
1884 }
1885 
1886 // This is a function should only be used by C2. Flip the unordered when unordered-greater, C2 would use
1887 // unordered-lesser instead of unordered-greater. Finally, commute the result bits at function do_one_bytecode().
1888 void C2_MacroAssembler::float_cmp_branch(int cmpFlag, FloatRegister op1, FloatRegister op2, Label& label, bool is_far) {
1889   assert(cmpFlag >= 0 && cmpFlag < (int)(sizeof(float_conditional_branches) / sizeof(float_conditional_branches[0])),
1890          "invalid float conditional branch index");
1891   int booltest_flag = cmpFlag & ~(C2_MacroAssembler::double_branch_mask);
1892   (this->*float_conditional_branches[cmpFlag])(op1, op2, label, is_far,
1893     (booltest_flag == (BoolTest::ge) || booltest_flag == (BoolTest::gt)) ? false : true);
1894 }
1895 
1896 void C2_MacroAssembler::enc_cmpUEqNeLeGt_imm0_branch(int cmpFlag, Register op1, Label& L, bool is_far) {
1897   switch (cmpFlag) {
1898     case BoolTest::eq:
1899     case BoolTest::le:
1900       beqz(op1, L, is_far);
1901       break;
1902     case BoolTest::ne:
1903     case BoolTest::gt:
1904       bnez(op1, L, is_far);
1905       break;
1906     default:
1907       ShouldNotReachHere();
1908   }
1909 }
1910 
1911 void C2_MacroAssembler::enc_cmpEqNe_imm0_branch(int cmpFlag, Register op1, Label& L, bool is_far) {
1912   switch (cmpFlag) {
1913     case BoolTest::eq:
1914       beqz(op1, L, is_far);
1915       break;
1916     case BoolTest::ne:
1917       bnez(op1, L, is_far);
1918       break;
1919     default:
1920       ShouldNotReachHere();
1921   }
1922 }
1923 
1924 void C2_MacroAssembler::enc_cmove(int cmpFlag, Register op1, Register op2, Register dst, Register src) {
1925   Label L;
1926   cmp_branch(cmpFlag ^ (1 << neg_cond_bits), op1, op2, L);
1927   mv(dst, src);
1928   bind(L);
1929 }
1930 
1931 // Set dst to NaN if any NaN input.
1932 void C2_MacroAssembler::minmax_fp(FloatRegister dst, FloatRegister src1, FloatRegister src2,
1933                                   bool is_double, bool is_min) {
1934   assert_different_registers(dst, src1, src2);
1935 
1936   Label Done, Compare;
1937 
1938   is_double ? fclass_d(t0, src1)
1939             : fclass_s(t0, src1);
1940   is_double ? fclass_d(t1, src2)
1941             : fclass_s(t1, src2);
1942   orr(t0, t0, t1);
1943   andi(t0, t0, fclass_mask::nan); // if src1 or src2 is quiet or signaling NaN then return NaN
1944   beqz(t0, Compare);
1945   is_double ? fadd_d(dst, src1, src2)
1946             : fadd_s(dst, src1, src2);
1947   j(Done);
1948 
1949   bind(Compare);
1950   if (is_double) {
1951     is_min ? fmin_d(dst, src1, src2)
1952            : fmax_d(dst, src1, src2);
1953   } else {
1954     is_min ? fmin_s(dst, src1, src2)
1955            : fmax_s(dst, src1, src2);
1956   }
1957 
1958   bind(Done);
1959 }
1960 
1961 // According to Java SE specification, for floating-point round operations, if
1962 // the input is NaN, +/-infinity, or +/-0, the same input is returned as the
1963 // rounded result; this differs from behavior of RISC-V fcvt instructions (which
1964 // round out-of-range values to the nearest max or min value), therefore special
1965 // handling is needed by NaN, +/-Infinity, +/-0.
1966 void C2_MacroAssembler::round_double_mode(FloatRegister dst, FloatRegister src, int round_mode,
1967                                           Register tmp1, Register tmp2, Register tmp3) {
1968 
1969   assert_different_registers(dst, src);
1970   assert_different_registers(tmp1, tmp2, tmp3);
1971 
1972   // Set rounding mode for conversions
1973   // Here we use similar modes to double->long and long->double conversions
1974   // Different mode for long->double conversion matter only if long value was not representable as double,
1975   // we got long value as a result of double->long conversion so, it is definitely representable
1976   RoundingMode rm;
1977   switch (round_mode) {
1978     case RoundDoubleModeNode::rmode_ceil:
1979       rm = RoundingMode::rup;
1980       break;
1981     case RoundDoubleModeNode::rmode_floor:
1982       rm = RoundingMode::rdn;
1983       break;
1984     case RoundDoubleModeNode::rmode_rint:
1985       rm = RoundingMode::rne;
1986       break;
1987     default:
1988       ShouldNotReachHere();
1989   }
1990 
1991   // tmp1 - is a register to store double converted to long int
1992   // tmp2 - is a register to create constant for comparison
1993   // tmp3 - is a register where we store modified result of double->long conversion
1994   Label done, bad_val;
1995 
1996   // Conversion from double to long
1997   fcvt_l_d(tmp1, src, rm);
1998 
1999   // Generate constant (tmp2)
2000   // tmp2 = 100...0000
2001   addi(tmp2, zr, 1);
2002   slli(tmp2, tmp2, 63);
2003 
2004   // Prepare converted long (tmp1)
2005   // as a result when conversion overflow we got:
2006   // tmp1 = 011...1111 or 100...0000
2007   // Convert it to: tmp3 = 100...0000
2008   addi(tmp3, tmp1, 1);
2009   andi(tmp3, tmp3, -2);
2010   beq(tmp3, tmp2, bad_val);
2011 
2012   // Conversion from long to double
2013   fcvt_d_l(dst, tmp1, rm);
2014   // Add sign of input value to result for +/- 0 cases
2015   fsgnj_d(dst, dst, src);
2016   j(done);
2017 
2018   // If got conversion overflow return src
2019   bind(bad_val);
2020   fmv_d(dst, src);
2021 
2022   bind(done);
2023 }
2024 
2025 // According to Java SE specification, for floating-point signum operations, if
2026 // on input we have NaN or +/-0.0 value we should return it,
2027 // otherwise return +/- 1.0 using sign of input.
2028 // one - gives us a floating-point 1.0 (got from matching rule)
2029 // bool is_double - specifies single or double precision operations will be used.
2030 void C2_MacroAssembler::signum_fp(FloatRegister dst, FloatRegister one, bool is_double) {
2031   Label done;
2032 
2033   is_double ? fclass_d(t0, dst)
2034             : fclass_s(t0, dst);
2035 
2036   // check if input is -0, +0, signaling NaN or quiet NaN
2037   andi(t0, t0, fclass_mask::zero | fclass_mask::nan);
2038 
2039   bnez(t0, done);
2040 
2041   // use floating-point 1.0 with a sign of input
2042   is_double ? fsgnj_d(dst, one, dst)
2043             : fsgnj_s(dst, one, dst);
2044 
2045   bind(done);
2046 }
2047 
2048 static void float16_to_float_slow_path(C2_MacroAssembler& masm, C2GeneralStub<FloatRegister, Register, Register>& stub) {
2049 #define __ masm.
2050   FloatRegister dst = stub.data<0>();
2051   Register src = stub.data<1>();
2052   Register tmp = stub.data<2>();
2053   __ bind(stub.entry());
2054 
2055   // following instructions mainly focus on NaN, as riscv does not handle
2056   // NaN well with fcvt, but the code also works for Inf at the same time.
2057 
2058   // construct a NaN in 32 bits from the NaN in 16 bits,
2059   // we need the payloads of non-canonical NaNs to be preserved.
2060   __ mv(tmp, 0x7f800000);
2061   // sign-bit was already set via sign-extension if necessary.
2062   __ slli(t0, src, 13);
2063   __ orr(tmp, t0, tmp);
2064   __ fmv_w_x(dst, tmp);
2065 
2066   __ j(stub.continuation());
2067 #undef __
2068 }
2069 
2070 // j.l.Float.float16ToFloat
2071 void C2_MacroAssembler::float16_to_float(FloatRegister dst, Register src, Register tmp) {
2072   auto stub = C2CodeStub::make<FloatRegister, Register, Register>(dst, src, tmp, 20, float16_to_float_slow_path);
2073 
2074   // On riscv, NaN needs a special process as fcvt does not work in that case.
2075   // On riscv, Inf does not need a special process as fcvt can handle it correctly.
2076   // but we consider to get the slow path to process NaN and Inf at the same time,
2077   // as both of them are rare cases, and if we try to get the slow path to handle
2078   // only NaN case it would sacrifise the performance for normal cases,
2079   // i.e. non-NaN and non-Inf cases.
2080 
2081   // check whether it's a NaN or +/- Inf.
2082   mv(t0, 0x7c00);
2083   andr(tmp, src, t0);
2084   // jump to stub processing NaN and Inf cases.
2085   beq(t0, tmp, stub->entry());
2086 
2087   // non-NaN or non-Inf cases, just use built-in instructions.
2088   fmv_h_x(dst, src);
2089   fcvt_s_h(dst, dst);
2090 
2091   bind(stub->continuation());
2092 }
2093 
2094 static void float_to_float16_slow_path(C2_MacroAssembler& masm, C2GeneralStub<Register, FloatRegister, Register>& stub) {
2095 #define __ masm.
2096   Register dst = stub.data<0>();
2097   FloatRegister src = stub.data<1>();
2098   Register tmp = stub.data<2>();
2099   __ bind(stub.entry());
2100 
2101   __ fmv_x_w(dst, src);
2102 
2103   // preserve the payloads of non-canonical NaNs.
2104   __ srai(dst, dst, 13);
2105   // preserve the sign bit.
2106   __ srai(tmp, dst, 13);
2107   __ slli(tmp, tmp, 10);
2108   __ mv(t0, 0x3ff);
2109   __ orr(tmp, tmp, t0);
2110 
2111   // get the result by merging sign bit and payloads of preserved non-canonical NaNs.
2112   __ andr(dst, dst, tmp);
2113 
2114   __ j(stub.continuation());
2115 #undef __
2116 }
2117 
2118 // j.l.Float.floatToFloat16
2119 void C2_MacroAssembler::float_to_float16(Register dst, FloatRegister src, FloatRegister ftmp, Register xtmp) {
2120   auto stub = C2CodeStub::make<Register, FloatRegister, Register>(dst, src, xtmp, 130, float_to_float16_slow_path);
2121 
2122   // On riscv, NaN needs a special process as fcvt does not work in that case.
2123 
2124   // check whether it's a NaN.
2125   // replace fclass with feq as performance optimization.
2126   feq_s(t0, src, src);
2127   // jump to stub processing NaN cases.
2128   beqz(t0, stub->entry());
2129 
2130   // non-NaN cases, just use built-in instructions.
2131   fcvt_h_s(ftmp, src);
2132   fmv_x_h(dst, ftmp);
2133 
2134   bind(stub->continuation());
2135 }
2136 
2137 static void float16_to_float_v_slow_path(C2_MacroAssembler& masm, C2GeneralStub<VectorRegister, VectorRegister, uint>& stub) {
2138 #define __ masm.
2139   VectorRegister dst = stub.data<0>();
2140   VectorRegister src = stub.data<1>();
2141   uint vector_length = stub.data<2>();
2142   __ bind(stub.entry());
2143 
2144   // following instructions mainly focus on NaN, as riscv does not handle
2145   // NaN well with vfwcvt_f_f_v, but the code also works for Inf at the same time.
2146   //
2147   // construct NaN's in 32 bits from the NaN's in 16 bits,
2148   // we need the payloads of non-canonical NaNs to be preserved.
2149 
2150   // adjust vector type to 2 * SEW.
2151   __ vsetvli_helper(T_FLOAT, vector_length, Assembler::m1);
2152   // widen and sign-extend src data.
2153   __ vsext_vf2(dst, src, Assembler::v0_t);
2154   __ mv(t0, 0x7f800000);
2155   // sign-bit was already set via sign-extension if necessary.
2156   __ vsll_vi(dst, dst, 13, Assembler::v0_t);
2157   __ vor_vx(dst, dst, t0, Assembler::v0_t);
2158 
2159   __ j(stub.continuation());
2160 #undef __
2161 }
2162 
2163 // j.l.Float.float16ToFloat
2164 void C2_MacroAssembler::float16_to_float_v(VectorRegister dst, VectorRegister src, uint vector_length) {
2165   auto stub = C2CodeStub::make<VectorRegister, VectorRegister, uint>
2166               (dst, src, vector_length, 24, float16_to_float_v_slow_path);
2167   assert_different_registers(dst, src);
2168 
2169   // On riscv, NaN needs a special process as vfwcvt_f_f_v does not work in that case.
2170   // On riscv, Inf does not need a special process as vfwcvt_f_f_v can handle it correctly.
2171   // but we consider to get the slow path to process NaN and Inf at the same time,
2172   // as both of them are rare cases, and if we try to get the slow path to handle
2173   // only NaN case it would sacrifise the performance for normal cases,
2174   // i.e. non-NaN and non-Inf cases.
2175 
2176   vsetvli_helper(BasicType::T_SHORT, vector_length, Assembler::mf2);
2177 
2178   // check whether there is a NaN or +/- Inf.
2179   mv(t0, 0x7c00);
2180   vand_vx(v0, src, t0);
2181   // v0 will be used as mask in slow path.
2182   vmseq_vx(v0, v0, t0);
2183   vcpop_m(t0, v0);
2184 
2185   // For non-NaN or non-Inf cases, just use built-in instructions.
2186   vfwcvt_f_f_v(dst, src);
2187 
2188   // jump to stub processing NaN and Inf cases if there is any of them in the vector-wide.
2189   bnez(t0, stub->entry());
2190 
2191   bind(stub->continuation());
2192 }
2193 
2194 static void float_to_float16_v_slow_path(C2_MacroAssembler& masm,
2195                                          C2GeneralStub<VectorRegister, VectorRegister, VectorRegister>& stub) {
2196 #define __ masm.
2197   VectorRegister dst = stub.data<0>();
2198   VectorRegister src = stub.data<1>();
2199   VectorRegister tmp = stub.data<2>();
2200   __ bind(stub.entry());
2201 
2202   // mul is already set to mf2 in float_to_float16_v.
2203 
2204   // preserve the payloads of non-canonical NaNs.
2205   __ vnsra_wi(dst, src, 13, Assembler::v0_t);
2206 
2207   // preserve the sign bit.
2208   __ vnsra_wi(tmp, src, 26, Assembler::v0_t);
2209   __ vsll_vi(tmp, tmp, 10, Assembler::v0_t);
2210   __ mv(t0, 0x3ff);
2211   __ vor_vx(tmp, tmp, t0, Assembler::v0_t);
2212 
2213   // get the result by merging sign bit and payloads of preserved non-canonical NaNs.
2214   __ vand_vv(dst, dst, tmp, Assembler::v0_t);
2215 
2216   __ j(stub.continuation());
2217 #undef __
2218 }
2219 
2220 // j.l.Float.float16ToFloat
2221 void C2_MacroAssembler::float_to_float16_v(VectorRegister dst, VectorRegister src, VectorRegister vtmp,
2222                                            Register tmp, uint vector_length) {
2223   assert_different_registers(dst, src, vtmp);
2224 
2225   auto stub = C2CodeStub::make<VectorRegister, VectorRegister, VectorRegister>
2226               (dst, src, vtmp, 28, float_to_float16_v_slow_path);
2227 
2228   // On riscv, NaN needs a special process as vfncvt_f_f_w does not work in that case.
2229 
2230   vsetvli_helper(BasicType::T_FLOAT, vector_length, Assembler::m1);
2231 
2232   // check whether there is a NaN.
2233   // replace v_fclass with vmseq_vv as performance optimization.
2234   vmfne_vv(v0, src, src);
2235   vcpop_m(t0, v0);
2236 
2237   vsetvli_helper(BasicType::T_SHORT, vector_length, Assembler::mf2, tmp);
2238 
2239   // For non-NaN cases, just use built-in instructions.
2240   vfncvt_f_f_w(dst, src);
2241 
2242   // jump to stub processing NaN cases.
2243   bnez(t0, stub->entry());
2244 
2245   bind(stub->continuation());
2246 }
2247 
2248 void C2_MacroAssembler::signum_fp_v(VectorRegister dst, VectorRegister one, BasicType bt, int vlen) {
2249   vsetvli_helper(bt, vlen);
2250 
2251   // check if input is -0, +0, signaling NaN or quiet NaN
2252   vfclass_v(v0, dst);
2253   mv(t0, fclass_mask::zero | fclass_mask::nan);
2254   vand_vx(v0, v0, t0);
2255   vmseq_vi(v0, v0, 0);
2256 
2257   // use floating-point 1.0 with a sign of input
2258   vfsgnj_vv(dst, one, dst, v0_t);
2259 }
2260 
2261 void C2_MacroAssembler::compress_bits_v(Register dst, Register src, Register mask, bool is_long) {
2262   Assembler::SEW sew = is_long ? Assembler::e64 : Assembler::e32;
2263   // intrinsic is enabled when MaxVectorSize >= 16
2264   Assembler::LMUL lmul = is_long ? Assembler::m4 : Assembler::m2;
2265   long len = is_long ? 64 : 32;
2266 
2267   // load the src data(in bits) to be compressed.
2268   vsetivli(x0, 1, sew, Assembler::m1);
2269   vmv_s_x(v0, src);
2270   // reset the src data(in bytes) to zero.
2271   mv(t0, len);
2272   vsetvli(x0, t0, Assembler::e8, lmul);
2273   vmv_v_i(v4, 0);
2274   // convert the src data from bits to bytes.
2275   vmerge_vim(v4, v4, 1); // v0 as the implicit mask register
2276   // reset the dst data(in bytes) to zero.
2277   vmv_v_i(v8, 0);
2278   // load the mask data(in bits).
2279   vsetivli(x0, 1, sew, Assembler::m1);
2280   vmv_s_x(v0, mask);
2281   // compress the src data(in bytes) to dst(in bytes).
2282   vsetvli(x0, t0, Assembler::e8, lmul);
2283   vcompress_vm(v8, v4, v0);
2284   // convert the dst data from bytes to bits.
2285   vmseq_vi(v0, v8, 1);
2286   // store result back.
2287   vsetivli(x0, 1, sew, Assembler::m1);
2288   vmv_x_s(dst, v0);
2289 }
2290 
2291 void C2_MacroAssembler::compress_bits_i_v(Register dst, Register src, Register mask) {
2292   compress_bits_v(dst, src, mask, /* is_long */ false);
2293 }
2294 
2295 void C2_MacroAssembler::compress_bits_l_v(Register dst, Register src, Register mask) {
2296   compress_bits_v(dst, src, mask, /* is_long */ true);
2297 }
2298 
2299 void C2_MacroAssembler::expand_bits_v(Register dst, Register src, Register mask, bool is_long) {
2300   Assembler::SEW sew = is_long ? Assembler::e64 : Assembler::e32;
2301   // intrinsic is enabled when MaxVectorSize >= 16
2302   Assembler::LMUL lmul = is_long ? Assembler::m4 : Assembler::m2;
2303   long len = is_long ? 64 : 32;
2304 
2305   // load the src data(in bits) to be expanded.
2306   vsetivli(x0, 1, sew, Assembler::m1);
2307   vmv_s_x(v0, src);
2308   // reset the src data(in bytes) to zero.
2309   mv(t0, len);
2310   vsetvli(x0, t0, Assembler::e8, lmul);
2311   vmv_v_i(v4, 0);
2312   // convert the src data from bits to bytes.
2313   vmerge_vim(v4, v4, 1); // v0 as implicit mask register
2314   // reset the dst data(in bytes) to zero.
2315   vmv_v_i(v12, 0);
2316   // load the mask data(in bits).
2317   vsetivli(x0, 1, sew, Assembler::m1);
2318   vmv_s_x(v0, mask);
2319   // expand the src data(in bytes) to dst(in bytes).
2320   vsetvli(x0, t0, Assembler::e8, lmul);
2321   viota_m(v8, v0);
2322   vrgather_vv(v12, v4, v8, VectorMask::v0_t); // v0 as implicit mask register
2323   // convert the dst data from bytes to bits.
2324   vmseq_vi(v0, v12, 1);
2325   // store result back.
2326   vsetivli(x0, 1, sew, Assembler::m1);
2327   vmv_x_s(dst, v0);
2328 }
2329 
2330 void C2_MacroAssembler::expand_bits_i_v(Register dst, Register src, Register mask) {
2331   expand_bits_v(dst, src, mask, /* is_long */ false);
2332 }
2333 
2334 void C2_MacroAssembler::expand_bits_l_v(Register dst, Register src, Register mask) {
2335   expand_bits_v(dst, src, mask, /* is_long */ true);
2336 }
2337 
2338 void C2_MacroAssembler::element_compare(Register a1, Register a2, Register result, Register cnt, Register tmp1, Register tmp2,
2339                                         VectorRegister vr1, VectorRegister vr2, VectorRegister vrs, bool islatin, Label &DONE) {
2340   Label loop;
2341   Assembler::SEW sew = islatin ? Assembler::e8 : Assembler::e16;
2342 
2343   bind(loop);
2344   vsetvli(tmp1, cnt, sew, Assembler::m2);
2345   vlex_v(vr1, a1, sew);
2346   vlex_v(vr2, a2, sew);
2347   vmsne_vv(vrs, vr1, vr2);
2348   vfirst_m(tmp2, vrs);
2349   bgez(tmp2, DONE);
2350   sub(cnt, cnt, tmp1);
2351   if (!islatin) {
2352     slli(tmp1, tmp1, 1); // get byte counts
2353   }
2354   add(a1, a1, tmp1);
2355   add(a2, a2, tmp1);
2356   bnez(cnt, loop);
2357 
2358   mv(result, true);
2359 }
2360 
2361 void C2_MacroAssembler::string_equals_v(Register a1, Register a2, Register result, Register cnt) {
2362   Label DONE;
2363   Register tmp1 = t0;
2364   Register tmp2 = t1;
2365 
2366   BLOCK_COMMENT("string_equals_v {");
2367 
2368   mv(result, false);
2369 
2370   element_compare(a1, a2, result, cnt, tmp1, tmp2, v2, v4, v2, true, DONE);
2371 
2372   bind(DONE);
2373   BLOCK_COMMENT("} string_equals_v");
2374 }
2375 
2376 // used by C2 ClearArray patterns.
2377 // base: Address of a buffer to be zeroed
2378 // cnt: Count in HeapWords
2379 //
2380 // base, cnt, v4, v5, v6, v7 and t0 are clobbered.
2381 void C2_MacroAssembler::clear_array_v(Register base, Register cnt) {
2382   Label loop;
2383 
2384   // making zero words
2385   vsetvli(t0, cnt, Assembler::e64, Assembler::m4);
2386   vxor_vv(v4, v4, v4);
2387 
2388   bind(loop);
2389   vsetvli(t0, cnt, Assembler::e64, Assembler::m4);
2390   vse64_v(v4, base);
2391   sub(cnt, cnt, t0);
2392   shadd(base, t0, base, t0, 3);
2393   bnez(cnt, loop);
2394 }
2395 
2396 void C2_MacroAssembler::arrays_equals_v(Register a1, Register a2, Register result,
2397                                         Register cnt1, int elem_size) {
2398   Label DONE;
2399   Register tmp1 = t0;
2400   Register tmp2 = t1;
2401   Register cnt2 = tmp2;
2402   int length_offset = arrayOopDesc::length_offset_in_bytes();
2403   int base_offset = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE);
2404 
2405   BLOCK_COMMENT("arrays_equals_v {");
2406 
2407   // if (a1 == a2), return true
2408   mv(result, true);
2409   beq(a1, a2, DONE);
2410 
2411   mv(result, false);
2412   // if a1 == null or a2 == null, return false
2413   beqz(a1, DONE);
2414   beqz(a2, DONE);
2415   // if (a1.length != a2.length), return false
2416   lwu(cnt1, Address(a1, length_offset));
2417   lwu(cnt2, Address(a2, length_offset));
2418   bne(cnt1, cnt2, DONE);
2419 
2420   la(a1, Address(a1, base_offset));
2421   la(a2, Address(a2, base_offset));
2422 
2423   element_compare(a1, a2, result, cnt1, tmp1, tmp2, v2, v4, v2, elem_size == 1, DONE);
2424 
2425   bind(DONE);
2426 
2427   BLOCK_COMMENT("} arrays_equals_v");
2428 }
2429 
2430 void C2_MacroAssembler::string_compare_v(Register str1, Register str2, Register cnt1, Register cnt2,
2431                                          Register result, Register tmp1, Register tmp2, int encForm) {
2432   Label DIFFERENCE, DONE, L, loop;
2433   bool encLL = encForm == StrIntrinsicNode::LL;
2434   bool encLU = encForm == StrIntrinsicNode::LU;
2435   bool encUL = encForm == StrIntrinsicNode::UL;
2436 
2437   bool str1_isL = encLL || encLU;
2438   bool str2_isL = encLL || encUL;
2439 
2440   int minCharsInWord = encLL ? wordSize : wordSize / 2;
2441 
2442   BLOCK_COMMENT("string_compare {");
2443 
2444   // for Latin strings, 1 byte for 1 character
2445   // for UTF16 strings, 2 bytes for 1 character
2446   if (!str1_isL)
2447     sraiw(cnt1, cnt1, 1);
2448   if (!str2_isL)
2449     sraiw(cnt2, cnt2, 1);
2450 
2451   // if str1 == str2, return the difference
2452   // save the minimum of the string lengths in cnt2.
2453   sub(result, cnt1, cnt2);
2454   bgt(cnt1, cnt2, L);
2455   mv(cnt2, cnt1);
2456   bind(L);
2457 
2458   if (str1_isL == str2_isL) { // LL or UU
2459     element_compare(str1, str2, zr, cnt2, tmp1, tmp2, v2, v4, v2, encLL, DIFFERENCE);
2460     j(DONE);
2461   } else { // LU or UL
2462     Register strL = encLU ? str1 : str2;
2463     Register strU = encLU ? str2 : str1;
2464     VectorRegister vstr1 = encLU ? v8 : v4;
2465     VectorRegister vstr2 = encLU ? v4 : v8;
2466 
2467     bind(loop);
2468     vsetvli(tmp1, cnt2, Assembler::e8, Assembler::m2);
2469     vle8_v(vstr1, strL);
2470     vsetvli(tmp1, cnt2, Assembler::e16, Assembler::m4);
2471     vzext_vf2(vstr2, vstr1);
2472     vle16_v(vstr1, strU);
2473     vmsne_vv(v4, vstr2, vstr1);
2474     vfirst_m(tmp2, v4);
2475     bgez(tmp2, DIFFERENCE);
2476     sub(cnt2, cnt2, tmp1);
2477     add(strL, strL, tmp1);
2478     shadd(strU, tmp1, strU, tmp1, 1);
2479     bnez(cnt2, loop);
2480     j(DONE);
2481   }
2482 
2483   bind(DIFFERENCE);
2484   slli(tmp1, tmp2, 1);
2485   add(str1, str1, str1_isL ? tmp2 : tmp1);
2486   add(str2, str2, str2_isL ? tmp2 : tmp1);
2487   str1_isL ? lbu(tmp1, Address(str1, 0)) : lhu(tmp1, Address(str1, 0));
2488   str2_isL ? lbu(tmp2, Address(str2, 0)) : lhu(tmp2, Address(str2, 0));
2489   sub(result, tmp1, tmp2);
2490 
2491   bind(DONE);
2492 }
2493 
2494 void C2_MacroAssembler::byte_array_inflate_v(Register src, Register dst, Register len, Register tmp) {
2495   Label loop;
2496   assert_different_registers(src, dst, len, tmp, t0);
2497 
2498   BLOCK_COMMENT("byte_array_inflate_v {");
2499   bind(loop);
2500   vsetvli(tmp, len, Assembler::e8, Assembler::m2);
2501   vle8_v(v6, src);
2502   vsetvli(t0, len, Assembler::e16, Assembler::m4);
2503   vzext_vf2(v4, v6);
2504   vse16_v(v4, dst);
2505   sub(len, len, tmp);
2506   add(src, src, tmp);
2507   shadd(dst, tmp, dst, tmp, 1);
2508   bnez(len, loop);
2509   BLOCK_COMMENT("} byte_array_inflate_v");
2510 }
2511 
2512 // Compress char[] array to byte[].
2513 // Intrinsic for java.lang.StringUTF16.compress(char[] src, int srcOff, byte[] dst, int dstOff, int len)
2514 // result: the array length if every element in array can be encoded,
2515 // otherwise, the index of first non-latin1 (> 0xff) character.
2516 void C2_MacroAssembler::char_array_compress_v(Register src, Register dst, Register len,
2517                                               Register result, Register tmp) {
2518   encode_iso_array_v(src, dst, len, result, tmp, false);
2519 }
2520 
2521 // Intrinsic for
2522 //
2523 // - sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray
2524 //     return the number of characters copied.
2525 // - java/lang/StringUTF16.compress
2526 //     return index of non-latin1 character if copy fails, otherwise 'len'.
2527 //
2528 // This version always returns the number of characters copied. A successful
2529 // copy will complete with the post-condition: 'res' == 'len', while an
2530 // unsuccessful copy will exit with the post-condition: 0 <= 'res' < 'len'.
2531 //
2532 // Clobbers: src, dst, len, result, t0
2533 void C2_MacroAssembler::encode_iso_array_v(Register src, Register dst, Register len,
2534                                            Register result, Register tmp, bool ascii) {
2535   Label loop, fail, done;
2536 
2537   BLOCK_COMMENT("encode_iso_array_v {");
2538   mv(result, 0);
2539 
2540   bind(loop);
2541   mv(tmp, ascii ? 0x7f : 0xff);
2542   vsetvli(t0, len, Assembler::e16, Assembler::m2);
2543   vle16_v(v2, src);
2544 
2545   vmsgtu_vx(v1, v2, tmp);
2546   vfirst_m(tmp, v1);
2547   vmsbf_m(v0, v1);
2548   // compress char to byte
2549   vsetvli(t0, len, Assembler::e8);
2550   vncvt_x_x_w(v1, v2, Assembler::v0_t);
2551   vse8_v(v1, dst, Assembler::v0_t);
2552 
2553   // fail if char > 0x7f/0xff
2554   bgez(tmp, fail);
2555   add(result, result, t0);
2556   add(dst, dst, t0);
2557   sub(len, len, t0);
2558   shadd(src, t0, src, t0, 1);
2559   bnez(len, loop);
2560   j(done);
2561 
2562   bind(fail);
2563   add(result, result, tmp);
2564 
2565   bind(done);
2566   BLOCK_COMMENT("} encode_iso_array_v");
2567 }
2568 
2569 void C2_MacroAssembler::count_positives_v(Register ary, Register len, Register result, Register tmp) {
2570   Label LOOP, SET_RESULT, DONE;
2571 
2572   BLOCK_COMMENT("count_positives_v {");
2573   assert_different_registers(ary, len, result, tmp);
2574 
2575   mv(result, zr);
2576 
2577   bind(LOOP);
2578   vsetvli(t0, len, Assembler::e8, Assembler::m4);
2579   vle8_v(v4, ary);
2580   vmslt_vx(v4, v4, zr);
2581   vfirst_m(tmp, v4);
2582   bgez(tmp, SET_RESULT);
2583   // if tmp == -1, all bytes are positive
2584   add(result, result, t0);
2585 
2586   sub(len, len, t0);
2587   add(ary, ary, t0);
2588   bnez(len, LOOP);
2589   j(DONE);
2590 
2591   // add remaining positive bytes count
2592   bind(SET_RESULT);
2593   add(result, result, tmp);
2594 
2595   bind(DONE);
2596   BLOCK_COMMENT("} count_positives_v");
2597 }
2598 
2599 void C2_MacroAssembler::string_indexof_char_v(Register str1, Register cnt1,
2600                                               Register ch, Register result,
2601                                               Register tmp1, Register tmp2,
2602                                               bool isL) {
2603   mv(result, zr);
2604 
2605   Label loop, MATCH, DONE;
2606   Assembler::SEW sew = isL ? Assembler::e8 : Assembler::e16;
2607   bind(loop);
2608   vsetvli(tmp1, cnt1, sew, Assembler::m4);
2609   vlex_v(v4, str1, sew);
2610   vmseq_vx(v4, v4, ch);
2611   vfirst_m(tmp2, v4);
2612   bgez(tmp2, MATCH); // if equal, return index
2613 
2614   add(result, result, tmp1);
2615   sub(cnt1, cnt1, tmp1);
2616   if (!isL) slli(tmp1, tmp1, 1);
2617   add(str1, str1, tmp1);
2618   bnez(cnt1, loop);
2619 
2620   mv(result, -1);
2621   j(DONE);
2622 
2623   bind(MATCH);
2624   add(result, result, tmp2);
2625 
2626   bind(DONE);
2627 }
2628 
2629 // Set dst to NaN if any NaN input.
2630 void C2_MacroAssembler::minmax_fp_v(VectorRegister dst, VectorRegister src1, VectorRegister src2,
2631                                     BasicType bt, bool is_min, uint vector_length) {
2632   assert_different_registers(dst, src1, src2);
2633 
2634   vsetvli_helper(bt, vector_length);
2635 
2636   is_min ? vfmin_vv(dst, src1, src2)
2637          : vfmax_vv(dst, src1, src2);
2638 
2639   vmfne_vv(v0,  src1, src1);
2640   vfadd_vv(dst, src1, src1, Assembler::v0_t);
2641   vmfne_vv(v0,  src2, src2);
2642   vfadd_vv(dst, src2, src2, Assembler::v0_t);
2643 }
2644 
2645 // Set dst to NaN if any NaN input.
2646 // The destination vector register elements corresponding to masked-off elements
2647 // are handled with a mask-undisturbed policy.
2648 void C2_MacroAssembler::minmax_fp_masked_v(VectorRegister dst, VectorRegister src1, VectorRegister src2,
2649                                            VectorRegister vmask, VectorRegister tmp1, VectorRegister tmp2,
2650                                            BasicType bt, bool is_min, uint vector_length) {
2651   assert_different_registers(src1, src2, tmp1, tmp2);
2652   vsetvli_helper(bt, vector_length);
2653 
2654   // Check vector elements of src1 and src2 for NaN.
2655   vmfeq_vv(tmp1, src1, src1);
2656   vmfeq_vv(tmp2, src2, src2);
2657 
2658   vmandn_mm(v0, vmask, tmp1);
2659   vfadd_vv(dst, src1, src1, Assembler::v0_t);
2660   vmandn_mm(v0, vmask, tmp2);
2661   vfadd_vv(dst, src2, src2, Assembler::v0_t);
2662 
2663   vmand_mm(tmp2, tmp1, tmp2);
2664   vmand_mm(v0, vmask, tmp2);
2665   is_min ? vfmin_vv(dst, src1, src2, Assembler::v0_t)
2666          : vfmax_vv(dst, src1, src2, Assembler::v0_t);
2667 }
2668 
2669 // Set dst to NaN if any NaN input.
2670 void C2_MacroAssembler::reduce_minmax_fp_v(FloatRegister dst,
2671                                            FloatRegister src1, VectorRegister src2,
2672                                            VectorRegister tmp1, VectorRegister tmp2,
2673                                            bool is_double, bool is_min, uint vector_length, VectorMask vm) {
2674   assert_different_registers(dst, src1);
2675   assert_different_registers(src2, tmp1, tmp2);
2676 
2677   Label L_done, L_NaN_1, L_NaN_2;
2678   // Set dst to src1 if src1 is NaN
2679   is_double ? feq_d(t0, src1, src1)
2680             : feq_s(t0, src1, src1);
2681   beqz(t0, L_NaN_2);
2682 
2683   vsetvli_helper(is_double ? T_DOUBLE : T_FLOAT, vector_length);
2684   vfmv_s_f(tmp2, src1);
2685 
2686   is_min ? vfredmin_vs(tmp1, src2, tmp2, vm)
2687          : vfredmax_vs(tmp1, src2, tmp2, vm);
2688   vfmv_f_s(dst, tmp1);
2689 
2690   // Checking NaNs in src2
2691   vmfne_vv(tmp1, src2, src2, vm);
2692   vcpop_m(t0, tmp1, vm);
2693   beqz(t0, L_done);
2694 
2695   bind(L_NaN_1);
2696   vfredusum_vs(tmp1, src2, tmp2, vm);
2697   vfmv_f_s(dst, tmp1);
2698   j(L_done);
2699 
2700   bind(L_NaN_2);
2701   is_double ? fmv_d(dst, src1)
2702             : fmv_s(dst, src1);
2703   bind(L_done);
2704 }
2705 
2706 bool C2_MacroAssembler::in_scratch_emit_size() {
2707   if (ciEnv::current()->task() != nullptr) {
2708     PhaseOutput* phase_output = Compile::current()->output();
2709     if (phase_output != nullptr && phase_output->in_scratch_emit_size()) {
2710       return true;
2711     }
2712   }
2713   return MacroAssembler::in_scratch_emit_size();
2714 }
2715 
2716 void C2_MacroAssembler::reduce_integral_v(Register dst, Register src1,
2717                                           VectorRegister src2, VectorRegister tmp,
2718                                           int opc, BasicType bt, uint vector_length, VectorMask vm) {
2719   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2720   vsetvli_helper(bt, vector_length);
2721   vmv_s_x(tmp, src1);
2722   switch (opc) {
2723     case Op_AddReductionVI:
2724     case Op_AddReductionVL:
2725       vredsum_vs(tmp, src2, tmp, vm);
2726       break;
2727     case Op_AndReductionV:
2728       vredand_vs(tmp, src2, tmp, vm);
2729       break;
2730     case Op_OrReductionV:
2731       vredor_vs(tmp, src2, tmp, vm);
2732       break;
2733     case Op_XorReductionV:
2734       vredxor_vs(tmp, src2, tmp, vm);
2735       break;
2736     case Op_MaxReductionV:
2737       vredmax_vs(tmp, src2, tmp, vm);
2738       break;
2739     case Op_MinReductionV:
2740       vredmin_vs(tmp, src2, tmp, vm);
2741       break;
2742     default:
2743       ShouldNotReachHere();
2744   }
2745   vmv_x_s(dst, tmp);
2746 }
2747 
2748 // Set vl and vtype for full and partial vector operations.
2749 // (vma = mu, vta = tu, vill = false)
2750 void C2_MacroAssembler::vsetvli_helper(BasicType bt, uint vector_length, LMUL vlmul, Register tmp) {
2751   Assembler::SEW sew = Assembler::elemtype_to_sew(bt);
2752   if (vector_length <= 31) {
2753     vsetivli(tmp, vector_length, sew, vlmul);
2754   } else if (vector_length == (MaxVectorSize / type2aelembytes(bt))) {
2755     vsetvli(tmp, x0, sew, vlmul);
2756   } else {
2757     mv(tmp, vector_length);
2758     vsetvli(tmp, tmp, sew, vlmul);
2759   }
2760 }
2761 
2762 void C2_MacroAssembler::compare_integral_v(VectorRegister vd, VectorRegister src1, VectorRegister src2,
2763                                            int cond, BasicType bt, uint vector_length, VectorMask vm) {
2764   assert(is_integral_type(bt), "unsupported element type");
2765   assert(vm == Assembler::v0_t ? vd != v0 : true, "should be different registers");
2766   vsetvli_helper(bt, vector_length);
2767   vmclr_m(vd);
2768   switch (cond) {
2769     case BoolTest::eq: vmseq_vv(vd, src1, src2, vm); break;
2770     case BoolTest::ne: vmsne_vv(vd, src1, src2, vm); break;
2771     case BoolTest::le: vmsle_vv(vd, src1, src2, vm); break;
2772     case BoolTest::ge: vmsge_vv(vd, src1, src2, vm); break;
2773     case BoolTest::lt: vmslt_vv(vd, src1, src2, vm); break;
2774     case BoolTest::gt: vmsgt_vv(vd, src1, src2, vm); break;
2775     case BoolTest::ule: vmsleu_vv(vd, src1, src2, vm); break;
2776     case BoolTest::uge: vmsgeu_vv(vd, src1, src2, vm); break;
2777     case BoolTest::ult: vmsltu_vv(vd, src1, src2, vm); break;
2778     case BoolTest::ugt: vmsgtu_vv(vd, src1, src2, vm); break;
2779     default:
2780       assert(false, "unsupported compare condition");
2781       ShouldNotReachHere();
2782   }
2783 }
2784 
2785 void C2_MacroAssembler::compare_fp_v(VectorRegister vd, VectorRegister src1, VectorRegister src2,
2786                                      int cond, BasicType bt, uint vector_length, VectorMask vm) {
2787   assert(is_floating_point_type(bt), "unsupported element type");
2788   assert(vm == Assembler::v0_t ? vd != v0 : true, "should be different registers");
2789   vsetvli_helper(bt, vector_length);
2790   vmclr_m(vd);
2791   switch (cond) {
2792     case BoolTest::eq: vmfeq_vv(vd, src1, src2, vm); break;
2793     case BoolTest::ne: vmfne_vv(vd, src1, src2, vm); break;
2794     case BoolTest::le: vmfle_vv(vd, src1, src2, vm); break;
2795     case BoolTest::ge: vmfge_vv(vd, src1, src2, vm); break;
2796     case BoolTest::lt: vmflt_vv(vd, src1, src2, vm); break;
2797     case BoolTest::gt: vmfgt_vv(vd, src1, src2, vm); break;
2798     default:
2799       assert(false, "unsupported compare condition");
2800       ShouldNotReachHere();
2801   }
2802 }
2803 
2804 // In Matcher::scalable_predicate_reg_slots,
2805 // we assume each predicate register is one-eighth of the size of
2806 // scalable vector register, one mask bit per vector byte.
2807 void C2_MacroAssembler::spill_vmask(VectorRegister v, int offset) {
2808   vsetvli_helper(T_BYTE, MaxVectorSize >> 3);
2809   add(t0, sp, offset);
2810   vse8_v(v, t0);
2811 }
2812 
2813 void C2_MacroAssembler::unspill_vmask(VectorRegister v, int offset) {
2814   vsetvli_helper(T_BYTE, MaxVectorSize >> 3);
2815   add(t0, sp, offset);
2816   vle8_v(v, t0);
2817 }
2818 
2819 void C2_MacroAssembler::integer_extend_v(VectorRegister dst, BasicType dst_bt, uint vector_length,
2820                                          VectorRegister src, BasicType src_bt, bool is_signed) {
2821   assert(type2aelembytes(dst_bt) > type2aelembytes(src_bt) && type2aelembytes(dst_bt) <= 8 && type2aelembytes(src_bt) <= 4, "invalid element size");
2822   assert(dst_bt != T_FLOAT && dst_bt != T_DOUBLE && src_bt != T_FLOAT && src_bt != T_DOUBLE, "unsupported element type");
2823   // https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#52-vector-operands
2824   // The destination EEW is greater than the source EEW, the source EMUL is at least 1,
2825   // and the overlap is in the highest-numbered part of the destination register group.
2826   // Since LMUL=1, vd and vs cannot be the same.
2827   assert_different_registers(dst, src);
2828 
2829   vsetvli_helper(dst_bt, vector_length);
2830   if (is_signed) {
2831     if (src_bt == T_BYTE) {
2832       switch (dst_bt) {
2833       case T_SHORT:
2834         vsext_vf2(dst, src);
2835         break;
2836       case T_INT:
2837         vsext_vf4(dst, src);
2838         break;
2839       case T_LONG:
2840         vsext_vf8(dst, src);
2841         break;
2842       default:
2843         ShouldNotReachHere();
2844       }
2845     } else if (src_bt == T_SHORT) {
2846       if (dst_bt == T_INT) {
2847         vsext_vf2(dst, src);
2848       } else {
2849         vsext_vf4(dst, src);
2850       }
2851     } else if (src_bt == T_INT) {
2852       vsext_vf2(dst, src);
2853     }
2854   } else {
2855     if (src_bt == T_BYTE) {
2856       switch (dst_bt) {
2857       case T_SHORT:
2858         vzext_vf2(dst, src);
2859         break;
2860       case T_INT:
2861         vzext_vf4(dst, src);
2862         break;
2863       case T_LONG:
2864         vzext_vf8(dst, src);
2865         break;
2866       default:
2867         ShouldNotReachHere();
2868       }
2869     } else if (src_bt == T_SHORT) {
2870       if (dst_bt == T_INT) {
2871         vzext_vf2(dst, src);
2872       } else {
2873         vzext_vf4(dst, src);
2874       }
2875     } else if (src_bt == T_INT) {
2876       vzext_vf2(dst, src);
2877     }
2878   }
2879 }
2880 
2881 // Vector narrow from src to dst with specified element sizes.
2882 // High part of dst vector will be filled with zero.
2883 void C2_MacroAssembler::integer_narrow_v(VectorRegister dst, BasicType dst_bt, uint vector_length,
2884                                          VectorRegister src, BasicType src_bt) {
2885   assert(type2aelembytes(dst_bt) < type2aelembytes(src_bt) && type2aelembytes(dst_bt) <= 4 && type2aelembytes(src_bt) <= 8, "invalid element size");
2886   assert(dst_bt != T_FLOAT && dst_bt != T_DOUBLE && src_bt != T_FLOAT && src_bt != T_DOUBLE, "unsupported element type");
2887   mv(t0, vector_length);
2888   if (src_bt == T_LONG) {
2889     // https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#117-vector-narrowing-integer-right-shift-instructions
2890     // Future extensions might add support for versions that narrow to a destination that is 1/4 the width of the source.
2891     // So we can currently only scale down by 1/2 the width at a time.
2892     vsetvli(t0, t0, Assembler::e32, Assembler::mf2);
2893     vncvt_x_x_w(dst, src);
2894     if (dst_bt == T_SHORT || dst_bt == T_BYTE) {
2895       vsetvli(t0, t0, Assembler::e16, Assembler::mf2);
2896       vncvt_x_x_w(dst, dst);
2897       if (dst_bt == T_BYTE) {
2898         vsetvli(t0, t0, Assembler::e8, Assembler::mf2);
2899         vncvt_x_x_w(dst, dst);
2900       }
2901     }
2902   } else if (src_bt == T_INT) {
2903     // T_SHORT
2904     vsetvli(t0, t0, Assembler::e16, Assembler::mf2);
2905     vncvt_x_x_w(dst, src);
2906     if (dst_bt == T_BYTE) {
2907       vsetvli(t0, t0, Assembler::e8, Assembler::mf2);
2908       vncvt_x_x_w(dst, dst);
2909     }
2910   } else if (src_bt == T_SHORT) {
2911     vsetvli(t0, t0, Assembler::e8, Assembler::mf2);
2912     vncvt_x_x_w(dst, src);
2913   }
2914 }
2915 
2916 #define VFCVT_SAFE(VFLOATCVT)                                                      \
2917 void C2_MacroAssembler::VFLOATCVT##_safe(VectorRegister dst, VectorRegister src) { \
2918   assert_different_registers(dst, src);                                            \
2919   vxor_vv(dst, dst, dst);                                                          \
2920   vmfeq_vv(v0, src, src);                                                          \
2921   VFLOATCVT(dst, src, Assembler::v0_t);                                            \
2922 }
2923 
2924 VFCVT_SAFE(vfcvt_rtz_x_f_v);
2925 
2926 #undef VFCVT_SAFE
2927 
2928 // Extract a scalar element from an vector at position 'idx'.
2929 // The input elements in src are expected to be of integral type.
2930 void C2_MacroAssembler::extract_v(Register dst, VectorRegister src, BasicType bt,
2931                                   int idx, VectorRegister tmp) {
2932   assert(is_integral_type(bt), "unsupported element type");
2933   assert(idx >= 0, "idx cannot be negative");
2934   // Only need the first element after vector slidedown
2935   vsetvli_helper(bt, 1);
2936   if (idx == 0) {
2937     vmv_x_s(dst, src);
2938   } else if (idx <= 31) {
2939     vslidedown_vi(tmp, src, idx);
2940     vmv_x_s(dst, tmp);
2941   } else {
2942     mv(t0, idx);
2943     vslidedown_vx(tmp, src, t0);
2944     vmv_x_s(dst, tmp);
2945   }
2946 }
2947 
2948 // Extract a scalar element from an vector at position 'idx'.
2949 // The input elements in src are expected to be of floating point type.
2950 void C2_MacroAssembler::extract_fp_v(FloatRegister dst, VectorRegister src, BasicType bt,
2951                                      int idx, VectorRegister tmp) {
2952   assert(is_floating_point_type(bt), "unsupported element type");
2953   assert(idx >= 0, "idx cannot be negative");
2954   // Only need the first element after vector slidedown
2955   vsetvli_helper(bt, 1);
2956   if (idx == 0) {
2957     vfmv_f_s(dst, src);
2958   } else if (idx <= 31) {
2959     vslidedown_vi(tmp, src, idx);
2960     vfmv_f_s(dst, tmp);
2961   } else {
2962     mv(t0, idx);
2963     vslidedown_vx(tmp, src, t0);
2964     vfmv_f_s(dst, tmp);
2965   }
2966 }